1 
2 #include <NTL/ZZ.h>
3 #include <NTL/vec_ZZ.h>
4 #include <NTL/Lazy.h>
5 #include <NTL/fileio.h>
6 #include <NTL/SmartPtr.h>
7 
8 #include <NTL/BasicThreadPool.h>
9 
10 
11 
12 #if defined(NTL_HAVE_AVX2)
13 #include <immintrin.h>
14 #elif defined(NTL_HAVE_SSSE3)
15 #include <emmintrin.h>
16 #include <tmmintrin.h>
17 #endif
18 
19 #if defined(NTL_HAVE_KMA)
20 #include <NTL/linux_s390x.h>
21 #endif
22 
23 
24 
25 NTL_START_IMPL
26 
27 
28 
29 
30 
zero()31 const ZZ& ZZ::zero()
32 {
33 
34    static const ZZ z; // GLOBAL (relies on C++11 thread-safe init)
35    return z;
36 }
37 
38 
ZZ_expo(long e)39 const ZZ& ZZ_expo(long e)
40 {
41    NTL_TLS_LOCAL(ZZ, expo_helper);
42 
43    conv(expo_helper, e);
44    return expo_helper;
45 }
46 
47 
48 
AddMod(ZZ & x,const ZZ & a,long b,const ZZ & n)49 void AddMod(ZZ& x, const ZZ& a, long b, const ZZ& n)
50 {
51    NTL_ZZRegister(B);
52    conv(B, b);
53    AddMod(x, a, B, n);
54 }
55 
56 
SubMod(ZZ & x,const ZZ & a,long b,const ZZ & n)57 void SubMod(ZZ& x, const ZZ& a, long b, const ZZ& n)
58 {
59    NTL_ZZRegister(B);
60    conv(B, b);
61    SubMod(x, a, B, n);
62 }
63 
SubMod(ZZ & x,long a,const ZZ & b,const ZZ & n)64 void SubMod(ZZ& x, long a, const ZZ& b, const ZZ& n)
65 {
66    NTL_ZZRegister(A);
67    conv(A, a);
68    SubMod(x, A, b, n);
69 }
70 
71 
72 
73 // ****** input and output
74 
75 
76 static NTL_CHEAP_THREAD_LOCAL long iodigits = 0;
77 static NTL_CHEAP_THREAD_LOCAL long ioradix = 0;
78 // iodigits is the greatest integer such that 10^{iodigits} < NTL_WSP_BOUND
79 // ioradix = 10^{iodigits}
80 
InitZZIO()81 static void InitZZIO()
82 {
83    long x;
84 
85    x = (NTL_WSP_BOUND-1)/10;
86    iodigits = 0;
87    ioradix = 1;
88 
89    while (x) {
90       x = x / 10;
91       iodigits++;
92       ioradix = ioradix * 10;
93    }
94 
95    if (iodigits <= 0) TerminalError("problem with I/O");
96 }
97 
98 
operator >>(istream & s,ZZ & x)99 istream& operator>>(istream& s, ZZ& x)
100 {
101    long c;
102    long cval;
103    long sign;
104    long ndigits;
105    long acc;
106    NTL_ZZRegister(a);
107 
108    if (!s) NTL_INPUT_ERROR(s, "bad ZZ input");
109 
110    if (!iodigits) InitZZIO();
111 
112    a = 0;
113 
114    SkipWhiteSpace(s);
115    c = s.peek();
116 
117    if (c == '-') {
118       sign = -1;
119       s.get();
120       c = s.peek();
121    }
122    else
123       sign = 1;
124 
125    cval = CharToIntVal(c);
126 
127    if (cval < 0 || cval > 9) NTL_INPUT_ERROR(s, "bad ZZ input");
128 
129    ndigits = 0;
130    acc = 0;
131    while (cval >= 0 && cval <= 9) {
132       acc = acc*10 + cval;
133       ndigits++;
134 
135       if (ndigits == iodigits) {
136          mul(a, a, ioradix);
137          add(a, a, acc);
138          ndigits = 0;
139          acc = 0;
140       }
141 
142       s.get();
143       c = s.peek();
144       cval = CharToIntVal(c);
145    }
146 
147    if (ndigits != 0) {
148       long mpy = 1;
149       while (ndigits > 0) {
150          mpy = mpy * 10;
151          ndigits--;
152       }
153 
154       mul(a, a, mpy);
155       add(a, a, acc);
156    }
157 
158    if (sign == -1)
159       negate(a, a);
160 
161    x = a;
162    return s;
163 }
164 
165 
166 // The class _ZZ_local_stack should be defined in an empty namespace,
167 // but since I don't want to rely on namespaces, we just give it a funny
168 // name to avoid accidental name clashes.
169 
170 struct _ZZ_local_stack {
171    long top;
172    Vec<long> data;
173 
_ZZ_local_stack_ZZ_local_stack174    _ZZ_local_stack() { top = -1; }
175 
pop_ZZ_local_stack176    long pop() { return data[top--]; }
empty_ZZ_local_stack177    long empty() { return (top == -1); }
178    void push(long x);
179 };
180 
push(long x)181 void _ZZ_local_stack::push(long x)
182 {
183    if (top+1 >= data.length())
184       data.SetLength(max(32, long(1.414*data.length())));
185 
186    top++;
187    data[top] = x;
188 }
189 
190 
191 static
PrintDigits(ostream & s,long d,long justify)192 void PrintDigits(ostream& s, long d, long justify)
193 {
194    NTL_TLS_LOCAL_INIT(Vec<char>, buf, (INIT_SIZE, iodigits));
195 
196    long i = 0;
197 
198    while (d) {
199       buf[i] = IntValToChar(d % 10);
200       d = d / 10;
201       i++;
202    }
203 
204    if (justify) {
205       long j = iodigits - i;
206       while (j > 0) {
207          s << "0";
208          j--;
209       }
210    }
211 
212    while (i > 0) {
213       i--;
214       s << buf[i];
215    }
216 }
217 
218 
219 
220 
operator <<(ostream & s,const ZZ & a)221 ostream& operator<<(ostream& s, const ZZ& a)
222 {
223    ZZ b;
224    _ZZ_local_stack S;
225    long r;
226    long k;
227 
228    if (!iodigits) InitZZIO();
229 
230    b = a;
231 
232    k = sign(b);
233 
234    if (k == 0) {
235       s << "0";
236       return s;
237    }
238 
239    if (k < 0) {
240       s << "-";
241       negate(b, b);
242    }
243 
244    do {
245       r = DivRem(b, b, ioradix);
246       S.push(r);
247    } while (!IsZero(b));
248 
249    r = S.pop();
250    PrintDigits(s, r, 0);
251 
252    while (!S.empty()) {
253       r = S.pop();
254       PrintDigits(s, r, 1);
255    }
256 
257    return s;
258 }
259 
260 
261 
GCD(long a,long b)262 long GCD(long a, long b)
263 {
264    long u, v, t, x;
265 
266    if (a < 0) {
267       if (a < -NTL_MAX_LONG) ResourceError("GCD: integer overflow");
268       a = -a;
269    }
270 
271    if (b < 0) {
272       if (b < -NTL_MAX_LONG) ResourceError("GCD: integer overflow");
273       b = -b;
274    }
275 
276 
277    if (b==0)
278       x = a;
279    else {
280       u = a;
281       v = b;
282       do {
283          t = u % v;
284          u = v;
285          v = t;
286       } while (v != 0);
287 
288       x = u;
289    }
290 
291    return x;
292 }
293 
294 
295 
XGCD(long & d,long & s,long & t,long a,long b)296 void XGCD(long& d, long& s, long& t, long a, long b)
297 {
298    long  u, v, u0, v0, u1, v1, u2, v2, q, r;
299 
300    long aneg = 0, bneg = 0;
301 
302    if (a < 0) {
303       if (a < -NTL_MAX_LONG) ResourceError("XGCD: integer overflow");
304       a = -a;
305       aneg = 1;
306    }
307 
308    if (b < 0) {
309       if (b < -NTL_MAX_LONG) ResourceError("XGCD: integer overflow");
310       b = -b;
311       bneg = 1;
312    }
313 
314    u1=1; v1=0;
315    u2=0; v2=1;
316    u = a; v = b;
317 
318    while (v != 0) {
319       q = u / v;
320       r = u % v;
321       u = v;
322       v = r;
323       u0 = u2;
324       v0 = v2;
325       u2 =  u1 - q*u2;
326       v2 = v1- q*v2;
327       u1 = u0;
328       v1 = v0;
329    }
330 
331    if (aneg)
332       u1 = -u1;
333 
334    if (bneg)
335       v1 = -v1;
336 
337    d = u;
338    s = u1;
339    t = v1;
340 }
341 
InvModStatus(long & x,long a,long n)342 long InvModStatus(long& x, long a, long n)
343 {
344    long d, s, t;
345 
346    XGCD(d, s, t, a, n);
347    if (d != 1) {
348       x = d;
349       return 1;
350    }
351    else {
352       if (s < 0)
353          x = s + n;
354       else
355          x = s;
356 
357       return 0;
358    }
359 }
360 
InvMod(long a,long n)361 long InvMod(long a, long n)
362 {
363    long d, s, t;
364 
365    XGCD(d, s, t, a, n);
366    if (d != 1) {
367       InvModError("InvMod: inverse undefined");
368    }
369    if (s < 0)
370       return s + n;
371    else
372       return s;
373 }
374 
375 
PowerMod(long a,long ee,long n)376 long PowerMod(long a, long ee, long n)
377 {
378    long x, y;
379 
380    unsigned long e;
381 
382    if (ee < 0)
383       e = - ((unsigned long) ee);
384    else
385       e = ee;
386 
387    x = 1;
388    y = a;
389    while (e) {
390       if (e & 1) x = MulMod(x, y, n);
391       y = MulMod(y, y, n);
392       e = e >> 1;
393    }
394 
395    if (ee < 0) x = InvMod(x, n);
396 
397    return x;
398 }
399 
400 static
MillerWitness_sp(long n,long x)401 long MillerWitness_sp(long n, long x)
402 {
403    long m, y, z;
404    long j, k;
405 
406    if (x == 0) return 0;
407 
408    m = n - 1;
409    k = 0;
410    while((m & 1) == 0) {
411       m = m >> 1;
412       k++;
413    }
414    // n - 1 == 2^k * m, m odd
415 
416    z = PowerMod(x, m, n);
417    if (z == 1) return 0;
418 
419    j = 0;
420    do {
421       y = z;
422       z = MulMod(y, y, n);
423       j++;
424    } while (j != k && z != 1);
425 
426    if (z != 1 || y != n-1) return 1;
427    return 0;
428 }
429 
ProbPrime(long n,long NumTrials)430 long ProbPrime(long n, long NumTrials)
431 {
432    if (NumTrials < 0) NumTrials = 0;
433 
434    long m, x, y, z;
435    long i, j, k;
436 
437    if (n <= 1) return 0;
438 
439 
440    if (n == 2) return 1;
441    if (n % 2 == 0) return 0;
442 
443    if (n == 3) return 1;
444    if (n % 3 == 0) return 0;
445 
446    if (n == 5) return 1;
447    if (n % 5 == 0) return 0;
448 
449    if (n == 7) return 1;
450    if (n % 7 == 0) return 0;
451 
452    if (n == 11) return 1;
453    if (n % 11 == 0) return 0;
454 
455    if (n == 13) return 1;
456    if (n % 13 == 0) return 0;
457 
458    if (n >= NTL_SP_BOUND) {
459       return ProbPrime(to_ZZ(n), NumTrials);
460    }
461 
462    m = n - 1;
463    k = 0;
464    while((m & 1) == 0) {
465       m = m >> 1;
466       k++;
467    }
468 
469    // n - 1 == 2^k * m, m odd
470 
471    for (i = 0; i < NumTrials+1; i++) {
472       // for consistency with the multi-precision version,
473       // we first see if 2 is a witness, so we really do
474       // NumTrials+1 tests
475 
476       if (i == 0)
477          x = 2;
478       else {
479 	 do {
480 	    x = RandomBnd(n);
481 	 } while (x == 0);
482          // x == 0 is not a useful candidate for a witness!
483       }
484 
485       z = PowerMod(x, m, n);
486       if (z == 1) continue;
487 
488       j = 0;
489       do {
490          y = z;
491          z = MulMod(y, y, n);
492          j++;
493       } while (j != k && z != 1);
494 
495       if (z != 1 || y !=  n-1) return 0;
496    }
497 
498    return 1;
499 }
500 
501 
MillerWitness(const ZZ & n,const ZZ & x)502 long MillerWitness(const ZZ& n, const ZZ& x)
503 {
504    if (n.SinglePrecision()) {
505       return MillerWitness_sp(to_long(n), to_long(x));
506    }
507 
508    ZZ m, y, z;
509 
510    long j, k;
511 
512    if (x == 0) return 0;
513 
514    add(m, n, -1);
515    k = MakeOdd(m);
516    // n - 1 == 2^k * m, m odd
517 
518    PowerMod(z, x, m, n);
519    if (z == 1) return 0;
520 
521    j = 0;
522    do {
523       y = z;
524       SqrMod(z, y, n);
525       j++;
526    } while (j != k && z != 1);
527 
528    if (z != 1) return 1;
529    add(y, y, 1);
530    if (y != n) return 1;
531    return 0;
532 }
533 
534 
535 // ComputePrimeBound computes a reasonable bound for trial
536 // division in the Miller-Rabin test.
537 // It is computed a bit on the "low" side, since being a bit
538 // low doesn't hurt much, but being too high can hurt a lot.
539 
540 // See the paper "Fast generation of prime numbers and secure
541 // public-key cryptographic parameters" by Ueli Maurer.
542 // In that paper, it is calculated that the optimal bound in
543 // roughly T_exp/T_div, where T_exp is the time for an exponentiation
544 // and T_div is is the time for a single precision division.
545 // Of course, estimating these times is a bit tricky, and
546 // the values we use are based on experimentation, assuming
547 // GMP is being used.  I've tested this on various bit lengths
548 // up to 16,000, and they seem to be pretty close to optimal.
549 
550 static
ComputePrimeBound(long bn)551 long ComputePrimeBound(long bn)
552 {
553    long wn = (bn+NTL_ZZ_NBITS-1)/NTL_ZZ_NBITS;
554 
555    long fn;
556 
557    if (wn <= 36)
558       fn = wn/4 + 1;
559    else
560       fn = long(1.67*sqrt(double(wn)));
561 
562    long prime_bnd;
563 
564    if (NumBits(bn) + NumBits(fn) > NTL_SP_NBITS)
565       prime_bnd = NTL_SP_BOUND;
566    else
567       prime_bnd = bn*fn;
568 
569    return prime_bnd;
570 
571 
572 }
573 
574 
ProbPrime(const ZZ & n,long NumTrials)575 long ProbPrime(const ZZ& n, long NumTrials)
576 {
577    if (NumTrials < 0) NumTrials = 0;
578 
579    if (n <= 1) return 0;
580 
581    if (n.SinglePrecision()) {
582       return ProbPrime(to_long(n), NumTrials);
583    }
584 
585 
586    long prime_bnd = ComputePrimeBound(NumBits(n));
587 
588 
589    PrimeSeq s;
590    long p;
591 
592    p = s.next();
593    while (p && p < prime_bnd) {
594       if (rem(n, p) == 0)
595          return 0;
596 
597       p = s.next();
598    }
599 
600    ZZ W;
601    W = 2;
602 
603    // first try W == 2....the exponentiation
604    // algorithm runs slightly faster in this case
605 
606    if (MillerWitness(n, W))
607       return 0;
608 
609 
610    long i;
611 
612    for (i = 0; i < NumTrials; i++) {
613       do {
614          RandomBnd(W, n);
615       } while (W == 0);
616       // W == 0 is not a useful candidate for a witness!
617 
618       if (MillerWitness(n, W))
619          return 0;
620    }
621 
622    return 1;
623 }
624 
625 
626 static
MultiThreadedRandomPrime(ZZ & n,long l,long NumTrials)627 void MultiThreadedRandomPrime(ZZ& n, long l, long NumTrials)
628 {
629 
630    long nt = AvailableThreads();
631 
632 
633 
634    const long LOCAL_ITER_BOUND = 8;
635    // since resetting the PRG comes at a certain cost,
636    // we perform a few iterations with each reset to
637    // amortize the reset cost.
638 
639    unsigned long initial_counter = 0;
640    ZZ seed;
641    RandomBits(seed, 256);
642 
643    for (;;) {
644 
645       AtomicLowWaterMark low_water_mark(-1UL);
646       AtomicCounter counter(initial_counter);
647 
648       Vec< UniquePtr<ZZ> > result(INIT_SIZE, nt);
649       Vec<unsigned long> result_ctr(INIT_SIZE, nt, -1UL);
650 
651       NTL_EXEC_INDEX(nt, index)
652 
653          RandomStreamPush push;
654 
655 	 SetSeed(seed);
656 	 RandomStream& stream = GetCurrentRandomStream();
657 
658 	 ZZ cand;
659 
660 	 while (low_water_mark == -1UL) {
661 
662 	    unsigned long local_ctr = counter.inc();
663             if (local_ctr >> (NTL_BITS_PER_NONCE-1)) {
664                // counter overflow...rather academic
665                break;
666             }
667 
668 	    stream.set_nonce(local_ctr);
669 
670 	    for (long iter = 0; iter < LOCAL_ITER_BOUND &&
671 				local_ctr <= low_water_mark; iter++) {
672 
673 	       RandomLen(cand, l);
674 	       if (!IsOdd(cand)) add(cand, cand, 1);
675 
676 	       if (ProbPrime(cand, 0)) {
677 		  result[index].make(cand);
678 		  result_ctr[index] = local_ctr;
679 		  low_water_mark.UpdateMin(local_ctr);
680 		  break;
681 	       }
682 	    }
683 	 }
684 
685       NTL_EXEC_INDEX_END
686 
687       // find index of low_water_mark
688 
689       unsigned long low_water_mark1 = low_water_mark;
690       long low_water_index = -1;
691 
692       for (long index = 0; index < nt; index++) {
693 	 if (result_ctr[index] == low_water_mark1) {
694 	    low_water_index = index;
695 	    break;
696 	 }
697       }
698 
699       if (low_water_index == -1) {
700          // counter overflow...rather academic
701          initial_counter = 0;
702          RandomBits(seed, 256);
703          continue;
704       }
705 
706       ZZ N;
707       N = *result[low_water_index];
708 
709       Vec<ZZ> W(INIT_SIZE, NumTrials);
710 
711       for (long i = 0; i < NumTrials; i++) {
712          do {
713             RandomBnd(W[i], N);
714          } while (W[i] == 0);
715       }
716 
717       AtomicBool tests_pass(true);
718 
719       NTL_EXEC_RANGE(NumTrials, first, last)
720 
721          for (long i = first; i < last && tests_pass; i++) {
722             if (MillerWitness(N, W[i])) tests_pass = false;
723          }
724 
725       NTL_EXEC_RANGE_END
726 
727       if (tests_pass) {
728          n = N;
729          return;
730       }
731 
732       // very unlikey to get here
733       initial_counter = low_water_mark1 + 1;
734    }
735 }
736 
737 
RandomPrime(ZZ & n,long l,long NumTrials)738 void RandomPrime(ZZ& n, long l, long NumTrials)
739 {
740    if (NumTrials < 0) NumTrials = 0;
741 
742    if (l >= 256) {
743       MultiThreadedRandomPrime(n, l, NumTrials);
744       return;
745    }
746 
747    if (l <= 1)
748       LogicError("RandomPrime: l out of range");
749 
750    if (l == 2) {
751       if (RandomBnd(2))
752          n = 3;
753       else
754          n = 2;
755 
756       return;
757    }
758 
759    do {
760       RandomLen(n, l);
761       if (!IsOdd(n)) add(n, n, 1);
762    } while (!ProbPrime(n, NumTrials));
763 }
764 
OldRandomPrime(ZZ & n,long l,long NumTrials)765 void OldRandomPrime(ZZ& n, long l, long NumTrials)
766 {
767    if (l <= 1)
768       LogicError("RandomPrime: l out of range");
769 
770    if (l == 2) {
771       if (RandomBnd(2))
772          n = 3;
773       else
774          n = 2;
775 
776       return;
777    }
778 
779    do {
780       RandomLen(n, l);
781       if (!IsOdd(n)) add(n, n, 1);
782    } while (!ProbPrime(n, NumTrials));
783 }
784 
NextPrime(ZZ & n,const ZZ & m,long NumTrials)785 void NextPrime(ZZ& n, const ZZ& m, long NumTrials)
786 {
787    ZZ x;
788 
789    if (m <= 2) {
790       n = 2;
791       return;
792    }
793 
794    x = m;
795 
796    while (!ProbPrime(x, NumTrials))
797       add(x, x, 1);
798 
799    n = x;
800 }
801 
NextPrime(long m,long NumTrials)802 long NextPrime(long m, long NumTrials)
803 {
804    long x;
805 
806    if (m <= 2)
807       return 2;
808 
809    x = m;
810 
811    while (x < NTL_SP_BOUND && !ProbPrime(x, NumTrials))
812       x++;
813 
814    if (x >= NTL_SP_BOUND)
815       ResourceError("NextPrime: no more primes");
816 
817    return x;
818 }
819 
820 
821 
NextPowerOfTwo(long m)822 long NextPowerOfTwo(long m)
823 {
824    long k;
825    unsigned long n, um;
826 
827    if (m < 0) return 0;
828 
829    um = m;
830    n = 1;
831    k = 0;
832 
833    while (n < um) {
834       n = n << 1;
835       k++;
836    }
837 
838    if (k >= NTL_BITS_PER_LONG-1)
839       ResourceError("NextPowerOfTwo: overflow");
840 
841    return k;
842 }
843 
844 
845 
846 
bit(long a,long k)847 long bit(long a, long k)
848 {
849    unsigned long aa;
850    if (a < 0)
851       aa = - ((unsigned long) a);
852    else
853       aa = a;
854 
855    if (k < 0 || k >= NTL_BITS_PER_LONG)
856       return 0;
857    else
858       return long((aa >> k) & 1);
859 }
860 
861 
862 
divide(ZZ & q,const ZZ & a,const ZZ & b)863 long divide(ZZ& q, const ZZ& a, const ZZ& b)
864 {
865    NTL_ZZRegister(qq);
866    NTL_ZZRegister(r);
867 
868    if (IsZero(b)) {
869       if (IsZero(a)) {
870          clear(q);
871          return 1;
872       }
873       else
874          return 0;
875    }
876 
877 
878    if (IsOne(b)) {
879       q = a;
880       return 1;
881    }
882 
883    DivRem(qq, r, a, b);
884    if (!IsZero(r)) return 0;
885    q = qq;
886    return 1;
887 }
888 
divide(const ZZ & a,const ZZ & b)889 long divide(const ZZ& a, const ZZ& b)
890 {
891    NTL_ZZRegister(r);
892 
893    if (IsZero(b)) return IsZero(a);
894    if (IsOne(b)) return 1;
895 
896    rem(r, a, b);
897    return IsZero(r);
898 }
899 
divide(ZZ & q,const ZZ & a,long b)900 long divide(ZZ& q, const ZZ& a, long b)
901 {
902    NTL_ZZRegister(qq);
903 
904    if (!b) {
905       if (IsZero(a)) {
906          clear(q);
907          return 1;
908       }
909       else
910          return 0;
911    }
912 
913    if (b == 1) {
914       q = a;
915       return 1;
916    }
917 
918    long r = DivRem(qq, a, b);
919    if (r) return 0;
920    q = qq;
921    return 1;
922 }
923 
divide(const ZZ & a,long b)924 long divide(const ZZ& a, long b)
925 {
926    if (!b) return IsZero(a);
927    if (b == 1) {
928       return 1;
929    }
930 
931    long r = rem(a,  b);
932    return (r == 0);
933 }
934 
935 
InvMod(ZZ & x,const ZZ & a,const ZZ & n)936 void InvMod(ZZ& x, const ZZ& a, const ZZ& n)
937 {
938    // NOTE: the underlying LIP routines write to the first argument,
939    // even if inverse is undefined
940 
941    NTL_ZZRegister(xx);
942    if (InvModStatus(xx, a, n))
943       InvModError("InvMod: inverse undefined", a, n);
944    x = xx;
945 }
946 
PowerMod(ZZ & x,const ZZ & a,const ZZ & e,const ZZ & n)947 void PowerMod(ZZ& x, const ZZ& a, const ZZ& e, const ZZ& n)
948 {
949    // NOTE: this ensures that all modular inverses are computed
950    // in the routine InvMod above, rather than the LIP-internal
951    // modular inverse routine
952    if (e < 0) {
953       ZZ a_inv;
954       ZZ e_neg;
955 
956       InvMod(a_inv, a, n);
957       negate(e_neg, e);
958       LowLevelPowerMod(x, a_inv, e_neg, n);
959    }
960    else
961       LowLevelPowerMod(x, a, e, n);
962 }
963 
964 #ifdef NTL_EXCEPTIONS
965 
InvModError(const char * s,const ZZ & a,const ZZ & n)966 void InvModError(const char *s, const ZZ& a, const ZZ& n)
967 {
968    throw InvModErrorObject(s, a, n);
969 }
970 
971 #else
972 
InvModError(const char * s,const ZZ & a,const ZZ & n)973 void InvModError(const char *s, const ZZ& a, const ZZ& n)
974 {
975    TerminalError(s);
976 }
977 
978 
979 #endif
980 
RandomPrime_long(long l,long NumTrials)981 long RandomPrime_long(long l, long NumTrials)
982 {
983    if (NumTrials < 0) NumTrials = 0;
984 
985    if (l <= 1 || l >= NTL_BITS_PER_LONG)
986       ResourceError("RandomPrime: length out of range");
987 
988    long n;
989    do {
990       n = RandomLen_long(l);
991    } while (!ProbPrime(n, NumTrials));
992 
993    return n;
994 }
995 
996 
997 static Lazy< Vec<char> > lowsieve_storage;
998 // This is a GLOBAL VARIABLE
999 
1000 
PrimeSeq()1001 PrimeSeq::PrimeSeq()
1002 {
1003    movesieve = 0;
1004    pshift = -1;
1005    pindex = -1;
1006    exhausted = 0;
1007 }
1008 
1009 
next()1010 long PrimeSeq::next()
1011 {
1012    if (exhausted) {
1013       return 0;
1014    }
1015 
1016    if (pshift < 0) {
1017       shift(0);
1018       return 2;
1019    }
1020 
1021    for (;;) {
1022       const char *p = movesieve;
1023       long i = pindex;
1024 
1025       while ((++i) < NTL_PRIME_BND) {
1026          if (p[i]) {
1027             pindex = i;
1028             return pshift + 2 * i + 3;
1029          }
1030       }
1031 
1032       long newshift = pshift + 2*NTL_PRIME_BND;
1033 
1034       if (newshift > 2 * NTL_PRIME_BND * (2 * NTL_PRIME_BND + 1)) {
1035          /* end of the road */
1036          exhausted = 1;
1037          return 0;
1038       }
1039 
1040       shift(newshift);
1041    }
1042 }
1043 
shift(long newshift)1044 void PrimeSeq::shift(long newshift)
1045 {
1046    long i;
1047    long j;
1048    long jstep;
1049    long jstart;
1050    long ibound;
1051    char *p;
1052 
1053    if (!lowsieve_storage.built())
1054       start();
1055 
1056    const char *lowsieve = lowsieve_storage->elts();
1057 
1058 
1059    if (newshift < 0) {
1060       pshift = -1;
1061    }
1062    else if (newshift == 0) {
1063       pshift = 0;
1064       movesieve = lowsieve;
1065    }
1066    else if (newshift != pshift) {
1067       if (movesieve_mem.length() == 0) {
1068          movesieve_mem.SetLength(NTL_PRIME_BND);
1069       }
1070 
1071       pshift = newshift;
1072       movesieve = p = movesieve_mem.elts();
1073       for (i = 0; i < NTL_PRIME_BND; i++)
1074          p[i] = 1;
1075 
1076       jstep = 3;
1077       ibound = pshift + 2 * NTL_PRIME_BND + 1;
1078       for (i = 0; jstep * jstep <= ibound; i++) {
1079          if (lowsieve[i]) {
1080             if (!((jstart = (pshift + 2) / jstep + 1) & 1))
1081                jstart++;
1082             if (jstart <= jstep)
1083                jstart = jstep;
1084             jstart = (jstart * jstep - pshift - 3) / 2;
1085             for (j = jstart; j < NTL_PRIME_BND; j += jstep)
1086                p[j] = 0;
1087          }
1088          jstep += 2;
1089       }
1090    }
1091 
1092    pindex = -1;
1093    exhausted = 0;
1094 }
1095 
1096 
start()1097 void PrimeSeq::start()
1098 {
1099    long i;
1100    long j;
1101    long jstep;
1102    long jstart;
1103    long ibnd;
1104    char *p;
1105 
1106    do {
1107       Lazy< Vec<char> >::Builder builder(lowsieve_storage);
1108       if (!builder()) break;
1109 
1110       UniquePtr< Vec<char> > ptr;
1111       ptr.make();
1112       ptr->SetLength(NTL_PRIME_BND);
1113 
1114       p = ptr->elts();
1115 
1116       for (i = 0; i < NTL_PRIME_BND; i++)
1117          p[i] = 1;
1118 
1119       jstep = 1;
1120       jstart = -1;
1121       ibnd = (SqrRoot(2 * NTL_PRIME_BND + 1) - 3) / 2;
1122       for (i = 0; i <= ibnd; i++) {
1123          jstart += 2 * ((jstep += 2) - 1);
1124          if (p[i])
1125             for (j = jstart; j < NTL_PRIME_BND; j += jstep)
1126                p[j] = 0;
1127       }
1128 
1129       builder.move(ptr);
1130    } while (0);
1131 
1132 }
1133 
reset(long b)1134 void PrimeSeq::reset(long b)
1135 {
1136    if (b > (2*NTL_PRIME_BND+1)*(2*NTL_PRIME_BND+1)) {
1137       exhausted = 1;
1138       return;
1139    }
1140 
1141    if (b <= 2) {
1142       shift(-1);
1143       return;
1144    }
1145 
1146    if ((b & 1) == 0) b++;
1147 
1148    shift(((b-3) / (2*NTL_PRIME_BND))* (2*NTL_PRIME_BND));
1149    pindex = (b - pshift - 3)/2 - 1;
1150 }
1151 
Jacobi(const ZZ & aa,const ZZ & nn)1152 long Jacobi(const ZZ& aa, const ZZ& nn)
1153 {
1154    ZZ a, n;
1155    long t, k;
1156    long d;
1157 
1158    a = aa;
1159    n = nn;
1160    t = 1;
1161 
1162    while (a != 0) {
1163       k = MakeOdd(a);
1164       d = trunc_long(n, 3);
1165       if ((k & 1) && (d == 3 || d == 5)) t = -t;
1166 
1167       if (trunc_long(a, 2) == 3 && (d & 3) == 3) t = -t;
1168       swap(a, n);
1169       rem(a, a, n);
1170    }
1171 
1172    if (n == 1)
1173       return t;
1174    else
1175       return 0;
1176 }
1177 
1178 
SqrRootMod(ZZ & x,const ZZ & aa,const ZZ & nn)1179 void SqrRootMod(ZZ& x, const ZZ& aa, const ZZ& nn)
1180 {
1181    if (aa == 0 || aa == 1) {
1182       x = aa;
1183       return;
1184    }
1185 
1186    // at this point, we must have nn >= 5
1187 
1188    if (trunc_long(nn, 2) == 3) {  // special case, n = 3 (mod 4)
1189       ZZ n, a, e, z;
1190 
1191       n = nn;
1192       a  = aa;
1193 
1194       add(e, n, 1);
1195       RightShift(e, e, 2);
1196 
1197       PowerMod(z, a, e, n);
1198       x = z;
1199 
1200       return;
1201    }
1202 
1203    ZZ n, m;
1204    int h, nlen;
1205 
1206    n = nn;
1207    nlen = NumBits(n);
1208 
1209    sub(m, n, 1);
1210    h = MakeOdd(m);  // h >= 2
1211 
1212 
1213    if (nlen > 50 && h < SqrRoot(nlen)) {
1214       long i, j;
1215       ZZ a, b, a_inv, c, r, m1, d;
1216 
1217       a = aa;
1218       InvMod(a_inv, a, n);
1219 
1220       if (h == 2)
1221          b = 2;
1222       else {
1223          do {
1224             RandomBnd(b, n);
1225          } while (Jacobi(b, n) != -1);
1226       }
1227 
1228 
1229       PowerMod(c, b, m, n);
1230 
1231       add(m1, m, 1);
1232       RightShift(m1, m1, 1);
1233       PowerMod(r, a, m1, n);
1234 
1235       for (i = h-2; i >= 0; i--) {
1236          SqrMod(d, r, n);
1237          MulMod(d, d, a_inv, n);
1238          for (j = 0; j < i; j++)
1239             SqrMod(d, d, n);
1240          if (!IsOne(d))
1241             MulMod(r, r, c, n);
1242          SqrMod(c, c, n);
1243       }
1244 
1245       x = r;
1246       return;
1247    }
1248 
1249 
1250 
1251 
1252 
1253    long i, k;
1254    ZZ ma, t, u, v, e;
1255    ZZ t1, t2, t3, t4;
1256 
1257    n = nn;
1258    NegateMod(ma, aa, n);
1259 
1260    // find t such that t^2 - 4*a is not a square
1261 
1262    MulMod(t1, ma, 4, n);
1263    do {
1264       RandomBnd(t, n);
1265       SqrMod(t2, t, n);
1266       AddMod(t2, t2, t1, n);
1267    } while (Jacobi(t2, n) != -1);
1268 
1269    // compute u*X + v = X^{(n+1)/2} mod f, where f = X^2 - t*X + a
1270 
1271    add(e, n, 1);
1272    RightShift(e, e, 1);
1273 
1274    u = 0;
1275    v = 1;
1276 
1277    k = NumBits(e);
1278 
1279    for (i = k - 1; i >= 0; i--) {
1280       add(t2, u, v);
1281       sqr(t3, t2);  // t3 = (u+v)^2
1282       sqr(t1, u);
1283       sqr(t2, v);
1284       sub(t3, t3, t1);
1285       sub(t3, t3, t2); // t1 = u^2, t2 = v^2, t3 = 2*u*v
1286       rem(t1, t1, n);
1287       mul(t4, t1, t);
1288       add(t4, t4, t3);
1289       rem(u, t4, n);
1290 
1291       mul(t4, t1, ma);
1292       add(t4, t4, t2);
1293       rem(v, t4, n);
1294 
1295       if (bit(e, i)) {
1296          MulMod(t1, u, t, n);
1297          AddMod(t1, t1, v, n);
1298          MulMod(v, u, ma, n);
1299          u = t1;
1300       }
1301 
1302    }
1303 
1304    x = v;
1305 }
1306 
1307 
1308 
1309 // Chinese Remaindering.
1310 //
1311 // This version in new to v3.7, and is significantly
1312 // simpler and faster than the previous version.
1313 //
1314 // This function takes as input g, a, G, p,
1315 // such that a > 0, 0 <= G < p, and gcd(a, p) = 1.
1316 // It computes a' = a*p and g' such that
1317 //   * g' = g (mod a);
1318 //   * g' = G (mod p);
1319 //   * -a'/2 < g' <= a'/2.
1320 // It then sets g := g' and a := a', and returns 1 iff g has changed.
1321 //
1322 // Under normal use, the input value g satisfies -a/2 < g <= a/2;
1323 // however, this was not documented or enforced in earlier versions,
1324 // so to maintain backward compatability, no restrictions are placed
1325 // on g.  This routine runs faster, though, if -a/2 < g <= a/2,
1326 // and the first thing the routine does is to make this condition
1327 // hold.
1328 //
1329 // Also, under normal use, both a and p are odd;  however, the routine
1330 // will still work even if this is not so.
1331 //
1332 // The routine is based on the following simple fact.
1333 //
1334 // Let -a/2 < g <= a/2, and let h satisfy
1335 //   * g + a h = G (mod p);
1336 //   * -p/2 < h <= p/2.
1337 // Further, if p = 2*h and g > 0, set
1338 //   g' := g - a h;
1339 // otherwise, set
1340 //   g' := g + a h.
1341 // Then g' so defined satisfies the above requirements.
1342 //
1343 // It is trivial to see that g's satisfies the congruence conditions.
1344 // The only thing is to check that the "balancing" condition
1345 // -a'/2 < g' <= a'/2 also holds.
1346 
1347 
CRT(ZZ & gg,ZZ & a,long G,long p)1348 long CRT(ZZ& gg, ZZ& a, long G, long p)
1349 {
1350    if (p >= NTL_SP_BOUND) {
1351       ZZ GG, pp;
1352       conv(GG, G);
1353       conv(pp, p);
1354       return CRT(gg, a, GG, pp);
1355    }
1356 
1357    long modified = 0;
1358 
1359    NTL_ZZRegister(g);
1360 
1361    if (!CRTInRange(gg, a)) {
1362       modified = 1;
1363       ZZ a1;
1364       rem(g, gg, a);
1365       RightShift(a1, a, 1);
1366       if (g > a1) sub(g, g, a);
1367    }
1368    else
1369       g = gg;
1370 
1371 
1372    long p1;
1373    p1 = p >> 1;
1374 
1375    long a_inv;
1376    a_inv = rem(a, p);
1377    a_inv = InvMod(a_inv, p);
1378 
1379    long h;
1380    h = rem(g, p);
1381    h = SubMod(G, h, p);
1382    h = MulMod(h, a_inv, p);
1383    if (h > p1)
1384       h = h - p;
1385 
1386    if (h != 0) {
1387       modified = 1;
1388 
1389       if (!(p & 1) && g > 0 && (h == p1))
1390          MulSubFrom(g, a, h);
1391       else
1392          MulAddTo(g, a, h);
1393    }
1394 
1395    mul(a, a, p);
1396    gg = g;
1397 
1398    return modified;
1399 }
1400 
CRT(ZZ & gg,ZZ & a,const ZZ & G,const ZZ & p)1401 long CRT(ZZ& gg, ZZ& a, const ZZ& G, const ZZ& p)
1402 {
1403    long modified = 0;
1404 
1405    ZZ g;
1406 
1407    if (!CRTInRange(gg, a)) {
1408       modified = 1;
1409       ZZ a1;
1410       rem(g, gg, a);
1411       RightShift(a1, a, 1);
1412       if (g > a1) sub(g, g, a);
1413    }
1414    else
1415       g = gg;
1416 
1417 
1418    ZZ p1;
1419    RightShift(p1, p, 1);
1420 
1421    ZZ a_inv;
1422    rem(a_inv, a, p);
1423    InvMod(a_inv, a_inv, p);
1424 
1425    ZZ h;
1426    rem(h, g, p);
1427    SubMod(h, G, h, p);
1428    MulMod(h, h, a_inv, p);
1429    if (h > p1)
1430       sub(h, h, p);
1431 
1432    if (h != 0) {
1433       modified = 1;
1434       ZZ ah;
1435       mul(ah, a, h);
1436 
1437       if (!IsOdd(p) && g > 0 &&  (h == p1))
1438          sub(g, g, ah);
1439       else
1440          add(g, g, ah);
1441    }
1442 
1443    mul(a, a, p);
1444    gg = g;
1445 
1446    return modified;
1447 }
1448 
1449 
1450 
sub(ZZ & x,long a,const ZZ & b)1451 void sub(ZZ& x, long a, const ZZ& b)
1452 {
1453    NTL_ZZRegister(A);
1454    conv(A, a);
1455    sub(x, A, b);
1456 }
1457 
1458 
power2(ZZ & x,long e)1459 void power2(ZZ& x, long e)
1460 {
1461    if (e < 0) ArithmeticError("power2: negative exponent");
1462    set(x);
1463    LeftShift(x, x, e);
1464 }
1465 
1466 
1467 
bit_and(ZZ & x,const ZZ & a,long b)1468 void bit_and(ZZ& x, const ZZ& a, long b)
1469 {
1470    NTL_ZZRegister(B);
1471    conv(B, b);
1472    bit_and(x, a, B);
1473 }
1474 
bit_or(ZZ & x,const ZZ & a,long b)1475 void bit_or(ZZ& x, const ZZ& a, long b)
1476 {
1477    NTL_ZZRegister(B);
1478    conv(B, b);
1479    bit_or(x, a, B);
1480 }
1481 
bit_xor(ZZ & x,const ZZ & a,long b)1482 void bit_xor(ZZ& x, const ZZ& a, long b)
1483 {
1484    NTL_ZZRegister(B);
1485    conv(B, b);
1486    bit_xor(x, a, B);
1487 }
1488 
1489 
power_long(long a,long e)1490 long power_long(long a, long e)
1491 {
1492    if (e < 0) ArithmeticError("power_long: negative exponent");
1493 
1494    if (e == 0) return 1;
1495 
1496    if (a == 1) return 1;
1497    if (a == -1) {
1498       if (e & 1)
1499          return -1;
1500       else
1501          return 1;
1502    }
1503 
1504    // no overflow check --- result is computed correctly
1505    // modulo word size
1506 
1507    unsigned long res = 1;
1508    unsigned long aa = a;
1509    long i;
1510 
1511    for (i = 0; i < e; i++)
1512       res *= aa;
1513 
1514    return to_long(res);
1515 }
1516 
1517 
1518 
1519 // ======================= new PRG stuff ======================
1520 
1521 
1522 
1523 
1524 #if (NTL_BITS_PER_INT32 == 32)
1525 #define INT32MASK(x) (x)
1526 #else
1527 #define INT32MASK(x) ((x) & _ntl_uint32(0xffffffff))
1528 #endif
1529 
1530 
1531 
1532 // SHA256 code adapted from an implementauin by Brad Conte.
1533 // The following is from his original source files.
1534 /*********************************************************************
1535 * Filename:   sha256.c
1536 * Author:     Brad Conte (brad AT bradconte.com)
1537 * Copyright:
1538 * Disclaimer: This code is presented "as is" without any guarantees.
1539 * Details:    Implementation of the SHA-256 hashing algorithm.
1540               SHA-256 is one of the three algorithms in the SHA2
1541               specification. The others, SHA-384 and SHA-512, are not
1542               offered in this implementation.
1543               Algorithm specification can be found here:
1544                * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2withchangenotice.pdf
1545               This implementation uses little endian byte order.
1546 *********************************************************************/
1547 
1548 // And the following is from the description at
1549 // https://github.com/B-Con/crypto-algorithms
1550 
1551 /*********************************************************************
1552 
1553 These are basic implementations of standard cryptography algorithms, written by
1554 Brad Conte (brad@bradconte.com) from scratch and without any cross-licensing.
1555 They exist to provide publically accessible, restriction-free implementations
1556 of popular cryptographic algorithms, like AES and SHA-1. These are primarily
1557 intended for educational and pragmatic purposes (such as comparing a
1558 specification to actual implementation code, or for building an internal
1559 application that computes test vectors for a product). The algorithms have been
1560 tested against standard test vectors.
1561 
1562 This code is released into the public domain free of any restrictions. The
1563 author requests acknowledgement if the code is used, but does not require it.
1564 This code is provided free of any liability and without any quality claims by
1565 the author.
1566 
1567 Note that these are not cryptographically secure implementations. They have no
1568 resistence to side-channel attacks and should not be used in contexts that need
1569 cryptographically secure implementations.
1570 
1571 These algorithms are not optimized for speed or space. They are primarily
1572 designed to be easy to read, although some basic optimization techniques have
1573 been employed.
1574 
1575 *********************************************************************/
1576 
1577 
1578 
1579 
1580 
1581 
1582 #define SHA256_BLOCKSIZE (64)
1583 #define SHA256_HASHSIZE  (32)
1584 
1585 // DBL_INT_ADD treats two unsigned ints a and b as one 64-bit integer and adds c to it
1586 static inline
DBL_INT_ADD(_ntl_uint32 & a,_ntl_uint32 & b,_ntl_uint32 c)1587 void DBL_INT_ADD(_ntl_uint32& a, _ntl_uint32& b, _ntl_uint32 c)
1588 {
1589    _ntl_uint32 aa = INT32MASK(a);
1590    if (aa > INT32MASK(_ntl_uint32(0xffffffff) - c)) b++;
1591    a = aa + c;
1592 }
1593 
1594 #define ROTLEFT(a,b) (((a) << (b)) | (INT32MASK(a) >> (32-(b))))
1595 #define ROTRIGHT(a,b) ((INT32MASK(a) >> (b)) | ((a) << (32-(b))))
1596 
1597 #define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z)))
1598 #define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
1599 #define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
1600 #define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
1601 #define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ (INT32MASK(x) >> 3))
1602 #define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ (INT32MASK(x) >> 10))
1603 
1604 struct SHA256_CTX {
1605    unsigned char data[64];
1606    _ntl_uint32 datalen;
1607    _ntl_uint32 bitlen[2];
1608    _ntl_uint32 state[8];
1609 };
1610 
1611 static const _ntl_uint32 sha256_const[64] = {
1612    0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1613    0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1614    0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1615    0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1616    0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1617    0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1618    0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1619    0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1620 };
1621 
1622 
1623 static
sha256_transform(SHA256_CTX & ctx,unsigned char * data)1624 void sha256_transform(SHA256_CTX& ctx, unsigned char *data)
1625 {
1626    _ntl_uint32 a,b,c,d,e,f,g,h,i,j,t1,t2,m[64];
1627 
1628    for (i=0,j=0; i < 16; ++i, j += 4)
1629       m[i] = (data[j] << 24) | (data[j+1] << 16) | (data[j+2] << 8) | (data[j+3]);
1630    for ( ; i < 64; ++i)
1631       m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16];
1632 
1633    a = ctx.state[0];
1634    b = ctx.state[1];
1635    c = ctx.state[2];
1636    d = ctx.state[3];
1637    e = ctx.state[4];
1638    f = ctx.state[5];
1639    g = ctx.state[6];
1640    h = ctx.state[7];
1641 
1642    for (i = 0; i < 64; ++i) {
1643       t1 = h + EP1(e) + CH(e,f,g) + sha256_const[i] + m[i];
1644       t2 = EP0(a) + MAJ(a,b,c);
1645       h = g;
1646       g = f;
1647       f = e;
1648       e = d + t1;
1649       d = c;
1650       c = b;
1651       b = a;
1652       a = t1 + t2;
1653    }
1654 
1655    ctx.state[0] += a;
1656    ctx.state[1] += b;
1657    ctx.state[2] += c;
1658    ctx.state[3] += d;
1659    ctx.state[4] += e;
1660    ctx.state[5] += f;
1661    ctx.state[6] += g;
1662    ctx.state[7] += h;
1663 }
1664 
1665 static
sha256_init(SHA256_CTX & ctx)1666 void sha256_init(SHA256_CTX& ctx)
1667 {
1668    ctx.datalen = 0;
1669    ctx.bitlen[0] = 0;
1670    ctx.bitlen[1] = 0;
1671    ctx.state[0] = 0x6a09e667;
1672    ctx.state[1] = 0xbb67ae85;
1673    ctx.state[2] = 0x3c6ef372;
1674    ctx.state[3] = 0xa54ff53a;
1675    ctx.state[4] = 0x510e527f;
1676    ctx.state[5] = 0x9b05688c;
1677    ctx.state[6] = 0x1f83d9ab;
1678    ctx.state[7] = 0x5be0cd19;
1679 }
1680 
1681 static
sha256_update(SHA256_CTX & ctx,const unsigned char * data,_ntl_uint32 len)1682 void sha256_update(SHA256_CTX& ctx, const unsigned char *data, _ntl_uint32 len)
1683 {
1684    _ntl_uint32 i;
1685 
1686    for (i=0; i < len; ++i) {
1687       ctx.data[ctx.datalen] = data[i];
1688       ctx.datalen++;
1689       if (ctx.datalen == 64) {
1690          sha256_transform(ctx,ctx.data);
1691          DBL_INT_ADD(ctx.bitlen[0],ctx.bitlen[1],512);
1692          ctx.datalen = 0;
1693       }
1694    }
1695 }
1696 
1697 static
sha256_final(SHA256_CTX & ctx,unsigned char * hash,long hlen=SHA256_HASHSIZE)1698 void sha256_final(SHA256_CTX& ctx, unsigned char *hash,
1699                   long hlen=SHA256_HASHSIZE)
1700 {
1701    _ntl_uint32 i, j;
1702 
1703    i = ctx.datalen;
1704 
1705    // Pad whatever data is left in the buffer.
1706    if (ctx.datalen < 56) {
1707       ctx.data[i++] = 0x80;
1708       while (i < 56)
1709          ctx.data[i++] = 0x00;
1710    }
1711    else {
1712       ctx.data[i++] = 0x80;
1713       while (i < 64)
1714          ctx.data[i++] = 0x00;
1715       sha256_transform(ctx,ctx.data);
1716       memset(ctx.data,0,56);
1717    }
1718 
1719    // Append to the padding the total message's length in bits and transform.
1720    DBL_INT_ADD(ctx.bitlen[0],ctx.bitlen[1],ctx.datalen * 8);
1721 
1722    ctx.data[63] = ctx.bitlen[0];
1723    ctx.data[62] = ctx.bitlen[0] >> 8;
1724    ctx.data[61] = ctx.bitlen[0] >> 16;
1725    ctx.data[60] = ctx.bitlen[0] >> 24;
1726    ctx.data[59] = ctx.bitlen[1];
1727    ctx.data[58] = ctx.bitlen[1] >> 8;
1728    ctx.data[57] = ctx.bitlen[1] >> 16;
1729    ctx.data[56] = ctx.bitlen[1] >> 24;
1730    sha256_transform(ctx,ctx.data);
1731 
1732    for (i = 0; i < 8; i++) {
1733       _ntl_uint32 w = ctx.state[i];
1734       for (j = 0; j < 4; j++) {
1735          if (hlen <= 0) break;
1736          hash[4*i + j] = w >> (24-j*8);
1737          hlen--;
1738       }
1739    }
1740 
1741 }
1742 
1743 
1744 
1745 static
sha256(const unsigned char * data,long dlen,unsigned char * hash,long hlen=SHA256_HASHSIZE)1746 void sha256(const unsigned char *data, long dlen, unsigned char *hash,
1747             long hlen=SHA256_HASHSIZE)
1748 {
1749    if (dlen < 0) dlen = 0;
1750    if (hlen < 0) hlen = 0;
1751 
1752    SHA256_CTX ctx;
1753    sha256_init(ctx);
1754 
1755    const long BLKSIZE = 4096;
1756 
1757    long i;
1758    for (i = 0; i <= dlen-BLKSIZE; i += BLKSIZE)
1759       sha256_update(ctx, data + i, BLKSIZE);
1760 
1761    if (i < dlen)
1762       sha256_update(ctx, data + i, dlen - i);
1763 
1764    sha256_final(ctx, hash, hlen);
1765 }
1766 
1767 
1768 static
hmac_sha256(const unsigned char * key,long klen,const unsigned char * data,long dlen,unsigned char * hash,long hlen=SHA256_HASHSIZE)1769 void hmac_sha256(const unsigned char *key, long klen,
1770                  const unsigned char *data, long dlen,
1771                  unsigned char *hash, long hlen=SHA256_HASHSIZE)
1772 {
1773    if (klen < 0) klen = 0;
1774    if (dlen < 0) dlen = 0;
1775    if (hlen < 0) hlen = 0;
1776 
1777    unsigned char K[SHA256_BLOCKSIZE];
1778    unsigned char tmp[SHA256_HASHSIZE];
1779 
1780    long i;
1781 
1782    if (klen <= SHA256_BLOCKSIZE) {
1783       for (i = 0; i < klen; i++)
1784          K[i] = key[i];
1785       for (i = klen; i < SHA256_BLOCKSIZE; i++)
1786          K[i] = 0;
1787    }
1788    else {
1789       sha256(key, klen, K, SHA256_BLOCKSIZE);
1790       for (i = SHA256_HASHSIZE; i < SHA256_BLOCKSIZE; i++)
1791          K[i] = 0;
1792    }
1793 
1794    for (i = 0; i < SHA256_BLOCKSIZE; i++)
1795       K[i] ^= 0x36;
1796 
1797    SHA256_CTX ctx;
1798    sha256_init(ctx);
1799    sha256_update(ctx, K, SHA256_BLOCKSIZE);
1800    sha256_update(ctx, data, dlen);
1801    sha256_final(ctx, tmp);
1802 
1803    for (i = 0; i < SHA256_BLOCKSIZE; i++)
1804       K[i] ^= (0x36 ^ 0x5C);
1805 
1806    sha256_init(ctx);
1807    sha256_update(ctx, K, SHA256_BLOCKSIZE);
1808    sha256_update(ctx, tmp, SHA256_HASHSIZE);
1809    sha256_final(ctx, hash, hlen);
1810 }
1811 
1812 
1813 // This key derivation uses HMAC with a zero key to derive
1814 // an intermediate key K from the data, and then uses HMAC
1815 // as a PRF in counter mode with key K to derive the final key
1816 
DeriveKey(unsigned char * key,long klen,const unsigned char * data,long dlen)1817 void DeriveKey(unsigned char *key, long klen,
1818                const unsigned char *data, long dlen)
1819 {
1820    if (dlen < 0) LogicError("DeriveKey: bad args");
1821    if (klen < 0) LogicError("DeriveKey: bad args");
1822 
1823    long i, j;
1824 
1825 
1826    unsigned char K[SHA256_HASHSIZE];
1827    hmac_sha256(0, 0, data, dlen, K);
1828 
1829    // initialize 64-bit counter to zero
1830    unsigned char counter[8];
1831    for (j = 0; j < 8; j++) counter[j] = 0;
1832 
1833    for (i = 0; i <= klen-SHA256_HASHSIZE; i += SHA256_HASHSIZE) {
1834       hmac_sha256(K, SHA256_HASHSIZE, counter, 8, key+i);
1835 
1836       // increment counter
1837       for (j = 0; j < 8; j++) {
1838          counter[j]++;
1839          if (counter[j] != 0) break;
1840       }
1841    }
1842 
1843    if (i < klen)
1844       hmac_sha256(K, SHA256_HASHSIZE, counter, 8, key+i, klen-i);
1845 }
1846 
1847 
1848 
1849 
1850 // ******************** ChaCha20 stuff ***********************
1851 
1852 // ============= old stuff
1853 
1854 #define LE(p) (((_ntl_uint32)((p)[0])) + ((_ntl_uint32)((p)[1]) << 8) + \
1855     ((_ntl_uint32)((p)[2]) << 16) + ((_ntl_uint32)((p)[3]) << 24))
1856 
1857 #define FROMLE(p, x) (p)[0] = (x), (p)[1] = ((x) >> 8), \
1858    (p)[2] = ((x) >> 16), (p)[3] = ((x) >> 24)
1859 
1860 
1861 #define QUARTERROUND(x, a, b, c, d) \
1862     x[a] += x[b], x[d] = ROTLEFT(x[d] ^ x[a], 16), \
1863     x[c] += x[d], x[b] = ROTLEFT(x[b] ^ x[c], 12), \
1864     x[a] += x[b], x[d] = ROTLEFT(x[d] ^ x[a], 8), \
1865     x[c] += x[d], x[b] = ROTLEFT(x[b] ^ x[c], 7)
1866 
1867 
1868 static
salsa20_core(_ntl_uint32 * data)1869 void salsa20_core(_ntl_uint32* data)
1870 {
1871    long i;
1872 
1873    for (i = 0; i < 10; i++) {
1874       QUARTERROUND(data, 0, 4, 8, 12);
1875       QUARTERROUND(data, 1, 5, 9, 13);
1876       QUARTERROUND(data, 2, 6, 10, 14);
1877       QUARTERROUND(data, 3, 7, 11, 15);
1878       QUARTERROUND(data, 0, 5, 10, 15);
1879       QUARTERROUND(data, 1, 6, 11, 12);
1880       QUARTERROUND(data, 2, 7, 8, 13);
1881       QUARTERROUND(data, 3, 4, 9, 14);
1882    }
1883 }
1884 
1885 
1886 // key K must be exactly 32 bytes
1887 static
salsa20_init(_ntl_uint32 * state,const unsigned char * K)1888 void salsa20_init(_ntl_uint32 *state, const unsigned char *K)
1889 {
1890    static const _ntl_uint32 chacha_const[4] =
1891       { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 };
1892 
1893    long i;
1894 
1895    for (i = 0; i < 4; i++)
1896       state[i] = chacha_const[i];
1897 
1898    for (i = 4; i < 12; i++)
1899       state[i] = LE(K + 4*(i-4));
1900 
1901    for (i = 12; i < 16; i++)
1902       state[i] = 0;
1903 }
1904 
1905 
1906 
1907 // state and data are of length 16
1908 static
salsa20_apply(_ntl_uint32 * state,_ntl_uint32 * data)1909 void salsa20_apply(_ntl_uint32 *state, _ntl_uint32 *data)
1910 {
1911    long i;
1912 
1913    for (i = 0; i < 16; i++) data[i] = state[i];
1914 
1915    salsa20_core(data);
1916 
1917    for (i = 0; i < 16; i++) data[i] += state[i];
1918 
1919    for (i = 12; i < 14; i++) {
1920       state[i]++;
1921       state[i] = INT32MASK(state[i]);
1922       if (state[i] != 0) break;
1923    }
1924 }
1925 
1926 
1927 
old_RandomStream(const unsigned char * key)1928 old_RandomStream::old_RandomStream(const unsigned char *key)
1929 {
1930    salsa20_init(state, key);
1931    pos = 64;
1932 }
1933 
1934 
do_get(unsigned char * res,long n)1935 void old_RandomStream::do_get(unsigned char *res, long n)
1936 {
1937    if (n < 0) LogicError("RandomStream::get: bad args");
1938 
1939    long i, j;
1940 
1941    if (n <= 64-pos) {
1942       for (i = 0; i < n; i++) res[i] = buf[pos+i];
1943       pos += n;
1944       return;
1945    }
1946 
1947    // read remainder of buffer
1948    for (i = 0; i < 64-pos; i++) res[i] = buf[pos+i];
1949    n -= 64-pos;
1950    res += 64-pos;
1951    pos = 64;
1952 
1953    _ntl_uint32 wdata[16];
1954 
1955    // read 64-byte chunks
1956    for (i = 0; i <= n-64; i += 64) {
1957       salsa20_apply(state, wdata);
1958       for (j = 0; j < 16; j++)
1959          FROMLE(res + i + 4*j, wdata[j]);
1960    }
1961 
1962    if (i < n) {
1963       salsa20_apply(state, wdata);
1964 
1965       for (j = 0; j < 16; j++)
1966          FROMLE(buf + 4*j, wdata[j]);
1967 
1968       pos = n-i;
1969       for (j = 0; j < pos; j++)
1970          res[i+j] = buf[j];
1971    }
1972 }
1973 
1974 #if defined(NTL_RANDOM_AES256CTR)
1975 
1976 /* Size must be a multiple of AES block-size (16 bytes). */
1977 #define BUFSIZE                 4096
1978 //#define BUFSIZE                 8192
1979 
1980 static void
inc32(unsigned char ctr[16])1981 inc32(unsigned char ctr[16])
1982 {
1983    int i, c = 1;
1984 
1985    for (i = 0; i < 4; i++) {
1986       c += ctr[15 - i];
1987       ctr[15 - i] = (unsigned char)c;
1988       c >>= 8;
1989    }
1990 }
1991 
1992 #if defined(NTL_HAVE_AES_NI) && defined(NTL_HAVE_AVX2)
1993 
1994 /*****************************************************************
1995 This optimized AES-256 implementation is derived from public
1996 domain code.
1997 
1998 Authors:
1999 Romain Dolbeau
2000 
2001 Obtained from:
2002 https://github.com/floodyberry/supercop/blob/master/crypto_stream/aes256ctr/dolbeau/aesenc-int/aesenc-int.c
2003 */
2004 
2005 #ifdef __INTEL_COMPILER
2006 #define ALIGN16 __declspec(align(16))
2007 #define ALIGN32 __declspec(align(32))
2008 #define ALIGN64 __declspec(align(64))
2009 #else // assume GCC
2010 #define ALIGN16  __attribute__((aligned(16)))
2011 #define ALIGN32  __attribute__((aligned(32)))
2012 #define ALIGN64  __attribute__((aligned(64)))
2013 #ifndef _bswap64
2014 #define _bswap64(a) __builtin_bswap64(a)
2015 #endif
2016 #ifndef  _bswap
2017 #define _bswap(a) __builtin_bswap(a)
2018 #endif
2019 #endif
2020 
aesni_key256_expand(const unsigned char * key,__m128i rkeys[16])2021 static inline void aesni_key256_expand(const unsigned char* key, __m128i rkeys[16]) {
2022    __m128i key0 = _mm_loadu_si128((const __m128i *)(key+0));
2023    __m128i key1 = _mm_loadu_si128((const __m128i *)(key+16));
2024    __m128i temp0, temp1, temp2, temp4;
2025    int idx = 0;
2026 
2027    rkeys[idx++] = key0;
2028    temp0 = key0;
2029    temp2 = key1;
2030 
2031    /* blockshift-based block by Cedric Bourrasset & Romain Dolbeau */
2032 #define BLOCK1(IMM)                              \
2033    temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \
2034    rkeys[idx++] = temp2;                          \
2035    temp4 = _mm_slli_si128(temp0,4);               \
2036    temp0 = _mm_xor_si128(temp0,temp4);            \
2037    temp4 = _mm_slli_si128(temp0,8);               \
2038    temp0 = _mm_xor_si128(temp0,temp4);            \
2039    temp1 = _mm_shuffle_epi32(temp1,0xff);         \
2040    temp0 = _mm_xor_si128(temp0,temp1)
2041 
2042 #define BLOCK2(IMM)                              \
2043    temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \
2044    rkeys[idx++] = temp0;                          \
2045    temp4 = _mm_slli_si128(temp2,4);               \
2046    temp2 = _mm_xor_si128(temp2,temp4);            \
2047    temp4 = _mm_slli_si128(temp2,8);               \
2048    temp2 = _mm_xor_si128(temp2,temp4);            \
2049    temp1 = _mm_shuffle_epi32(temp1,0xaa);         \
2050    temp2 = _mm_xor_si128(temp2,temp1)
2051 
2052    BLOCK1(0x01);
2053    BLOCK2(0x01);
2054 
2055    BLOCK1(0x02);
2056    BLOCK2(0x02);
2057 
2058    BLOCK1(0x04);
2059    BLOCK2(0x04);
2060 
2061    BLOCK1(0x08);
2062    BLOCK2(0x08);
2063 
2064    BLOCK1(0x10);
2065    BLOCK2(0x10);
2066 
2067    BLOCK1(0x20);
2068    BLOCK2(0x20);
2069 
2070    BLOCK1(0x40);
2071    rkeys[idx++] = temp0;
2072 }
2073 
2074 /** single, by-the-book AES encryption with AES-NI */
aesni_encrypt1(unsigned char * out,unsigned char * n,__m128i rkeys[16])2075 static inline void aesni_encrypt1(unsigned char *out, unsigned char *n, __m128i rkeys[16]) {
2076    __m128i nv = _mm_load_si128((const __m128i *)n);
2077    int i;
2078    __m128i temp = _mm_xor_si128(nv, rkeys[0]);
2079 #if 0
2080 // This pragma is not recognized by GCC < 8
2081 #pragma unroll(13)
2082    for (i = 1 ; i < 14 ; i++) {
2083       temp = _mm_aesenc_si128(temp, rkeys[i]);
2084    }
2085 #else
2086    temp = _mm_aesenc_si128(temp, rkeys[ 1]);
2087    temp = _mm_aesenc_si128(temp, rkeys[ 2]);
2088    temp = _mm_aesenc_si128(temp, rkeys[ 3]);
2089    temp = _mm_aesenc_si128(temp, rkeys[ 4]);
2090    temp = _mm_aesenc_si128(temp, rkeys[ 5]);
2091    temp = _mm_aesenc_si128(temp, rkeys[ 6]);
2092    temp = _mm_aesenc_si128(temp, rkeys[ 7]);
2093    temp = _mm_aesenc_si128(temp, rkeys[ 8]);
2094    temp = _mm_aesenc_si128(temp, rkeys[ 9]);
2095    temp = _mm_aesenc_si128(temp, rkeys[10]);
2096    temp = _mm_aesenc_si128(temp, rkeys[11]);
2097    temp = _mm_aesenc_si128(temp, rkeys[12]);
2098    temp = _mm_aesenc_si128(temp, rkeys[13]);
2099 #endif
2100    temp = _mm_aesenclast_si128(temp, rkeys[14]);
2101    _mm_store_si128((__m128i*)(out), temp);
2102 }
2103 
2104 /** increment the 16-bytes nonce ;
2105     this really should be improved somehow...
2106     but it's not yet time-critical, because we
2107     use the vector variant anyway  */
incle(unsigned char n[16])2108 static inline void incle(unsigned char n[16]) {
2109 /*   unsigned long long out; */
2110 /*   unsigned char carry; */
2111    unsigned long long *n_ = (unsigned long long*)n;
2112    n_[1]++;
2113    if (n_[1] == 0)
2114       n_[0] ++;
2115   /* perhaps this will be efficient on broadwell ? */
2116   /*   carry = _addcarry_u64(0, n_[1], 1ULL, &out); */
2117   /*   carry = _addcarry_u64(carry, n_[0], 0ULL, &out); */
2118 }
2119 
2120 /** multiple-blocks-at-once AES encryption with AES-NI ;
2121     on Haswell, aesenc as a latency of 7 and a througput of 1
2122     so the sequence of aesenc should be bubble-free, if you
2123     have at least 8 blocks. Let's build an arbitratry-sized
2124     function */
2125 /* Step 1 : loading the nonce */
2126 /* load & increment the n vector (non-vectorized, unused for now) */
2127 #define NVx(a)                                                  \
2128   __m128i nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *)n), _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); incle(n)
2129 /* load the incremented n vector (vectorized, probably buggy) */
2130 #define NVxV_DEC(a)                                                     \
2131   __m128i nv##a;
2132 #define NVxV_NOWRAP(a)                                                  \
2133   nv##a = _mm_shuffle_epi8(_mm_add_epi64(nv0i, _mm_set_epi64x(a,0)), _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7))
2134 #define NVxV_WRAP(a)                                                    \
2135   __m128i ad##a = _mm_add_epi64(nv0i, _mm_set_epi64x(a,a>=wrapnumber?1:0)); \
2136   nv##a = _mm_shuffle_epi8(ad##a, _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7))
2137 
2138 /* Step 2 : define value in round one (xor with subkey #0, aka key) */
2139 #define TEMPx(a)                                        \
2140   __m128i temp##a = _mm_xor_si128(nv##a, rkeys[0])
2141 
2142 /* Step 3: one round of AES */
2143 #define AESENCx(a)                                      \
2144   temp##a =  _mm_aesenc_si128(temp##a, rkeys[i]);
2145 
2146 /* Step 4: last round of AES */
2147 #define AESENCLASTx(a)                                  \
2148   temp##a = _mm_aesenclast_si128(temp##a, rkeys[14]);
2149 
2150 /* Step 5: store result */
2151 #define STOREx(a)                                       \
2152   _mm_store_si128((__m128i*)(out+(a*16)), temp##a);
2153 
2154 /* all the MAKE* macros are for automatic explicit unrolling */
2155 #define MAKE4(X)                                \
2156   X(0);X(1);X(2);X(3)
2157 
2158 #define MAKE6(X)                                \
2159   X(0);X(1);X(2);X(3);                          \
2160   X(4);X(5)
2161 
2162 #define MAKE7(X)                                \
2163   X(0);X(1);X(2);X(3);                          \
2164   X(4);X(5);X(6)
2165 
2166 #define MAKE8(X)                                \
2167   X(0);X(1);X(2);X(3);                          \
2168   X(4);X(5);X(6);X(7)
2169 
2170 #define MAKE10(X)                               \
2171   X(0);X(1);X(2);X(3);                          \
2172   X(4);X(5);X(6);X(7);                          \
2173   X(8);X(9)
2174 
2175 #define MAKE12(X)                               \
2176   X(0);X(1);X(2);X(3);                          \
2177   X(4);X(5);X(6);X(7);                          \
2178   X(8);X(9);X(10);X(11)
2179 
2180 #define MAKE16(X)                               \
2181   X(0);X(1);X(2);X(3);                          \
2182   X(4);X(5);X(6);X(7);                          \
2183   X(8);X(9);X(10);X(11);                        \
2184   X(12);X(13);X(14);X(15)
2185 
2186 /* create a function of unrolling N ; the MAKEN is the unrolling
2187    macro, defined above. The N in MAKEN must match N, obviously. */
2188 #define FUNC(N, MAKEN)                          \
2189   static inline void aesni_encrypt##N(unsigned char *out, unsigned char *n, __m128i rkeys[16]) { \
2190     __m128i nv0i = _mm_load_si128((const __m128i *)n);                  \
2191     long long nl = *(long long*)&n[8];                                  \
2192     MAKEN(NVxV_DEC);                                                    \
2193     /* check for nonce wraparound */                                    \
2194     if ((nl < 0) && (nl + N) >= 0) {                                \
2195       int wrapnumber = (int)(N - (nl+N));                               \
2196       MAKEN(NVxV_WRAP);                                                 \
2197       _mm_storeu_si128((__m128i*)n, _mm_add_epi64(nv0i, _mm_set_epi64x(N,1))); \
2198     } else {                                                            \
2199       MAKEN(NVxV_NOWRAP);                                               \
2200       _mm_storeu_si128((__m128i*)n, _mm_add_epi64(nv0i, _mm_set_epi64x(N,0))); \
2201     }                                                                   \
2202     int i;                                                              \
2203     MAKEN(TEMPx);                                                       \
2204     for (i = 1 ; i < 14 ; i++) {                                        \
2205       MAKEN(AESENCx);                                                   \
2206     }                                                                   \
2207     MAKEN(AESENCLASTx);                                                 \
2208     MAKEN(STOREx);                                                      \
2209   }
2210 
2211 /* and now building our unrolled function is trivial */
2212 FUNC(4, MAKE4)
2213 FUNC(6, MAKE6)
2214 FUNC(7, MAKE7)
2215 FUNC(8, MAKE8)
2216 FUNC(10, MAKE10)
2217 FUNC(12, MAKE12)
2218 FUNC(16, MAKE16)
2219 
crypto_stream(unsigned char * out,unsigned long long outlen,unsigned char * n,const unsigned char * k)2220 void crypto_stream(
2221 unsigned char *out,
2222 unsigned long long outlen,
2223 unsigned char *n,
2224 const unsigned char *k
2225 )
2226 {
2227    __m128i rkeys[16];
2228    ALIGN16 unsigned char n2[16];
2229    unsigned long long i, j;
2230    aesni_key256_expand(k, rkeys);
2231    /* n2 is in byte-reversed (i.e., native little endian)
2232       order to make increment/testing easier */
2233    (*(unsigned long long*)&n2[8]) = _bswap64((*(unsigned long long*)&n[8]));
2234    (*(unsigned long long*)&n2[0]) = _bswap64((*(unsigned long long*)&n[0]));
2235 
2236 #define LOOP(iter)                                          \
2237    int lb = iter * 16;                                      \
2238    for (i = 0 ; i < outlen ; i+= lb) {                      \
2239       ALIGN16 unsigned char outni[lb];                      \
2240       aesni_encrypt##iter(outni, n2, rkeys);                \
2241       unsigned long long mj = lb;                           \
2242       if ((i+mj)>=outlen)                                   \
2243          mj = outlen-i;                                     \
2244       for (j = 0 ; j < mj ; j++)                            \
2245          out[i+j] = outni[j];                               \
2246    }
2247 
2248    LOOP(8);
2249 
2250    (*(unsigned long long*)&n[8]) = _bswap64((*(unsigned long long*)&n2[8]));
2251    (*(unsigned long long*)&n[0]) = _bswap64((*(unsigned long long*)&n2[0]));
2252 }
2253 
2254 static void
aes256ctr_stream(unsigned char out[BUFSIZE],unsigned char iv[16],const unsigned char key[32])2255 aes256ctr_stream(unsigned char out[BUFSIZE], unsigned char iv[16], const unsigned char key[32])
2256 {
2257    crypto_stream(out, BUFSIZE, iv, key);
2258 }
2259 
2260 /*****************************************************************/
2261 
2262 #elif defined(NTL_HAVE_KMA)
2263 
2264 static void
aes256ctr_stream(unsigned char out[BUFSIZE],unsigned char iv[16],const unsigned char key[32])2265 aes256ctr_stream(unsigned char out[BUFSIZE], unsigned char iv[16], const unsigned char key[32])
2266 {
2267    static const unsigned char zerobuf[BUFSIZE] = {0};
2268    unsigned long fc = CPACF_KMA_GCM_AES_256 | CPACF_KMA_HS | CPACF_KMA_LAAD;
2269    struct {
2270       unsigned char reserved[12];
2271       unsigned int cv;
2272       unsigned char _[48];
2273       unsigned char j0[16];
2274       unsigned char k[32];
2275    } param;
2276 
2277    memcpy(&param.cv, &iv[12], sizeof(param.cv));
2278    param.cv--;
2279    memcpy(&param.j0[0], &iv[0], sizeof(param.j0) - sizeof(param.cv));
2280    memcpy(&param.j0[12], &param.cv, sizeof(param.cv));
2281    memcpy(param.k, key, sizeof(param.k));
2282 
2283    cpacf_kma(fc, &param, out, NULL, 0, zerobuf, sizeof(zerobuf));
2284 
2285    param.cv++;
2286    memcpy(&iv[12], &param.cv, sizeof(param.cv));
2287 }
2288 
2289 #else
2290 
2291 /*****************************************************************
2292 This AES-256 reference implementation is derived from
2293 public domain code.
2294 
2295 Authors:
2296 Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
2297 Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
2298 Paulo Barreto <paulo.barreto@terra.com.br>
2299 
2300 Obtained from:
2301 https://github.com/zakird/zdlibc/blob/master/rijndael-alg-fst.c
2302 */
2303 
2304 typedef uint8_t u8;
2305 typedef uint32_t u32;
2306 
2307 static const u32 Te0[256] = {
2308    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
2309    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
2310    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
2311    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
2312    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
2313    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
2314    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
2315    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
2316    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
2317    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
2318    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
2319    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
2320    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
2321    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
2322    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
2323    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
2324    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
2325    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
2326    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
2327    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
2328    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
2329    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
2330    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
2331    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
2332    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
2333    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
2334    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
2335    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
2336    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
2337    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
2338    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
2339    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
2340    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
2341    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
2342    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
2343    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
2344    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
2345    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
2346    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
2347    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
2348    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
2349    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
2350    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
2351    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
2352    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
2353    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
2354    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
2355    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
2356    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
2357    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
2358    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
2359    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
2360    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
2361    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
2362    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
2363    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
2364    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
2365    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
2366    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
2367    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
2368    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
2369    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
2370    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
2371    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
2372 };
2373 static const u32 Te1[256] = {
2374    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
2375    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
2376    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
2377    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
2378    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
2379    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
2380    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
2381    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
2382    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
2383    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
2384    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
2385    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
2386    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
2387    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
2388    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
2389    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
2390    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
2391    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
2392    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
2393    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
2394    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
2395    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
2396    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
2397    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
2398    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
2399    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
2400    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
2401    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
2402    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
2403    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
2404    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
2405    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
2406    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
2407    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
2408    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
2409    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
2410    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
2411    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
2412    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
2413    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
2414    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
2415    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
2416    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
2417    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
2418    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
2419    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
2420    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
2421    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
2422    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
2423    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
2424    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
2425    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
2426    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
2427    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
2428    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
2429    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
2430    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
2431    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
2432    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
2433    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
2434    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
2435    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
2436    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
2437    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
2438 };
2439 static const u32 Te2[256] = {
2440    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
2441    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
2442    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
2443    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
2444    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
2445    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
2446    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
2447    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
2448    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
2449    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
2450    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
2451    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
2452    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
2453    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
2454    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
2455    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
2456    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
2457    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
2458    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
2459    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
2460    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
2461    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
2462    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
2463    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
2464    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
2465    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
2466    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
2467    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
2468    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
2469    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
2470    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
2471    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
2472    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
2473    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
2474    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
2475    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
2476    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
2477    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
2478    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
2479    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
2480    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
2481    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
2482    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
2483    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
2484    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
2485    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
2486    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
2487    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
2488    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
2489    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
2490    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
2491    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
2492    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
2493    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
2494    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
2495    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
2496    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
2497    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
2498    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
2499    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
2500    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
2501    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
2502    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
2503    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
2504 };
2505 static const u32 Te3[256] = {
2506    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
2507    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
2508    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
2509    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
2510    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
2511    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
2512    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
2513    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
2514    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
2515    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
2516    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
2517    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
2518    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
2519    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
2520    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
2521    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
2522    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
2523    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
2524    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
2525    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
2526    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
2527    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
2528    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
2529    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
2530    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
2531    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
2532    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
2533    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
2534    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
2535    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
2536    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
2537    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
2538    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
2539    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
2540    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
2541    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
2542    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
2543    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
2544    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
2545    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
2546    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
2547    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
2548    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
2549    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
2550    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
2551    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
2552    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
2553    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
2554    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
2555    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
2556    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
2557    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
2558    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
2559    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
2560    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
2561    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
2562    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
2563    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
2564    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
2565    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
2566    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
2567    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
2568    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
2569    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
2570 };
2571 static const u32 Te4[256] = {
2572    0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
2573    0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
2574    0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
2575    0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
2576    0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
2577    0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
2578    0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
2579    0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
2580    0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
2581    0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
2582    0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
2583    0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
2584    0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
2585    0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
2586    0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
2587    0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
2588    0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
2589    0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
2590    0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
2591    0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
2592    0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
2593    0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
2594    0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
2595    0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
2596    0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
2597    0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
2598    0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
2599    0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
2600    0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
2601    0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
2602    0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
2603    0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
2604    0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
2605    0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
2606    0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
2607    0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
2608    0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
2609    0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
2610    0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
2611    0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
2612    0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
2613    0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
2614    0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
2615    0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
2616    0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
2617    0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
2618    0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
2619    0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
2620    0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
2621    0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
2622    0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
2623    0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
2624    0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
2625    0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
2626    0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
2627    0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
2628    0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
2629    0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
2630    0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
2631    0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
2632    0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
2633    0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
2634    0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
2635    0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
2636 };
2637 static const u32 rcon[] = {
2638    0x01000000, 0x02000000, 0x04000000, 0x08000000,
2639    0x10000000, 0x20000000, 0x40000000, 0x80000000,
2640    0x1B000000, 0x36000000,
2641 };
2642 
2643 #define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
2644 
2645 #ifdef _MSC_VER
2646 #define GETU32(p) SWAP(*((u32 *)(p)))
2647 #define PUTU32(ct, st) { *((u32 *)(ct)) = SWAP((st)); }
2648 #else
2649 #define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] <<  8) ^ ((u32)(pt)[3]))
2650 #define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >>  8); (ct)[3] = (u8)(st); }
2651 #endif
2652 
2653 /**
2654  * Expand the cipher key into the encryption key schedule.
2655  */
AES256KeySetupEnc(u32 rk[60],const u8 cipherKey[32])2656 void AES256KeySetupEnc(u32 rk[60], const u8 cipherKey[32]) {
2657    int i = 0;
2658    u32 temp;
2659 
2660    rk[0] = GETU32(cipherKey     );
2661    rk[1] = GETU32(cipherKey +  4);
2662    rk[2] = GETU32(cipherKey +  8);
2663    rk[3] = GETU32(cipherKey + 12);
2664    rk[4] = GETU32(cipherKey + 16);
2665    rk[5] = GETU32(cipherKey + 20);
2666    rk[6] = GETU32(cipherKey + 24);
2667    rk[7] = GETU32(cipherKey + 28);
2668 
2669    for (;;) {
2670       temp = rk[ 7];
2671 
2672       rk[ 8] = rk[ 0] ^
2673                (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
2674                (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
2675                (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
2676                (Te4[(temp >> 24)       ] & 0x000000ff) ^
2677                rcon[i];
2678 
2679       rk[ 9] = rk[ 1] ^ rk[ 8];
2680       rk[10] = rk[ 2] ^ rk[ 9];
2681       rk[11] = rk[ 3] ^ rk[10];
2682 
2683       if (++i == 7)
2684          return;
2685 
2686       temp = rk[11];
2687 
2688       rk[12] = rk[ 4] ^
2689                (Te4[(temp >> 24)       ] & 0xff000000) ^
2690                (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
2691                (Te4[(temp >>  8) & 0xff] & 0x0000ff00) ^
2692                (Te4[(temp      ) & 0xff] & 0x000000ff);
2693 
2694       rk[13] = rk[ 5] ^ rk[12];
2695       rk[14] = rk[ 6] ^ rk[13];
2696       rk[15] = rk[ 7] ^ rk[14];
2697 
2698       rk += 8;
2699    }
2700 }
2701 
AES256Encrypt(const u32 rk[60],const u8 pt[16],u8 ct[16])2702 void AES256Encrypt(const u32 rk[60], const u8 pt[16], u8 ct[16]) {
2703    u32 s0, s1, s2, s3, t0, t1, t2, t3;
2704    int r, Nr = 14;
2705 
2706    /*
2707     * map byte array block to cipher state
2708     * and add initial round key:
2709     */
2710    s0 = GETU32(pt     ) ^ rk[0];
2711    s1 = GETU32(pt +  4) ^ rk[1];
2712    s2 = GETU32(pt +  8) ^ rk[2];
2713    s3 = GETU32(pt + 12) ^ rk[3];
2714 
2715    /*
2716     * Nr - 1 full rounds:
2717     */
2718    r = Nr >> 1;
2719 
2720    for (;;) {
2721       t0 = Te0[(s0 >> 24)       ] ^
2722            Te1[(s1 >> 16) & 0xff] ^
2723            Te2[(s2 >>  8) & 0xff] ^
2724            Te3[(s3      ) & 0xff] ^
2725            rk[4];
2726       t1 = Te0[(s1 >> 24)       ] ^
2727            Te1[(s2 >> 16) & 0xff] ^
2728            Te2[(s3 >>  8) & 0xff] ^
2729            Te3[(s0      ) & 0xff] ^
2730            rk[5];
2731       t2 = Te0[(s2 >> 24)       ] ^
2732            Te1[(s3 >> 16) & 0xff] ^
2733            Te2[(s0 >>  8) & 0xff] ^
2734            Te3[(s1      ) & 0xff] ^
2735            rk[6];
2736       t3 = Te0[(s3 >> 24)       ] ^
2737            Te1[(s0 >> 16) & 0xff] ^
2738            Te2[(s1 >>  8) & 0xff] ^
2739            Te3[(s2      ) & 0xff] ^
2740            rk[7];
2741 
2742       rk += 8;
2743 
2744       if (--r == 0)
2745          break;
2746 
2747       s0 = Te0[(t0 >> 24)       ] ^
2748            Te1[(t1 >> 16) & 0xff] ^
2749            Te2[(t2 >>  8) & 0xff] ^
2750            Te3[(t3      ) & 0xff] ^
2751            rk[0];
2752       s1 = Te0[(t1 >> 24)       ] ^
2753            Te1[(t2 >> 16) & 0xff] ^
2754            Te2[(t3 >>  8) & 0xff] ^
2755            Te3[(t0      ) & 0xff] ^
2756            rk[1];
2757       s2 = Te0[(t2 >> 24)       ] ^
2758            Te1[(t3 >> 16) & 0xff] ^
2759            Te2[(t0 >>  8) & 0xff] ^
2760            Te3[(t1      ) & 0xff] ^
2761            rk[2];
2762       s3 = Te0[(t3 >> 24)       ] ^
2763            Te1[(t0 >> 16) & 0xff] ^
2764            Te2[(t1 >>  8) & 0xff] ^
2765            Te3[(t2      ) & 0xff] ^
2766            rk[3];
2767    }
2768    /*
2769     * apply last round and
2770     * map cipher state to byte array block:
2771     */
2772    s0 = (Te4[(t0 >> 24)       ] & 0xff000000) ^
2773         (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
2774         (Te4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
2775         (Te4[(t3      ) & 0xff] & 0x000000ff) ^
2776         rk[0];
2777 
2778    PUTU32(ct     , s0);
2779 
2780    s1 = (Te4[(t1 >> 24)       ] & 0xff000000) ^
2781         (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
2782         (Te4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
2783         (Te4[(t0      ) & 0xff] & 0x000000ff) ^
2784         rk[1];
2785 
2786    PUTU32(ct +  4, s1);
2787 
2788    s2 = (Te4[(t2 >> 24)       ] & 0xff000000) ^
2789         (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
2790         (Te4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
2791         (Te4[(t1      ) & 0xff] & 0x000000ff) ^
2792         rk[2];
2793 
2794    PUTU32(ct +  8, s2);
2795 
2796    s3 = (Te4[(t3 >> 24)       ] & 0xff000000) ^
2797         (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
2798         (Te4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
2799         (Te4[(t2      ) & 0xff] & 0x000000ff) ^
2800         rk[3];
2801 
2802    PUTU32(ct + 12, s3);
2803 }
2804 
2805 /*****************************************************************/
2806 
2807 static void
aes256ctr_stream(unsigned char out[BUFSIZE],unsigned char iv[16],const unsigned char key[32])2808 aes256ctr_stream(unsigned char out[BUFSIZE], unsigned char iv[16], const unsigned char key[32])
2809 {
2810    u32 rk[60];
2811    int i;
2812 
2813    AES256KeySetupEnc(rk, key);
2814 
2815    for (i = 0; i < BUFSIZE; i += 16) {
2816       AES256Encrypt(rk, iv, out + i);
2817       inc32(iv);
2818    }
2819 }
2820 
2821 #endif
2822 
2823 struct RandomStream_impl {
2824    unsigned char key[32];
2825    unsigned char iv[16];
2826    unsigned char buf[BUFSIZE];
2827 
2828    explicit
RandomStream_implRandomStream_impl2829    RandomStream_impl(const unsigned char *k)
2830    {
2831       memcpy(key, k, sizeof(key));
2832       memset(iv, 0, sizeof(iv));
2833       iv[15] = 1; // nonce = 1
2834    }
2835 
2836    const unsigned char *
get_bufRandomStream_impl2837    get_buf() const
2838    {
2839       return buf + sizeof(key);
2840    }
2841 
2842    long
get_buf_lenRandomStream_impl2843    get_buf_len() const
2844    {
2845       return sizeof(buf) - sizeof(key);
2846    }
2847 
get_bytesRandomStream_impl2848    long get_bytes(unsigned char *res, long n, long pos)
2849    {
2850       size_t len;
2851 
2852       if (n < 0)
2853          LogicError("RandomStream::get: bad args");
2854 
2855       if (n > 0 && sizeof(buf) - sizeof(key) - pos > 0) {
2856          len = min((size_t)n, sizeof(buf) - sizeof(key) - pos);
2857          memcpy(res, buf + sizeof(key) + pos, len);
2858 
2859          n -= len;
2860          res += len;
2861          pos += len;
2862       }
2863 
2864       while (n > 0) {
2865          aes256ctr_stream(buf, iv, key);
2866          memcpy(key, buf, sizeof(key));
2867 
2868          len = min((size_t)n, sizeof(buf) - sizeof(key));
2869          memcpy(res, buf + sizeof(key), len);
2870 
2871 	 n -= len;
2872          res += len;
2873          pos = len;
2874       }
2875 
2876       return pos;
2877    }
2878 
set_nonceRandomStream_impl2879    void set_nonce(unsigned long nonce)
2880    {
2881       // low-order  8 bytes of iv set to zero
2882       // high-order 8 bytes of iv set to nonce
2883       memset(iv, 0, sizeof(iv));
2884       iv[ 8] = (unsigned char) nonce; nonce >>= 8;
2885       iv[ 9] = (unsigned char) nonce; nonce >>= 8;
2886       iv[10] = (unsigned char) nonce; nonce >>= 8;
2887       iv[11] = (unsigned char) nonce; nonce >>= 8;
2888       iv[12] = (unsigned char) nonce; nonce >>= 8;
2889       iv[13] = (unsigned char) nonce; nonce >>= 8;
2890       iv[14] = (unsigned char) nonce; nonce >>= 8;
2891       iv[15] = (unsigned char) nonce; nonce >>= 8;
2892    }
2893 };
2894 
2895 #else // defined(NTL_RANDOM_AES256CTR)
2896 
2897 #if (defined(NTL_HAVE_AVX2) || defined(NTL_HAVE_SSSE3))
2898 
2899 
2900 /*****************************************************************
2901 
2902 This AVX2 implementation is derived from public domain code
2903 originally developed by Martin Goll Shay Gueron, and obtained from
2904 here:
2905 
2906 https://github.com/floodyberry/supercop/tree/master/crypto_stream/chacha20/goll_gueron
2907 
2908 On a Haswell machine, ths code is about 4.x faster than the vanilla
2909 C code.
2910 
2911 The following is the README from that page
2912 
2913 ==================================================================
2914 
2915 This code implements Daniel J. Bernstein's ChaCha stream cipher in C,
2916 targeting architectures with AVX2 and future AVX512 vector extensions.
2917 
2918 The implementation improves the slightly modified implementations of Ted Krovetz in the Chromium Project
2919 (http://src.chromium.org/viewvc/chrome/trunk/deps/third_party/nss/nss/lib/freebl/chacha20/chacha20_vec.c and
2920 http://src.chromium.org/viewvc/chrome/trunk/deps/third_party/openssl/openssl/crypto/chacha/chacha_vec.c)
2921 by using the Advanced Vector Extensions AVX2 and, if available in future, AVX512 to widen the vectorization
2922 to 256-bit, respectively 512-bit.
2923 
2924 On Intel's Haswell architecture this implementation (using AVX2) is almost ~2x faster than the fastest
2925 implementation here, when encrypting (decrypting) 2 blocks and more. Also, this implementation is expected
2926 to double the speed again, when encrypting (decrypting) 4 blocks and more, running on a future architecture
2927 with support for AVX512.
2928 
2929 Further details and our measurement results are provided in:
2930 Goll, M., and Gueron,S.: Vectorization of ChaCha Stream Cipher. Cryptology ePrint Archive,
2931 Report 2013/759, November, 2013, http://eprint.iacr.org/2013/759.pdf
2932 
2933 Developers and authors:
2934 *********************************************************
2935 Martin Goll (1) and Shay Gueron (2, 3),
2936 (1) Ruhr-University Bochum, Germany
2937 (2) University of Haifa, Israel
2938 (3) Intel Corporation, Israel Development Center, Haifa, Israel
2939 *********************************************************
2940 
2941 Intellectual Property Notices
2942 -----------------------------
2943 
2944 There are no known present or future claims by a copyright holder that the
2945 distribution of this software infringes the copyright. In particular, the author
2946 of the software is not making such claims and does not intend to make such
2947 claims.
2948 
2949 There are no known present or future claims by a patent holder that the use of
2950 this software infringes the patent. In particular, the author of the software is
2951 not making such claims and does not intend to make such claims.
2952 
2953 Our implementation is in public domain.
2954 
2955 *****************************************************************/
2956 
2957 
2958 // round selector, specified values:
2959 //  8:  low security - high speed
2960 // 12:  mid security -  mid speed
2961 // 20: high security -  low speed
2962 #ifndef CHACHA_RNDS
2963 #define CHACHA_RNDS 20
2964 #endif
2965 
2966 
2967 #if (defined(NTL_HAVE_AVX2))
2968 
2969 typedef __m256i ivec_t;
2970 
2971 #define DELTA	_mm256_set_epi64x(0,2,0,2)
2972 #define START   _mm256_set_epi64x(0,1,0,0)
2973 #define NONCE(nonce) _mm256_set_epi64x(nonce, 1, nonce, 0)
2974 
2975 #define STOREU_VEC(m,r)	_mm256_storeu_si256((__m256i*)(m), r)
2976 #define STORE_VEC(m,r)	_mm256_store_si256((__m256i*)(m), r)
2977 
2978 #define LOAD_VEC(r,m) r = _mm256_load_si256((const __m256i *)(m))
2979 #define LOADU_VEC(r,m) r = _mm256_loadu_si256((const __m256i *)(m))
2980 
2981 #define LOADU_VEC_128(r, m) r = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(m)))
2982 
2983 
2984 
2985 #define ADD_VEC_32(a,b)	_mm256_add_epi32(a, b)
2986 #define ADD_VEC_64(a,b)	_mm256_add_epi64(a, b)
2987 #define XOR_VEC(a,b)	_mm256_xor_si256(a, b)
2988 
2989 
2990 #define ROR_VEC_V1(x)	_mm256_shuffle_epi32(x,_MM_SHUFFLE(0,3,2,1))
2991 #define ROR_VEC_V2(x)	_mm256_shuffle_epi32(x,_MM_SHUFFLE(1,0,3,2))
2992 #define ROR_VEC_V3(x)	_mm256_shuffle_epi32(x,_MM_SHUFFLE(2,1,0,3))
2993 #define ROL_VEC_7(x)	XOR_VEC(_mm256_slli_epi32(x, 7), _mm256_srli_epi32(x,25))
2994 #define ROL_VEC_12(x)	XOR_VEC(_mm256_slli_epi32(x,12), _mm256_srli_epi32(x,20))
2995 
2996 #define ROL_VEC_8(x)	_mm256_shuffle_epi8(x,_mm256_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3,14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3))
2997 
2998 
2999 #define ROL_VEC_16(x)	_mm256_shuffle_epi8(x,_mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2))
3000 
3001 
3002 
3003 #define WRITEU_VEC(op, d, v0, v1, v2, v3)						\
3004     STOREU_VEC(op + (d + 0*4), _mm256_permute2x128_si256(v0, v1, 0x20));	\
3005     STOREU_VEC(op + (d + 8*4), _mm256_permute2x128_si256(v2, v3, 0x20));	\
3006     STOREU_VEC(op + (d +16*4), _mm256_permute2x128_si256(v0, v1, 0x31));	\
3007     STOREU_VEC(op + (d +24*4), _mm256_permute2x128_si256(v2, v3, 0x31));
3008 
3009 #define WRITE_VEC(op, d, v0, v1, v2, v3)						\
3010     STORE_VEC(op + (d + 0*4), _mm256_permute2x128_si256(v0, v1, 0x20));	\
3011     STORE_VEC(op + (d + 8*4), _mm256_permute2x128_si256(v2, v3, 0x20));	\
3012     STORE_VEC(op + (d +16*4), _mm256_permute2x128_si256(v0, v1, 0x31));	\
3013     STORE_VEC(op + (d +24*4), _mm256_permute2x128_si256(v2, v3, 0x31));
3014 
3015 #define SZ_VEC (32)
3016 
3017 #define RANSTREAM_NCHUNKS (2)
3018 // leads to a BUFSZ of 512
3019 
3020 
3021 #elif defined(NTL_HAVE_SSSE3)
3022 
3023 typedef __m128i ivec_t;
3024 
3025 #define DELTA	_mm_set_epi32(0,0,0,1)
3026 #define START   _mm_setzero_si128()
3027 #define NONCE(nonce) _mm_set_epi64x(nonce,0)
3028 
3029 #define STOREU_VEC(m,r)	_mm_storeu_si128((__m128i*)(m), r)
3030 #define STORE_VEC(m,r)	_mm_store_si128((__m128i*)(m), r)
3031 
3032 #define LOAD_VEC(r,m) r = _mm_load_si128((const __m128i *)(m))
3033 #define LOADU_VEC(r,m) r = _mm_loadu_si128((const __m128i *)(m))
3034 
3035 #define LOADU_VEC_128(r, m) r = _mm_loadu_si128((const __m128i*)(m))
3036 
3037 #define ADD_VEC_32(a,b)	_mm_add_epi32(a, b)
3038 #define ADD_VEC_64(a,b)	_mm_add_epi64(a, b)
3039 #define XOR_VEC(a,b)	_mm_xor_si128(a, b)
3040 
3041 
3042 #define ROR_VEC_V1(x)	_mm_shuffle_epi32(x,_MM_SHUFFLE(0,3,2,1))
3043 #define ROR_VEC_V2(x)	_mm_shuffle_epi32(x,_MM_SHUFFLE(1,0,3,2))
3044 #define ROR_VEC_V3(x)	_mm_shuffle_epi32(x,_MM_SHUFFLE(2,1,0,3))
3045 #define ROL_VEC_7(x)	XOR_VEC(_mm_slli_epi32(x, 7), _mm_srli_epi32(x,25))
3046 #define ROL_VEC_12(x)	XOR_VEC(_mm_slli_epi32(x,12), _mm_srli_epi32(x,20))
3047 
3048 #define ROL_VEC_8(x)	_mm_shuffle_epi8(x,_mm_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3))
3049 
3050 
3051 #define ROL_VEC_16(x)	_mm_shuffle_epi8(x,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2))
3052 
3053 
3054 #define WRITEU_VEC(op, d, v0, v1, v2, v3)	\
3055     STOREU_VEC(op + (d + 0*4), v0);	\
3056     STOREU_VEC(op + (d + 4*4), v1);	\
3057     STOREU_VEC(op + (d + 8*4), v2);	\
3058     STOREU_VEC(op + (d +12*4), v3);
3059 
3060 #define WRITE_VEC(op, d, v0, v1, v2, v3)	\
3061     STORE_VEC(op + (d + 0*4), v0);	\
3062     STORE_VEC(op + (d + 4*4), v1);	\
3063     STORE_VEC(op + (d + 8*4), v2);	\
3064     STORE_VEC(op + (d +12*4), v3);
3065 
3066 #define SZ_VEC (16)
3067 
3068 #define RANSTREAM_NCHUNKS (4)
3069 // leads to a BUFSZ of 512
3070 
3071 #else
3072 
3073 #error "unsupported architecture"
3074 
3075 #endif
3076 
3077 
3078 #define DQROUND_VECTORS_VEC(a,b,c,d)				\
3079     a = ADD_VEC_32(a,b); d = XOR_VEC(d,a); d = ROL_VEC_16(d);	\
3080     c = ADD_VEC_32(c,d); b = XOR_VEC(b,c); b = ROL_VEC_12(b);	\
3081     a = ADD_VEC_32(a,b); d = XOR_VEC(d,a); d = ROL_VEC_8(d);	\
3082     c = ADD_VEC_32(c,d); b = XOR_VEC(b,c); b = ROL_VEC_7(b);	\
3083     b = ROR_VEC_V1(b); c = ROR_VEC_V2(c); d = ROR_VEC_V3(d);	\
3084     a = ADD_VEC_32(a,b); d = XOR_VEC(d,a); d = ROL_VEC_16(d);	\
3085     c = ADD_VEC_32(c,d); b = XOR_VEC(b,c); b = ROL_VEC_12(b);	\
3086     a = ADD_VEC_32(a,b); d = XOR_VEC(d,a); d = ROL_VEC_8(d);	\
3087     c = ADD_VEC_32(c,d); b = XOR_VEC(b,c); b = ROL_VEC_7(b);	\
3088     b = ROR_VEC_V3(b); c = ROR_VEC_V2(c); d = ROR_VEC_V1(d);
3089 
3090 
3091 
3092 #define RANSTREAM_STATESZ (4*SZ_VEC)
3093 
3094 #define RANSTREAM_CHUNKSZ (2*RANSTREAM_STATESZ)
3095 #define RANSTREAM_BUFSZ   (RANSTREAM_NCHUNKS*RANSTREAM_CHUNKSZ)
3096 
3097 
3098 struct RandomStream_impl {
3099 
3100    AlignedArray<unsigned char> state_store;
3101    AlignedArray<unsigned char> buf_store;
3102    long chunk_count;
3103 
allocate_spaceRandomStream_impl3104    void allocate_space()
3105    {
3106       state_store.SetLength(RANSTREAM_STATESZ);
3107       buf_store.SetLength(RANSTREAM_BUFSZ);
3108    }
3109 
3110 
3111    explicit
RandomStream_implRandomStream_impl3112    RandomStream_impl(const unsigned char *key)
3113    {
3114       allocate_space();
3115 
3116       unsigned char *state = state_store.elts();
3117 
3118       unsigned int chacha_const[] = {
3119 	      0x61707865,0x3320646E,0x79622D32,0x6B206574
3120       };
3121 
3122 
3123       ivec_t d0, d1, d2, d3;
3124       LOADU_VEC_128(d0, chacha_const);
3125       LOADU_VEC_128(d1, key);
3126       LOADU_VEC_128(d2, key+16);
3127 
3128       d3 = START;
3129 
3130 
3131       STORE_VEC(state + 0*SZ_VEC, d0);
3132       STORE_VEC(state + 1*SZ_VEC, d1);
3133       STORE_VEC(state + 2*SZ_VEC, d2);
3134       STORE_VEC(state + 3*SZ_VEC, d3);
3135 
3136       chunk_count = 0;
3137    }
3138 
RandomStream_implRandomStream_impl3139    RandomStream_impl(const RandomStream_impl& other)
3140    {
3141       allocate_space();
3142       *this = other;
3143    }
3144 
operator =RandomStream_impl3145    RandomStream_impl& operator=(const RandomStream_impl& other)
3146    {
3147       std::memcpy(state_store.elts(), other.state_store.elts(), RANSTREAM_STATESZ);
3148       std::memcpy(buf_store.elts(), other.buf_store.elts(), RANSTREAM_BUFSZ);
3149       chunk_count = other.chunk_count;
3150       return *this;
3151    }
3152 
3153    const unsigned char *
get_bufRandomStream_impl3154    get_buf() const
3155    {
3156       return buf_store.elts();
3157    }
3158 
3159    long
get_buf_lenRandomStream_impl3160    get_buf_len() const
3161    {
3162       return RANSTREAM_BUFSZ;
3163    }
3164 
3165    // bytes are generated in chunks of RANSTREAM_BUFSZ bytes, except that
3166    // initially, we may generate a few chunks of RANSTREAM_CHUNKSZ
3167    // bytes.  This optimizes a bit for short bursts following a reset.
3168 
3169    long
get_bytesRandomStream_impl3170    get_bytes(unsigned char *NTL_RESTRICT res,
3171              long n, long pos)
3172    {
3173       if (n < 0) LogicError("RandomStream::get: bad args");
3174       if (n == 0) return pos;
3175 
3176       unsigned char *NTL_RESTRICT buf = buf_store.elts();
3177 
3178       if (n <= RANSTREAM_BUFSZ-pos) {
3179 	 std::memcpy(&res[0], &buf[pos], n);
3180 	 pos += n;
3181 	 return pos;
3182       }
3183 
3184       unsigned char *NTL_RESTRICT state = state_store.elts();
3185 
3186       ivec_t d0, d1, d2, d3;
3187       LOAD_VEC(d0, state + 0*SZ_VEC);
3188       LOAD_VEC(d1, state + 1*SZ_VEC);
3189       LOAD_VEC(d2, state + 2*SZ_VEC);
3190       LOAD_VEC(d3, state + 3*SZ_VEC);
3191 
3192 
3193       // read remainder of buffer
3194       std::memcpy(&res[0], &buf[pos], RANSTREAM_BUFSZ-pos);
3195       n -= RANSTREAM_BUFSZ-pos;
3196       res += RANSTREAM_BUFSZ-pos;
3197       pos = RANSTREAM_BUFSZ;
3198 
3199       long i = 0;
3200       for (;  i <= n-RANSTREAM_BUFSZ; i += RANSTREAM_BUFSZ) {
3201 
3202          chunk_count |= RANSTREAM_NCHUNKS;  // disable small buffer strategy
3203 
3204 	 for (long j = 0; j < RANSTREAM_NCHUNKS; j++) {
3205 	    ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
3206 	    ivec_t v4=d0, v5=d1, v6=d2, v7=ADD_VEC_64(d3, DELTA);
3207 
3208 	    for (long k = 0; k < CHACHA_RNDS/2; k++) {
3209 		    DQROUND_VECTORS_VEC(v0,v1,v2,v3)
3210 		    DQROUND_VECTORS_VEC(v4,v5,v6,v7)
3211 	    }
3212 
3213 	    WRITEU_VEC(res+i+j*(8*SZ_VEC), 0, ADD_VEC_32(v0,d0), ADD_VEC_32(v1,d1), ADD_VEC_32(v2,d2), ADD_VEC_32(v3,d3))
3214 	    d3 = ADD_VEC_64(d3, DELTA);
3215 	    WRITEU_VEC(res+i+j*(8*SZ_VEC), 4*SZ_VEC, ADD_VEC_32(v4,d0), ADD_VEC_32(v5,d1), ADD_VEC_32(v6,d2), ADD_VEC_32(v7,d3))
3216 	    d3 = ADD_VEC_64(d3, DELTA);
3217 
3218 	 }
3219 
3220       }
3221 
3222       if (i < n) {
3223 
3224          long nchunks;
3225 
3226          if (chunk_count < RANSTREAM_NCHUNKS) {
3227             nchunks = long(cast_unsigned((n-i)+RANSTREAM_CHUNKSZ-1)/RANSTREAM_CHUNKSZ);
3228             chunk_count += nchunks;
3229          }
3230          else
3231             nchunks = RANSTREAM_NCHUNKS;
3232 
3233          long pos_offset = RANSTREAM_BUFSZ - nchunks*RANSTREAM_CHUNKSZ;
3234          buf += pos_offset;
3235 
3236 	 for (long j = 0; j < nchunks; j++) {
3237 	    ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
3238 	    ivec_t v4=d0, v5=d1, v6=d2, v7=ADD_VEC_64(d3, DELTA);
3239 
3240 	    for (long k = 0; k < CHACHA_RNDS/2; k++) {
3241                DQROUND_VECTORS_VEC(v0,v1,v2,v3)
3242                DQROUND_VECTORS_VEC(v4,v5,v6,v7)
3243 	    }
3244 
3245 	    WRITE_VEC(buf+j*(8*SZ_VEC), 0, ADD_VEC_32(v0,d0), ADD_VEC_32(v1,d1), ADD_VEC_32(v2,d2), ADD_VEC_32(v3,d3))
3246 	    d3 = ADD_VEC_64(d3, DELTA);
3247 	    WRITE_VEC(buf+j*(8*SZ_VEC), 4*SZ_VEC, ADD_VEC_32(v4,d0), ADD_VEC_32(v5,d1), ADD_VEC_32(v6,d2), ADD_VEC_32(v7,d3))
3248 	    d3 = ADD_VEC_64(d3, DELTA);
3249 	 }
3250 
3251 	 pos = n-i+pos_offset;
3252 	 std::memcpy(&res[i], &buf[0], n-i);
3253       }
3254 
3255       STORE_VEC(state + 3*SZ_VEC, d3);
3256 
3257       return pos;
3258    }
3259 
set_nonceRandomStream_impl3260    void set_nonce(unsigned long nonce)
3261    {
3262       unsigned char *state = state_store.elts();
3263       ivec_t d3;
3264       d3 = NONCE(nonce);
3265       STORE_VEC(state + 3*SZ_VEC, d3);
3266       chunk_count = 0;
3267    }
3268 
3269 };
3270 
3271 
3272 #else
3273 
3274 struct RandomStream_impl {
3275    _ntl_uint32 state[16];
3276    unsigned char buf[64];
3277 
3278    explicit
RandomStream_implRandomStream_impl3279    RandomStream_impl(const unsigned char *key)
3280    {
3281       salsa20_init(state, key);
3282    }
3283 
3284    const unsigned char *
get_bufRandomStream_impl3285    get_buf() const
3286    {
3287       return &buf[0];
3288    }
3289 
3290    long
get_buf_lenRandomStream_impl3291    get_buf_len() const
3292    {
3293       return 64;
3294    }
3295 
get_bytesRandomStream_impl3296    long get_bytes(unsigned char *res, long n, long pos)
3297    {
3298       if (n < 0) LogicError("RandomStream::get: bad args");
3299 
3300       long i, j;
3301 
3302       if (n <= 64-pos) {
3303 	 for (i = 0; i < n; i++) res[i] = buf[pos+i];
3304 	 pos += n;
3305 	 return pos;
3306       }
3307 
3308       // read remainder of buffer
3309       for (i = 0; i < 64-pos; i++) res[i] = buf[pos+i];
3310       n -= 64-pos;
3311       res += 64-pos;
3312       pos = 64;
3313 
3314       _ntl_uint32 wdata[16];
3315 
3316       // read 64-byte chunks
3317       for (i = 0; i <= n-64; i += 64) {
3318 	 salsa20_apply(state, wdata);
3319 	 for (j = 0; j < 16; j++)
3320 	    FROMLE(res + i + 4*j, wdata[j]);
3321       }
3322 
3323       if (i < n) {
3324 	 salsa20_apply(state, wdata);
3325 
3326 	 for (j = 0; j < 16; j++)
3327 	    FROMLE(buf + 4*j, wdata[j]);
3328 
3329 	 pos = n-i;
3330 	 for (j = 0; j < pos; j++)
3331 	    res[i+j] = buf[j];
3332       }
3333 
3334       return pos;
3335    }
3336 
set_nonceRandomStream_impl3337    void set_nonce(unsigned long nonce)
3338    {
3339       _ntl_uint32 nonce0, nonce1;
3340 
3341       nonce0 = nonce;
3342       nonce0 = INT32MASK(nonce0);
3343 
3344       nonce1 = 0;
3345 
3346 #if (NTL_BITS_PER_LONG > 32)
3347       nonce1 = nonce >> 32;
3348       nonce1 = INT32MASK(nonce1);
3349 #endif
3350 
3351       state[12] = 0;
3352       state[13] = 0;
3353       state[14] = nonce0;
3354       state[15] = nonce1;
3355    }
3356 };
3357 
3358 
3359 #endif
3360 #endif // defined(NTL_RANDOMSTREAM_AES256CTR)
3361 
3362 
3363 
3364 // Boilerplate PIMPL code
3365 
3366 RandomStream_impl *
RandomStream_impl_build(const unsigned char * key)3367 RandomStream_impl_build(const unsigned char *key)
3368 {
3369    UniquePtr<RandomStream_impl> p;
3370    p.make(key);
3371    return p.release();
3372 }
3373 
3374 RandomStream_impl *
RandomStream_impl_build(const RandomStream_impl & other)3375 RandomStream_impl_build(const RandomStream_impl& other)
3376 {
3377    UniquePtr<RandomStream_impl> p;
3378    p.make(other);
3379    return p.release();
3380 }
3381 
3382 void
RandomStream_impl_copy(RandomStream_impl & x,const RandomStream_impl & y)3383 RandomStream_impl_copy(RandomStream_impl& x, const RandomStream_impl& y)
3384 {
3385    x = y;
3386 }
3387 
3388 void
RandomStream_impl_delete(RandomStream_impl * p)3389 RandomStream_impl_delete(RandomStream_impl* p)
3390 {
3391    delete p;
3392 }
3393 
3394 const unsigned char *
RandomStream_impl_get_buf(const RandomStream_impl & x)3395 RandomStream_impl_get_buf(const RandomStream_impl& x)
3396 {
3397    return x.get_buf();
3398 }
3399 
3400 long
RandomStream_impl_get_buf_len(const RandomStream_impl & x)3401 RandomStream_impl_get_buf_len(const RandomStream_impl& x)
3402 {
3403    return x.get_buf_len();
3404 }
3405 
3406 long
RandomStream_impl_get_bytes(RandomStream_impl & impl,unsigned char * res,long n,long pos)3407 RandomStream_impl_get_bytes(RandomStream_impl& impl,
3408    unsigned char *res, long n, long pos)
3409 {
3410    return impl.get_bytes(res, n, pos);
3411 }
3412 
3413 
3414 void
RandomStream_impl_set_nonce(RandomStream_impl & impl,unsigned long nonce)3415 RandomStream_impl_set_nonce(RandomStream_impl& impl, unsigned long nonce)
3416 {
3417    impl.set_nonce(nonce);
3418 }
3419 
3420 
3421 
3422 
3423 
3424 NTL_TLS_GLOBAL_DECL(UniquePtr<RandomStream>,  CurrentRandomStream);
3425 
3426 
SetSeed(const RandomStream & s)3427 void SetSeed(const RandomStream& s)
3428 {
3429    NTL_TLS_GLOBAL_ACCESS(CurrentRandomStream);
3430 
3431    if (!CurrentRandomStream)
3432       CurrentRandomStream.make(s);
3433    else
3434       *CurrentRandomStream = s;
3435 }
3436 
3437 
SetSeed(const unsigned char * data,long dlen)3438 void SetSeed(const unsigned char *data, long dlen)
3439 {
3440    if (dlen < 0) LogicError("SetSeed: bad args");
3441 
3442    Vec<unsigned char> key;
3443    key.SetLength(NTL_PRG_KEYLEN);
3444    DeriveKey(key.elts(), NTL_PRG_KEYLEN, data, dlen);
3445 
3446    SetSeed(RandomStream(key.elts()));
3447 }
3448 
SetSeed(const ZZ & seed)3449 void SetSeed(const ZZ& seed)
3450 {
3451    long nb = NumBytes(seed);
3452 
3453    Vec<unsigned char> buf;
3454    buf.SetLength(nb);
3455 
3456    BytesFromZZ(buf.elts(), seed, nb);
3457 
3458    SetSeed(buf.elts(), nb);
3459 }
3460 
3461 
3462 static
InitRandomStream()3463 void InitRandomStream()
3464 {
3465    const std::string& id = UniqueID();
3466    SetSeed((const unsigned char *) id.c_str(), id.length());
3467 }
3468 
3469 static inline
LocalGetCurrentRandomStream()3470 RandomStream& LocalGetCurrentRandomStream()
3471 {
3472    NTL_TLS_GLOBAL_ACCESS(CurrentRandomStream);
3473 
3474    if (!CurrentRandomStream) InitRandomStream();
3475    return *CurrentRandomStream;
3476 }
3477 
GetCurrentRandomStream()3478 RandomStream& GetCurrentRandomStream()
3479 {
3480    return LocalGetCurrentRandomStream();
3481 }
3482 
3483 
3484 
3485 
3486 
3487 
3488 
3489 static inline
WordFromBytes(const unsigned char * buf,long n)3490 unsigned long WordFromBytes(const unsigned char *buf, long n)
3491 {
3492    unsigned long res = 0;
3493    long i;
3494 
3495    for (i = n-1; i >= 0; i--)
3496       res = (res << 8) | buf[i];
3497 
3498    return res;
3499 }
3500 
3501 
RandomWord()3502 unsigned long RandomWord()
3503 {
3504    RandomStream& stream = LocalGetCurrentRandomStream();
3505    unsigned char buf[NTL_BITS_PER_LONG/8];
3506 
3507    stream.get(buf, NTL_BITS_PER_LONG/8);
3508    return WordFromBytes(buf, NTL_BITS_PER_LONG/8);
3509 }
3510 
3511 
VectorRandomWord(long k,unsigned long * x)3512 void VectorRandomWord(long k, unsigned long* x)
3513 {
3514    RandomStream& stream = LocalGetCurrentRandomStream();
3515    unsigned char buf[NTL_BITS_PER_LONG/8];
3516 
3517    for (long i = 0; i < k; i++) {
3518       stream.get(buf, NTL_BITS_PER_LONG/8);
3519       x[i] = WordFromBytes(buf, NTL_BITS_PER_LONG/8);
3520    }
3521 }
3522 
RandomBits_long(long l)3523 long RandomBits_long(long l)
3524 {
3525    if (l <= 0) return 0;
3526    if (l >= NTL_BITS_PER_LONG)
3527       ResourceError("RandomBits: length too big");
3528 
3529    RandomStream& stream = LocalGetCurrentRandomStream();
3530    unsigned char buf[NTL_BITS_PER_LONG/8];
3531    long nb = (l+7)/8;
3532    stream.get(buf, nb);
3533 
3534    return long(WordFromBytes(buf, nb) & ((1UL << l)-1UL));
3535 }
3536 
RandomBits_ulong(long l)3537 unsigned long RandomBits_ulong(long l)
3538 {
3539    if (l <= 0) return 0;
3540    if (l > NTL_BITS_PER_LONG)
3541       ResourceError("RandomBits: length too big");
3542 
3543    RandomStream& stream = LocalGetCurrentRandomStream();
3544    unsigned char buf[NTL_BITS_PER_LONG/8];
3545    long nb = (l+7)/8;
3546    stream.get(buf, nb);
3547    unsigned long res = WordFromBytes(buf, nb);
3548    if (l < NTL_BITS_PER_LONG)
3549       res = res & ((1UL << l)-1UL);
3550    return res;
3551 }
3552 
RandomLen_long(long l)3553 long RandomLen_long(long l)
3554 {
3555    if (l <= 0) return 0;
3556    if (l == 1) return 1;
3557    if (l >= NTL_BITS_PER_LONG)
3558       ResourceError("RandomLen: length too big");
3559 
3560    RandomStream& stream = LocalGetCurrentRandomStream();
3561    unsigned char buf[NTL_BITS_PER_LONG/8];
3562    long nb = ((l-1)+7)/8;
3563    stream.get(buf, nb);
3564    unsigned long res = WordFromBytes(buf, nb);
3565    unsigned long mask = (1UL << (l-1)) - 1UL;
3566    return long((res & mask) | (mask+1UL));
3567 }
3568 
3569 
RandomBnd(long bnd)3570 long RandomBnd(long bnd)
3571 {
3572    if (bnd <= 1) return 0;
3573 
3574    RandomStream& stream = LocalGetCurrentRandomStream();
3575    unsigned char buf[NTL_BITS_PER_LONG/8];
3576    long l = NumBits(bnd-1);
3577    long nb = (l+7)/8;
3578 
3579    long tmp;
3580    do {
3581       stream.get(buf, nb);
3582       tmp = long(WordFromBytes(buf, nb) & ((1UL << l)-1UL));
3583    } while (tmp >= bnd);
3584 
3585    return tmp;
3586 }
3587 
3588 
3589 
RandomBits(ZZ & x,long l)3590 void RandomBits(ZZ& x, long l)
3591 {
3592    if (l <= 0) {
3593       x = 0;
3594       return;
3595    }
3596 
3597    if (NTL_OVERFLOW(l, 1, 0))
3598       ResourceError("RandomBits: length too big");
3599 
3600    RandomStream& stream = LocalGetCurrentRandomStream();
3601 
3602    long nb = (l+7)/8;
3603    unsigned long mask = (1UL << (8 - nb*8 + l)) - 1UL;
3604 
3605    NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
3606    Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
3607 
3608    buf_mem.SetLength(nb);
3609    unsigned char *buf = buf_mem.elts();
3610 
3611    x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
3612    // pre-allocate to ensure strong ES
3613 
3614    stream.get(buf, nb);
3615    buf[nb-1] &= mask;
3616 
3617    ZZFromBytes(x, buf, nb);
3618 }
3619 
3620 
RandomLen(ZZ & x,long l)3621 void RandomLen(ZZ& x, long l)
3622 {
3623    if (l <= 0) {
3624       x = 0;
3625       return;
3626    }
3627 
3628    if (l == 1) {
3629       x = 1;
3630       return;
3631    }
3632 
3633    if (NTL_OVERFLOW(l, 1, 0))
3634       ResourceError("RandomLen: length too big");
3635 
3636    RandomStream& stream = LocalGetCurrentRandomStream();
3637 
3638    long nb = (l+7)/8;
3639    unsigned long mask = (1UL << (8 - nb*8 + l)) - 1UL;
3640 
3641    NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
3642    Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
3643 
3644    buf_mem.SetLength(nb);
3645    unsigned char *buf = buf_mem.elts();
3646 
3647    x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
3648    // pre-allocate to ensure strong ES
3649 
3650    stream.get(buf, nb);
3651    buf[nb-1] &= mask;
3652    buf[nb-1] |= ((mask >> 1) + 1UL);
3653 
3654    ZZFromBytes(x, buf, nb);
3655 }
3656 
3657 
3658 
3659 
3660 
3661 /**********************************************************
3662 
3663 The following implementation of RandomBnd is designed
3664 for speed.  It certainly is not resilient against a
3665 timing side-channel attack (but then again, none of these
3666 PRG routines are designed to be).
3667 
3668 The naive strategy generates random candidates of the right
3669 bit length until the candidate < bnd.
3670 The idea in this implementation is to generate the high
3671 order two bytes of the candidate first, and compare this
3672 to the high order two bytes of tmp.  We can discard the
3673 candidate if this is already too large.
3674 
3675 ***********************************************************/
3676 
RandomBnd(ZZ & x,const ZZ & bnd)3677 void RandomBnd(ZZ& x, const ZZ& bnd)
3678 {
3679    if (bnd <= 1) {
3680       x = 0;
3681       return;
3682    }
3683 
3684    RandomStream& stream = LocalGetCurrentRandomStream();
3685 
3686    long l = NumBits(bnd);
3687    long nb = (l+7)/8;
3688 
3689    if (nb <= 3) {
3690       long lbnd = conv<long>(bnd);
3691       unsigned char lbuf[3];
3692       long ltmp;
3693 
3694       x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
3695       // pre-allocate to ensure strong ES
3696       do {
3697          stream.get(lbuf, nb);
3698          ltmp = long(WordFromBytes(lbuf, nb) & ((1UL << l)-1UL));
3699       } while (ltmp >= lbnd);
3700 
3701      conv(x, ltmp);
3702      return;
3703    }
3704 
3705    // deal with possible alias
3706    NTL_ZZRegister(tmp_store);
3707    const ZZ& bnd_ref = ((&x == &bnd) ? (tmp_store = bnd) : bnd);
3708 
3709 
3710    NTL_ZZRegister(hbnd);
3711    RightShift(hbnd, bnd_ref, (nb-2)*8);
3712    long lhbnd = conv<long>(hbnd);
3713 
3714    unsigned long mask = (1UL << (16 - nb*8 + l)) - 1UL;
3715 
3716    NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
3717    Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
3718    buf_mem.SetLength(nb);
3719    unsigned char *buf = buf_mem.elts();
3720 
3721    unsigned char hbuf[2];
3722 
3723    x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
3724    // pre-allocate to ensure strong ES
3725    for (;;) {
3726       stream.get(hbuf, 2);
3727       long hpart = long(WordFromBytes(hbuf, 2) & mask);
3728 
3729       if (hpart > lhbnd) continue;
3730 
3731       stream.get(buf, nb-2);
3732       buf[nb-2] = ((unsigned long) hpart);
3733       buf[nb-1] = ((unsigned long) hpart) >> 8;
3734 
3735       ZZFromBytes(x, buf, nb);
3736       if (hpart < lhbnd || x < bnd_ref) break;
3737    }
3738 }
3739 
3740 
3741 
3742 
3743 // More prime generation stuff...
3744 
3745 static
Log2(double x)3746 double Log2(double x)
3747 {
3748    static const double log2 = log(2.0); // GLOBAL (relies on C++11 thread-safe init)
3749    return log(x)/log2;
3750 }
3751 
3752 // Define p(k,t) to be the conditional probability that a random, odd, k-bit
3753 // number is composite, given that it passes t iterations of the
3754 // Miller-Rabin test.
3755 // If this routine returns a non-zero value, then
3756 //    p(k,t) <= 2^{-n}.
3757 // This basically encodes the estimates of Damgard, Landrock, and Pomerance;
3758 // it uses floating point arithmetic, but is coded in such a way
3759 // that its results should be correct, assuming that the log function
3760 // is computed with reasonable precision.
3761 //
3762 // It is assumed that k >= 3 and t >= 1; if this does not hold,
3763 // then 0 is returned.
3764 
3765 static
ErrBoundTest(long kk,long tt,long nn)3766 long ErrBoundTest(long kk, long tt, long nn)
3767 
3768 {
3769    const double fudge = (1.0 + 1024.0/NTL_FDOUBLE_PRECISION);
3770    const double log2_3 = Log2(3.0);
3771    const double log2_7 = Log2(7.0);
3772    const double log2_20 = Log2(20.0);
3773 
3774    double k = kk;
3775    double t = tt;
3776    double n = nn;
3777 
3778    if (k < 3 || t < 1) return 0;
3779    if (n < 1) return 1;
3780 
3781    // the following test is largely academic
3782    if (9*t > NTL_FDOUBLE_PRECISION) LogicError("ErrBoundTest: t too big");
3783 
3784    double log2_k = Log2(k);
3785 
3786    if ((n + log2_k)*fudge <= 2*t)
3787       return 1;
3788 
3789    if ((2*log2_k + 4.0 + n)*fudge <= 2*sqrt(k))
3790       return 2;
3791 
3792    if ((t == 2 && k >= 88) || (3 <= t && 9*t <= k && k >= 21)) {
3793       if ((1.5*log2_k + t + 4.0 + n)*fudge <= 0.5*Log2(t) + 2*(sqrt(t*k)))
3794          return 3;
3795    }
3796 
3797    if (k <= 9*t && 4*t <= k && k >= 21) {
3798       if ( ((log2_3 + log2_7 + log2_k + n)*fudge <= log2_20 + 5*t)  &&
3799            ((log2_3 + (15.0/4.0)*log2_k + n)*fudge <= log2_7 + k/2 + 2*t) &&
3800            ((2*log2_3 + 2 + log2_k + n)*fudge <= k/4 + 3*t) )
3801          return 4;
3802    }
3803 
3804    if (4*t >= k && k >= 21) {
3805       if (((15.0/4.0)*log2_k + n)*fudge <= log2_7 + k/2 + 2*t)
3806          return 5;
3807    }
3808 
3809    return 0;
3810 }
3811 
3812 
GenPrime(ZZ & n,long k,long err)3813 void GenPrime(ZZ& n, long k, long err)
3814 {
3815    if (k <= 1) LogicError("GenPrime: bad length");
3816 
3817    if (k > (1L << 20)) ResourceError("GenPrime: length too large");
3818 
3819    if (err < 1) err = 1;
3820    if (err > 512) err = 512;
3821 
3822    if (k == 2) {
3823       if (RandomBnd(2))
3824          n = 3;
3825       else
3826          n = 2;
3827 
3828       return;
3829    }
3830 
3831 
3832    long t;
3833 
3834    t = 1;
3835    while (!ErrBoundTest(k, t, err))
3836       t++;
3837 
3838    RandomPrime(n, k, t);
3839 }
3840 
3841 
GenPrime_long(long k,long err)3842 long GenPrime_long(long k, long err)
3843 {
3844    if (k <= 1) LogicError("GenPrime: bad length");
3845 
3846    if (k >= NTL_BITS_PER_LONG) ResourceError("GenPrime: length too large");
3847 
3848    if (err < 1) err = 1;
3849    if (err > 512) err = 512;
3850 
3851    if (k == 2) {
3852       if (RandomBnd(2))
3853          return 3;
3854       else
3855          return 2;
3856    }
3857 
3858    long t;
3859 
3860    t = 1;
3861    while (!ErrBoundTest(k, t, err))
3862       t++;
3863 
3864    return RandomPrime_long(k, t);
3865 }
3866 
MultiThreadedGenGermainPrime(ZZ & n,long k,long err)3867 void MultiThreadedGenGermainPrime(ZZ& n, long k, long err)
3868 {
3869    long nt = AvailableThreads();
3870 
3871 
3872    long prime_bnd = ComputePrimeBound(k);
3873 
3874    if (NumBits(prime_bnd) >= k/2)
3875       prime_bnd = (1L << (k/2-1));
3876 
3877    ZZ two;
3878    two = 2;
3879 
3880    const long LOCAL_ITER_BOUND = 8;
3881    // since resetting the PRG comes at a certain cost,
3882    // we perform a few iterations with each reset to
3883    // amortize the reset cost.
3884 
3885    unsigned long initial_counter = 0;
3886    ZZ seed;
3887    RandomBits(seed, 256);
3888 
3889 
3890    ZZ overflow_counter;
3891 
3892    for (;;) {
3893 
3894       AtomicLowWaterMark low_water_mark(-1UL);
3895       AtomicCounter counter(initial_counter);
3896 
3897       Vec< UniquePtr<ZZ> > result(INIT_SIZE, nt);
3898       Vec<unsigned long> result_ctr(INIT_SIZE, nt, -1UL);
3899 
3900       NTL_EXEC_INDEX(nt, index)
3901 
3902          RandomStreamPush push;
3903 
3904 	 SetSeed(seed);
3905 	 RandomStream& stream = GetCurrentRandomStream();
3906 
3907 	 ZZ cand, n1;
3908          PrimeSeq s;
3909 
3910 	 while (low_water_mark == -1UL) {
3911 
3912 	    unsigned long local_ctr = counter.inc();
3913             if (local_ctr >> (NTL_BITS_PER_NONCE-1)) {
3914                // counter overflow...rather academic
3915                break;
3916             }
3917 
3918 	    stream.set_nonce(local_ctr);
3919 
3920 	    for (long iter = 0; iter < LOCAL_ITER_BOUND &&
3921 				local_ctr <= low_water_mark; iter++) {
3922 
3923 
3924 	       RandomLen(cand, k);
3925 	       if (!IsOdd(cand)) add(cand, cand, 1);
3926 
3927 	       s.reset(3);
3928 	       long p;
3929 
3930 	       long sieve_passed = 1;
3931 
3932 	       p = s.next();
3933 	       while (p && p < prime_bnd) {
3934 		  long r = rem(cand, p);
3935 
3936 		  if (r == 0) {
3937 		     sieve_passed = 0;
3938 		     break;
3939 		  }
3940 
3941 		  // test if 2*r + 1 = 0 (mod p)
3942 		  if (r == p-r-1) {
3943 		     sieve_passed = 0;
3944 		     break;
3945 		  }
3946 
3947 		  p = s.next();
3948 	       }
3949 
3950                if (!sieve_passed) continue;
3951 
3952 
3953                if (MillerWitness(cand, two)) continue;
3954 
3955 	       // n1 = 2*cand+1
3956 	       mul(n1, cand, 2);
3957 	       add(n1, n1, 1);
3958 
3959 
3960                if (MillerWitness(n1, two)) continue;
3961 
3962 	       result[index].make(cand);
3963 	       result_ctr[index] = local_ctr;
3964 	       low_water_mark.UpdateMin(local_ctr);
3965 	       break;
3966             }
3967          }
3968 
3969       NTL_EXEC_INDEX_END
3970 
3971       // find index of low_water_mark
3972 
3973       unsigned long low_water_mark1 = low_water_mark;
3974       long low_water_index = -1;
3975 
3976       for (long index = 0; index < nt; index++) {
3977 	 if (result_ctr[index] == low_water_mark1) {
3978 	    low_water_index = index;
3979 	    break;
3980 	 }
3981       }
3982 
3983       if (low_water_index == -1) {
3984          // counter overflow...rather academic
3985          overflow_counter++;
3986          initial_counter = 0;
3987          RandomBits(seed, 256);
3988          continue;
3989       }
3990 
3991       ZZ N;
3992       N = *result[low_water_index];
3993 
3994       ZZ iter = ((overflow_counter << (NTL_BITS_PER_NONCE-1)) +
3995                  conv<ZZ>(low_water_mark1) + 1)*LOCAL_ITER_BOUND;
3996 
3997       // now do t M-R iterations...just to make sure
3998 
3999       // First compute the appropriate number of M-R iterations, t
4000       // The following computes t such that
4001       //       p(k,t)*8/k <= 2^{-err}/(5*iter^{1.25})
4002       // which suffices to get an overall error probability of 2^{-err}.
4003       // Note that this method has the advantage of not requiring
4004       // any assumptions on the density of Germain primes.
4005 
4006       long err1 = max(1, err + 7 + (5*NumBits(iter) + 3)/4 - NumBits(k));
4007       long t;
4008       t = 1;
4009       while (!ErrBoundTest(k, t, err1))
4010          t++;
4011 
4012       Vec<ZZ> W(INIT_SIZE, t);
4013 
4014       for (long i = 0; i < t; i++) {
4015          do {
4016             RandomBnd(W[i], N);
4017          } while (W[i] == 0);
4018       }
4019 
4020       AtomicBool tests_pass(true);
4021 
4022       NTL_EXEC_RANGE(t, first, last)
4023 
4024          for (long i = first; i < last && tests_pass; i++) {
4025             if (MillerWitness(N, W[i])) tests_pass = false;
4026          }
4027 
4028       NTL_EXEC_RANGE_END
4029 
4030       if (tests_pass) {
4031          n = N;
4032          return;
4033       }
4034 
4035       // very unlikey to get here
4036       initial_counter = low_water_mark1 + 1;
4037    }
4038 }
4039 
GenGermainPrime(ZZ & n,long k,long err)4040 void GenGermainPrime(ZZ& n, long k, long err)
4041 {
4042    if (k <= 1) LogicError("GenGermainPrime: bad length");
4043 
4044    if (k > (1L << 20)) ResourceError("GenGermainPrime: length too large");
4045 
4046    if (err < 1) err = 1;
4047    if (err > 512) err = 512;
4048 
4049    if (k == 2) {
4050       if (RandomBnd(2))
4051          n = 3;
4052       else
4053          n = 2;
4054 
4055       return;
4056    }
4057 
4058    if (k >= 192) {
4059       MultiThreadedGenGermainPrime(n, k, err);
4060       return;
4061    }
4062 
4063 
4064    long prime_bnd = ComputePrimeBound(k);
4065 
4066    if (NumBits(prime_bnd) >= k/2)
4067       prime_bnd = (1L << (k/2-1));
4068 
4069 
4070    ZZ two;
4071    two = 2;
4072 
4073    ZZ n1;
4074 
4075 
4076    PrimeSeq s;
4077 
4078    ZZ iter;
4079    iter = 0;
4080 
4081 
4082    for (;;) {
4083       iter++;
4084 
4085       RandomLen(n, k);
4086       if (!IsOdd(n)) add(n, n, 1);
4087 
4088       s.reset(3);
4089       long p;
4090 
4091       long sieve_passed = 1;
4092 
4093       p = s.next();
4094       while (p && p < prime_bnd) {
4095          long r = rem(n, p);
4096 
4097          if (r == 0) {
4098             sieve_passed = 0;
4099             break;
4100          }
4101 
4102          // test if 2*r + 1 = 0 (mod p)
4103          if (r == p-r-1) {
4104             sieve_passed = 0;
4105             break;
4106          }
4107 
4108          p = s.next();
4109       }
4110 
4111       if (!sieve_passed) continue;
4112 
4113 
4114       if (MillerWitness(n, two)) continue;
4115 
4116       // n1 = 2*n+1
4117       mul(n1, n, 2);
4118       add(n1, n1, 1);
4119 
4120 
4121       if (MillerWitness(n1, two)) continue;
4122 
4123       // now do t M-R iterations...just to make sure
4124 
4125       // First compute the appropriate number of M-R iterations, t
4126       // The following computes t such that
4127       //       p(k,t)*8/k <= 2^{-err}/(5*iter^{1.25})
4128       // which suffices to get an overall error probability of 2^{-err}.
4129       // Note that this method has the advantage of not requiring
4130       // any assumptions on the density of Germain primes.
4131 
4132       long err1 = max(1, err + 7 + (5*NumBits(iter) + 3)/4 - NumBits(k));
4133       long t;
4134       t = 1;
4135       while (!ErrBoundTest(k, t, err1))
4136          t++;
4137 
4138       ZZ W;
4139       long MR_passed = 1;
4140 
4141       long i;
4142       for (i = 1; i <= t; i++) {
4143          do {
4144             RandomBnd(W, n);
4145          } while (W == 0);
4146          // W == 0 is not a useful candidate witness!
4147 
4148          if (MillerWitness(n, W)) {
4149             MR_passed = 0;
4150             break;
4151          }
4152       }
4153 
4154       if (MR_passed) break;
4155    }
4156 }
4157 
OldGenGermainPrime(ZZ & n,long k,long err)4158 void OldGenGermainPrime(ZZ& n, long k, long err)
4159 {
4160    if (k <= 1) LogicError("GenGermainPrime: bad length");
4161 
4162    if (k > (1L << 20)) ResourceError("GenGermainPrime: length too large");
4163 
4164    if (err < 1) err = 1;
4165    if (err > 512) err = 512;
4166 
4167    if (k == 2) {
4168       if (RandomBnd(2))
4169          n = 3;
4170       else
4171          n = 2;
4172 
4173       return;
4174    }
4175 
4176 
4177    long prime_bnd = ComputePrimeBound(k);
4178 
4179    if (NumBits(prime_bnd) >= k/2)
4180       prime_bnd = (1L << (k/2-1));
4181 
4182 
4183    ZZ two;
4184    two = 2;
4185 
4186    ZZ n1;
4187 
4188 
4189    PrimeSeq s;
4190 
4191    ZZ iter;
4192    iter = 0;
4193 
4194 
4195    for (;;) {
4196       iter++;
4197 
4198       RandomLen(n, k);
4199       if (!IsOdd(n)) add(n, n, 1);
4200 
4201       s.reset(3);
4202       long p;
4203 
4204       long sieve_passed = 1;
4205 
4206       p = s.next();
4207       while (p && p < prime_bnd) {
4208          long r = rem(n, p);
4209 
4210          if (r == 0) {
4211             sieve_passed = 0;
4212             break;
4213          }
4214 
4215          // test if 2*r + 1 = 0 (mod p)
4216          if (r == p-r-1) {
4217             sieve_passed = 0;
4218             break;
4219          }
4220 
4221          p = s.next();
4222       }
4223 
4224       if (!sieve_passed) continue;
4225 
4226 
4227       if (MillerWitness(n, two)) continue;
4228 
4229       // n1 = 2*n+1
4230       mul(n1, n, 2);
4231       add(n1, n1, 1);
4232 
4233 
4234       if (MillerWitness(n1, two)) continue;
4235 
4236       // now do t M-R iterations...just to make sure
4237 
4238       // First compute the appropriate number of M-R iterations, t
4239       // The following computes t such that
4240       //       p(k,t)*8/k <= 2^{-err}/(5*iter^{1.25})
4241       // which suffices to get an overall error probability of 2^{-err}.
4242       // Note that this method has the advantage of not requiring
4243       // any assumptions on the density of Germain primes.
4244 
4245       long err1 = max(1, err + 7 + (5*NumBits(iter) + 3)/4 - NumBits(k));
4246       long t;
4247       t = 1;
4248       while (!ErrBoundTest(k, t, err1))
4249          t++;
4250 
4251       ZZ W;
4252       long MR_passed = 1;
4253 
4254       long i;
4255       for (i = 1; i <= t; i++) {
4256          do {
4257             RandomBnd(W, n);
4258          } while (W == 0);
4259          // W == 0 is not a useful candidate witness!
4260 
4261          if (MillerWitness(n, W)) {
4262             MR_passed = 0;
4263             break;
4264          }
4265       }
4266 
4267       if (MR_passed) break;
4268    }
4269 }
4270 
GenGermainPrime_long(long k,long err)4271 long GenGermainPrime_long(long k, long err)
4272 {
4273    if (k >= NTL_BITS_PER_LONG-1)
4274       ResourceError("GenGermainPrime_long: length too long");
4275 
4276    ZZ n;
4277    GenGermainPrime(n, k, err);
4278    return to_long(n);
4279 }
4280 
4281 
4282 NTL_END_IMPL
4283