1 /*
2    Copyright (c) 2005, 2012, Oracle and/or its affiliates
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; see the file COPYING. If not, write to the
15    Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
16    MA  02110-1335  USA.
17 */
18 
19 
20 
21 /* based on Wei Dai's integer.cpp from CryptoPP */
22 
23 #include "runtime.hpp"
24 #include "integer.hpp"
25 #include "modarith.hpp"
26 #include "asn.hpp"
27 
28 
29 
30 #ifdef __DECCXX
31     #include <c_asm.h>  // for asm overflow assembly
32 #endif
33 
34 #if defined(_M_X64) || defined(_M_IA64)
35     #include <intrin.h>
36 #pragma intrinsic(_umul128)
37 #endif
38 
39 
40 #ifdef __GNUC__
41     #include <signal.h>
42     #include <setjmp.h>
43 #endif
44 
45 
46 #ifdef SSE2_INTRINSICS_AVAILABLE
47     #ifdef __GNUC__
48         #include <xmmintrin.h>
49         #ifdef TAOCRYPT_MEMALIGN_AVAILABLE
50             #include <malloc.h>
51         #else
52             #include <stdlib.h>
53         #endif
54     #else
55         #include <emmintrin.h>
56     #endif
57 #elif defined(_MSC_VER) && defined(_M_IX86)
58 /*    #pragma message("You do not seem to have the Visual C++ Processor Pack ")
59     #pragma message("installed, so use of SSE2 intrinsics will be disabled.")
60 */
61 #elif defined(__GNUC__) && defined(__i386__)
62 /*   #warning You do not have GCC 3.3 or later, or did not specify the -msse2 \
63              compiler option. Use of SSE2 intrinsics will be disabled.
64 */
65 #endif
66 
67 
68 namespace TaoCrypt {
69 
70 
71 #ifdef SSE2_INTRINSICS_AVAILABLE
72 
73 template <class T>
allocate(size_type n,const void *)74 CPP_TYPENAME AlignedAllocator<T>::pointer AlignedAllocator<T>::allocate(
75                                            size_type n, const void *)
76 {
77     if (n > this->max_size())
78         return 0;
79     if (n == 0)
80         return 0;
81     if (n >= 4)
82     {
83         void* p;
84     #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
85         p = _mm_malloc(sizeof(T)*n, 16);
86     #elif defined(TAOCRYPT_MEMALIGN_AVAILABLE)
87         p = memalign(16, sizeof(T)*n);
88     #elif defined(TAOCRYPT_MALLOC_ALIGNMENT_IS_16)
89         p = malloc(sizeof(T)*n);
90     #else
91         p = (byte *)malloc(sizeof(T)*n + 8);
92         // assume malloc alignment is at least 8
93     #endif
94 
95     #ifdef TAOCRYPT_NO_ALIGNED_ALLOC
96         m_pBlock = p;
97         if (!IsAlignedOn(p, 16))
98         {
99             p = (byte *)p + 8;
100         }
101     #endif
102 
103         return (T*)p;
104     }
105     return NEW_TC T[n];
106 }
107 
108 
109 template <class T>
deallocate(void * p,size_type n)110 void AlignedAllocator<T>::deallocate(void* p, size_type n)
111 {
112     memset(p, 0, n*sizeof(T));
113     if (n >= 4)
114     {
115         #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
116             _mm_free(p);
117         #elif defined(TAOCRYPT_NO_ALIGNED_ALLOC)
118             free(m_pBlock);
119             m_pBlock = 0;
120         #else
121             free(p);
122         #endif
123     }
124     else
125         tcArrayDelete((T *)p);
126 }
127 
128 #endif  // SSE2
129 
130 
131 // ********  start of integer needs
132 
133 // start 5.2.1 adds DWord and Word ********
134 
135 // ********************************************************
136 
137 class DWord {
138 public:
DWord()139 DWord() {}
140 
141 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
DWord(word low)142     explicit DWord(word low)
143     {
144         whole_ = low;
145     }
146 #else
DWord(word low)147     explicit DWord(word low)
148     {
149         halfs_.low = low;
150         halfs_.high = 0;
151     }
152 #endif
153 
DWord(word low,word high)154     DWord(word low, word high)
155     {
156         halfs_.low = low;
157         halfs_.high = high;
158     }
159 
Multiply(word a,word b)160     static DWord Multiply(word a, word b)
161     {
162         DWord r;
163 
164         #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
165             r.whole_ = (dword)a * b;
166 
167         #elif defined(_M_X64) || defined(_M_IA64)
168             r.halfs_.low = _umul128(a, b, &r.halfs_.high);
169 
170         #elif defined(__alpha__)
171             r.halfs_.low = a*b;
172             #ifdef __GNUC__
173                 __asm__("umulh %1,%2,%0" : "=r" (r.halfs_.high)
174                     : "r" (a), "r" (b));
175             #elif defined(__DECCXX)
176                 r.halfs_.high = asm("umulh %a0, %a1, %v0", a, b);
177             #else
178                 #error unknown alpha compiler
179             #endif
180 
181         #elif defined(__ia64__)
182             r.halfs_.low = a*b;
183             __asm__("xmpy.hu %0=%1,%2" : "=f" (r.halfs_.high)
184                 : "f" (a), "f" (b));
185 
186         #elif defined(_ARCH_PPC64)
187             r.halfs_.low = a*b;
188             __asm__("mulhdu %0,%1,%2" : "=r" (r.halfs_.high)
189                 : "r" (a), "r" (b) : "cc");
190 
191         #elif defined(__x86_64__)
192             __asm__("mulq %3" : "=d" (r.halfs_.high), "=a" (r.halfs_.low) :
193                 "a" (a), "rm" (b) : "cc");
194 
195         #elif defined(__mips64)
196             unsigned __int128 t = (unsigned __int128) a * b;
197             r.halfs_.high = t >> 64;
198             r.halfs_.low = (word) t;
199 
200         #elif defined(_M_IX86)
201             // for testing
202             word64 t = (word64)a * b;
203             r.halfs_.high = ((word32 *)(&t))[1];
204             r.halfs_.low = (word32)t;
205         #else
206             #error can not implement DWord
207         #endif
208 
209         return r;
210     }
211 
MultiplyAndAdd(word a,word b,word c)212     static DWord MultiplyAndAdd(word a, word b, word c)
213     {
214         DWord r = Multiply(a, b);
215         return r += c;
216     }
217 
operator +=(word a)218     DWord & operator+=(word a)
219     {
220         #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
221             whole_ = whole_ + a;
222         #else
223             halfs_.low += a;
224             halfs_.high += (halfs_.low < a);
225         #endif
226         return *this;
227     }
228 
operator +(word a)229     DWord operator+(word a)
230     {
231         DWord r;
232         #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
233             r.whole_ = whole_ + a;
234         #else
235             r.halfs_.low = halfs_.low + a;
236             r.halfs_.high = halfs_.high + (r.halfs_.low < a);
237         #endif
238         return r;
239     }
240 
operator -(DWord a)241     DWord operator-(DWord a)
242     {
243         DWord r;
244         #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
245             r.whole_ = whole_ - a.whole_;
246         #else
247             r.halfs_.low = halfs_.low - a.halfs_.low;
248             r.halfs_.high = halfs_.high - a.halfs_.high -
249                              (r.halfs_.low > halfs_.low);
250         #endif
251         return r;
252     }
253 
operator -(word a)254     DWord operator-(word a)
255     {
256         DWord r;
257         #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
258             r.whole_ = whole_ - a;
259         #else
260             r.halfs_.low = halfs_.low - a;
261             r.halfs_.high = halfs_.high - (r.halfs_.low > halfs_.low);
262         #endif
263         return r;
264     }
265 
266     // returns quotient, which must fit in a word
267     word operator/(word divisor);
268 
269     word operator%(word a);
270 
operator !() const271     bool operator!() const
272     {
273     #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
274         return !whole_;
275     #else
276         return !halfs_.high && !halfs_.low;
277     #endif
278     }
279 
GetLowHalf() const280     word GetLowHalf() const {return halfs_.low;}
GetHighHalf() const281     word GetHighHalf() const {return halfs_.high;}
GetHighHalfAsBorrow() const282     word GetHighHalfAsBorrow() const {return 0-halfs_.high;}
283 
284 private:
285     union
286     {
287     #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
288         dword whole_;
289     #endif
290         struct
291         {
292         #ifdef LITTLE_ENDIAN_ORDER
293             word low;
294             word high;
295         #else
296             word high;
297             word low;
298         #endif
299         } halfs_;
300     };
301 };
302 
303 
304 class Word {
305 public:
Word()306     Word() {}
307 
Word(word value)308     Word(word value)
309     {
310         whole_ = value;
311     }
312 
Word(hword low,hword high)313     Word(hword low, hword high)
314     {
315         whole_ = low | (word(high) << (WORD_BITS/2));
316     }
317 
Multiply(hword a,hword b)318     static Word Multiply(hword a, hword b)
319     {
320         Word r;
321         r.whole_ = (word)a * b;
322         return r;
323     }
324 
operator -(Word a)325     Word operator-(Word a)
326     {
327         Word r;
328         r.whole_ = whole_ - a.whole_;
329         return r;
330     }
331 
operator -(hword a)332     Word operator-(hword a)
333     {
334         Word r;
335         r.whole_ = whole_ - a;
336         return r;
337     }
338 
339     // returns quotient, which must fit in a word
operator /(hword divisor)340     hword operator/(hword divisor)
341     {
342         return hword(whole_ / divisor);
343     }
344 
operator !() const345     bool operator!() const
346     {
347         return !whole_;
348     }
349 
GetWhole() const350     word GetWhole() const {return whole_;}
GetLowHalf() const351     hword GetLowHalf() const {return hword(whole_);}
GetHighHalf() const352     hword GetHighHalf() const {return hword(whole_>>(WORD_BITS/2));}
GetHighHalfAsBorrow() const353     hword GetHighHalfAsBorrow() const {return 0-hword(whole_>>(WORD_BITS/2));}
354 
355 private:
356     word whole_;
357 };
358 
359 
360 // dummy is VC60 compiler bug workaround
361 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
362 template <class S, class D>
363 S DivideThreeWordsByTwo(S* A, S B0, S B1, D* dummy_VC6_WorkAround = 0)
364 {
365     // estimate the quotient: do a 2 S by 1 S divide
366     S Q;
367     if (S(B1+1) == 0)
368         Q = A[2];
369     else
370         Q = D(A[1], A[2]) / S(B1+1);
371 
372     // now subtract Q*B from A
373     D p = D::Multiply(B0, Q);
374     D u = (D) A[0] - p.GetLowHalf();
375     A[0] = u.GetLowHalf();
376     u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() -
377             D::Multiply(B1, Q);
378     A[1] = u.GetLowHalf();
379     A[2] += u.GetHighHalf();
380 
381     // Q <= actual quotient, so fix it
382     while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
383     {
384         u = (D) A[0] - B0;
385         A[0] = u.GetLowHalf();
386         u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
387         A[1] = u.GetLowHalf();
388         A[2] += u.GetHighHalf();
389         Q++;
390     }
391 
392     return Q;
393 }
394 
395 
396 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
397 template <class S, class D>
DivideFourWordsByTwo(S * T,const D & Al,const D & Ah,const D & B)398 inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
399 {
400     if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
401         return D(Ah.GetLowHalf(), Ah.GetHighHalf());
402     else
403     {
404         S Q[2];
405         T[0] = Al.GetLowHalf();
406         T[1] = Al.GetHighHalf();
407         T[2] = Ah.GetLowHalf();
408         T[3] = Ah.GetHighHalf();
409         Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(),
410                                                 B.GetHighHalf());
411         Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
412         return D(Q[0], Q[1]);
413     }
414 }
415 
416 
417 // returns quotient, which must fit in a word
operator /(word a)418 inline word DWord::operator/(word a)
419 {
420     #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
421         return word(whole_ / a);
422     #else
423         hword r[4];
424         return DivideFourWordsByTwo<hword, Word>(r, halfs_.low,
425                                                     halfs_.high, a).GetWhole();
426     #endif
427 }
428 
operator %(word a)429 inline word DWord::operator%(word a)
430 {
431     #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
432         return word(whole_ % a);
433     #else
434         if (a < (word(1) << (WORD_BITS/2)))
435         {
436             hword h = hword(a);
437             word r = halfs_.high % h;
438             r = ((halfs_.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
439             return hword((hword(halfs_.low) + (r << (WORD_BITS/2))) % h);
440         }
441         else
442         {
443             hword r[4];
444             DivideFourWordsByTwo<hword, Word>(r, halfs_.low, halfs_.high, a);
445             return Word(r[0], r[1]).GetWhole();
446         }
447     #endif
448 }
449 
450 
451 
452 // end 5.2.1 DWord and Word adds
453 
454 
455 
456 
457 
458 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
459 
RoundupSize(unsigned int n)460 static inline unsigned int RoundupSize(unsigned int n)
461 {
462     if (n<=8)
463         return RoundupSizeTable[n];
464     else if (n<=16)
465         return 16;
466     else if (n<=32)
467         return 32;
468     else if (n<=64)
469         return 64;
470     else return 1U << BitPrecision(n-1);
471 }
472 
473 
Compare(const word * A,const word * B,unsigned int N)474 static int Compare(const word *A, const word *B, unsigned int N)
475 {
476     while (N--)
477         if (A[N] > B[N])
478             return 1;
479         else if (A[N] < B[N])
480             return -1;
481 
482     return 0;
483 }
484 
Increment(word * A,unsigned int N,word B=1)485 static word Increment(word *A, unsigned int N, word B=1)
486 {
487     word t = A[0];
488     A[0] = t+B;
489     if (A[0] >= t)
490         return 0;
491     for (unsigned i=1; i<N; i++)
492         if (++A[i])
493             return 0;
494     return 1;
495 }
496 
Decrement(word * A,unsigned int N,word B=1)497 static word Decrement(word *A, unsigned int N, word B=1)
498 {
499     word t = A[0];
500     A[0] = t-B;
501     if (A[0] <= t)
502         return 0;
503     for (unsigned i=1; i<N; i++)
504         if (A[i]--)
505             return 0;
506     return 1;
507 }
508 
TwosComplement(word * A,unsigned int N)509 static void TwosComplement(word *A, unsigned int N)
510 {
511     Decrement(A, N);
512     for (unsigned i=0; i<N; i++)
513         A[i] = ~A[i];
514 }
515 
516 
LinearMultiply(word * C,const word * A,word B,unsigned int N)517 static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
518 {
519     word carry=0;
520     for(unsigned i=0; i<N; i++)
521     {
522         DWord p = DWord::MultiplyAndAdd(A[i], B, carry);
523         C[i] = p.GetLowHalf();
524         carry = p.GetHighHalf();
525     }
526     return carry;
527 }
528 
529 
AtomicInverseModPower2(word A)530 static word AtomicInverseModPower2(word A)
531 {
532     word R=A%8;
533 
534     for (unsigned i=3; i<WORD_BITS; i*=2)
535         R = R*(2-R*A);
536 
537     return R;
538 }
539 
540 
541 // ********************************************************
542 
543 class Portable
544 {
545 public:
546     static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
547                                    unsigned int N);
548     static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word*B,
549                                         unsigned int N);
550     static void TAOCRYPT_CDECL Multiply2(word *C, const word *A, const word *B);
551     static word TAOCRYPT_CDECL Multiply2Add(word *C,
552                                             const word *A, const word *B);
553     static void TAOCRYPT_CDECL Multiply4(word *C, const word *A, const word *B);
554     static void TAOCRYPT_CDECL Multiply8(word *C, const word *A, const word *B);
MultiplyRecursionLimit()555     static unsigned int TAOCRYPT_CDECL MultiplyRecursionLimit() {return 8;}
556 
557     static void TAOCRYPT_CDECL Multiply2Bottom(word *C, const word *A,
558                                                const word *B);
559     static void TAOCRYPT_CDECL Multiply4Bottom(word *C, const word *A,
560                                                const word *B);
561     static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
562                                                const word *B);
MultiplyBottomRecursionLimit()563     static unsigned int TAOCRYPT_CDECL MultiplyBottomRecursionLimit(){return 8;}
564 
565     static void TAOCRYPT_CDECL Square2(word *R, const word *A);
566     static void TAOCRYPT_CDECL Square4(word *R, const word *A);
SquareRecursionLimit()567     static unsigned int TAOCRYPT_CDECL SquareRecursionLimit() {return 4;}
568 };
569 
Add(word * C,const word * A,const word * B,unsigned int N)570 word Portable::Add(word *C, const word *A, const word *B, unsigned int N)
571 {
572     DWord u(0, 0);
573     for (unsigned int i = 0; i < N; i+=2)
574     {
575         u = DWord(A[i]) + B[i] + u.GetHighHalf();
576         C[i] = u.GetLowHalf();
577         u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
578         C[i+1] = u.GetLowHalf();
579     }
580     return u.GetHighHalf();
581 }
582 
Subtract(word * C,const word * A,const word * B,unsigned int N)583 word Portable::Subtract(word *C, const word *A, const word *B, unsigned int N)
584 {
585     DWord u(0, 0);
586     for (unsigned int i = 0; i < N; i+=2)
587     {
588         u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
589         C[i] = u.GetLowHalf();
590         u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
591         C[i+1] = u.GetLowHalf();
592     }
593     return 0-u.GetHighHalf();
594 }
595 
Multiply2(word * C,const word * A,const word * B)596 void Portable::Multiply2(word *C, const word *A, const word *B)
597 {
598 /*
599     word s;
600     dword d;
601 
602     if (A1 >= A0)
603         if (B0 >= B1)
604         {
605             s = 0;
606             d = (dword)(A1-A0)*(B0-B1);
607         }
608         else
609         {
610             s = (A1-A0);
611             d = (dword)s*(word)(B0-B1);
612         }
613     else
614         if (B0 > B1)
615         {
616             s = (B0-B1);
617             d = (word)(A1-A0)*(dword)s;
618         }
619         else
620         {
621             s = 0;
622             d = (dword)(A0-A1)*(B1-B0);
623         }
624 */
625     // this segment is the branchless equivalent of above
626     word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
627     unsigned int ai = A[1] < A[0];
628     unsigned int bi = B[0] < B[1];
629     unsigned int di = ai & bi;
630     DWord d = DWord::Multiply(D[di], D[di+2]);
631     D[1] = D[3] = 0;
632     unsigned int si = ai + !bi;
633     word s = D[si];
634 
635     DWord A0B0 = DWord::Multiply(A[0], B[0]);
636     C[0] = A0B0.GetLowHalf();
637 
638     DWord A1B1 = DWord::Multiply(A[1], B[1]);
639     DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf()
640                        + A1B1.GetLowHalf();
641     C[1] = t.GetLowHalf();
642 
643     t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf()
644              + A1B1.GetHighHalf() - s;
645     C[2] = t.GetLowHalf();
646     C[3] = t.GetHighHalf();
647 }
648 
Multiply2Bottom(word * C,const word * A,const word * B)649 void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
650 {
651     DWord t = DWord::Multiply(A[0], B[0]);
652     C[0] = t.GetLowHalf();
653     C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
654 }
655 
Multiply2Add(word * C,const word * A,const word * B)656 word Portable::Multiply2Add(word *C, const word *A, const word *B)
657 {
658     word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
659     unsigned int ai = A[1] < A[0];
660     unsigned int bi = B[0] < B[1];
661     unsigned int di = ai & bi;
662     DWord d = DWord::Multiply(D[di], D[di+2]);
663     D[1] = D[3] = 0;
664     unsigned int si = ai + !bi;
665     word s = D[si];
666 
667     DWord A0B0 = DWord::Multiply(A[0], B[0]);
668     DWord t = A0B0 + C[0];
669     C[0] = t.GetLowHalf();
670 
671     DWord A1B1 = DWord::Multiply(A[1], B[1]);
672     t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() +
673         A1B1.GetLowHalf() + C[1];
674     C[1] = t.GetLowHalf();
675 
676     t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() +
677         d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
678     C[2] = t.GetLowHalf();
679 
680     t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
681     C[3] = t.GetLowHalf();
682     return t.GetHighHalf();
683 }
684 
685 
686 #define MulAcc(x, y)                                \
687     p = DWord::MultiplyAndAdd(A[x], B[y], c);       \
688     c = p.GetLowHalf();                             \
689     p = (DWord) d + p.GetHighHalf();                \
690     d = p.GetLowHalf();                             \
691     e += p.GetHighHalf();
692 
693 #define SaveMulAcc(s, x, y)                         \
694     R[s] = c;                                       \
695     p = DWord::MultiplyAndAdd(A[x], B[y], d);       \
696     c = p.GetLowHalf();                             \
697     p = (DWord) e + p.GetHighHalf();                \
698     d = p.GetLowHalf();                             \
699     e = p.GetHighHalf();
700 
701 #define SquAcc(x, y)                                \
702     q = DWord::Multiply(A[x], A[y]);                \
703     p = q + c;                                      \
704     c = p.GetLowHalf();                             \
705     p = (DWord) d + p.GetHighHalf();                \
706     d = p.GetLowHalf();                             \
707     e += p.GetHighHalf();                           \
708     p = q + c;                                      \
709     c = p.GetLowHalf();                             \
710     p = (DWord) d + p.GetHighHalf();                \
711     d = p.GetLowHalf();                             \
712     e += p.GetHighHalf();
713 
714 #define SaveSquAcc(s, x, y)                         \
715     R[s] = c;                                       \
716     q = DWord::Multiply(A[x], A[y]);                \
717     p = q + d;                                      \
718     c = p.GetLowHalf();                             \
719     p = (DWord) e + p.GetHighHalf();                \
720     d = p.GetLowHalf();                             \
721     e = p.GetHighHalf();                            \
722     p = q + c;                                      \
723     c = p.GetLowHalf();                             \
724     p = (DWord) d + p.GetHighHalf();                \
725     d = p.GetLowHalf();                             \
726     e += p.GetHighHalf();
727 
728 
Multiply4(word * R,const word * A,const word * B)729 void Portable::Multiply4(word *R, const word *A, const word *B)
730 {
731     DWord p;
732     word c, d, e;
733 
734     p = DWord::Multiply(A[0], B[0]);
735     R[0] = p.GetLowHalf();
736     c = p.GetHighHalf();
737     d = e = 0;
738 
739     MulAcc(0, 1);
740     MulAcc(1, 0);
741 
742     SaveMulAcc(1, 2, 0);
743     MulAcc(1, 1);
744     MulAcc(0, 2);
745 
746     SaveMulAcc(2, 0, 3);
747     MulAcc(1, 2);
748     MulAcc(2, 1);
749     MulAcc(3, 0);
750 
751     SaveMulAcc(3, 3, 1);
752     MulAcc(2, 2);
753     MulAcc(1, 3);
754 
755     SaveMulAcc(4, 2, 3);
756     MulAcc(3, 2);
757 
758     R[5] = c;
759     p = DWord::MultiplyAndAdd(A[3], B[3], d);
760     R[6] = p.GetLowHalf();
761     R[7] = e + p.GetHighHalf();
762 }
763 
Square2(word * R,const word * A)764 void Portable::Square2(word *R, const word *A)
765 {
766     DWord p, q;
767     word c, d, e;
768 
769     p = DWord::Multiply(A[0], A[0]);
770     R[0] = p.GetLowHalf();
771     c = p.GetHighHalf();
772     d = e = 0;
773 
774     SquAcc(0, 1);
775 
776     R[1] = c;
777     p = DWord::MultiplyAndAdd(A[1], A[1], d);
778     R[2] = p.GetLowHalf();
779     R[3] = e + p.GetHighHalf();
780 }
781 
Square4(word * R,const word * A)782 void Portable::Square4(word *R, const word *A)
783 {
784 #ifdef _MSC_VER
785     // VC60 workaround: MSVC 6.0 has an optimization bug that makes
786     // (dword)A*B where either A or B has been cast to a dword before
787     // very expensive. Revisit this function when this
788     // bug is fixed.
789     Multiply4(R, A, A);
790 #else
791     const word *B = A;
792     DWord p, q;
793     word c, d, e;
794 
795     p = DWord::Multiply(A[0], A[0]);
796     R[0] = p.GetLowHalf();
797     c = p.GetHighHalf();
798     d = e = 0;
799 
800     SquAcc(0, 1);
801 
802     SaveSquAcc(1, 2, 0);
803     MulAcc(1, 1);
804 
805     SaveSquAcc(2, 0, 3);
806     SquAcc(1, 2);
807 
808     SaveSquAcc(3, 3, 1);
809     MulAcc(2, 2);
810 
811     SaveSquAcc(4, 2, 3);
812 
813     R[5] = c;
814     p = DWord::MultiplyAndAdd(A[3], A[3], d);
815     R[6] = p.GetLowHalf();
816     R[7] = e + p.GetHighHalf();
817 #endif
818 }
819 
Multiply8(word * R,const word * A,const word * B)820 void Portable::Multiply8(word *R, const word *A, const word *B)
821 {
822     DWord p;
823     word c, d, e;
824 
825     p = DWord::Multiply(A[0], B[0]);
826     R[0] = p.GetLowHalf();
827     c = p.GetHighHalf();
828     d = e = 0;
829 
830     MulAcc(0, 1);
831     MulAcc(1, 0);
832 
833     SaveMulAcc(1, 2, 0);
834     MulAcc(1, 1);
835     MulAcc(0, 2);
836 
837     SaveMulAcc(2, 0, 3);
838     MulAcc(1, 2);
839     MulAcc(2, 1);
840     MulAcc(3, 0);
841 
842     SaveMulAcc(3, 0, 4);
843     MulAcc(1, 3);
844     MulAcc(2, 2);
845     MulAcc(3, 1);
846     MulAcc(4, 0);
847 
848     SaveMulAcc(4, 0, 5);
849     MulAcc(1, 4);
850     MulAcc(2, 3);
851     MulAcc(3, 2);
852     MulAcc(4, 1);
853     MulAcc(5, 0);
854 
855     SaveMulAcc(5, 0, 6);
856     MulAcc(1, 5);
857     MulAcc(2, 4);
858     MulAcc(3, 3);
859     MulAcc(4, 2);
860     MulAcc(5, 1);
861     MulAcc(6, 0);
862 
863     SaveMulAcc(6, 0, 7);
864     MulAcc(1, 6);
865     MulAcc(2, 5);
866     MulAcc(3, 4);
867     MulAcc(4, 3);
868     MulAcc(5, 2);
869     MulAcc(6, 1);
870     MulAcc(7, 0);
871 
872     SaveMulAcc(7, 1, 7);
873     MulAcc(2, 6);
874     MulAcc(3, 5);
875     MulAcc(4, 4);
876     MulAcc(5, 3);
877     MulAcc(6, 2);
878     MulAcc(7, 1);
879 
880     SaveMulAcc(8, 2, 7);
881     MulAcc(3, 6);
882     MulAcc(4, 5);
883     MulAcc(5, 4);
884     MulAcc(6, 3);
885     MulAcc(7, 2);
886 
887     SaveMulAcc(9, 3, 7);
888     MulAcc(4, 6);
889     MulAcc(5, 5);
890     MulAcc(6, 4);
891     MulAcc(7, 3);
892 
893     SaveMulAcc(10, 4, 7);
894     MulAcc(5, 6);
895     MulAcc(6, 5);
896     MulAcc(7, 4);
897 
898     SaveMulAcc(11, 5, 7);
899     MulAcc(6, 6);
900     MulAcc(7, 5);
901 
902     SaveMulAcc(12, 6, 7);
903     MulAcc(7, 6);
904 
905     R[13] = c;
906     p = DWord::MultiplyAndAdd(A[7], B[7], d);
907     R[14] = p.GetLowHalf();
908     R[15] = e + p.GetHighHalf();
909 }
910 
Multiply4Bottom(word * R,const word * A,const word * B)911 void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
912 {
913     DWord p;
914     word c, d, e;
915 
916     p = DWord::Multiply(A[0], B[0]);
917     R[0] = p.GetLowHalf();
918     c = p.GetHighHalf();
919     d = e = 0;
920 
921     MulAcc(0, 1);
922     MulAcc(1, 0);
923 
924     SaveMulAcc(1, 2, 0);
925     MulAcc(1, 1);
926     MulAcc(0, 2);
927 
928     R[2] = c;
929     R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
930 }
931 
Multiply8Bottom(word * R,const word * A,const word * B)932 void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
933 {
934     DWord p;
935     word c, d, e;
936 
937     p = DWord::Multiply(A[0], B[0]);
938     R[0] = p.GetLowHalf();
939     c = p.GetHighHalf();
940     d = e = 0;
941 
942     MulAcc(0, 1);
943     MulAcc(1, 0);
944 
945     SaveMulAcc(1, 2, 0);
946     MulAcc(1, 1);
947     MulAcc(0, 2);
948 
949     SaveMulAcc(2, 0, 3);
950     MulAcc(1, 2);
951     MulAcc(2, 1);
952     MulAcc(3, 0);
953 
954     SaveMulAcc(3, 0, 4);
955     MulAcc(1, 3);
956     MulAcc(2, 2);
957     MulAcc(3, 1);
958     MulAcc(4, 0);
959 
960     SaveMulAcc(4, 0, 5);
961     MulAcc(1, 4);
962     MulAcc(2, 3);
963     MulAcc(3, 2);
964     MulAcc(4, 1);
965     MulAcc(5, 0);
966 
967     SaveMulAcc(5, 0, 6);
968     MulAcc(1, 5);
969     MulAcc(2, 4);
970     MulAcc(3, 3);
971     MulAcc(4, 2);
972     MulAcc(5, 1);
973     MulAcc(6, 0);
974 
975     R[6] = c;
976     R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
977                A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
978 }
979 
980 
981 #undef MulAcc
982 #undef SaveMulAcc
983 #undef SquAcc
984 #undef SaveSquAcc
985 
986 // optimized
987 
988 #ifdef TAOCRYPT_X86ASM_AVAILABLE
989 
990 // ************** x86 feature detection ***************
991 
992 
993 #ifdef SSE2_INTRINSICS_AVAILABLE
994 
995 #ifndef _MSC_VER
996     static jmp_buf s_env;
SigIllHandler(int)997     static void SigIllHandler(int)
998     {
999         longjmp(s_env, 1);
1000     }
1001 #endif
1002 
HasSSE2()1003 static bool HasSSE2()
1004 {
1005     if (!IsPentium())
1006         return false;
1007 
1008     word32 cpuid[4];
1009     CpuId(1, cpuid);
1010     if ((cpuid[3] & (1 << 26)) == 0)
1011         return false;
1012 
1013 #ifdef _MSC_VER
1014     __try
1015     {
1016         __asm xorpd xmm0, xmm0        // executing SSE2 instruction
1017     }
1018     __except (1)
1019     {
1020         return false;
1021     }
1022     return true;
1023 #else
1024     typedef void (*SigHandler)(int);
1025 
1026     SigHandler oldHandler = signal(SIGILL, SigIllHandler);
1027     if (oldHandler == SIG_ERR)
1028         return false;
1029 
1030     bool result = true;
1031     if (setjmp(s_env))
1032         result = false;
1033     else
1034         __asm __volatile ("xorpd %xmm0, %xmm0");
1035 
1036     signal(SIGILL, oldHandler);
1037     return result;
1038 #endif
1039 }
1040 #endif // SSE2_INTRINSICS_AVAILABLE
1041 
1042 
IsP4()1043 static bool IsP4()
1044 {
1045     if (!IsPentium())
1046         return false;
1047 
1048     word32 cpuid[4];
1049 
1050     CpuId(1, cpuid);
1051     return ((cpuid[0] >> 8) & 0xf) == 0xf;
1052 }
1053 
1054 // ************** Pentium/P4 optimizations ***************
1055 
1056 class PentiumOptimized : public Portable
1057 {
1058 public:
1059     static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
1060                                    unsigned int N);
1061     static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
1062                                         unsigned int N);
1063     static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
1064                                          const word *B);
1065     static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
1066                                          const word *B);
1067     static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
1068                                                const word *B);
1069 };
1070 
1071 class P4Optimized
1072 {
1073 public:
1074     static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
1075                                    unsigned int N);
1076     static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
1077                                         unsigned int N);
1078 #ifdef SSE2_INTRINSICS_AVAILABLE
1079     static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
1080                                          const word *B);
1081     static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
1082                                          const word *B);
1083     static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
1084                                                const word *B);
1085 #endif
1086 };
1087 
1088 typedef word (TAOCRYPT_CDECL * PAddSub)(word *C, const word *A, const word *B,
1089                                         unsigned int N);
1090 typedef void (TAOCRYPT_CDECL * PMul)(word *C, const word *A, const word *B);
1091 
1092 static PAddSub s_pAdd, s_pSub;
1093 #ifdef SSE2_INTRINSICS_AVAILABLE
1094 static PMul s_pMul4, s_pMul8, s_pMul8B;
1095 #endif
1096 
SetPentiumFunctionPointers()1097 static void SetPentiumFunctionPointers()
1098 {
1099     if (!IsPentium())
1100     {
1101         s_pAdd = &Portable::Add;
1102         s_pSub = &Portable::Subtract;
1103     }
1104     else if (IsP4())
1105     {
1106         s_pAdd = &P4Optimized::Add;
1107         s_pSub = &P4Optimized::Subtract;
1108     }
1109     else
1110     {
1111         s_pAdd = &PentiumOptimized::Add;
1112         s_pSub = &PentiumOptimized::Subtract;
1113     }
1114 
1115 #ifdef SSE2_INTRINSICS_AVAILABLE
1116     if (!IsPentium())
1117     {
1118         s_pMul4 = &Portable::Multiply4;
1119         s_pMul8 = &Portable::Multiply8;
1120         s_pMul8B = &Portable::Multiply8Bottom;
1121     }
1122     else if (HasSSE2())
1123     {
1124         s_pMul4 = &P4Optimized::Multiply4;
1125         s_pMul8 = &P4Optimized::Multiply8;
1126         s_pMul8B = &P4Optimized::Multiply8Bottom;
1127     }
1128     else
1129     {
1130         s_pMul4 = &PentiumOptimized::Multiply4;
1131         s_pMul8 = &PentiumOptimized::Multiply8;
1132         s_pMul8B = &PentiumOptimized::Multiply8Bottom;
1133     }
1134 #endif
1135 }
1136 
1137 static const char s_RunAtStartupSetPentiumFunctionPointers =
1138     (SetPentiumFunctionPointers(), 0);
1139 
1140 
1141 class LowLevel : public PentiumOptimized
1142 {
1143 public:
Add(word * C,const word * A,const word * B,unsigned int N)1144     inline static word Add(word *C, const word *A, const word *B,
1145                            unsigned int N)
1146         {return s_pAdd(C, A, B, N);}
Subtract(word * C,const word * A,const word * B,unsigned int N)1147     inline static word Subtract(word *C, const word *A, const word *B,
1148                                 unsigned int N)
1149         {return s_pSub(C, A, B, N);}
Square4(word * R,const word * A)1150     inline static void Square4(word *R, const word *A)
1151         {Multiply4(R, A, A);}
1152 #ifdef SSE2_INTRINSICS_AVAILABLE
Multiply4(word * C,const word * A,const word * B)1153     inline static void Multiply4(word *C, const word *A, const word *B)
1154         {s_pMul4(C, A, B);}
Multiply8(word * C,const word * A,const word * B)1155     inline static void Multiply8(word *C, const word *A, const word *B)
1156         {s_pMul8(C, A, B);}
Multiply8Bottom(word * C,const word * A,const word * B)1157     inline static void Multiply8Bottom(word *C, const word *A, const word *B)
1158         {s_pMul8B(C, A, B);}
1159 #endif
1160 };
1161 
1162 // use some tricks to share assembly code between MSVC and GCC
1163 #ifdef _MSC_VER
1164     #define TAOCRYPT_NAKED __declspec(naked)
1165     #define AS1(x) __asm x
1166     #define AS2(x, y) __asm x, y
1167     #define AddPrologue \
1168         __asm	push ebp \
1169         __asm	push ebx \
1170         __asm	push esi \
1171         __asm	push edi \
1172         __asm	mov		ecx, [esp+20] \
1173         __asm	mov		edx, [esp+24] \
1174         __asm	mov		ebx, [esp+28] \
1175         __asm	mov		esi, [esp+32]
1176     #define AddEpilogue \
1177         __asm	pop edi \
1178         __asm	pop esi \
1179         __asm	pop ebx \
1180         __asm	pop ebp \
1181         __asm	ret
1182     #define MulPrologue \
1183         __asm	push ebp \
1184         __asm	push ebx \
1185         __asm	push esi \
1186         __asm	push edi \
1187         __asm	mov ecx, [esp+28] \
1188         __asm	mov esi, [esp+24] \
1189         __asm	push [esp+20]
1190     #define MulEpilogue \
1191         __asm	add esp, 4 \
1192         __asm	pop edi \
1193         __asm	pop esi \
1194         __asm	pop ebx \
1195         __asm	pop ebp \
1196         __asm	ret
1197 #else
1198     #define TAOCRYPT_NAKED
1199     #define AS1(x) #x ";"
1200     #define AS2(x, y) #x ", " #y ";"
1201     #define AddPrologue \
1202         __asm__ __volatile__ \
1203         ( \
1204             "push %%ebx;"	/* save this manually, in case of -fPIC */ \
1205             "mov %2, %%ebx;" \
1206             ".intel_syntax noprefix;" \
1207             "push ebp;"
1208     #define AddEpilogue \
1209             "pop ebp;" \
1210             ".att_syntax prefix;" \
1211             "pop %%ebx;" \
1212                     : \
1213                     : "c" (C), "d" (A), "m" (B), "S" (N) \
1214                     : "%edi", "memory", "cc" \
1215         );
1216     #define MulPrologue \
1217         __asm__ __volatile__ \
1218         ( \
1219             "push %%ebx;"	/* save this manually, in case of -fPIC */ \
1220             "push %%ebp;" \
1221             "push %0;" \
1222             ".intel_syntax noprefix;"
1223     #define MulEpilogue \
1224             "add esp, 4;" \
1225             "pop ebp;" \
1226             "pop ebx;" \
1227             ".att_syntax prefix;" \
1228             : \
1229             : "rm" (Z), "S" (X), "c" (Y) \
1230             : "%eax", "%edx", "%edi", "memory", "cc" \
1231         );
1232 #endif
1233 
Add(word * C,const word * A,const word * B,unsigned int N)1234 TAOCRYPT_NAKED word PentiumOptimized::Add(word *C, const word *A,
1235                                           const word *B, unsigned int N)
1236 {
1237     AddPrologue
1238 
1239     // now: ebx = B, ecx = C, edx = A, esi = N
1240     AS2(    sub ecx, edx)           // hold the distance between C & A so we
1241                                     // can add this to A to get C
1242     AS2(    xor eax, eax)           // clear eax
1243 
1244     AS2(    sub eax, esi)           // eax is a negative index from end of B
1245     AS2(    lea ebx, [ebx+4*esi])   // ebx is end of B
1246 
1247     AS2(    sar eax, 1)             // unit of eax is now dwords; this also
1248                                     // clears the carry flag
1249     AS1(    jz  loopendAdd)         // if no dwords then nothing to do
1250 
1251     AS1(loopstartAdd:)
1252     AS2(    mov    esi,[edx])           // load lower word of A
1253     AS2(    mov    ebp,[edx+4])         // load higher word of A
1254 
1255     AS2(    mov    edi,[ebx+8*eax])     // load lower word of B
1256     AS2(    lea    edx,[edx+8])         // advance A and C
1257 
1258     AS2(    adc    esi,edi)             // add lower words
1259     AS2(    mov    edi,[ebx+8*eax+4])   // load higher word of B
1260 
1261     AS2(    adc    ebp,edi)             // add higher words
1262     AS1(    inc    eax)                 // advance B
1263 
1264     AS2(    mov    [edx+ecx-8],esi)     // store lower word result
1265     AS2(    mov    [edx+ecx-4],ebp)     // store higher word result
1266 
1267     AS1(    jnz    loopstartAdd)   // loop until eax overflows and becomes zero
1268 
1269     AS1(loopendAdd:)
1270     AS2(    adc eax, 0)     // store carry into eax (return result register)
1271 
1272     AddEpilogue
1273 }
1274 
Subtract(word * C,const word * A,const word * B,unsigned int N)1275 TAOCRYPT_NAKED word PentiumOptimized::Subtract(word *C, const word *A,
1276                                                const word *B, unsigned int N)
1277 {
1278     AddPrologue
1279 
1280     // now: ebx = B, ecx = C, edx = A, esi = N
1281     AS2(    sub ecx, edx)           // hold the distance between C & A so we
1282                                     // can add this to A to get C
1283     AS2(    xor eax, eax)           // clear eax
1284 
1285     AS2(    sub eax, esi)           // eax is a negative index from end of B
1286     AS2(    lea ebx, [ebx+4*esi])   // ebx is end of B
1287 
1288     AS2(    sar eax, 1)             // unit of eax is now dwords; this also
1289                                     // clears the carry flag
1290     AS1(    jz  loopendSub)         // if no dwords then nothing to do
1291 
1292     AS1(loopstartSub:)
1293     AS2(    mov    esi,[edx])           // load lower word of A
1294     AS2(    mov    ebp,[edx+4])         // load higher word of A
1295 
1296     AS2(    mov    edi,[ebx+8*eax])     // load lower word of B
1297     AS2(    lea    edx,[edx+8])         // advance A and C
1298 
1299     AS2(    sbb    esi,edi)             // subtract lower words
1300     AS2(    mov    edi,[ebx+8*eax+4])   // load higher word of B
1301 
1302     AS2(    sbb    ebp,edi)             // subtract higher words
1303     AS1(    inc    eax)                 // advance B
1304 
1305     AS2(    mov    [edx+ecx-8],esi)     // store lower word result
1306     AS2(    mov    [edx+ecx-4],ebp)     // store higher word result
1307 
1308     AS1(    jnz    loopstartSub)   // loop until eax overflows and becomes zero
1309 
1310     AS1(loopendSub:)
1311     AS2(    adc eax, 0)     // store carry into eax (return result register)
1312 
1313     AddEpilogue
1314 }
1315 
1316 // On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
1317 
Add(word * C,const word * A,const word * B,unsigned int N)1318 TAOCRYPT_NAKED word P4Optimized::Add(word *C, const word *A, const word *B,
1319                                      unsigned int N)
1320 {
1321     AddPrologue
1322 
1323     // now: ebx = B, ecx = C, edx = A, esi = N
1324     AS2(    xor     eax, eax)
1325     AS1(    neg     esi)
1326     AS1(    jz      loopendAddP4)       // if no dwords then nothing to do
1327 
1328     AS2(    mov     edi, [edx])
1329     AS2(    mov     ebp, [ebx])
1330     AS1(    jmp     carry1AddP4)
1331 
1332     AS1(loopstartAddP4:)
1333     AS2(    mov     edi, [edx+8])
1334     AS2(    add     ecx, 8)
1335     AS2(    add     edx, 8)
1336     AS2(    mov     ebp, [ebx])
1337     AS2(    add     edi, eax)
1338     AS1(    jc      carry1AddP4)
1339     AS2(    xor     eax, eax)
1340 
1341     AS1(carry1AddP4:)
1342     AS2(    add     edi, ebp)
1343     AS2(    mov     ebp, 1)
1344     AS2(    mov     [ecx], edi)
1345     AS2(    mov     edi, [edx+4])
1346     AS2(    cmovc   eax, ebp)
1347     AS2(    mov     ebp, [ebx+4])
1348     AS2(    add     ebx, 8)
1349     AS2(    add     edi, eax)
1350     AS1(    jc      carry2AddP4)
1351     AS2(    xor     eax, eax)
1352 
1353     AS1(carry2AddP4:)
1354     AS2(    add     edi, ebp)
1355     AS2(    mov     ebp, 1)
1356     AS2(    cmovc   eax, ebp)
1357     AS2(    mov     [ecx+4], edi)
1358     AS2(    add     esi, 2)
1359     AS1(    jnz     loopstartAddP4)
1360 
1361     AS1(loopendAddP4:)
1362 
1363     AddEpilogue
1364 }
1365 
Subtract(word * C,const word * A,const word * B,unsigned int N)1366 TAOCRYPT_NAKED word P4Optimized::Subtract(word *C, const word *A,
1367                                           const word *B, unsigned int N)
1368 {
1369     AddPrologue
1370 
1371     // now: ebx = B, ecx = C, edx = A, esi = N
1372     AS2(    xor     eax, eax)
1373     AS1(    neg     esi)
1374     AS1(    jz      loopendSubP4)       // if no dwords then nothing to do
1375 
1376     AS2(    mov     edi, [edx])
1377     AS2(    mov     ebp, [ebx])
1378     AS1(    jmp     carry1SubP4)
1379 
1380     AS1(loopstartSubP4:)
1381     AS2(    mov     edi, [edx+8])
1382     AS2(    add     edx, 8)
1383     AS2(    add     ecx, 8)
1384     AS2(    mov     ebp, [ebx])
1385     AS2(    sub     edi, eax)
1386     AS1(    jc      carry1SubP4)
1387     AS2(    xor     eax, eax)
1388 
1389     AS1(carry1SubP4:)
1390     AS2(    sub     edi, ebp)
1391     AS2(    mov     ebp, 1)
1392     AS2(    mov     [ecx], edi)
1393     AS2(    mov     edi, [edx+4])
1394     AS2(    cmovc   eax, ebp)
1395     AS2(    mov     ebp, [ebx+4])
1396     AS2(    add     ebx, 8)
1397     AS2(    sub     edi, eax)
1398     AS1(    jc      carry2SubP4)
1399     AS2(    xor     eax, eax)
1400 
1401     AS1(carry2SubP4:)
1402     AS2(    sub     edi, ebp)
1403     AS2(    mov     ebp, 1)
1404     AS2(    cmovc   eax, ebp)
1405     AS2(    mov     [ecx+4], edi)
1406     AS2(    add     esi, 2)
1407     AS1(    jnz     loopstartSubP4)
1408 
1409     AS1(loopendSubP4:)
1410 
1411     AddEpilogue
1412 }
1413 
1414 // multiply assembly code originally contributed by Leonard Janke
1415 
1416 #define MulStartup \
1417     AS2(xor ebp, ebp) \
1418     AS2(xor edi, edi) \
1419     AS2(xor ebx, ebx)
1420 
1421 #define MulShiftCarry \
1422     AS2(mov ebp, edx) \
1423     AS2(mov edi, ebx) \
1424     AS2(xor ebx, ebx)
1425 
1426 #define MulAccumulateBottom(i,j) \
1427     AS2(mov eax, [ecx+4*j]) \
1428     AS2(imul eax, dword ptr [esi+4*i]) \
1429     AS2(add ebp, eax)
1430 
1431 #define MulAccumulate(i,j) \
1432     AS2(mov eax, [ecx+4*j]) \
1433     AS1(mul dword ptr [esi+4*i]) \
1434     AS2(add ebp, eax) \
1435     AS2(adc edi, edx) \
1436     AS2(adc bl, bh)
1437 
1438 #define MulStoreDigit(i)  \
1439     AS2(mov edx, edi) \
1440     AS2(mov edi, [esp]) \
1441     AS2(mov [edi+4*i], ebp)
1442 
1443 #define MulLastDiagonal(digits) \
1444     AS2(mov eax, [ecx+4*(digits-1)]) \
1445     AS1(mul dword ptr [esi+4*(digits-1)]) \
1446     AS2(add ebp, eax) \
1447     AS2(adc edx, edi) \
1448     AS2(mov edi, [esp]) \
1449     AS2(mov [edi+4*(2*digits-2)], ebp) \
1450     AS2(mov [edi+4*(2*digits-1)], edx)
1451 
Multiply4(word * Z,const word * X,const word * Y)1452 TAOCRYPT_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X,
1453                                                 const word* Y)
1454 {
1455     MulPrologue
1456     // now: [esp] = Z, esi = X, ecx = Y
1457     MulStartup
1458     MulAccumulate(0,0)
1459     MulStoreDigit(0)
1460     MulShiftCarry
1461 
1462     MulAccumulate(1,0)
1463     MulAccumulate(0,1)
1464     MulStoreDigit(1)
1465     MulShiftCarry
1466 
1467     MulAccumulate(2,0)
1468     MulAccumulate(1,1)
1469     MulAccumulate(0,2)
1470     MulStoreDigit(2)
1471     MulShiftCarry
1472 
1473     MulAccumulate(3,0)
1474     MulAccumulate(2,1)
1475     MulAccumulate(1,2)
1476     MulAccumulate(0,3)
1477     MulStoreDigit(3)
1478     MulShiftCarry
1479 
1480     MulAccumulate(3,1)
1481     MulAccumulate(2,2)
1482     MulAccumulate(1,3)
1483     MulStoreDigit(4)
1484     MulShiftCarry
1485 
1486     MulAccumulate(3,2)
1487     MulAccumulate(2,3)
1488     MulStoreDigit(5)
1489     MulShiftCarry
1490 
1491     MulLastDiagonal(4)
1492     MulEpilogue
1493 }
1494 
Multiply8(word * Z,const word * X,const word * Y)1495 TAOCRYPT_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X,
1496                                                 const word* Y)
1497 {
1498     MulPrologue
1499     // now: [esp] = Z, esi = X, ecx = Y
1500     MulStartup
1501     MulAccumulate(0,0)
1502     MulStoreDigit(0)
1503     MulShiftCarry
1504 
1505     MulAccumulate(1,0)
1506     MulAccumulate(0,1)
1507     MulStoreDigit(1)
1508     MulShiftCarry
1509 
1510     MulAccumulate(2,0)
1511     MulAccumulate(1,1)
1512     MulAccumulate(0,2)
1513     MulStoreDigit(2)
1514     MulShiftCarry
1515 
1516     MulAccumulate(3,0)
1517     MulAccumulate(2,1)
1518     MulAccumulate(1,2)
1519     MulAccumulate(0,3)
1520     MulStoreDigit(3)
1521     MulShiftCarry
1522 
1523     MulAccumulate(4,0)
1524     MulAccumulate(3,1)
1525     MulAccumulate(2,2)
1526     MulAccumulate(1,3)
1527     MulAccumulate(0,4)
1528     MulStoreDigit(4)
1529     MulShiftCarry
1530 
1531     MulAccumulate(5,0)
1532     MulAccumulate(4,1)
1533     MulAccumulate(3,2)
1534     MulAccumulate(2,3)
1535     MulAccumulate(1,4)
1536     MulAccumulate(0,5)
1537     MulStoreDigit(5)
1538     MulShiftCarry
1539 
1540     MulAccumulate(6,0)
1541     MulAccumulate(5,1)
1542     MulAccumulate(4,2)
1543     MulAccumulate(3,3)
1544     MulAccumulate(2,4)
1545     MulAccumulate(1,5)
1546     MulAccumulate(0,6)
1547     MulStoreDigit(6)
1548     MulShiftCarry
1549 
1550     MulAccumulate(7,0)
1551     MulAccumulate(6,1)
1552     MulAccumulate(5,2)
1553     MulAccumulate(4,3)
1554     MulAccumulate(3,4)
1555     MulAccumulate(2,5)
1556     MulAccumulate(1,6)
1557     MulAccumulate(0,7)
1558     MulStoreDigit(7)
1559     MulShiftCarry
1560 
1561     MulAccumulate(7,1)
1562     MulAccumulate(6,2)
1563     MulAccumulate(5,3)
1564     MulAccumulate(4,4)
1565     MulAccumulate(3,5)
1566     MulAccumulate(2,6)
1567     MulAccumulate(1,7)
1568     MulStoreDigit(8)
1569     MulShiftCarry
1570 
1571     MulAccumulate(7,2)
1572     MulAccumulate(6,3)
1573     MulAccumulate(5,4)
1574     MulAccumulate(4,5)
1575     MulAccumulate(3,6)
1576     MulAccumulate(2,7)
1577     MulStoreDigit(9)
1578     MulShiftCarry
1579 
1580     MulAccumulate(7,3)
1581     MulAccumulate(6,4)
1582     MulAccumulate(5,5)
1583     MulAccumulate(4,6)
1584     MulAccumulate(3,7)
1585     MulStoreDigit(10)
1586     MulShiftCarry
1587 
1588     MulAccumulate(7,4)
1589     MulAccumulate(6,5)
1590     MulAccumulate(5,6)
1591     MulAccumulate(4,7)
1592     MulStoreDigit(11)
1593     MulShiftCarry
1594 
1595     MulAccumulate(7,5)
1596     MulAccumulate(6,6)
1597     MulAccumulate(5,7)
1598     MulStoreDigit(12)
1599     MulShiftCarry
1600 
1601     MulAccumulate(7,6)
1602     MulAccumulate(6,7)
1603     MulStoreDigit(13)
1604     MulShiftCarry
1605 
1606     MulLastDiagonal(8)
1607     MulEpilogue
1608 }
1609 
Multiply8Bottom(word * Z,const word * X,const word * Y)1610 TAOCRYPT_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X,
1611                                                       const word* Y)
1612 {
1613     MulPrologue
1614     // now: [esp] = Z, esi = X, ecx = Y
1615     MulStartup
1616     MulAccumulate(0,0)
1617     MulStoreDigit(0)
1618     MulShiftCarry
1619 
1620     MulAccumulate(1,0)
1621     MulAccumulate(0,1)
1622     MulStoreDigit(1)
1623     MulShiftCarry
1624 
1625     MulAccumulate(2,0)
1626     MulAccumulate(1,1)
1627     MulAccumulate(0,2)
1628     MulStoreDigit(2)
1629     MulShiftCarry
1630 
1631     MulAccumulate(3,0)
1632     MulAccumulate(2,1)
1633     MulAccumulate(1,2)
1634     MulAccumulate(0,3)
1635     MulStoreDigit(3)
1636     MulShiftCarry
1637 
1638     MulAccumulate(4,0)
1639     MulAccumulate(3,1)
1640     MulAccumulate(2,2)
1641     MulAccumulate(1,3)
1642     MulAccumulate(0,4)
1643     MulStoreDigit(4)
1644     MulShiftCarry
1645 
1646     MulAccumulate(5,0)
1647     MulAccumulate(4,1)
1648     MulAccumulate(3,2)
1649     MulAccumulate(2,3)
1650     MulAccumulate(1,4)
1651     MulAccumulate(0,5)
1652     MulStoreDigit(5)
1653     MulShiftCarry
1654 
1655     MulAccumulate(6,0)
1656     MulAccumulate(5,1)
1657     MulAccumulate(4,2)
1658     MulAccumulate(3,3)
1659     MulAccumulate(2,4)
1660     MulAccumulate(1,5)
1661     MulAccumulate(0,6)
1662     MulStoreDigit(6)
1663     MulShiftCarry
1664 
1665     MulAccumulateBottom(7,0)
1666     MulAccumulateBottom(6,1)
1667     MulAccumulateBottom(5,2)
1668     MulAccumulateBottom(4,3)
1669     MulAccumulateBottom(3,4)
1670     MulAccumulateBottom(2,5)
1671     MulAccumulateBottom(1,6)
1672     MulAccumulateBottom(0,7)
1673     MulStoreDigit(7)
1674     MulEpilogue
1675 }
1676 
1677 #undef AS1
1678 #undef AS2
1679 
1680 #else	// not x86 - no processor specific code at this layer
1681 
1682 typedef Portable LowLevel;
1683 
1684 #endif
1685 
1686 #ifdef SSE2_INTRINSICS_AVAILABLE
1687 
1688 #ifdef __GNUC__
1689 #define TAOCRYPT_FASTCALL
1690 #else
1691 #define TAOCRYPT_FASTCALL __fastcall
1692 #endif
1693 
P4_Mul(__m128i * C,const __m128i * A,const __m128i * B)1694 static void TAOCRYPT_FASTCALL P4_Mul(__m128i *C, const __m128i *A,
1695                                      const __m128i *B)
1696 {
1697     __m128i a3210 = _mm_load_si128(A);
1698     __m128i b3210 = _mm_load_si128(B);
1699 
1700     __m128i sum;
1701 
1702     __m128i z = _mm_setzero_si128();
1703     __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
1704     C[0] = a2b2_a0b0;
1705 
1706     __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
1707     __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
1708     __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
1709     __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
1710     __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
1711     C[1] = _mm_add_epi64(a1b0, a0b1);
1712 
1713     __m128i a31 = _mm_srli_epi64(a3210, 32);
1714     __m128i b31 = _mm_srli_epi64(b3210, 32);
1715     __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
1716     C[6] = a3b3_a1b1;
1717 
1718     __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
1719     __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
1720     __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
1721     __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
1722     __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
1723     sum = _mm_add_epi64(a1b1, a0b2);
1724     C[2] = _mm_add_epi64(sum, a2b0);
1725 
1726     __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
1727     __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
1728     __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
1729     __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
1730     __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
1731     __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
1732     __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
1733     __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
1734     __m128i sum1 = _mm_add_epi64(a3b0, a1b2);
1735     sum = _mm_add_epi64(a2b1, a0b3);
1736     C[3] = _mm_add_epi64(sum, sum1);
1737 
1738     __m128i	a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
1739     __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
1740     __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
1741     __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
1742     sum = _mm_add_epi64(a2b2, a3b1);
1743     C[4] = _mm_add_epi64(sum, a1b3);
1744 
1745     __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
1746     __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
1747     __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
1748     __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
1749     __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
1750     C[5] = _mm_add_epi64(a3b2, a2b3);
1751 }
1752 
Multiply4(word * C,const word * A,const word * B)1753 void P4Optimized::Multiply4(word *C, const word *A, const word *B)
1754 {
1755     __m128i temp[7];
1756     const word *w = (word *)temp;
1757     const __m64 *mw = (__m64 *)w;
1758 
1759     P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1760 
1761     C[0] = w[0];
1762 
1763     __m64 s1, s2;
1764 
1765     __m64 w1 = _mm_cvtsi32_si64(w[1]);
1766     __m64 w4 = mw[2];
1767     __m64 w6 = mw[3];
1768     __m64 w8 = mw[4];
1769     __m64 w10 = mw[5];
1770     __m64 w12 = mw[6];
1771     __m64 w14 = mw[7];
1772     __m64 w16 = mw[8];
1773     __m64 w18 = mw[9];
1774     __m64 w20 = mw[10];
1775     __m64 w22 = mw[11];
1776     __m64 w26 = _mm_cvtsi32_si64(w[26]);
1777 
1778     s1 = _mm_add_si64(w1, w4);
1779     C[1] = _mm_cvtsi64_si32(s1);
1780     s1 = _mm_srli_si64(s1, 32);
1781 
1782     s2 = _mm_add_si64(w6, w8);
1783     s1 = _mm_add_si64(s1, s2);
1784     C[2] = _mm_cvtsi64_si32(s1);
1785     s1 = _mm_srli_si64(s1, 32);
1786 
1787     s2 = _mm_add_si64(w10, w12);
1788     s1 = _mm_add_si64(s1, s2);
1789     C[3] = _mm_cvtsi64_si32(s1);
1790     s1 = _mm_srli_si64(s1, 32);
1791 
1792     s2 = _mm_add_si64(w14, w16);
1793     s1 = _mm_add_si64(s1, s2);
1794     C[4] = _mm_cvtsi64_si32(s1);
1795     s1 = _mm_srli_si64(s1, 32);
1796 
1797     s2 = _mm_add_si64(w18, w20);
1798     s1 = _mm_add_si64(s1, s2);
1799     C[5] = _mm_cvtsi64_si32(s1);
1800     s1 = _mm_srli_si64(s1, 32);
1801 
1802     s2 = _mm_add_si64(w22, w26);
1803     s1 = _mm_add_si64(s1, s2);
1804     C[6] = _mm_cvtsi64_si32(s1);
1805     s1 = _mm_srli_si64(s1, 32);
1806 
1807     C[7] = _mm_cvtsi64_si32(s1) + w[27];
1808     _mm_empty();
1809 }
1810 
Multiply8(word * C,const word * A,const word * B)1811 void P4Optimized::Multiply8(word *C, const word *A, const word *B)
1812 {
1813     __m128i temp[28];
1814     const word *w = (word *)temp;
1815     const __m64 *mw = (__m64 *)w;
1816     const word *x = (word *)temp+7*4;
1817     const __m64 *mx = (__m64 *)x;
1818     const word *y = (word *)temp+7*4*2;
1819     const __m64 *my = (__m64 *)y;
1820     const word *z = (word *)temp+7*4*3;
1821     const __m64 *mz = (__m64 *)z;
1822 
1823     P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1824 
1825     P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1826 
1827     P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1828 
1829     P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
1830 
1831     C[0] = w[0];
1832 
1833     __m64 s1, s2, s3, s4;
1834 
1835     __m64 w1 = _mm_cvtsi32_si64(w[1]);
1836     __m64 w4 = mw[2];
1837     __m64 w6 = mw[3];
1838     __m64 w8 = mw[4];
1839     __m64 w10 = mw[5];
1840     __m64 w12 = mw[6];
1841     __m64 w14 = mw[7];
1842     __m64 w16 = mw[8];
1843     __m64 w18 = mw[9];
1844     __m64 w20 = mw[10];
1845     __m64 w22 = mw[11];
1846     __m64 w26 = _mm_cvtsi32_si64(w[26]);
1847     __m64 w27 = _mm_cvtsi32_si64(w[27]);
1848 
1849     __m64 x0 = _mm_cvtsi32_si64(x[0]);
1850     __m64 x1 = _mm_cvtsi32_si64(x[1]);
1851     __m64 x4 = mx[2];
1852     __m64 x6 = mx[3];
1853     __m64 x8 = mx[4];
1854     __m64 x10 = mx[5];
1855     __m64 x12 = mx[6];
1856     __m64 x14 = mx[7];
1857     __m64 x16 = mx[8];
1858     __m64 x18 = mx[9];
1859     __m64 x20 = mx[10];
1860     __m64 x22 = mx[11];
1861     __m64 x26 = _mm_cvtsi32_si64(x[26]);
1862     __m64 x27 = _mm_cvtsi32_si64(x[27]);
1863 
1864     __m64 y0 = _mm_cvtsi32_si64(y[0]);
1865     __m64 y1 = _mm_cvtsi32_si64(y[1]);
1866     __m64 y4 = my[2];
1867     __m64 y6 = my[3];
1868     __m64 y8 = my[4];
1869     __m64 y10 = my[5];
1870     __m64 y12 = my[6];
1871     __m64 y14 = my[7];
1872     __m64 y16 = my[8];
1873     __m64 y18 = my[9];
1874     __m64 y20 = my[10];
1875     __m64 y22 = my[11];
1876     __m64 y26 = _mm_cvtsi32_si64(y[26]);
1877     __m64 y27 = _mm_cvtsi32_si64(y[27]);
1878 
1879     __m64 z0 = _mm_cvtsi32_si64(z[0]);
1880     __m64 z1 = _mm_cvtsi32_si64(z[1]);
1881     __m64 z4 = mz[2];
1882     __m64 z6 = mz[3];
1883     __m64 z8 = mz[4];
1884     __m64 z10 = mz[5];
1885     __m64 z12 = mz[6];
1886     __m64 z14 = mz[7];
1887     __m64 z16 = mz[8];
1888     __m64 z18 = mz[9];
1889     __m64 z20 = mz[10];
1890     __m64 z22 = mz[11];
1891     __m64 z26 = _mm_cvtsi32_si64(z[26]);
1892 
1893     s1 = _mm_add_si64(w1, w4);
1894     C[1] = _mm_cvtsi64_si32(s1);
1895     s1 = _mm_srli_si64(s1, 32);
1896 
1897     s2 = _mm_add_si64(w6, w8);
1898     s1 = _mm_add_si64(s1, s2);
1899     C[2] = _mm_cvtsi64_si32(s1);
1900     s1 = _mm_srli_si64(s1, 32);
1901 
1902     s2 = _mm_add_si64(w10, w12);
1903     s1 = _mm_add_si64(s1, s2);
1904     C[3] = _mm_cvtsi64_si32(s1);
1905     s1 = _mm_srli_si64(s1, 32);
1906 
1907     s3 = _mm_add_si64(x0, y0);
1908     s2 = _mm_add_si64(w14, w16);
1909     s1 = _mm_add_si64(s1, s3);
1910     s1 = _mm_add_si64(s1, s2);
1911     C[4] = _mm_cvtsi64_si32(s1);
1912     s1 = _mm_srli_si64(s1, 32);
1913 
1914     s3 = _mm_add_si64(x1, y1);
1915     s4 = _mm_add_si64(x4, y4);
1916     s1 = _mm_add_si64(s1, w18);
1917     s3 = _mm_add_si64(s3, s4);
1918     s1 = _mm_add_si64(s1, w20);
1919     s1 = _mm_add_si64(s1, s3);
1920     C[5] = _mm_cvtsi64_si32(s1);
1921     s1 = _mm_srli_si64(s1, 32);
1922 
1923     s3 = _mm_add_si64(x6, y6);
1924     s4 = _mm_add_si64(x8, y8);
1925     s1 = _mm_add_si64(s1, w22);
1926     s3 = _mm_add_si64(s3, s4);
1927     s1 = _mm_add_si64(s1, w26);
1928     s1 = _mm_add_si64(s1, s3);
1929     C[6] = _mm_cvtsi64_si32(s1);
1930     s1 = _mm_srli_si64(s1, 32);
1931 
1932     s3 = _mm_add_si64(x10, y10);
1933     s4 = _mm_add_si64(x12, y12);
1934     s1 = _mm_add_si64(s1, w27);
1935     s3 = _mm_add_si64(s3, s4);
1936     s1 = _mm_add_si64(s1, s3);
1937     C[7] = _mm_cvtsi64_si32(s1);
1938     s1 = _mm_srli_si64(s1, 32);
1939 
1940     s3 = _mm_add_si64(x14, y14);
1941     s4 = _mm_add_si64(x16, y16);
1942     s1 = _mm_add_si64(s1, z0);
1943     s3 = _mm_add_si64(s3, s4);
1944     s1 = _mm_add_si64(s1, s3);
1945     C[8] = _mm_cvtsi64_si32(s1);
1946     s1 = _mm_srli_si64(s1, 32);
1947 
1948     s3 = _mm_add_si64(x18, y18);
1949     s4 = _mm_add_si64(x20, y20);
1950     s1 = _mm_add_si64(s1, z1);
1951     s3 = _mm_add_si64(s3, s4);
1952     s1 = _mm_add_si64(s1, z4);
1953     s1 = _mm_add_si64(s1, s3);
1954     C[9] = _mm_cvtsi64_si32(s1);
1955     s1 = _mm_srli_si64(s1, 32);
1956 
1957     s3 = _mm_add_si64(x22, y22);
1958     s4 = _mm_add_si64(x26, y26);
1959     s1 = _mm_add_si64(s1, z6);
1960     s3 = _mm_add_si64(s3, s4);
1961     s1 = _mm_add_si64(s1, z8);
1962     s1 = _mm_add_si64(s1, s3);
1963     C[10] = _mm_cvtsi64_si32(s1);
1964     s1 = _mm_srli_si64(s1, 32);
1965 
1966     s3 = _mm_add_si64(x27, y27);
1967     s1 = _mm_add_si64(s1, z10);
1968     s1 = _mm_add_si64(s1, z12);
1969     s1 = _mm_add_si64(s1, s3);
1970     C[11] = _mm_cvtsi64_si32(s1);
1971     s1 = _mm_srli_si64(s1, 32);
1972 
1973     s3 = _mm_add_si64(z14, z16);
1974     s1 = _mm_add_si64(s1, s3);
1975     C[12] = _mm_cvtsi64_si32(s1);
1976     s1 = _mm_srli_si64(s1, 32);
1977 
1978     s3 = _mm_add_si64(z18, z20);
1979     s1 = _mm_add_si64(s1, s3);
1980     C[13] = _mm_cvtsi64_si32(s1);
1981     s1 = _mm_srli_si64(s1, 32);
1982 
1983     s3 = _mm_add_si64(z22, z26);
1984     s1 = _mm_add_si64(s1, s3);
1985     C[14] = _mm_cvtsi64_si32(s1);
1986     s1 = _mm_srli_si64(s1, 32);
1987 
1988     C[15] = z[27] + _mm_cvtsi64_si32(s1);
1989     _mm_empty();
1990 }
1991 
Multiply8Bottom(word * C,const word * A,const word * B)1992 void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
1993 {
1994     __m128i temp[21];
1995     const word *w = (word *)temp;
1996     const __m64 *mw = (__m64 *)w;
1997     const word *x = (word *)temp+7*4;
1998     const __m64 *mx = (__m64 *)x;
1999     const word *y = (word *)temp+7*4*2;
2000     const __m64 *my = (__m64 *)y;
2001 
2002     P4_Mul(temp, (__m128i *)A, (__m128i *)B);
2003 
2004     P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
2005 
2006     P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
2007 
2008     C[0] = w[0];
2009 
2010     __m64 s1, s2, s3, s4;
2011 
2012     __m64 w1 = _mm_cvtsi32_si64(w[1]);
2013     __m64 w4 = mw[2];
2014     __m64 w6 = mw[3];
2015     __m64 w8 = mw[4];
2016     __m64 w10 = mw[5];
2017     __m64 w12 = mw[6];
2018     __m64 w14 = mw[7];
2019     __m64 w16 = mw[8];
2020     __m64 w18 = mw[9];
2021     __m64 w20 = mw[10];
2022     __m64 w22 = mw[11];
2023     __m64 w26 = _mm_cvtsi32_si64(w[26]);
2024 
2025     __m64 x0 = _mm_cvtsi32_si64(x[0]);
2026     __m64 x1 = _mm_cvtsi32_si64(x[1]);
2027     __m64 x4 = mx[2];
2028     __m64 x6 = mx[3];
2029     __m64 x8 = mx[4];
2030 
2031     __m64 y0 = _mm_cvtsi32_si64(y[0]);
2032     __m64 y1 = _mm_cvtsi32_si64(y[1]);
2033     __m64 y4 = my[2];
2034     __m64 y6 = my[3];
2035     __m64 y8 = my[4];
2036 
2037     s1 = _mm_add_si64(w1, w4);
2038     C[1] = _mm_cvtsi64_si32(s1);
2039     s1 = _mm_srli_si64(s1, 32);
2040 
2041     s2 = _mm_add_si64(w6, w8);
2042     s1 = _mm_add_si64(s1, s2);
2043     C[2] = _mm_cvtsi64_si32(s1);
2044     s1 = _mm_srli_si64(s1, 32);
2045 
2046     s2 = _mm_add_si64(w10, w12);
2047     s1 = _mm_add_si64(s1, s2);
2048     C[3] = _mm_cvtsi64_si32(s1);
2049     s1 = _mm_srli_si64(s1, 32);
2050 
2051     s3 = _mm_add_si64(x0, y0);
2052     s2 = _mm_add_si64(w14, w16);
2053     s1 = _mm_add_si64(s1, s3);
2054     s1 = _mm_add_si64(s1, s2);
2055     C[4] = _mm_cvtsi64_si32(s1);
2056     s1 = _mm_srli_si64(s1, 32);
2057 
2058     s3 = _mm_add_si64(x1, y1);
2059     s4 = _mm_add_si64(x4, y4);
2060     s1 = _mm_add_si64(s1, w18);
2061     s3 = _mm_add_si64(s3, s4);
2062     s1 = _mm_add_si64(s1, w20);
2063     s1 = _mm_add_si64(s1, s3);
2064     C[5] = _mm_cvtsi64_si32(s1);
2065     s1 = _mm_srli_si64(s1, 32);
2066 
2067     s3 = _mm_add_si64(x6, y6);
2068     s4 = _mm_add_si64(x8, y8);
2069     s1 = _mm_add_si64(s1, w22);
2070     s3 = _mm_add_si64(s3, s4);
2071     s1 = _mm_add_si64(s1, w26);
2072     s1 = _mm_add_si64(s1, s3);
2073     C[6] = _mm_cvtsi64_si32(s1);
2074     s1 = _mm_srli_si64(s1, 32);
2075 
2076     C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
2077     _mm_empty();
2078 }
2079 
2080 #endif	// #ifdef SSE2_INTRINSICS_AVAILABLE
2081 
2082 // end optimized
2083 
2084 // ********************************************************
2085 
2086 #define A0      A
2087 #define A1      (A+N2)
2088 #define B0      B
2089 #define B1      (B+N2)
2090 
2091 #define T0      T
2092 #define T1      (T+N2)
2093 #define T2      (T+N)
2094 #define T3      (T+N+N2)
2095 
2096 #define R0      R
2097 #define R1      (R+N2)
2098 #define R2      (R+N)
2099 #define R3      (R+N+N2)
2100 
2101 //VC60 workaround: compiler bug triggered without the extra dummy parameters
2102 
2103 // R[2*N] - result = A*B
2104 // T[2*N] - temporary work space
2105 // A[N] --- multiplier
2106 // B[N] --- multiplicant
2107 
2108 
RecursiveMultiply(word * R,word * T,const word * A,const word * B,unsigned int N)2109 void RecursiveMultiply(word *R, word *T, const word *A, const word *B,
2110                        unsigned int N)
2111 {
2112     if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
2113         LowLevel::Multiply8(R, A, B);
2114     else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
2115         LowLevel::Multiply4(R, A, B);
2116     else if (N==2)
2117         LowLevel::Multiply2(R, A, B);
2118     else
2119     {
2120         const unsigned int N2 = N/2;
2121         int carry;
2122 
2123         int aComp = Compare(A0, A1, N2);
2124         int bComp = Compare(B0, B1, N2);
2125 
2126         switch (2*aComp + aComp + bComp)
2127         {
2128         case -4:
2129             LowLevel::Subtract(R0, A1, A0, N2);
2130             LowLevel::Subtract(R1, B0, B1, N2);
2131             RecursiveMultiply(T0, T2, R0, R1, N2);
2132             LowLevel::Subtract(T1, T1, R0, N2);
2133             carry = -1;
2134             break;
2135         case -2:
2136             LowLevel::Subtract(R0, A1, A0, N2);
2137             LowLevel::Subtract(R1, B0, B1, N2);
2138             RecursiveMultiply(T0, T2, R0, R1, N2);
2139             carry = 0;
2140             break;
2141         case 2:
2142             LowLevel::Subtract(R0, A0, A1, N2);
2143             LowLevel::Subtract(R1, B1, B0, N2);
2144             RecursiveMultiply(T0, T2, R0, R1, N2);
2145             carry = 0;
2146             break;
2147         case 4:
2148             LowLevel::Subtract(R0, A1, A0, N2);
2149             LowLevel::Subtract(R1, B0, B1, N2);
2150             RecursiveMultiply(T0, T2, R0, R1, N2);
2151             LowLevel::Subtract(T1, T1, R1, N2);
2152             carry = -1;
2153             break;
2154         default:
2155             SetWords(T0, 0, N);
2156             carry = 0;
2157         }
2158 
2159         RecursiveMultiply(R0, T2, A0, B0, N2);
2160         RecursiveMultiply(R2, T2, A1, B1, N2);
2161 
2162         // now T[01] holds (A1-A0)*(B0-B1),R[01] holds A0*B0, R[23] holds A1*B1
2163 
2164         carry += LowLevel::Add(T0, T0, R0, N);
2165         carry += LowLevel::Add(T0, T0, R2, N);
2166         carry += LowLevel::Add(R1, R1, T0, N);
2167 
2168         Increment(R3, N2, carry);
2169     }
2170 }
2171 
2172 
RecursiveSquare(word * R,word * T,const word * A,unsigned int N)2173 void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)
2174 {
2175     if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
2176         LowLevel::Square4(R, A);
2177     else if (N==2)
2178         LowLevel::Square2(R, A);
2179     else
2180     {
2181         const unsigned int N2 = N/2;
2182 
2183         RecursiveSquare(R0, T2, A0, N2);
2184         RecursiveSquare(R2, T2, A1, N2);
2185         RecursiveMultiply(T0, T2, A0, A1, N2);
2186 
2187         word carry = LowLevel::Add(R1, R1, T0, N);
2188         carry += LowLevel::Add(R1, R1, T0, N);
2189         Increment(R3, N2, carry);
2190     }
2191 }
2192 
2193 
2194 // R[N] - bottom half of A*B
2195 // T[N] - temporary work space
2196 // A[N] - multiplier
2197 // B[N] - multiplicant
2198 
2199 
RecursiveMultiplyBottom(word * R,word * T,const word * A,const word * B,unsigned int N)2200 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B,
2201                              unsigned int N)
2202 {
2203     if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
2204         LowLevel::Multiply8Bottom(R, A, B);
2205     else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
2206         LowLevel::Multiply4Bottom(R, A, B);
2207     else if (N==2)
2208         LowLevel::Multiply2Bottom(R, A, B);
2209     else
2210     {
2211         const unsigned int N2 = N/2;
2212 
2213         RecursiveMultiply(R, T, A0, B0, N2);
2214         RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
2215         LowLevel::Add(R1, R1, T0, N2);
2216         RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
2217         LowLevel::Add(R1, R1, T0, N2);
2218     }
2219 }
2220 
2221 
RecursiveMultiplyTop(word * R,word * T,const word * L,const word * A,const word * B,unsigned int N)2222 void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A,
2223                           const word *B, unsigned int N)
2224 {
2225     if (N==4)
2226     {
2227         LowLevel::Multiply4(T, A, B);
2228         memcpy(R, T+4, 4*WORD_SIZE);
2229     }
2230     else if (N==2)
2231     {
2232         LowLevel::Multiply2(T, A, B);
2233         memcpy(R, T+2, 2*WORD_SIZE);
2234     }
2235     else
2236     {
2237         const unsigned int N2 = N/2;
2238         int carry;
2239 
2240         int aComp = Compare(A0, A1, N2);
2241         int bComp = Compare(B0, B1, N2);
2242 
2243         switch (2*aComp + aComp + bComp)
2244         {
2245         case -4:
2246             LowLevel::Subtract(R0, A1, A0, N2);
2247             LowLevel::Subtract(R1, B0, B1, N2);
2248             RecursiveMultiply(T0, T2, R0, R1, N2);
2249             LowLevel::Subtract(T1, T1, R0, N2);
2250             carry = -1;
2251             break;
2252         case -2:
2253             LowLevel::Subtract(R0, A1, A0, N2);
2254             LowLevel::Subtract(R1, B0, B1, N2);
2255             RecursiveMultiply(T0, T2, R0, R1, N2);
2256             carry = 0;
2257             break;
2258         case 2:
2259             LowLevel::Subtract(R0, A0, A1, N2);
2260             LowLevel::Subtract(R1, B1, B0, N2);
2261             RecursiveMultiply(T0, T2, R0, R1, N2);
2262             carry = 0;
2263             break;
2264         case 4:
2265             LowLevel::Subtract(R0, A1, A0, N2);
2266             LowLevel::Subtract(R1, B0, B1, N2);
2267             RecursiveMultiply(T0, T2, R0, R1, N2);
2268             LowLevel::Subtract(T1, T1, R1, N2);
2269             carry = -1;
2270             break;
2271         default:
2272             SetWords(T0, 0, N);
2273             carry = 0;
2274         }
2275 
2276         RecursiveMultiply(T2, R0, A1, B1, N2);
2277 
2278         // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
2279 
2280         word c2 = LowLevel::Subtract(R0, L+N2, L, N2);
2281         c2 += LowLevel::Subtract(R0, R0, T0, N2);
2282         word t = (Compare(R0, T2, N2) == -1);
2283 
2284         carry += t;
2285         carry += Increment(R0, N2, c2+t);
2286         carry += LowLevel::Add(R0, R0, T1, N2);
2287         carry += LowLevel::Add(R0, R0, T3, N2);
2288 
2289         CopyWords(R1, T3, N2);
2290         Increment(R1, N2, carry);
2291     }
2292 }
2293 
2294 
Add(word * C,const word * A,const word * B,unsigned int N)2295 inline word Add(word *C, const word *A, const word *B, unsigned int N)
2296 {
2297     return LowLevel::Add(C, A, B, N);
2298 }
2299 
Subtract(word * C,const word * A,const word * B,unsigned int N)2300 inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
2301 {
2302     return LowLevel::Subtract(C, A, B, N);
2303 }
2304 
Multiply(word * R,word * T,const word * A,const word * B,unsigned int N)2305 inline void Multiply(word *R, word *T, const word *A, const word *B,
2306                      unsigned int N)
2307 {
2308     RecursiveMultiply(R, T, A, B, N);
2309 }
2310 
Square(word * R,word * T,const word * A,unsigned int N)2311 inline void Square(word *R, word *T, const word *A, unsigned int N)
2312 {
2313     RecursiveSquare(R, T, A, N);
2314 }
2315 
2316 
AsymmetricMultiply(word * R,word * T,const word * A,unsigned int NA,const word * B,unsigned int NB)2317 void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA,
2318                         const word *B, unsigned int NB)
2319 {
2320     if (NA == NB)
2321     {
2322         if (A == B)
2323             Square(R, T, A, NA);
2324         else
2325             Multiply(R, T, A, B, NA);
2326 
2327         return;
2328     }
2329 
2330     if (NA > NB)
2331     {
2332         STL::swap(A, B);
2333         STL::swap(NA, NB);
2334     }
2335 
2336     if (NA==2 && !A[1])
2337     {
2338         switch (A[0])
2339         {
2340         case 0:
2341             SetWords(R, 0, NB+2);
2342             return;
2343         case 1:
2344             CopyWords(R, B, NB);
2345             R[NB] = R[NB+1] = 0;
2346             return;
2347         default:
2348             R[NB] = LinearMultiply(R, B, A[0], NB);
2349             R[NB+1] = 0;
2350             return;
2351         }
2352     }
2353 
2354     Multiply(R, T, A, B, NA);
2355     CopyWords(T+2*NA, R+NA, NA);
2356 
2357     unsigned i;
2358 
2359     for (i=2*NA; i<NB; i+=2*NA)
2360         Multiply(T+NA+i, T, A, B+i, NA);
2361     for (i=NA; i<NB; i+=2*NA)
2362         Multiply(R+i, T, A, B+i, NA);
2363 
2364     if (Add(R+NA, R+NA, T+2*NA, NB-NA))
2365         Increment(R+NB, NA);
2366 }
2367 
2368 
PositiveMultiply(Integer & product,const Integer & a,const Integer & b)2369 void PositiveMultiply(Integer& product, const Integer& a, const Integer& b)
2370 {
2371     unsigned int aSize = RoundupSize(a.WordCount());
2372     unsigned int bSize = RoundupSize(b.WordCount());
2373 
2374     product.reg_.CleanNew(RoundupSize(aSize + bSize));
2375     product.sign_ = Integer::POSITIVE;
2376 
2377     AlignedWordBlock workspace(aSize + bSize);
2378     AsymmetricMultiply(product.reg_.get_buffer(), workspace.get_buffer(),
2379                        a.reg_.get_buffer(), aSize, b.reg_.get_buffer(), bSize);
2380 }
2381 
Multiply(Integer & product,const Integer & a,const Integer & b)2382 void Multiply(Integer &product, const Integer &a, const Integer &b)
2383 {
2384     PositiveMultiply(product, a, b);
2385 
2386     if (a.NotNegative() != b.NotNegative())
2387         product.Negate();
2388 }
2389 
2390 
EvenWordCount(const word * X,unsigned int N)2391 static inline unsigned int EvenWordCount(const word *X, unsigned int N)
2392 {
2393     while (N && X[N-2]==0 && X[N-1]==0)
2394         N-=2;
2395     return N;
2396 }
2397 
2398 
AlmostInverse(word * R,word * T,const word * A,unsigned int NA,const word * M,unsigned int N)2399 unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA,
2400                            const word *M, unsigned int N)
2401 {
2402     word *b = T;
2403     word *c = T+N;
2404     word *f = T+2*N;
2405     word *g = T+3*N;
2406     unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
2407     unsigned int k=0, s=0;
2408 
2409     SetWords(T, 0, 3*N);
2410     b[0]=1;
2411     CopyWords(f, A, NA);
2412     CopyWords(g, M, N);
2413 
2414     while (1)
2415     {
2416         word t=f[0];
2417         while (!t)
2418         {
2419             if (EvenWordCount(f, fgLen)==0)
2420             {
2421                 SetWords(R, 0, N);
2422                 return 0;
2423             }
2424 
2425             ShiftWordsRightByWords(f, fgLen, 1);
2426             if (c[bcLen-1]) bcLen+=2;
2427             ShiftWordsLeftByWords(c, bcLen, 1);
2428             k+=WORD_BITS;
2429             t=f[0];
2430         }
2431 
2432         unsigned int i=0;
2433         while (t%2 == 0)
2434         {
2435             t>>=1;
2436             i++;
2437         }
2438         k+=i;
2439 
2440         if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
2441         {
2442             if (s%2==0)
2443                 CopyWords(R, b, N);
2444             else
2445                 Subtract(R, M, b, N);
2446             return k;
2447         }
2448 
2449         ShiftWordsRightByBits(f, fgLen, i);
2450         t=ShiftWordsLeftByBits(c, bcLen, i);
2451         if (t)
2452         {
2453             c[bcLen] = t;
2454             bcLen+=2;
2455         }
2456 
2457         if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
2458             fgLen-=2;
2459 
2460         if (Compare(f, g, fgLen)==-1)
2461         {
2462             STL::swap(f, g);
2463             STL::swap(b, c);
2464             s++;
2465         }
2466 
2467         Subtract(f, f, g, fgLen);
2468 
2469         if (Add(b, b, c, bcLen))
2470         {
2471             b[bcLen] = 1;
2472             bcLen+=2;
2473         }
2474     }
2475 }
2476 
2477 // R[N] - result = A/(2^k) mod M
2478 // A[N] - input
2479 // M[N] - modulus
2480 
DivideByPower2Mod(word * R,const word * A,unsigned int k,const word * M,unsigned int N)2481 void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M,
2482                        unsigned int N)
2483 {
2484     CopyWords(R, A, N);
2485 
2486     while (k--)
2487     {
2488         if (R[0]%2==0)
2489             ShiftWordsRightByBits(R, N, 1);
2490         else
2491         {
2492             word carry = Add(R, R, M, N);
2493             ShiftWordsRightByBits(R, N, 1);
2494             R[N-1] += carry<<(WORD_BITS-1);
2495         }
2496     }
2497 }
2498 
2499 // R[N] - result = A*(2^k) mod M
2500 // A[N] - input
2501 // M[N] - modulus
2502 
MultiplyByPower2Mod(word * R,const word * A,unsigned int k,const word * M,unsigned int N)2503 void MultiplyByPower2Mod(word *R, const word *A, unsigned int k, const word *M,
2504                          unsigned int N)
2505 {
2506     CopyWords(R, A, N);
2507 
2508     while (k--)
2509         if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
2510             Subtract(R, R, M, N);
2511 }
2512 
2513 
2514 // ********** end of integer needs
2515 
2516 
Integer()2517 Integer::Integer()
2518     : reg_(2), sign_(POSITIVE)
2519 {
2520     reg_[0] = reg_[1] = 0;
2521 }
2522 
2523 
Integer(const Integer & t)2524 Integer::Integer(const Integer& t)
2525     : reg_(RoundupSize(t.WordCount())), sign_(t.sign_)
2526 {
2527     CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
2528 }
2529 
2530 
Integer(signed long value)2531 Integer::Integer(signed long value)
2532     : reg_(2)
2533 {
2534     if (value >= 0)
2535         sign_ = POSITIVE;
2536     else
2537     {
2538         sign_ = NEGATIVE;
2539         value = -value;
2540     }
2541     reg_[0] = word(value);
2542     reg_[1] = word(SafeRightShift<WORD_BITS, unsigned long>(value));
2543 }
2544 
2545 
Integer(Sign s,word high,word low)2546 Integer::Integer(Sign s, word high, word low)
2547     : reg_(2), sign_(s)
2548 {
2549     reg_[0] = low;
2550     reg_[1] = high;
2551 }
2552 
2553 
Integer(word value,unsigned int length)2554 Integer::Integer(word value, unsigned int length)
2555     : reg_(RoundupSize(length)), sign_(POSITIVE)
2556 {
2557     reg_[0] = value;
2558     SetWords(reg_ + 1, 0, reg_.size() - 1);
2559 }
2560 
2561 
Integer(const byte * encodedInteger,unsigned int byteCount,Signedness s)2562 Integer::Integer(const byte *encodedInteger, unsigned int byteCount,
2563                  Signedness s)
2564 {
2565     Decode(encodedInteger, byteCount, s);
2566 }
2567 
2568 class BadBER {};
2569 
2570 // BER Decode Source
Integer(Source & source)2571 Integer::Integer(Source& source)
2572     : reg_(2), sign_(POSITIVE)
2573 {
2574     Decode(source);
2575 }
2576 
Decode(Source & source)2577 void Integer::Decode(Source& source)
2578 {
2579     byte b = source.next();
2580     if (b != INTEGER) {
2581         source.SetError(INTEGER_E);
2582         return;
2583     }
2584 
2585     word32 length = GetLength(source);
2586     if (length == 0 || source.GetError().What()) return;
2587 
2588     if ( (b = source.next()) == 0x00)
2589         length--;
2590     else
2591         source.prev();
2592 
2593     if (source.IsLeft(length) == false) return;
2594 
2595     unsigned int words = (length + WORD_SIZE - 1) / WORD_SIZE;
2596     words = RoundupSize(words);
2597     if (words > reg_.size()) reg_.CleanNew(words);
2598 
2599     for (int j = length; j > 0; j--) {
2600         b = source.next();
2601         reg_ [(j-1) / WORD_SIZE] |= (word)b << ((j-1) % WORD_SIZE) * 8;
2602     }
2603 }
2604 
2605 
Decode(const byte * input,unsigned int inputLen,Signedness s)2606 void Integer::Decode(const byte* input, unsigned int inputLen, Signedness s)
2607 {
2608     unsigned int idx(0);
2609     byte b = 0;
2610     if (inputLen>0)
2611         b = input[idx];   // peek
2612     sign_  = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
2613 
2614     while (inputLen>0 && (sign_==POSITIVE ? b==0 : b==0xff))
2615     {
2616         idx++;   // skip
2617         if (--inputLen>0)
2618             b = input[idx];  // peek
2619     }
2620 
2621     reg_.CleanNew(RoundupSize(BytesToWords(inputLen)));
2622 
2623     for (unsigned int i=inputLen; i > 0; i--)
2624     {
2625         b = input[idx++];
2626         reg_[(i-1)/WORD_SIZE] |= (word)b << ((i-1)%WORD_SIZE)*8;
2627     }
2628 
2629     if (sign_ == NEGATIVE)
2630     {
2631         for (unsigned i=inputLen; i<reg_.size()*WORD_SIZE; i++)
2632             reg_[i/WORD_SIZE] |= (word)0xff << (i%WORD_SIZE)*8;
2633         TwosComplement(reg_.get_buffer(), reg_.size());
2634     }
2635 }
2636 
2637 
Encode(byte * output,unsigned int outputLen,Signedness signedness) const2638 unsigned int Integer::Encode(byte* output, unsigned int outputLen,
2639                        Signedness signedness) const
2640 {
2641     unsigned int idx(0);
2642     if (signedness == UNSIGNED || NotNegative())
2643     {
2644         for (unsigned int i=outputLen; i > 0; i--)
2645             output[idx++] = GetByte(i-1);
2646     }
2647     else
2648     {
2649         // take two's complement of *this
2650         Integer temp = Integer::Power2(8*max(ByteCount(), outputLen)) + *this;
2651         for (unsigned i=0; i<outputLen; i++)
2652             output[idx++] = temp.GetByte(outputLen-i-1);
2653     }
2654     return outputLen;
2655 }
2656 
2657 
2658 static Integer* zero = 0;
2659 
Zero()2660 const Integer &Integer::Zero()
2661 {
2662     if (!zero)
2663         zero = NEW_TC Integer;
2664     return *zero;
2665 }
2666 
2667 
2668 static Integer* one = 0;
2669 
One()2670 const Integer &Integer::One()
2671 {
2672     if (!one)
2673         one = NEW_TC Integer(1,2);
2674     return *one;
2675 }
2676 
2677 
2678 // Clean up static singleton holders, not a leak, but helpful to have gone
2679 // when checking for leaks
CleanUp()2680 void CleanUp()
2681 {
2682     tcDelete(one);
2683     tcDelete(zero);
2684 
2685     // In case user calls more than once, prevent seg fault
2686     one  = 0;
2687     zero = 0;
2688 }
2689 
Integer(RandomNumberGenerator & rng,const Integer & min,const Integer & max)2690 Integer::Integer(RandomNumberGenerator& rng, const Integer& min,
2691                  const Integer& max)
2692 {
2693     Randomize(rng, min, max);
2694 }
2695 
2696 
Randomize(RandomNumberGenerator & rng,unsigned int nbits)2697 void Integer::Randomize(RandomNumberGenerator& rng, unsigned int nbits)
2698 {
2699     const unsigned int nbytes = nbits/8 + 1;
2700     ByteBlock buf(nbytes);
2701     rng.GenerateBlock(buf.get_buffer(), nbytes);
2702     if (nbytes)
2703         buf[0] = (byte)Crop(buf[0], nbits % 8);
2704     Decode(buf.get_buffer(), nbytes, UNSIGNED);
2705 }
2706 
Randomize(RandomNumberGenerator & rng,const Integer & min,const Integer & max)2707 void Integer::Randomize(RandomNumberGenerator& rng, const Integer& min,
2708                         const Integer& max)
2709 {
2710     Integer range = max - min;
2711     const unsigned int nbits = range.BitCount();
2712 
2713     do
2714     {
2715         Randomize(rng, nbits);
2716     }
2717     while (*this > range);
2718 
2719     *this += min;
2720 }
2721 
2722 
Power2(unsigned int e)2723 Integer Integer::Power2(unsigned int e)
2724 {
2725     Integer r((word)0, BitsToWords(e + 1));
2726     r.SetBit(e);
2727     return r;
2728 }
2729 
2730 
SetBit(unsigned int n,bool value)2731 void Integer::SetBit(unsigned int n, bool value)
2732 {
2733     if (value)
2734     {
2735         reg_.CleanGrow(RoundupSize(BitsToWords(n + 1)));
2736         reg_[n / WORD_BITS] |= (word(1) << (n % WORD_BITS));
2737     }
2738     else
2739     {
2740         if (n / WORD_BITS < reg_.size())
2741             reg_[n / WORD_BITS] &= ~(word(1) << (n % WORD_BITS));
2742     }
2743 }
2744 
2745 
SetByte(unsigned int n,byte value)2746 void Integer::SetByte(unsigned int n, byte value)
2747 {
2748     reg_.CleanGrow(RoundupSize(BytesToWords(n+1)));
2749     reg_[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
2750     reg_[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
2751 }
2752 
2753 
Negate()2754 void Integer::Negate()
2755 {
2756     if (!!(*this))	// don't flip sign if *this==0
2757         sign_ = Sign(1 - sign_);
2758 }
2759 
2760 
operator !() const2761 bool Integer::operator!() const
2762 {
2763     return IsNegative() ? false : (reg_[0]==0 && WordCount()==0);
2764 }
2765 
2766 
operator =(const Integer & t)2767 Integer& Integer::operator=(const Integer& t)
2768 {
2769     if (this != &t)
2770     {
2771         reg_.New(RoundupSize(t.WordCount()));
2772         CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
2773         sign_ = t.sign_;
2774     }
2775     return *this;
2776 }
2777 
2778 
operator +=(const Integer & t)2779 Integer& Integer::operator+=(const Integer& t)
2780 {
2781     reg_.CleanGrow(t.reg_.size());
2782     if (NotNegative())
2783     {
2784         if (t.NotNegative())
2785             PositiveAdd(*this, *this, t);
2786         else
2787             PositiveSubtract(*this, *this, t);
2788     }
2789     else
2790     {
2791         if (t.NotNegative())
2792             PositiveSubtract(*this, t, *this);
2793         else
2794         {
2795             PositiveAdd(*this, *this, t);
2796             sign_ = Integer::NEGATIVE;
2797         }
2798     }
2799     return *this;
2800 }
2801 
2802 
operator -() const2803 Integer Integer::operator-() const
2804 {
2805     Integer result(*this);
2806     result.Negate();
2807     return result;
2808 }
2809 
2810 
operator -=(const Integer & t)2811 Integer& Integer::operator-=(const Integer& t)
2812 {
2813     reg_.CleanGrow(t.reg_.size());
2814     if (NotNegative())
2815     {
2816         if (t.NotNegative())
2817             PositiveSubtract(*this, *this, t);
2818         else
2819             PositiveAdd(*this, *this, t);
2820     }
2821     else
2822     {
2823         if (t.NotNegative())
2824         {
2825             PositiveAdd(*this, *this, t);
2826             sign_ = Integer::NEGATIVE;
2827         }
2828         else
2829             PositiveSubtract(*this, t, *this);
2830     }
2831     return *this;
2832 }
2833 
2834 
operator ++()2835 Integer& Integer::operator++()
2836 {
2837     if (NotNegative())
2838     {
2839         if (Increment(reg_.get_buffer(), reg_.size()))
2840         {
2841             reg_.CleanGrow(2*reg_.size());
2842             reg_[reg_.size()/2]=1;
2843         }
2844     }
2845     else
2846     {
2847         word borrow = Decrement(reg_.get_buffer(), reg_.size());
2848         (void)borrow;           // shut up compiler
2849         if (WordCount()==0)
2850             *this = Zero();
2851     }
2852     return *this;
2853 }
2854 
operator --()2855 Integer& Integer::operator--()
2856 {
2857     if (IsNegative())
2858     {
2859         if (Increment(reg_.get_buffer(), reg_.size()))
2860         {
2861             reg_.CleanGrow(2*reg_.size());
2862             reg_[reg_.size()/2]=1;
2863         }
2864     }
2865     else
2866     {
2867         if (Decrement(reg_.get_buffer(), reg_.size()))
2868             *this = -One();
2869     }
2870     return *this;
2871 }
2872 
2873 
operator <<=(unsigned int n)2874 Integer& Integer::operator<<=(unsigned int n)
2875 {
2876     const unsigned int wordCount = WordCount();
2877     const unsigned int shiftWords = n / WORD_BITS;
2878     const unsigned int shiftBits = n % WORD_BITS;
2879 
2880     reg_.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
2881     ShiftWordsLeftByWords(reg_.get_buffer(), wordCount + shiftWords,
2882                           shiftWords);
2883     ShiftWordsLeftByBits(reg_+shiftWords, wordCount+BitsToWords(shiftBits),
2884                          shiftBits);
2885     return *this;
2886 }
2887 
operator >>=(unsigned int n)2888 Integer& Integer::operator>>=(unsigned int n)
2889 {
2890     const unsigned int wordCount = WordCount();
2891     const unsigned int shiftWords = n / WORD_BITS;
2892     const unsigned int shiftBits = n % WORD_BITS;
2893 
2894     ShiftWordsRightByWords(reg_.get_buffer(), wordCount, shiftWords);
2895     if (wordCount > shiftWords)
2896         ShiftWordsRightByBits(reg_.get_buffer(), wordCount-shiftWords,
2897                               shiftBits);
2898     if (IsNegative() && WordCount()==0)   // avoid -0
2899         *this = Zero();
2900     return *this;
2901 }
2902 
2903 
PositiveAdd(Integer & sum,const Integer & a,const Integer & b)2904 void PositiveAdd(Integer& sum, const Integer& a, const Integer& b)
2905 {
2906     word carry;
2907     if (a.reg_.size() == b.reg_.size())
2908         carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2909                     b.reg_.get_buffer(), a.reg_.size());
2910     else if (a.reg_.size() > b.reg_.size())
2911     {
2912         carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2913                     b.reg_.get_buffer(), b.reg_.size());
2914         CopyWords(sum.reg_+b.reg_.size(), a.reg_+b.reg_.size(),
2915                   a.reg_.size()-b.reg_.size());
2916         carry = Increment(sum.reg_+b.reg_.size(), a.reg_.size()-b.reg_.size(),
2917                           carry);
2918     }
2919     else
2920     {
2921         carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2922                     b.reg_.get_buffer(), a.reg_.size());
2923         CopyWords(sum.reg_+a.reg_.size(), b.reg_+a.reg_.size(),
2924                   b.reg_.size()-a.reg_.size());
2925         carry = Increment(sum.reg_+a.reg_.size(), b.reg_.size()-a.reg_.size(),
2926                           carry);
2927     }
2928 
2929     if (carry)
2930     {
2931         sum.reg_.CleanGrow(2*sum.reg_.size());
2932         sum.reg_[sum.reg_.size()/2] = 1;
2933     }
2934     sum.sign_ = Integer::POSITIVE;
2935 }
2936 
PositiveSubtract(Integer & diff,const Integer & a,const Integer & b)2937 void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
2938 {
2939     unsigned aSize = a.WordCount();
2940     aSize += aSize%2;
2941     unsigned bSize = b.WordCount();
2942     bSize += bSize%2;
2943 
2944     if (aSize == bSize)
2945     {
2946         if (Compare(a.reg_.get_buffer(), b.reg_.get_buffer(), aSize) >= 0)
2947         {
2948             Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
2949                      b.reg_.get_buffer(), aSize);
2950             diff.sign_ = Integer::POSITIVE;
2951         }
2952         else
2953         {
2954             Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
2955                      a.reg_.get_buffer(), aSize);
2956             diff.sign_ = Integer::NEGATIVE;
2957         }
2958     }
2959     else if (aSize > bSize)
2960     {
2961         word borrow = Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
2962                                b.reg_.get_buffer(), bSize);
2963         CopyWords(diff.reg_+bSize, a.reg_+bSize, aSize-bSize);
2964         borrow = Decrement(diff.reg_+bSize, aSize-bSize, borrow);
2965         diff.sign_ = Integer::POSITIVE;
2966     }
2967     else
2968     {
2969         word borrow = Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
2970                                a.reg_.get_buffer(), aSize);
2971         CopyWords(diff.reg_+aSize, b.reg_+aSize, bSize-aSize);
2972         borrow = Decrement(diff.reg_+aSize, bSize-aSize, borrow);
2973         diff.sign_ = Integer::NEGATIVE;
2974     }
2975 }
2976 
2977 
MinEncodedSize(Signedness signedness) const2978 unsigned int Integer::MinEncodedSize(Signedness signedness) const
2979 {
2980     unsigned int outputLen = max(1U, ByteCount());
2981     if (signedness == UNSIGNED)
2982         return outputLen;
2983     if (NotNegative() && (GetByte(outputLen-1) & 0x80))
2984         outputLen++;
2985     if (IsNegative() && *this < -Power2(outputLen*8-1))
2986         outputLen++;
2987     return outputLen;
2988 }
2989 
2990 
Compare(const Integer & t) const2991 int Integer::Compare(const Integer& t) const
2992 {
2993     if (NotNegative())
2994     {
2995         if (t.NotNegative())
2996             return PositiveCompare(t);
2997         else
2998             return 1;
2999     }
3000     else
3001     {
3002         if (t.NotNegative())
3003             return -1;
3004         else
3005             return -PositiveCompare(t);
3006     }
3007 }
3008 
3009 
PositiveCompare(const Integer & t) const3010 int Integer::PositiveCompare(const Integer& t) const
3011 {
3012     unsigned size = WordCount(), tSize = t.WordCount();
3013 
3014     if (size == tSize)
3015         return TaoCrypt::Compare(reg_.get_buffer(), t.reg_.get_buffer(), size);
3016     else
3017         return size > tSize ? 1 : -1;
3018 }
3019 
3020 
GetBit(unsigned int n) const3021 bool Integer::GetBit(unsigned int n) const
3022 {
3023     if (n/WORD_BITS >= reg_.size())
3024         return 0;
3025     else
3026         return bool((reg_[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
3027 }
3028 
3029 
GetBits(unsigned int i,unsigned int n) const3030 unsigned long Integer::GetBits(unsigned int i, unsigned int n) const
3031 {
3032     unsigned long v = 0;
3033     for (unsigned int j=0; j<n; j++)
3034         v |= GetBit(i+j) << j;
3035     return v;
3036 }
3037 
3038 
GetByte(unsigned int n) const3039 byte Integer::GetByte(unsigned int n) const
3040 {
3041     if (n/WORD_SIZE >= reg_.size())
3042         return 0;
3043     else
3044         return byte(reg_[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
3045 }
3046 
3047 
BitCount() const3048 unsigned int Integer::BitCount() const
3049 {
3050     unsigned wordCount = WordCount();
3051     if (wordCount)
3052         return (wordCount-1)*WORD_BITS + BitPrecision(reg_[wordCount-1]);
3053     else
3054         return 0;
3055 }
3056 
3057 
ByteCount() const3058 unsigned int Integer::ByteCount() const
3059 {
3060     unsigned wordCount = WordCount();
3061     if (wordCount)
3062         return (wordCount-1)*WORD_SIZE + BytePrecision(reg_[wordCount-1]);
3063     else
3064         return 0;
3065 }
3066 
3067 
WordCount() const3068 unsigned int Integer::WordCount() const
3069 {
3070     return CountWords(reg_.get_buffer(), reg_.size());
3071 }
3072 
3073 
IsConvertableToLong() const3074 bool Integer::IsConvertableToLong() const
3075 {
3076     if (ByteCount() > sizeof(long))
3077         return false;
3078 
3079     unsigned long value = reg_[0];
3080     value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
3081 
3082     if (sign_ == POSITIVE)
3083         return (signed long)value >= 0;
3084     else
3085         return -(signed long)value < 0;
3086 }
3087 
3088 
ConvertToLong() const3089 signed long Integer::ConvertToLong() const
3090 {
3091     unsigned long value = reg_[0];
3092     value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
3093     return sign_ == POSITIVE ? value : -(signed long)value;
3094 }
3095 
3096 
Swap(Integer & a)3097 void Integer::Swap(Integer& a)
3098 {
3099     reg_.Swap(a.reg_);
3100     STL::swap(sign_, a.sign_);
3101 }
3102 
3103 
Plus(const Integer & b) const3104 Integer Integer::Plus(const Integer& b) const
3105 {
3106     Integer sum((word)0, max(reg_.size(), b.reg_.size()));
3107     if (NotNegative())
3108     {
3109         if (b.NotNegative())
3110             PositiveAdd(sum, *this, b);
3111         else
3112             PositiveSubtract(sum, *this, b);
3113     }
3114     else
3115     {
3116         if (b.NotNegative())
3117             PositiveSubtract(sum, b, *this);
3118         else
3119         {
3120             PositiveAdd(sum, *this, b);
3121             sum.sign_ = Integer::NEGATIVE;
3122         }
3123     }
3124     return sum;
3125 }
3126 
3127 
Minus(const Integer & b) const3128 Integer Integer::Minus(const Integer& b) const
3129 {
3130     Integer diff((word)0, max(reg_.size(), b.reg_.size()));
3131     if (NotNegative())
3132     {
3133         if (b.NotNegative())
3134             PositiveSubtract(diff, *this, b);
3135         else
3136             PositiveAdd(diff, *this, b);
3137     }
3138     else
3139     {
3140         if (b.NotNegative())
3141         {
3142             PositiveAdd(diff, *this, b);
3143             diff.sign_ = Integer::NEGATIVE;
3144         }
3145         else
3146             PositiveSubtract(diff, b, *this);
3147     }
3148     return diff;
3149 }
3150 
3151 
Times(const Integer & b) const3152 Integer Integer::Times(const Integer &b) const
3153 {
3154     Integer product;
3155     Multiply(product, *this, b);
3156     return product;
3157 }
3158 
3159 
3160 #undef A0
3161 #undef A1
3162 #undef B0
3163 #undef B1
3164 
3165 #undef T0
3166 #undef T1
3167 #undef T2
3168 #undef T3
3169 
3170 #undef R0
3171 #undef R1
3172 #undef R2
3173 #undef R3
3174 
3175 
AtomicDivide(word * Q,const word * A,const word * B)3176 static inline void AtomicDivide(word *Q, const word *A, const word *B)
3177 {
3178     word T[4];
3179     DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]),
3180                                          DWord(A[2], A[3]), DWord(B[0], B[1]));
3181     Q[0] = q.GetLowHalf();
3182     Q[1] = q.GetHighHalf();
3183 
3184 #ifndef NDEBUG
3185     if (B[0] || B[1])
3186     {
3187         // multiply quotient and divisor and add remainder, make sure it
3188         // equals dividend
3189         word P[4];
3190         Portable::Multiply2(P, Q, B);
3191         Add(P, P, T, 4);
3192     }
3193 #endif
3194 }
3195 
3196 
3197 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
CorrectQuotientEstimate(word * R,word * T,word * Q,const word * B,unsigned int N)3198 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B,
3199                                     unsigned int N)
3200 {
3201     if (Q[1])
3202     {
3203         T[N] = T[N+1] = 0;
3204         unsigned i;
3205         for (i=0; i<N; i+=4)
3206             LowLevel::Multiply2(T+i, Q, B+i);
3207         for (i=2; i<N; i+=4)
3208             if (LowLevel::Multiply2Add(T+i, Q, B+i))
3209                 T[i+5] += (++T[i+4]==0);
3210     }
3211     else
3212     {
3213         T[N] = LinearMultiply(T, B, Q[0], N);
3214         T[N+1] = 0;
3215     }
3216 
3217     word borrow = Subtract(R, R, T, N+2);
3218     (void)borrow;       // shut up compiler
3219 
3220     while (R[N] || Compare(R, B, N) >= 0)
3221     {
3222         R[N] -= Subtract(R, R, B, N);
3223         Q[1] += (++Q[0]==0);
3224     }
3225 }
3226 
3227 // R[NB] -------- remainder = A%B
3228 // Q[NA-NB+2] --- quotient	= A/B
3229 // T[NA+2*NB+4] - temp work space
3230 // A[NA] -------- dividend
3231 // B[NB] -------- divisor
3232 
3233 
Divide(word * R,word * Q,word * T,const word * A,unsigned int NA,const word * B,unsigned int NB)3234 void Divide(word* R, word* Q, word* T, const word* A, unsigned int NA,
3235             const word* B, unsigned int NB)
3236 {
3237     // set up temporary work space
3238     word *const TA=T;
3239     word *const TB=T+NA+2;
3240     word *const TP=T+NA+2+NB;
3241 
3242     // copy B into TB and normalize it so that TB has highest bit set to 1
3243     unsigned shiftWords = (B[NB-1]==0);
3244     TB[0] = TB[NB-1] = 0;
3245     CopyWords(TB+shiftWords, B, NB-shiftWords);
3246     unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
3247     ShiftWordsLeftByBits(TB, NB, shiftBits);
3248 
3249     // copy A into TA and normalize it
3250     TA[0] = TA[NA] = TA[NA+1] = 0;
3251     CopyWords(TA+shiftWords, A, NA);
3252     ShiftWordsLeftByBits(TA, NA+2, shiftBits);
3253 
3254     if (TA[NA+1]==0 && TA[NA] <= 1)
3255     {
3256         Q[NA-NB+1] = Q[NA-NB] = 0;
3257         while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
3258         {
3259             TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
3260             ++Q[NA-NB];
3261         }
3262     }
3263     else
3264     {
3265         NA+=2;
3266     }
3267 
3268     word BT[2];
3269     BT[0] = TB[NB-2] + 1;
3270     BT[1] = TB[NB-1] + (BT[0]==0);
3271 
3272     // start reducing TA mod TB, 2 words at a time
3273     for (unsigned i=NA-2; i>=NB; i-=2)
3274     {
3275         AtomicDivide(Q+i-NB, TA+i-2, BT);
3276         CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
3277     }
3278 
3279     // copy TA into R, and denormalize it
3280     CopyWords(R, TA+shiftWords, NB);
3281     ShiftWordsRightByBits(R, NB, shiftBits);
3282 }
3283 
3284 
PositiveDivide(Integer & remainder,Integer & quotient,const Integer & a,const Integer & b)3285 void PositiveDivide(Integer& remainder, Integer& quotient,
3286                    const Integer& a, const Integer& b)
3287 {
3288     unsigned aSize = a.WordCount();
3289     unsigned bSize = b.WordCount();
3290 
3291     if (a.PositiveCompare(b) == -1)
3292     {
3293         remainder = a;
3294         remainder.sign_ = Integer::POSITIVE;
3295         quotient = Integer::Zero();
3296         return;
3297     }
3298 
3299     aSize += aSize%2;	// round up to next even number
3300     bSize += bSize%2;
3301 
3302     remainder.reg_.CleanNew(RoundupSize(bSize));
3303     remainder.sign_ = Integer::POSITIVE;
3304     quotient.reg_.CleanNew(RoundupSize(aSize-bSize+2));
3305     quotient.sign_ = Integer::POSITIVE;
3306 
3307     AlignedWordBlock T(aSize+2*bSize+4);
3308     Divide(remainder.reg_.get_buffer(), quotient.reg_.get_buffer(),
3309            T.get_buffer(), a.reg_.get_buffer(), aSize, b.reg_.get_buffer(),
3310            bSize);
3311 }
3312 
Divide(Integer & remainder,Integer & quotient,const Integer & dividend,const Integer & divisor)3313 void Integer::Divide(Integer &remainder, Integer &quotient,
3314                      const Integer &dividend, const Integer &divisor)
3315 {
3316     PositiveDivide(remainder, quotient, dividend, divisor);
3317 
3318     if (dividend.IsNegative())
3319     {
3320         quotient.Negate();
3321         if (remainder.NotZero())
3322         {
3323             --quotient;
3324             remainder = divisor.AbsoluteValue() - remainder;
3325         }
3326     }
3327 
3328     if (divisor.IsNegative())
3329         quotient.Negate();
3330 }
3331 
DivideByPowerOf2(Integer & r,Integer & q,const Integer & a,unsigned int n)3332 void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a,
3333                                unsigned int n)
3334 {
3335     q = a;
3336     q >>= n;
3337 
3338     const unsigned int wordCount = BitsToWords(n);
3339     if (wordCount <= a.WordCount())
3340     {
3341         r.reg_.resize(RoundupSize(wordCount));
3342         CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), wordCount);
3343         SetWords(r.reg_+wordCount, 0, r.reg_.size()-wordCount);
3344         if (n % WORD_BITS != 0)
3345           r.reg_[wordCount-1] %= (word(1) << (n % WORD_BITS));
3346     }
3347     else
3348     {
3349         r.reg_.resize(RoundupSize(a.WordCount()));
3350         CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), r.reg_.size());
3351     }
3352     r.sign_ = POSITIVE;
3353 
3354     if (a.IsNegative() && r.NotZero())
3355     {
3356         --q;
3357         r = Power2(n) - r;
3358     }
3359 }
3360 
DividedBy(const Integer & b) const3361 Integer Integer::DividedBy(const Integer &b) const
3362 {
3363     Integer remainder, quotient;
3364     Integer::Divide(remainder, quotient, *this, b);
3365     return quotient;
3366 }
3367 
Modulo(const Integer & b) const3368 Integer Integer::Modulo(const Integer &b) const
3369 {
3370     Integer remainder, quotient;
3371     Integer::Divide(remainder, quotient, *this, b);
3372     return remainder;
3373 }
3374 
Divide(word & remainder,Integer & quotient,const Integer & dividend,word divisor)3375 void Integer::Divide(word &remainder, Integer &quotient,
3376                      const Integer &dividend, word divisor)
3377 {
3378     if ((divisor & (divisor-1)) == 0)	// divisor is a power of 2
3379     {
3380         quotient = dividend >> (BitPrecision(divisor)-1);
3381         remainder = dividend.reg_[0] & (divisor-1);
3382         return;
3383     }
3384 
3385     unsigned int i = dividend.WordCount();
3386     quotient.reg_.CleanNew(RoundupSize(i));
3387     remainder = 0;
3388     while (i--)
3389     {
3390         quotient.reg_[i] = DWord(dividend.reg_[i], remainder) / divisor;
3391         remainder = DWord(dividend.reg_[i], remainder) % divisor;
3392     }
3393 
3394     if (dividend.NotNegative())
3395         quotient.sign_ = POSITIVE;
3396     else
3397     {
3398         quotient.sign_ = NEGATIVE;
3399         if (remainder)
3400         {
3401             --quotient;
3402             remainder = divisor - remainder;
3403         }
3404     }
3405 }
3406 
DividedBy(word b) const3407 Integer Integer::DividedBy(word b) const
3408 {
3409     word remainder;
3410     Integer quotient;
3411     Integer::Divide(remainder, quotient, *this, b);
3412     return quotient;
3413 }
3414 
Modulo(word divisor) const3415 word Integer::Modulo(word divisor) const
3416 {
3417     word remainder;
3418 
3419     if ((divisor & (divisor-1)) == 0)	// divisor is a power of 2
3420         remainder = reg_[0] & (divisor-1);
3421     else
3422     {
3423         unsigned int i = WordCount();
3424 
3425         if (divisor <= 5)
3426         {
3427             DWord sum(0, 0);
3428             while (i--)
3429                 sum += reg_[i];
3430             remainder = sum % divisor;
3431         }
3432         else
3433         {
3434             remainder = 0;
3435             while (i--)
3436                 remainder = DWord(reg_[i], remainder) % divisor;
3437         }
3438     }
3439 
3440     if (IsNegative() && remainder)
3441         remainder = divisor - remainder;
3442 
3443     return remainder;
3444 }
3445 
3446 
AbsoluteValue() const3447 Integer Integer::AbsoluteValue() const
3448 {
3449     Integer result(*this);
3450     result.sign_ = POSITIVE;
3451     return result;
3452 }
3453 
3454 
SquareRoot() const3455 Integer Integer::SquareRoot() const
3456 {
3457     if (!IsPositive())
3458         return Zero();
3459 
3460     // overestimate square root
3461     Integer x, y = Power2((BitCount()+1)/2);
3462 
3463     do
3464     {
3465         x = y;
3466         y = (x + *this/x) >> 1;
3467     } while (y<x);
3468 
3469     return x;
3470 }
3471 
IsSquare() const3472 bool Integer::IsSquare() const
3473 {
3474     Integer r = SquareRoot();
3475     return *this == r.Squared();
3476 }
3477 
IsUnit() const3478 bool Integer::IsUnit() const
3479 {
3480     return (WordCount() == 1) && (reg_[0] == 1);
3481 }
3482 
MultiplicativeInverse() const3483 Integer Integer::MultiplicativeInverse() const
3484 {
3485     return IsUnit() ? *this : Zero();
3486 }
3487 
a_times_b_mod_c(const Integer & x,const Integer & y,const Integer & m)3488 Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
3489 {
3490     return x*y%m;
3491 }
3492 
a_exp_b_mod_c(const Integer & x,const Integer & e,const Integer & m)3493 Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
3494 {
3495     ModularArithmetic mr(m);
3496     return mr.Exponentiate(x, e);
3497 }
3498 
Gcd(const Integer & a,const Integer & b)3499 Integer Integer::Gcd(const Integer &a, const Integer &b)
3500 {
3501     return EuclideanDomainOf().Gcd(a, b);
3502 }
3503 
InverseMod(const Integer & m) const3504 Integer Integer::InverseMod(const Integer &m) const
3505 {
3506     if (IsNegative() || *this>=m)
3507         return (*this%m).InverseMod(m);
3508 
3509     if (m.IsEven())
3510     {
3511         if (!m || IsEven())
3512             return Zero();	// no inverse
3513         if (*this == One())
3514             return One();
3515 
3516         Integer u = m.InverseMod(*this);
3517         return !u ? Zero() : (m*(*this-u)+1)/(*this);
3518     }
3519 
3520     AlignedWordBlock T(m.reg_.size() * 4);
3521     Integer r((word)0, m.reg_.size());
3522     unsigned k = AlmostInverse(r.reg_.get_buffer(), T.get_buffer(),
3523                                reg_.get_buffer(), reg_.size(),
3524                                m.reg_.get_buffer(), m.reg_.size());
3525     DivideByPower2Mod(r.reg_.get_buffer(), r.reg_.get_buffer(), k,
3526                       m.reg_.get_buffer(), m.reg_.size());
3527     return r;
3528 }
3529 
InverseMod(const word mod) const3530 word Integer::InverseMod(const word mod) const
3531 {
3532     word g0 = mod, g1 = *this % mod;
3533     word v0 = 0, v1 = 1;
3534     word y;
3535 
3536     while (g1)
3537     {
3538         if (g1 == 1)
3539             return v1;
3540         y = g0 / g1;
3541         g0 = g0 % g1;
3542         v0 += y * v1;
3543 
3544         if (!g0)
3545             break;
3546         if (g0 == 1)
3547             return mod-v0;
3548         y = g1 / g0;
3549         g1 = g1 % g0;
3550         v1 += y * v0;
3551     }
3552     return 0;
3553 }
3554 
3555 // ********* ModArith stuff
3556 
Half(const Integer & a) const3557 const Integer& ModularArithmetic::Half(const Integer &a) const
3558 {
3559     if (a.reg_.size()==modulus.reg_.size())
3560     {
3561         TaoCrypt::DivideByPower2Mod(result.reg_.begin(), a.reg_.begin(), 1,
3562                                     modulus.reg_.begin(), a.reg_.size());
3563         return result;
3564     }
3565     else
3566         return result1 = (a.IsEven() ? (a >> 1) : ((a+modulus) >> 1));
3567 }
3568 
Add(const Integer & a,const Integer & b) const3569 const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
3570 {
3571     if (a.reg_.size()==modulus.reg_.size() &&
3572         b.reg_.size()==modulus.reg_.size())
3573     {
3574         if (TaoCrypt::Add(result.reg_.begin(), a.reg_.begin(), b.reg_.begin(),
3575                           a.reg_.size())
3576             || Compare(result.reg_.get_buffer(), modulus.reg_.get_buffer(),
3577                        a.reg_.size()) >= 0)
3578         {
3579             TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
3580                                modulus.reg_.begin(), a.reg_.size());
3581         }
3582         return result;
3583     }
3584     else
3585     {
3586         result1 = a+b;
3587         if (result1 >= modulus)
3588             result1 -= modulus;
3589         return result1;
3590     }
3591 }
3592 
Accumulate(Integer & a,const Integer & b) const3593 Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
3594 {
3595     if (a.reg_.size()==modulus.reg_.size() &&
3596         b.reg_.size()==modulus.reg_.size())
3597     {
3598         if (TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
3599                           b.reg_.get_buffer(), a.reg_.size())
3600             || Compare(a.reg_.get_buffer(), modulus.reg_.get_buffer(),
3601                        a.reg_.size()) >= 0)
3602         {
3603             TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
3604                                modulus.reg_.get_buffer(), a.reg_.size());
3605         }
3606     }
3607     else
3608     {
3609         a+=b;
3610         if (a>=modulus)
3611             a-=modulus;
3612     }
3613 
3614     return a;
3615 }
3616 
Subtract(const Integer & a,const Integer & b) const3617 const Integer& ModularArithmetic::Subtract(const Integer &a,
3618                                            const Integer &b) const
3619 {
3620     if (a.reg_.size()==modulus.reg_.size() &&
3621         b.reg_.size()==modulus.reg_.size())
3622     {
3623         if (TaoCrypt::Subtract(result.reg_.begin(), a.reg_.begin(),
3624                                b.reg_.begin(), a.reg_.size()))
3625             TaoCrypt::Add(result.reg_.begin(), result.reg_.begin(),
3626                           modulus.reg_.begin(), a.reg_.size());
3627         return result;
3628     }
3629     else
3630     {
3631         result1 = a-b;
3632         if (result1.IsNegative())
3633             result1 += modulus;
3634         return result1;
3635     }
3636 }
3637 
Reduce(Integer & a,const Integer & b) const3638 Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
3639 {
3640     if (a.reg_.size()==modulus.reg_.size() &&
3641         b.reg_.size()==modulus.reg_.size())
3642     {
3643         if (TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
3644                                b.reg_.get_buffer(), a.reg_.size()))
3645             TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
3646                           modulus.reg_.get_buffer(), a.reg_.size());
3647     }
3648     else
3649     {
3650         a-=b;
3651         if (a.IsNegative())
3652             a+=modulus;
3653     }
3654 
3655     return a;
3656 }
3657 
Inverse(const Integer & a) const3658 const Integer& ModularArithmetic::Inverse(const Integer &a) const
3659 {
3660     if (!a)
3661         return a;
3662 
3663     CopyWords(result.reg_.begin(), modulus.reg_.begin(), modulus.reg_.size());
3664     if (TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
3665                            a.reg_.begin(), a.reg_.size()))
3666         Decrement(result.reg_.begin()+a.reg_.size(), 1,
3667                   modulus.reg_.size()-a.reg_.size());
3668 
3669     return result;
3670 }
3671 
CascadeExponentiate(const Integer & x,const Integer & e1,const Integer & y,const Integer & e2) const3672 Integer ModularArithmetic::CascadeExponentiate(const Integer &x,
3673                   const Integer &e1, const Integer &y, const Integer &e2) const
3674 {
3675     if (modulus.IsOdd())
3676     {
3677         MontgomeryRepresentation dr(modulus);
3678         return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1,
3679                                                     dr.ConvertIn(y), e2));
3680     }
3681     else
3682         return AbstractRing::CascadeExponentiate(x, e1, y, e2);
3683 }
3684 
SimultaneousExponentiate(Integer * results,const Integer & base,const Integer * exponents,unsigned int exponentsCount) const3685 void ModularArithmetic::SimultaneousExponentiate(Integer *results,
3686         const Integer &base, const Integer *exponents,
3687         unsigned int exponentsCount) const
3688 {
3689     if (modulus.IsOdd())
3690     {
3691         MontgomeryRepresentation dr(modulus);
3692         dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents,
3693                                     exponentsCount);
3694         for (unsigned int i=0; i<exponentsCount; i++)
3695             results[i] = dr.ConvertOut(results[i]);
3696     }
3697     else
3698         AbstractRing::SimultaneousExponentiate(results, base,
3699                                                     exponents, exponentsCount);
3700 }
3701 
3702 
3703 // ********************************************************
3704 
3705 #define A0      A
3706 #define A1      (A+N2)
3707 #define B0      B
3708 #define B1      (B+N2)
3709 
3710 #define T0      T
3711 #define T1      (T+N2)
3712 #define T2      (T+N)
3713 #define T3      (T+N+N2)
3714 
3715 #define R0      R
3716 #define R1      (R+N2)
3717 #define R2      (R+N)
3718 #define R3      (R+N+N2)
3719 
3720 
MultiplyBottom(word * R,word * T,const word * A,const word * B,unsigned int N)3721 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B,
3722                            unsigned int N)
3723 {
3724     RecursiveMultiplyBottom(R, T, A, B, N);
3725 }
3726 
MultiplyTop(word * R,word * T,const word * L,const word * A,const word * B,unsigned int N)3727 inline void MultiplyTop(word *R, word *T, const word *L, const word *A,
3728                         const word *B, unsigned int N)
3729 {
3730     RecursiveMultiplyTop(R, T, L, A, B, N);
3731 }
3732 
3733 
3734 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M
3735 // T[3*N] - temporary work space
3736 // X[2*N] - number to be reduced
3737 // M[N] --- modulus
3738 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
3739 
MontgomeryReduce(word * R,word * T,const word * X,const word * M,const word * U,unsigned int N)3740 void MontgomeryReduce(word *R, word *T, const word *X, const word *M,
3741                       const word *U, unsigned int N)
3742 {
3743     MultiplyBottom(R, T, X, U, N);
3744     MultiplyTop(T, T+N, X, R, M, N);
3745     word borrow = Subtract(T, X+N, T, N);
3746     // defend against timing attack by doing this Add even when not needed
3747     word carry = Add(T+N, T, M, N);
3748     (void)carry;            // shut up compiler
3749     CopyWords(R, T + (borrow ? N : 0), N);
3750 }
3751 
3752 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
3753 // T[3*N/2] - temporary work space
3754 // A[N] ----- an odd number as input
3755 
RecursiveInverseModPower2(word * R,word * T,const word * A,unsigned int N)3756 void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
3757 {
3758     if (N==2)
3759     {
3760         T[0] = AtomicInverseModPower2(A[0]);
3761         T[1] = 0;
3762         LowLevel::Multiply2Bottom(T+2, T, A);
3763         TwosComplement(T+2, 2);
3764         Increment(T+2, 2, 2);
3765         LowLevel::Multiply2Bottom(R, T, T+2);
3766     }
3767     else
3768     {
3769         const unsigned int N2 = N/2;
3770         RecursiveInverseModPower2(R0, T0, A0, N2);
3771         T0[0] = 1;
3772         SetWords(T0+1, 0, N2-1);
3773         MultiplyTop(R1, T1, T0, R0, A0, N2);
3774         MultiplyBottom(T0, T1, R0, A1, N2);
3775         Add(T0, R1, T0, N2);
3776         TwosComplement(T0, N2);
3777         MultiplyBottom(R1, T1, R0, T0, N2);
3778     }
3779 }
3780 
3781 
3782 #undef A0
3783 #undef A1
3784 #undef B0
3785 #undef B1
3786 
3787 #undef T0
3788 #undef T1
3789 #undef T2
3790 #undef T3
3791 
3792 #undef R0
3793 #undef R1
3794 #undef R2
3795 #undef R3
3796 
3797 
3798 // modulus must be odd
MontgomeryRepresentation(const Integer & m)3799 MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)
3800     : ModularArithmetic(m),
3801       u((word)0, modulus.reg_.size()),
3802       workspace(5*modulus.reg_.size())
3803 {
3804     RecursiveInverseModPower2(u.reg_.get_buffer(), workspace.get_buffer(),
3805                               modulus.reg_.get_buffer(), modulus.reg_.size());
3806 }
3807 
Multiply(const Integer & a,const Integer & b) const3808 const Integer& MontgomeryRepresentation::Multiply(const Integer &a,
3809                                                   const Integer &b) const
3810 {
3811     word *const T = workspace.begin();
3812     word *const R = result.reg_.begin();
3813     const unsigned int N = modulus.reg_.size();
3814 
3815     AsymmetricMultiply(T, T+2*N, a.reg_.get_buffer(), a.reg_.size(),
3816                        b.reg_.get_buffer(), b.reg_.size());
3817     SetWords(T+a.reg_.size()+b.reg_.size(),0, 2*N-a.reg_.size()-b.reg_.size());
3818     MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3819                      u.reg_.get_buffer(), N);
3820     return result;
3821 }
3822 
Square(const Integer & a) const3823 const Integer& MontgomeryRepresentation::Square(const Integer &a) const
3824 {
3825     word *const T = workspace.begin();
3826     word *const R = result.reg_.begin();
3827     const unsigned int N = modulus.reg_.size();
3828 
3829     TaoCrypt::Square(T, T+2*N, a.reg_.get_buffer(), a.reg_.size());
3830     SetWords(T+2*a.reg_.size(), 0, 2*N-2*a.reg_.size());
3831     MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3832                      u.reg_.get_buffer(), N);
3833     return result;
3834 }
3835 
ConvertOut(const Integer & a) const3836 Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
3837 {
3838     word *const T = workspace.begin();
3839     word *const R = result.reg_.begin();
3840     const unsigned int N = modulus.reg_.size();
3841 
3842     CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
3843     SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
3844     MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3845                      u.reg_.get_buffer(), N);
3846     return result;
3847 }
3848 
MultiplicativeInverse(const Integer & a) const3849 const Integer& MontgomeryRepresentation::MultiplicativeInverse(
3850                                                         const Integer &a) const
3851 {
3852 //  return (EuclideanMultiplicativeInverse(a, modulus)<<
3853 //      (2*WORD_BITS*modulus.reg_.size()))%modulus;
3854     word *const T = workspace.begin();
3855     word *const R = result.reg_.begin();
3856     const unsigned int N = modulus.reg_.size();
3857 
3858     CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
3859     SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
3860     MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3861                      u.reg_.get_buffer(), N);
3862     unsigned k = AlmostInverse(R, T, R, N, modulus.reg_.get_buffer(), N);
3863 
3864 //  cout << "k=" << k << " N*32=" << 32*N << endl;
3865 
3866     if (k>N*WORD_BITS)
3867         DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg_.get_buffer(), N);
3868     else
3869         MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg_.get_buffer(), N);
3870 
3871     return result;
3872 }
3873 
3874 
3875 //  mod Root stuff
ModularRoot(const Integer & a,const Integer & dp,const Integer & dq,const Integer & p,const Integer & q,const Integer & u)3876 Integer ModularRoot(const Integer &a, const Integer &dp, const Integer &dq,
3877                     const Integer &p, const Integer &q, const Integer &u)
3878 {
3879     Integer p2 = ModularExponentiation((a % p), dp, p);
3880     Integer q2 = ModularExponentiation((a % q), dq, q);
3881     return CRT(p2, p, q2, q, u);
3882 }
3883 
CRT(const Integer & xp,const Integer & p,const Integer & xq,const Integer & q,const Integer & u)3884 Integer CRT(const Integer &xp, const Integer &p, const Integer &xq,
3885             const Integer &q, const Integer &u)
3886 {
3887     // isn't operator overloading great?
3888     return p * (u * (xq-xp) % q) + xp;
3889 }
3890 
3891 } // namespace
3892 
3893