1 /*
2 Copyright (c) 2005, 2012, Oracle and/or its affiliates
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; see the file COPYING. If not, write to the
15 Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
16 MA 02110-1335 USA.
17 */
18
19
20
21 /* based on Wei Dai's integer.cpp from CryptoPP */
22
23 #include "runtime.hpp"
24 #include "integer.hpp"
25 #include "modarith.hpp"
26 #include "asn.hpp"
27
28
29
30 #ifdef __DECCXX
31 #include <c_asm.h> // for asm overflow assembly
32 #endif
33
34 #if defined(_M_X64) || defined(_M_IA64)
35 #include <intrin.h>
36 #pragma intrinsic(_umul128)
37 #endif
38
39
40 #ifdef __GNUC__
41 #include <signal.h>
42 #include <setjmp.h>
43 #endif
44
45
46 #ifdef SSE2_INTRINSICS_AVAILABLE
47 #ifdef __GNUC__
48 #include <xmmintrin.h>
49 #ifdef TAOCRYPT_MEMALIGN_AVAILABLE
50 #include <malloc.h>
51 #else
52 #include <stdlib.h>
53 #endif
54 #else
55 #include <emmintrin.h>
56 #endif
57 #elif defined(_MSC_VER) && defined(_M_IX86)
58 /* #pragma message("You do not seem to have the Visual C++ Processor Pack ")
59 #pragma message("installed, so use of SSE2 intrinsics will be disabled.")
60 */
61 #elif defined(__GNUC__) && defined(__i386__)
62 /* #warning You do not have GCC 3.3 or later, or did not specify the -msse2 \
63 compiler option. Use of SSE2 intrinsics will be disabled.
64 */
65 #endif
66
67
68 namespace TaoCrypt {
69
70
71 #ifdef SSE2_INTRINSICS_AVAILABLE
72
73 template <class T>
allocate(size_type n,const void *)74 CPP_TYPENAME AlignedAllocator<T>::pointer AlignedAllocator<T>::allocate(
75 size_type n, const void *)
76 {
77 if (n > this->max_size())
78 return 0;
79 if (n == 0)
80 return 0;
81 if (n >= 4)
82 {
83 void* p;
84 #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
85 p = _mm_malloc(sizeof(T)*n, 16);
86 #elif defined(TAOCRYPT_MEMALIGN_AVAILABLE)
87 p = memalign(16, sizeof(T)*n);
88 #elif defined(TAOCRYPT_MALLOC_ALIGNMENT_IS_16)
89 p = malloc(sizeof(T)*n);
90 #else
91 p = (byte *)malloc(sizeof(T)*n + 8);
92 // assume malloc alignment is at least 8
93 #endif
94
95 #ifdef TAOCRYPT_NO_ALIGNED_ALLOC
96 m_pBlock = p;
97 if (!IsAlignedOn(p, 16))
98 {
99 p = (byte *)p + 8;
100 }
101 #endif
102
103 return (T*)p;
104 }
105 return NEW_TC T[n];
106 }
107
108
109 template <class T>
deallocate(void * p,size_type n)110 void AlignedAllocator<T>::deallocate(void* p, size_type n)
111 {
112 memset(p, 0, n*sizeof(T));
113 if (n >= 4)
114 {
115 #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
116 _mm_free(p);
117 #elif defined(TAOCRYPT_NO_ALIGNED_ALLOC)
118 free(m_pBlock);
119 m_pBlock = 0;
120 #else
121 free(p);
122 #endif
123 }
124 else
125 tcArrayDelete((T *)p);
126 }
127
128 #endif // SSE2
129
130
131 // ******** start of integer needs
132
133 // start 5.2.1 adds DWord and Word ********
134
135 // ********************************************************
136
137 class DWord {
138 public:
DWord()139 DWord() {}
140
141 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
DWord(word low)142 explicit DWord(word low)
143 {
144 whole_ = low;
145 }
146 #else
DWord(word low)147 explicit DWord(word low)
148 {
149 halfs_.low = low;
150 halfs_.high = 0;
151 }
152 #endif
153
DWord(word low,word high)154 DWord(word low, word high)
155 {
156 halfs_.low = low;
157 halfs_.high = high;
158 }
159
Multiply(word a,word b)160 static DWord Multiply(word a, word b)
161 {
162 DWord r;
163
164 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
165 r.whole_ = (dword)a * b;
166
167 #elif defined(_M_X64) || defined(_M_IA64)
168 r.halfs_.low = _umul128(a, b, &r.halfs_.high);
169
170 #elif defined(__alpha__)
171 r.halfs_.low = a*b;
172 #ifdef __GNUC__
173 __asm__("umulh %1,%2,%0" : "=r" (r.halfs_.high)
174 : "r" (a), "r" (b));
175 #elif defined(__DECCXX)
176 r.halfs_.high = asm("umulh %a0, %a1, %v0", a, b);
177 #else
178 #error unknown alpha compiler
179 #endif
180
181 #elif defined(__ia64__)
182 r.halfs_.low = a*b;
183 __asm__("xmpy.hu %0=%1,%2" : "=f" (r.halfs_.high)
184 : "f" (a), "f" (b));
185
186 #elif defined(_ARCH_PPC64)
187 r.halfs_.low = a*b;
188 __asm__("mulhdu %0,%1,%2" : "=r" (r.halfs_.high)
189 : "r" (a), "r" (b) : "cc");
190
191 #elif defined(__x86_64__)
192 __asm__("mulq %3" : "=d" (r.halfs_.high), "=a" (r.halfs_.low) :
193 "a" (a), "rm" (b) : "cc");
194
195 #elif defined(__mips64)
196 unsigned __int128 t = (unsigned __int128) a * b;
197 r.halfs_.high = t >> 64;
198 r.halfs_.low = (word) t;
199
200 #elif defined(_M_IX86)
201 // for testing
202 word64 t = (word64)a * b;
203 r.halfs_.high = ((word32 *)(&t))[1];
204 r.halfs_.low = (word32)t;
205 #else
206 #error can not implement DWord
207 #endif
208
209 return r;
210 }
211
MultiplyAndAdd(word a,word b,word c)212 static DWord MultiplyAndAdd(word a, word b, word c)
213 {
214 DWord r = Multiply(a, b);
215 return r += c;
216 }
217
operator +=(word a)218 DWord & operator+=(word a)
219 {
220 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
221 whole_ = whole_ + a;
222 #else
223 halfs_.low += a;
224 halfs_.high += (halfs_.low < a);
225 #endif
226 return *this;
227 }
228
operator +(word a)229 DWord operator+(word a)
230 {
231 DWord r;
232 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
233 r.whole_ = whole_ + a;
234 #else
235 r.halfs_.low = halfs_.low + a;
236 r.halfs_.high = halfs_.high + (r.halfs_.low < a);
237 #endif
238 return r;
239 }
240
operator -(DWord a)241 DWord operator-(DWord a)
242 {
243 DWord r;
244 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
245 r.whole_ = whole_ - a.whole_;
246 #else
247 r.halfs_.low = halfs_.low - a.halfs_.low;
248 r.halfs_.high = halfs_.high - a.halfs_.high -
249 (r.halfs_.low > halfs_.low);
250 #endif
251 return r;
252 }
253
operator -(word a)254 DWord operator-(word a)
255 {
256 DWord r;
257 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
258 r.whole_ = whole_ - a;
259 #else
260 r.halfs_.low = halfs_.low - a;
261 r.halfs_.high = halfs_.high - (r.halfs_.low > halfs_.low);
262 #endif
263 return r;
264 }
265
266 // returns quotient, which must fit in a word
267 word operator/(word divisor);
268
269 word operator%(word a);
270
operator !() const271 bool operator!() const
272 {
273 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
274 return !whole_;
275 #else
276 return !halfs_.high && !halfs_.low;
277 #endif
278 }
279
GetLowHalf() const280 word GetLowHalf() const {return halfs_.low;}
GetHighHalf() const281 word GetHighHalf() const {return halfs_.high;}
GetHighHalfAsBorrow() const282 word GetHighHalfAsBorrow() const {return 0-halfs_.high;}
283
284 private:
285 union
286 {
287 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
288 dword whole_;
289 #endif
290 struct
291 {
292 #ifdef LITTLE_ENDIAN_ORDER
293 word low;
294 word high;
295 #else
296 word high;
297 word low;
298 #endif
299 } halfs_;
300 };
301 };
302
303
304 class Word {
305 public:
Word()306 Word() {}
307
Word(word value)308 Word(word value)
309 {
310 whole_ = value;
311 }
312
Word(hword low,hword high)313 Word(hword low, hword high)
314 {
315 whole_ = low | (word(high) << (WORD_BITS/2));
316 }
317
Multiply(hword a,hword b)318 static Word Multiply(hword a, hword b)
319 {
320 Word r;
321 r.whole_ = (word)a * b;
322 return r;
323 }
324
operator -(Word a)325 Word operator-(Word a)
326 {
327 Word r;
328 r.whole_ = whole_ - a.whole_;
329 return r;
330 }
331
operator -(hword a)332 Word operator-(hword a)
333 {
334 Word r;
335 r.whole_ = whole_ - a;
336 return r;
337 }
338
339 // returns quotient, which must fit in a word
operator /(hword divisor)340 hword operator/(hword divisor)
341 {
342 return hword(whole_ / divisor);
343 }
344
operator !() const345 bool operator!() const
346 {
347 return !whole_;
348 }
349
GetWhole() const350 word GetWhole() const {return whole_;}
GetLowHalf() const351 hword GetLowHalf() const {return hword(whole_);}
GetHighHalf() const352 hword GetHighHalf() const {return hword(whole_>>(WORD_BITS/2));}
GetHighHalfAsBorrow() const353 hword GetHighHalfAsBorrow() const {return 0-hword(whole_>>(WORD_BITS/2));}
354
355 private:
356 word whole_;
357 };
358
359
360 // dummy is VC60 compiler bug workaround
361 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
362 template <class S, class D>
363 S DivideThreeWordsByTwo(S* A, S B0, S B1, D* dummy_VC6_WorkAround = 0)
364 {
365 // estimate the quotient: do a 2 S by 1 S divide
366 S Q;
367 if (S(B1+1) == 0)
368 Q = A[2];
369 else
370 Q = D(A[1], A[2]) / S(B1+1);
371
372 // now subtract Q*B from A
373 D p = D::Multiply(B0, Q);
374 D u = (D) A[0] - p.GetLowHalf();
375 A[0] = u.GetLowHalf();
376 u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() -
377 D::Multiply(B1, Q);
378 A[1] = u.GetLowHalf();
379 A[2] += u.GetHighHalf();
380
381 // Q <= actual quotient, so fix it
382 while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
383 {
384 u = (D) A[0] - B0;
385 A[0] = u.GetLowHalf();
386 u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
387 A[1] = u.GetLowHalf();
388 A[2] += u.GetHighHalf();
389 Q++;
390 }
391
392 return Q;
393 }
394
395
396 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
397 template <class S, class D>
DivideFourWordsByTwo(S * T,const D & Al,const D & Ah,const D & B)398 inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
399 {
400 if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
401 return D(Ah.GetLowHalf(), Ah.GetHighHalf());
402 else
403 {
404 S Q[2];
405 T[0] = Al.GetLowHalf();
406 T[1] = Al.GetHighHalf();
407 T[2] = Ah.GetLowHalf();
408 T[3] = Ah.GetHighHalf();
409 Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(),
410 B.GetHighHalf());
411 Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
412 return D(Q[0], Q[1]);
413 }
414 }
415
416
417 // returns quotient, which must fit in a word
operator /(word a)418 inline word DWord::operator/(word a)
419 {
420 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
421 return word(whole_ / a);
422 #else
423 hword r[4];
424 return DivideFourWordsByTwo<hword, Word>(r, halfs_.low,
425 halfs_.high, a).GetWhole();
426 #endif
427 }
428
operator %(word a)429 inline word DWord::operator%(word a)
430 {
431 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
432 return word(whole_ % a);
433 #else
434 if (a < (word(1) << (WORD_BITS/2)))
435 {
436 hword h = hword(a);
437 word r = halfs_.high % h;
438 r = ((halfs_.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
439 return hword((hword(halfs_.low) + (r << (WORD_BITS/2))) % h);
440 }
441 else
442 {
443 hword r[4];
444 DivideFourWordsByTwo<hword, Word>(r, halfs_.low, halfs_.high, a);
445 return Word(r[0], r[1]).GetWhole();
446 }
447 #endif
448 }
449
450
451
452 // end 5.2.1 DWord and Word adds
453
454
455
456
457
458 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
459
RoundupSize(unsigned int n)460 static inline unsigned int RoundupSize(unsigned int n)
461 {
462 if (n<=8)
463 return RoundupSizeTable[n];
464 else if (n<=16)
465 return 16;
466 else if (n<=32)
467 return 32;
468 else if (n<=64)
469 return 64;
470 else return 1U << BitPrecision(n-1);
471 }
472
473
Compare(const word * A,const word * B,unsigned int N)474 static int Compare(const word *A, const word *B, unsigned int N)
475 {
476 while (N--)
477 if (A[N] > B[N])
478 return 1;
479 else if (A[N] < B[N])
480 return -1;
481
482 return 0;
483 }
484
Increment(word * A,unsigned int N,word B=1)485 static word Increment(word *A, unsigned int N, word B=1)
486 {
487 word t = A[0];
488 A[0] = t+B;
489 if (A[0] >= t)
490 return 0;
491 for (unsigned i=1; i<N; i++)
492 if (++A[i])
493 return 0;
494 return 1;
495 }
496
Decrement(word * A,unsigned int N,word B=1)497 static word Decrement(word *A, unsigned int N, word B=1)
498 {
499 word t = A[0];
500 A[0] = t-B;
501 if (A[0] <= t)
502 return 0;
503 for (unsigned i=1; i<N; i++)
504 if (A[i]--)
505 return 0;
506 return 1;
507 }
508
TwosComplement(word * A,unsigned int N)509 static void TwosComplement(word *A, unsigned int N)
510 {
511 Decrement(A, N);
512 for (unsigned i=0; i<N; i++)
513 A[i] = ~A[i];
514 }
515
516
LinearMultiply(word * C,const word * A,word B,unsigned int N)517 static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
518 {
519 word carry=0;
520 for(unsigned i=0; i<N; i++)
521 {
522 DWord p = DWord::MultiplyAndAdd(A[i], B, carry);
523 C[i] = p.GetLowHalf();
524 carry = p.GetHighHalf();
525 }
526 return carry;
527 }
528
529
AtomicInverseModPower2(word A)530 static word AtomicInverseModPower2(word A)
531 {
532 word R=A%8;
533
534 for (unsigned i=3; i<WORD_BITS; i*=2)
535 R = R*(2-R*A);
536
537 return R;
538 }
539
540
541 // ********************************************************
542
543 class Portable
544 {
545 public:
546 static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
547 unsigned int N);
548 static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word*B,
549 unsigned int N);
550 static void TAOCRYPT_CDECL Multiply2(word *C, const word *A, const word *B);
551 static word TAOCRYPT_CDECL Multiply2Add(word *C,
552 const word *A, const word *B);
553 static void TAOCRYPT_CDECL Multiply4(word *C, const word *A, const word *B);
554 static void TAOCRYPT_CDECL Multiply8(word *C, const word *A, const word *B);
MultiplyRecursionLimit()555 static unsigned int TAOCRYPT_CDECL MultiplyRecursionLimit() {return 8;}
556
557 static void TAOCRYPT_CDECL Multiply2Bottom(word *C, const word *A,
558 const word *B);
559 static void TAOCRYPT_CDECL Multiply4Bottom(word *C, const word *A,
560 const word *B);
561 static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
562 const word *B);
MultiplyBottomRecursionLimit()563 static unsigned int TAOCRYPT_CDECL MultiplyBottomRecursionLimit(){return 8;}
564
565 static void TAOCRYPT_CDECL Square2(word *R, const word *A);
566 static void TAOCRYPT_CDECL Square4(word *R, const word *A);
SquareRecursionLimit()567 static unsigned int TAOCRYPT_CDECL SquareRecursionLimit() {return 4;}
568 };
569
Add(word * C,const word * A,const word * B,unsigned int N)570 word Portable::Add(word *C, const word *A, const word *B, unsigned int N)
571 {
572 DWord u(0, 0);
573 for (unsigned int i = 0; i < N; i+=2)
574 {
575 u = DWord(A[i]) + B[i] + u.GetHighHalf();
576 C[i] = u.GetLowHalf();
577 u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
578 C[i+1] = u.GetLowHalf();
579 }
580 return u.GetHighHalf();
581 }
582
Subtract(word * C,const word * A,const word * B,unsigned int N)583 word Portable::Subtract(word *C, const word *A, const word *B, unsigned int N)
584 {
585 DWord u(0, 0);
586 for (unsigned int i = 0; i < N; i+=2)
587 {
588 u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
589 C[i] = u.GetLowHalf();
590 u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
591 C[i+1] = u.GetLowHalf();
592 }
593 return 0-u.GetHighHalf();
594 }
595
Multiply2(word * C,const word * A,const word * B)596 void Portable::Multiply2(word *C, const word *A, const word *B)
597 {
598 /*
599 word s;
600 dword d;
601
602 if (A1 >= A0)
603 if (B0 >= B1)
604 {
605 s = 0;
606 d = (dword)(A1-A0)*(B0-B1);
607 }
608 else
609 {
610 s = (A1-A0);
611 d = (dword)s*(word)(B0-B1);
612 }
613 else
614 if (B0 > B1)
615 {
616 s = (B0-B1);
617 d = (word)(A1-A0)*(dword)s;
618 }
619 else
620 {
621 s = 0;
622 d = (dword)(A0-A1)*(B1-B0);
623 }
624 */
625 // this segment is the branchless equivalent of above
626 word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
627 unsigned int ai = A[1] < A[0];
628 unsigned int bi = B[0] < B[1];
629 unsigned int di = ai & bi;
630 DWord d = DWord::Multiply(D[di], D[di+2]);
631 D[1] = D[3] = 0;
632 unsigned int si = ai + !bi;
633 word s = D[si];
634
635 DWord A0B0 = DWord::Multiply(A[0], B[0]);
636 C[0] = A0B0.GetLowHalf();
637
638 DWord A1B1 = DWord::Multiply(A[1], B[1]);
639 DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf()
640 + A1B1.GetLowHalf();
641 C[1] = t.GetLowHalf();
642
643 t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf()
644 + A1B1.GetHighHalf() - s;
645 C[2] = t.GetLowHalf();
646 C[3] = t.GetHighHalf();
647 }
648
Multiply2Bottom(word * C,const word * A,const word * B)649 void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
650 {
651 DWord t = DWord::Multiply(A[0], B[0]);
652 C[0] = t.GetLowHalf();
653 C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
654 }
655
Multiply2Add(word * C,const word * A,const word * B)656 word Portable::Multiply2Add(word *C, const word *A, const word *B)
657 {
658 word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
659 unsigned int ai = A[1] < A[0];
660 unsigned int bi = B[0] < B[1];
661 unsigned int di = ai & bi;
662 DWord d = DWord::Multiply(D[di], D[di+2]);
663 D[1] = D[3] = 0;
664 unsigned int si = ai + !bi;
665 word s = D[si];
666
667 DWord A0B0 = DWord::Multiply(A[0], B[0]);
668 DWord t = A0B0 + C[0];
669 C[0] = t.GetLowHalf();
670
671 DWord A1B1 = DWord::Multiply(A[1], B[1]);
672 t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() +
673 A1B1.GetLowHalf() + C[1];
674 C[1] = t.GetLowHalf();
675
676 t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() +
677 d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
678 C[2] = t.GetLowHalf();
679
680 t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
681 C[3] = t.GetLowHalf();
682 return t.GetHighHalf();
683 }
684
685
686 #define MulAcc(x, y) \
687 p = DWord::MultiplyAndAdd(A[x], B[y], c); \
688 c = p.GetLowHalf(); \
689 p = (DWord) d + p.GetHighHalf(); \
690 d = p.GetLowHalf(); \
691 e += p.GetHighHalf();
692
693 #define SaveMulAcc(s, x, y) \
694 R[s] = c; \
695 p = DWord::MultiplyAndAdd(A[x], B[y], d); \
696 c = p.GetLowHalf(); \
697 p = (DWord) e + p.GetHighHalf(); \
698 d = p.GetLowHalf(); \
699 e = p.GetHighHalf();
700
701 #define SquAcc(x, y) \
702 q = DWord::Multiply(A[x], A[y]); \
703 p = q + c; \
704 c = p.GetLowHalf(); \
705 p = (DWord) d + p.GetHighHalf(); \
706 d = p.GetLowHalf(); \
707 e += p.GetHighHalf(); \
708 p = q + c; \
709 c = p.GetLowHalf(); \
710 p = (DWord) d + p.GetHighHalf(); \
711 d = p.GetLowHalf(); \
712 e += p.GetHighHalf();
713
714 #define SaveSquAcc(s, x, y) \
715 R[s] = c; \
716 q = DWord::Multiply(A[x], A[y]); \
717 p = q + d; \
718 c = p.GetLowHalf(); \
719 p = (DWord) e + p.GetHighHalf(); \
720 d = p.GetLowHalf(); \
721 e = p.GetHighHalf(); \
722 p = q + c; \
723 c = p.GetLowHalf(); \
724 p = (DWord) d + p.GetHighHalf(); \
725 d = p.GetLowHalf(); \
726 e += p.GetHighHalf();
727
728
Multiply4(word * R,const word * A,const word * B)729 void Portable::Multiply4(word *R, const word *A, const word *B)
730 {
731 DWord p;
732 word c, d, e;
733
734 p = DWord::Multiply(A[0], B[0]);
735 R[0] = p.GetLowHalf();
736 c = p.GetHighHalf();
737 d = e = 0;
738
739 MulAcc(0, 1);
740 MulAcc(1, 0);
741
742 SaveMulAcc(1, 2, 0);
743 MulAcc(1, 1);
744 MulAcc(0, 2);
745
746 SaveMulAcc(2, 0, 3);
747 MulAcc(1, 2);
748 MulAcc(2, 1);
749 MulAcc(3, 0);
750
751 SaveMulAcc(3, 3, 1);
752 MulAcc(2, 2);
753 MulAcc(1, 3);
754
755 SaveMulAcc(4, 2, 3);
756 MulAcc(3, 2);
757
758 R[5] = c;
759 p = DWord::MultiplyAndAdd(A[3], B[3], d);
760 R[6] = p.GetLowHalf();
761 R[7] = e + p.GetHighHalf();
762 }
763
Square2(word * R,const word * A)764 void Portable::Square2(word *R, const word *A)
765 {
766 DWord p, q;
767 word c, d, e;
768
769 p = DWord::Multiply(A[0], A[0]);
770 R[0] = p.GetLowHalf();
771 c = p.GetHighHalf();
772 d = e = 0;
773
774 SquAcc(0, 1);
775
776 R[1] = c;
777 p = DWord::MultiplyAndAdd(A[1], A[1], d);
778 R[2] = p.GetLowHalf();
779 R[3] = e + p.GetHighHalf();
780 }
781
Square4(word * R,const word * A)782 void Portable::Square4(word *R, const word *A)
783 {
784 #ifdef _MSC_VER
785 // VC60 workaround: MSVC 6.0 has an optimization bug that makes
786 // (dword)A*B where either A or B has been cast to a dword before
787 // very expensive. Revisit this function when this
788 // bug is fixed.
789 Multiply4(R, A, A);
790 #else
791 const word *B = A;
792 DWord p, q;
793 word c, d, e;
794
795 p = DWord::Multiply(A[0], A[0]);
796 R[0] = p.GetLowHalf();
797 c = p.GetHighHalf();
798 d = e = 0;
799
800 SquAcc(0, 1);
801
802 SaveSquAcc(1, 2, 0);
803 MulAcc(1, 1);
804
805 SaveSquAcc(2, 0, 3);
806 SquAcc(1, 2);
807
808 SaveSquAcc(3, 3, 1);
809 MulAcc(2, 2);
810
811 SaveSquAcc(4, 2, 3);
812
813 R[5] = c;
814 p = DWord::MultiplyAndAdd(A[3], A[3], d);
815 R[6] = p.GetLowHalf();
816 R[7] = e + p.GetHighHalf();
817 #endif
818 }
819
Multiply8(word * R,const word * A,const word * B)820 void Portable::Multiply8(word *R, const word *A, const word *B)
821 {
822 DWord p;
823 word c, d, e;
824
825 p = DWord::Multiply(A[0], B[0]);
826 R[0] = p.GetLowHalf();
827 c = p.GetHighHalf();
828 d = e = 0;
829
830 MulAcc(0, 1);
831 MulAcc(1, 0);
832
833 SaveMulAcc(1, 2, 0);
834 MulAcc(1, 1);
835 MulAcc(0, 2);
836
837 SaveMulAcc(2, 0, 3);
838 MulAcc(1, 2);
839 MulAcc(2, 1);
840 MulAcc(3, 0);
841
842 SaveMulAcc(3, 0, 4);
843 MulAcc(1, 3);
844 MulAcc(2, 2);
845 MulAcc(3, 1);
846 MulAcc(4, 0);
847
848 SaveMulAcc(4, 0, 5);
849 MulAcc(1, 4);
850 MulAcc(2, 3);
851 MulAcc(3, 2);
852 MulAcc(4, 1);
853 MulAcc(5, 0);
854
855 SaveMulAcc(5, 0, 6);
856 MulAcc(1, 5);
857 MulAcc(2, 4);
858 MulAcc(3, 3);
859 MulAcc(4, 2);
860 MulAcc(5, 1);
861 MulAcc(6, 0);
862
863 SaveMulAcc(6, 0, 7);
864 MulAcc(1, 6);
865 MulAcc(2, 5);
866 MulAcc(3, 4);
867 MulAcc(4, 3);
868 MulAcc(5, 2);
869 MulAcc(6, 1);
870 MulAcc(7, 0);
871
872 SaveMulAcc(7, 1, 7);
873 MulAcc(2, 6);
874 MulAcc(3, 5);
875 MulAcc(4, 4);
876 MulAcc(5, 3);
877 MulAcc(6, 2);
878 MulAcc(7, 1);
879
880 SaveMulAcc(8, 2, 7);
881 MulAcc(3, 6);
882 MulAcc(4, 5);
883 MulAcc(5, 4);
884 MulAcc(6, 3);
885 MulAcc(7, 2);
886
887 SaveMulAcc(9, 3, 7);
888 MulAcc(4, 6);
889 MulAcc(5, 5);
890 MulAcc(6, 4);
891 MulAcc(7, 3);
892
893 SaveMulAcc(10, 4, 7);
894 MulAcc(5, 6);
895 MulAcc(6, 5);
896 MulAcc(7, 4);
897
898 SaveMulAcc(11, 5, 7);
899 MulAcc(6, 6);
900 MulAcc(7, 5);
901
902 SaveMulAcc(12, 6, 7);
903 MulAcc(7, 6);
904
905 R[13] = c;
906 p = DWord::MultiplyAndAdd(A[7], B[7], d);
907 R[14] = p.GetLowHalf();
908 R[15] = e + p.GetHighHalf();
909 }
910
Multiply4Bottom(word * R,const word * A,const word * B)911 void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
912 {
913 DWord p;
914 word c, d, e;
915
916 p = DWord::Multiply(A[0], B[0]);
917 R[0] = p.GetLowHalf();
918 c = p.GetHighHalf();
919 d = e = 0;
920
921 MulAcc(0, 1);
922 MulAcc(1, 0);
923
924 SaveMulAcc(1, 2, 0);
925 MulAcc(1, 1);
926 MulAcc(0, 2);
927
928 R[2] = c;
929 R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
930 }
931
Multiply8Bottom(word * R,const word * A,const word * B)932 void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
933 {
934 DWord p;
935 word c, d, e;
936
937 p = DWord::Multiply(A[0], B[0]);
938 R[0] = p.GetLowHalf();
939 c = p.GetHighHalf();
940 d = e = 0;
941
942 MulAcc(0, 1);
943 MulAcc(1, 0);
944
945 SaveMulAcc(1, 2, 0);
946 MulAcc(1, 1);
947 MulAcc(0, 2);
948
949 SaveMulAcc(2, 0, 3);
950 MulAcc(1, 2);
951 MulAcc(2, 1);
952 MulAcc(3, 0);
953
954 SaveMulAcc(3, 0, 4);
955 MulAcc(1, 3);
956 MulAcc(2, 2);
957 MulAcc(3, 1);
958 MulAcc(4, 0);
959
960 SaveMulAcc(4, 0, 5);
961 MulAcc(1, 4);
962 MulAcc(2, 3);
963 MulAcc(3, 2);
964 MulAcc(4, 1);
965 MulAcc(5, 0);
966
967 SaveMulAcc(5, 0, 6);
968 MulAcc(1, 5);
969 MulAcc(2, 4);
970 MulAcc(3, 3);
971 MulAcc(4, 2);
972 MulAcc(5, 1);
973 MulAcc(6, 0);
974
975 R[6] = c;
976 R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
977 A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
978 }
979
980
981 #undef MulAcc
982 #undef SaveMulAcc
983 #undef SquAcc
984 #undef SaveSquAcc
985
986 // optimized
987
988 #ifdef TAOCRYPT_X86ASM_AVAILABLE
989
990 // ************** x86 feature detection ***************
991
992
993 #ifdef SSE2_INTRINSICS_AVAILABLE
994
995 #ifndef _MSC_VER
996 static jmp_buf s_env;
SigIllHandler(int)997 static void SigIllHandler(int)
998 {
999 longjmp(s_env, 1);
1000 }
1001 #endif
1002
HasSSE2()1003 static bool HasSSE2()
1004 {
1005 if (!IsPentium())
1006 return false;
1007
1008 word32 cpuid[4];
1009 CpuId(1, cpuid);
1010 if ((cpuid[3] & (1 << 26)) == 0)
1011 return false;
1012
1013 #ifdef _MSC_VER
1014 __try
1015 {
1016 __asm xorpd xmm0, xmm0 // executing SSE2 instruction
1017 }
1018 __except (1)
1019 {
1020 return false;
1021 }
1022 return true;
1023 #else
1024 typedef void (*SigHandler)(int);
1025
1026 SigHandler oldHandler = signal(SIGILL, SigIllHandler);
1027 if (oldHandler == SIG_ERR)
1028 return false;
1029
1030 bool result = true;
1031 if (setjmp(s_env))
1032 result = false;
1033 else
1034 __asm __volatile ("xorpd %xmm0, %xmm0");
1035
1036 signal(SIGILL, oldHandler);
1037 return result;
1038 #endif
1039 }
1040 #endif // SSE2_INTRINSICS_AVAILABLE
1041
1042
IsP4()1043 static bool IsP4()
1044 {
1045 if (!IsPentium())
1046 return false;
1047
1048 word32 cpuid[4];
1049
1050 CpuId(1, cpuid);
1051 return ((cpuid[0] >> 8) & 0xf) == 0xf;
1052 }
1053
1054 // ************** Pentium/P4 optimizations ***************
1055
1056 class PentiumOptimized : public Portable
1057 {
1058 public:
1059 static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
1060 unsigned int N);
1061 static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
1062 unsigned int N);
1063 static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
1064 const word *B);
1065 static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
1066 const word *B);
1067 static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
1068 const word *B);
1069 };
1070
1071 class P4Optimized
1072 {
1073 public:
1074 static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
1075 unsigned int N);
1076 static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
1077 unsigned int N);
1078 #ifdef SSE2_INTRINSICS_AVAILABLE
1079 static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
1080 const word *B);
1081 static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
1082 const word *B);
1083 static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
1084 const word *B);
1085 #endif
1086 };
1087
1088 typedef word (TAOCRYPT_CDECL * PAddSub)(word *C, const word *A, const word *B,
1089 unsigned int N);
1090 typedef void (TAOCRYPT_CDECL * PMul)(word *C, const word *A, const word *B);
1091
1092 static PAddSub s_pAdd, s_pSub;
1093 #ifdef SSE2_INTRINSICS_AVAILABLE
1094 static PMul s_pMul4, s_pMul8, s_pMul8B;
1095 #endif
1096
SetPentiumFunctionPointers()1097 static void SetPentiumFunctionPointers()
1098 {
1099 if (!IsPentium())
1100 {
1101 s_pAdd = &Portable::Add;
1102 s_pSub = &Portable::Subtract;
1103 }
1104 else if (IsP4())
1105 {
1106 s_pAdd = &P4Optimized::Add;
1107 s_pSub = &P4Optimized::Subtract;
1108 }
1109 else
1110 {
1111 s_pAdd = &PentiumOptimized::Add;
1112 s_pSub = &PentiumOptimized::Subtract;
1113 }
1114
1115 #ifdef SSE2_INTRINSICS_AVAILABLE
1116 if (!IsPentium())
1117 {
1118 s_pMul4 = &Portable::Multiply4;
1119 s_pMul8 = &Portable::Multiply8;
1120 s_pMul8B = &Portable::Multiply8Bottom;
1121 }
1122 else if (HasSSE2())
1123 {
1124 s_pMul4 = &P4Optimized::Multiply4;
1125 s_pMul8 = &P4Optimized::Multiply8;
1126 s_pMul8B = &P4Optimized::Multiply8Bottom;
1127 }
1128 else
1129 {
1130 s_pMul4 = &PentiumOptimized::Multiply4;
1131 s_pMul8 = &PentiumOptimized::Multiply8;
1132 s_pMul8B = &PentiumOptimized::Multiply8Bottom;
1133 }
1134 #endif
1135 }
1136
1137 static const char s_RunAtStartupSetPentiumFunctionPointers =
1138 (SetPentiumFunctionPointers(), 0);
1139
1140
1141 class LowLevel : public PentiumOptimized
1142 {
1143 public:
Add(word * C,const word * A,const word * B,unsigned int N)1144 inline static word Add(word *C, const word *A, const word *B,
1145 unsigned int N)
1146 {return s_pAdd(C, A, B, N);}
Subtract(word * C,const word * A,const word * B,unsigned int N)1147 inline static word Subtract(word *C, const word *A, const word *B,
1148 unsigned int N)
1149 {return s_pSub(C, A, B, N);}
Square4(word * R,const word * A)1150 inline static void Square4(word *R, const word *A)
1151 {Multiply4(R, A, A);}
1152 #ifdef SSE2_INTRINSICS_AVAILABLE
Multiply4(word * C,const word * A,const word * B)1153 inline static void Multiply4(word *C, const word *A, const word *B)
1154 {s_pMul4(C, A, B);}
Multiply8(word * C,const word * A,const word * B)1155 inline static void Multiply8(word *C, const word *A, const word *B)
1156 {s_pMul8(C, A, B);}
Multiply8Bottom(word * C,const word * A,const word * B)1157 inline static void Multiply8Bottom(word *C, const word *A, const word *B)
1158 {s_pMul8B(C, A, B);}
1159 #endif
1160 };
1161
1162 // use some tricks to share assembly code between MSVC and GCC
1163 #ifdef _MSC_VER
1164 #define TAOCRYPT_NAKED __declspec(naked)
1165 #define AS1(x) __asm x
1166 #define AS2(x, y) __asm x, y
1167 #define AddPrologue \
1168 __asm push ebp \
1169 __asm push ebx \
1170 __asm push esi \
1171 __asm push edi \
1172 __asm mov ecx, [esp+20] \
1173 __asm mov edx, [esp+24] \
1174 __asm mov ebx, [esp+28] \
1175 __asm mov esi, [esp+32]
1176 #define AddEpilogue \
1177 __asm pop edi \
1178 __asm pop esi \
1179 __asm pop ebx \
1180 __asm pop ebp \
1181 __asm ret
1182 #define MulPrologue \
1183 __asm push ebp \
1184 __asm push ebx \
1185 __asm push esi \
1186 __asm push edi \
1187 __asm mov ecx, [esp+28] \
1188 __asm mov esi, [esp+24] \
1189 __asm push [esp+20]
1190 #define MulEpilogue \
1191 __asm add esp, 4 \
1192 __asm pop edi \
1193 __asm pop esi \
1194 __asm pop ebx \
1195 __asm pop ebp \
1196 __asm ret
1197 #else
1198 #define TAOCRYPT_NAKED
1199 #define AS1(x) #x ";"
1200 #define AS2(x, y) #x ", " #y ";"
1201 #define AddPrologue \
1202 __asm__ __volatile__ \
1203 ( \
1204 "push %%ebx;" /* save this manually, in case of -fPIC */ \
1205 "mov %2, %%ebx;" \
1206 ".intel_syntax noprefix;" \
1207 "push ebp;"
1208 #define AddEpilogue \
1209 "pop ebp;" \
1210 ".att_syntax prefix;" \
1211 "pop %%ebx;" \
1212 : \
1213 : "c" (C), "d" (A), "m" (B), "S" (N) \
1214 : "%edi", "memory", "cc" \
1215 );
1216 #define MulPrologue \
1217 __asm__ __volatile__ \
1218 ( \
1219 "push %%ebx;" /* save this manually, in case of -fPIC */ \
1220 "push %%ebp;" \
1221 "push %0;" \
1222 ".intel_syntax noprefix;"
1223 #define MulEpilogue \
1224 "add esp, 4;" \
1225 "pop ebp;" \
1226 "pop ebx;" \
1227 ".att_syntax prefix;" \
1228 : \
1229 : "rm" (Z), "S" (X), "c" (Y) \
1230 : "%eax", "%edx", "%edi", "memory", "cc" \
1231 );
1232 #endif
1233
Add(word * C,const word * A,const word * B,unsigned int N)1234 TAOCRYPT_NAKED word PentiumOptimized::Add(word *C, const word *A,
1235 const word *B, unsigned int N)
1236 {
1237 AddPrologue
1238
1239 // now: ebx = B, ecx = C, edx = A, esi = N
1240 AS2( sub ecx, edx) // hold the distance between C & A so we
1241 // can add this to A to get C
1242 AS2( xor eax, eax) // clear eax
1243
1244 AS2( sub eax, esi) // eax is a negative index from end of B
1245 AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1246
1247 AS2( sar eax, 1) // unit of eax is now dwords; this also
1248 // clears the carry flag
1249 AS1( jz loopendAdd) // if no dwords then nothing to do
1250
1251 AS1(loopstartAdd:)
1252 AS2( mov esi,[edx]) // load lower word of A
1253 AS2( mov ebp,[edx+4]) // load higher word of A
1254
1255 AS2( mov edi,[ebx+8*eax]) // load lower word of B
1256 AS2( lea edx,[edx+8]) // advance A and C
1257
1258 AS2( adc esi,edi) // add lower words
1259 AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1260
1261 AS2( adc ebp,edi) // add higher words
1262 AS1( inc eax) // advance B
1263
1264 AS2( mov [edx+ecx-8],esi) // store lower word result
1265 AS2( mov [edx+ecx-4],ebp) // store higher word result
1266
1267 AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero
1268
1269 AS1(loopendAdd:)
1270 AS2( adc eax, 0) // store carry into eax (return result register)
1271
1272 AddEpilogue
1273 }
1274
Subtract(word * C,const word * A,const word * B,unsigned int N)1275 TAOCRYPT_NAKED word PentiumOptimized::Subtract(word *C, const word *A,
1276 const word *B, unsigned int N)
1277 {
1278 AddPrologue
1279
1280 // now: ebx = B, ecx = C, edx = A, esi = N
1281 AS2( sub ecx, edx) // hold the distance between C & A so we
1282 // can add this to A to get C
1283 AS2( xor eax, eax) // clear eax
1284
1285 AS2( sub eax, esi) // eax is a negative index from end of B
1286 AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
1287
1288 AS2( sar eax, 1) // unit of eax is now dwords; this also
1289 // clears the carry flag
1290 AS1( jz loopendSub) // if no dwords then nothing to do
1291
1292 AS1(loopstartSub:)
1293 AS2( mov esi,[edx]) // load lower word of A
1294 AS2( mov ebp,[edx+4]) // load higher word of A
1295
1296 AS2( mov edi,[ebx+8*eax]) // load lower word of B
1297 AS2( lea edx,[edx+8]) // advance A and C
1298
1299 AS2( sbb esi,edi) // subtract lower words
1300 AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
1301
1302 AS2( sbb ebp,edi) // subtract higher words
1303 AS1( inc eax) // advance B
1304
1305 AS2( mov [edx+ecx-8],esi) // store lower word result
1306 AS2( mov [edx+ecx-4],ebp) // store higher word result
1307
1308 AS1( jnz loopstartSub) // loop until eax overflows and becomes zero
1309
1310 AS1(loopendSub:)
1311 AS2( adc eax, 0) // store carry into eax (return result register)
1312
1313 AddEpilogue
1314 }
1315
1316 // On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
1317
Add(word * C,const word * A,const word * B,unsigned int N)1318 TAOCRYPT_NAKED word P4Optimized::Add(word *C, const word *A, const word *B,
1319 unsigned int N)
1320 {
1321 AddPrologue
1322
1323 // now: ebx = B, ecx = C, edx = A, esi = N
1324 AS2( xor eax, eax)
1325 AS1( neg esi)
1326 AS1( jz loopendAddP4) // if no dwords then nothing to do
1327
1328 AS2( mov edi, [edx])
1329 AS2( mov ebp, [ebx])
1330 AS1( jmp carry1AddP4)
1331
1332 AS1(loopstartAddP4:)
1333 AS2( mov edi, [edx+8])
1334 AS2( add ecx, 8)
1335 AS2( add edx, 8)
1336 AS2( mov ebp, [ebx])
1337 AS2( add edi, eax)
1338 AS1( jc carry1AddP4)
1339 AS2( xor eax, eax)
1340
1341 AS1(carry1AddP4:)
1342 AS2( add edi, ebp)
1343 AS2( mov ebp, 1)
1344 AS2( mov [ecx], edi)
1345 AS2( mov edi, [edx+4])
1346 AS2( cmovc eax, ebp)
1347 AS2( mov ebp, [ebx+4])
1348 AS2( add ebx, 8)
1349 AS2( add edi, eax)
1350 AS1( jc carry2AddP4)
1351 AS2( xor eax, eax)
1352
1353 AS1(carry2AddP4:)
1354 AS2( add edi, ebp)
1355 AS2( mov ebp, 1)
1356 AS2( cmovc eax, ebp)
1357 AS2( mov [ecx+4], edi)
1358 AS2( add esi, 2)
1359 AS1( jnz loopstartAddP4)
1360
1361 AS1(loopendAddP4:)
1362
1363 AddEpilogue
1364 }
1365
Subtract(word * C,const word * A,const word * B,unsigned int N)1366 TAOCRYPT_NAKED word P4Optimized::Subtract(word *C, const word *A,
1367 const word *B, unsigned int N)
1368 {
1369 AddPrologue
1370
1371 // now: ebx = B, ecx = C, edx = A, esi = N
1372 AS2( xor eax, eax)
1373 AS1( neg esi)
1374 AS1( jz loopendSubP4) // if no dwords then nothing to do
1375
1376 AS2( mov edi, [edx])
1377 AS2( mov ebp, [ebx])
1378 AS1( jmp carry1SubP4)
1379
1380 AS1(loopstartSubP4:)
1381 AS2( mov edi, [edx+8])
1382 AS2( add edx, 8)
1383 AS2( add ecx, 8)
1384 AS2( mov ebp, [ebx])
1385 AS2( sub edi, eax)
1386 AS1( jc carry1SubP4)
1387 AS2( xor eax, eax)
1388
1389 AS1(carry1SubP4:)
1390 AS2( sub edi, ebp)
1391 AS2( mov ebp, 1)
1392 AS2( mov [ecx], edi)
1393 AS2( mov edi, [edx+4])
1394 AS2( cmovc eax, ebp)
1395 AS2( mov ebp, [ebx+4])
1396 AS2( add ebx, 8)
1397 AS2( sub edi, eax)
1398 AS1( jc carry2SubP4)
1399 AS2( xor eax, eax)
1400
1401 AS1(carry2SubP4:)
1402 AS2( sub edi, ebp)
1403 AS2( mov ebp, 1)
1404 AS2( cmovc eax, ebp)
1405 AS2( mov [ecx+4], edi)
1406 AS2( add esi, 2)
1407 AS1( jnz loopstartSubP4)
1408
1409 AS1(loopendSubP4:)
1410
1411 AddEpilogue
1412 }
1413
1414 // multiply assembly code originally contributed by Leonard Janke
1415
1416 #define MulStartup \
1417 AS2(xor ebp, ebp) \
1418 AS2(xor edi, edi) \
1419 AS2(xor ebx, ebx)
1420
1421 #define MulShiftCarry \
1422 AS2(mov ebp, edx) \
1423 AS2(mov edi, ebx) \
1424 AS2(xor ebx, ebx)
1425
1426 #define MulAccumulateBottom(i,j) \
1427 AS2(mov eax, [ecx+4*j]) \
1428 AS2(imul eax, dword ptr [esi+4*i]) \
1429 AS2(add ebp, eax)
1430
1431 #define MulAccumulate(i,j) \
1432 AS2(mov eax, [ecx+4*j]) \
1433 AS1(mul dword ptr [esi+4*i]) \
1434 AS2(add ebp, eax) \
1435 AS2(adc edi, edx) \
1436 AS2(adc bl, bh)
1437
1438 #define MulStoreDigit(i) \
1439 AS2(mov edx, edi) \
1440 AS2(mov edi, [esp]) \
1441 AS2(mov [edi+4*i], ebp)
1442
1443 #define MulLastDiagonal(digits) \
1444 AS2(mov eax, [ecx+4*(digits-1)]) \
1445 AS1(mul dword ptr [esi+4*(digits-1)]) \
1446 AS2(add ebp, eax) \
1447 AS2(adc edx, edi) \
1448 AS2(mov edi, [esp]) \
1449 AS2(mov [edi+4*(2*digits-2)], ebp) \
1450 AS2(mov [edi+4*(2*digits-1)], edx)
1451
Multiply4(word * Z,const word * X,const word * Y)1452 TAOCRYPT_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X,
1453 const word* Y)
1454 {
1455 MulPrologue
1456 // now: [esp] = Z, esi = X, ecx = Y
1457 MulStartup
1458 MulAccumulate(0,0)
1459 MulStoreDigit(0)
1460 MulShiftCarry
1461
1462 MulAccumulate(1,0)
1463 MulAccumulate(0,1)
1464 MulStoreDigit(1)
1465 MulShiftCarry
1466
1467 MulAccumulate(2,0)
1468 MulAccumulate(1,1)
1469 MulAccumulate(0,2)
1470 MulStoreDigit(2)
1471 MulShiftCarry
1472
1473 MulAccumulate(3,0)
1474 MulAccumulate(2,1)
1475 MulAccumulate(1,2)
1476 MulAccumulate(0,3)
1477 MulStoreDigit(3)
1478 MulShiftCarry
1479
1480 MulAccumulate(3,1)
1481 MulAccumulate(2,2)
1482 MulAccumulate(1,3)
1483 MulStoreDigit(4)
1484 MulShiftCarry
1485
1486 MulAccumulate(3,2)
1487 MulAccumulate(2,3)
1488 MulStoreDigit(5)
1489 MulShiftCarry
1490
1491 MulLastDiagonal(4)
1492 MulEpilogue
1493 }
1494
Multiply8(word * Z,const word * X,const word * Y)1495 TAOCRYPT_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X,
1496 const word* Y)
1497 {
1498 MulPrologue
1499 // now: [esp] = Z, esi = X, ecx = Y
1500 MulStartup
1501 MulAccumulate(0,0)
1502 MulStoreDigit(0)
1503 MulShiftCarry
1504
1505 MulAccumulate(1,0)
1506 MulAccumulate(0,1)
1507 MulStoreDigit(1)
1508 MulShiftCarry
1509
1510 MulAccumulate(2,0)
1511 MulAccumulate(1,1)
1512 MulAccumulate(0,2)
1513 MulStoreDigit(2)
1514 MulShiftCarry
1515
1516 MulAccumulate(3,0)
1517 MulAccumulate(2,1)
1518 MulAccumulate(1,2)
1519 MulAccumulate(0,3)
1520 MulStoreDigit(3)
1521 MulShiftCarry
1522
1523 MulAccumulate(4,0)
1524 MulAccumulate(3,1)
1525 MulAccumulate(2,2)
1526 MulAccumulate(1,3)
1527 MulAccumulate(0,4)
1528 MulStoreDigit(4)
1529 MulShiftCarry
1530
1531 MulAccumulate(5,0)
1532 MulAccumulate(4,1)
1533 MulAccumulate(3,2)
1534 MulAccumulate(2,3)
1535 MulAccumulate(1,4)
1536 MulAccumulate(0,5)
1537 MulStoreDigit(5)
1538 MulShiftCarry
1539
1540 MulAccumulate(6,0)
1541 MulAccumulate(5,1)
1542 MulAccumulate(4,2)
1543 MulAccumulate(3,3)
1544 MulAccumulate(2,4)
1545 MulAccumulate(1,5)
1546 MulAccumulate(0,6)
1547 MulStoreDigit(6)
1548 MulShiftCarry
1549
1550 MulAccumulate(7,0)
1551 MulAccumulate(6,1)
1552 MulAccumulate(5,2)
1553 MulAccumulate(4,3)
1554 MulAccumulate(3,4)
1555 MulAccumulate(2,5)
1556 MulAccumulate(1,6)
1557 MulAccumulate(0,7)
1558 MulStoreDigit(7)
1559 MulShiftCarry
1560
1561 MulAccumulate(7,1)
1562 MulAccumulate(6,2)
1563 MulAccumulate(5,3)
1564 MulAccumulate(4,4)
1565 MulAccumulate(3,5)
1566 MulAccumulate(2,6)
1567 MulAccumulate(1,7)
1568 MulStoreDigit(8)
1569 MulShiftCarry
1570
1571 MulAccumulate(7,2)
1572 MulAccumulate(6,3)
1573 MulAccumulate(5,4)
1574 MulAccumulate(4,5)
1575 MulAccumulate(3,6)
1576 MulAccumulate(2,7)
1577 MulStoreDigit(9)
1578 MulShiftCarry
1579
1580 MulAccumulate(7,3)
1581 MulAccumulate(6,4)
1582 MulAccumulate(5,5)
1583 MulAccumulate(4,6)
1584 MulAccumulate(3,7)
1585 MulStoreDigit(10)
1586 MulShiftCarry
1587
1588 MulAccumulate(7,4)
1589 MulAccumulate(6,5)
1590 MulAccumulate(5,6)
1591 MulAccumulate(4,7)
1592 MulStoreDigit(11)
1593 MulShiftCarry
1594
1595 MulAccumulate(7,5)
1596 MulAccumulate(6,6)
1597 MulAccumulate(5,7)
1598 MulStoreDigit(12)
1599 MulShiftCarry
1600
1601 MulAccumulate(7,6)
1602 MulAccumulate(6,7)
1603 MulStoreDigit(13)
1604 MulShiftCarry
1605
1606 MulLastDiagonal(8)
1607 MulEpilogue
1608 }
1609
Multiply8Bottom(word * Z,const word * X,const word * Y)1610 TAOCRYPT_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X,
1611 const word* Y)
1612 {
1613 MulPrologue
1614 // now: [esp] = Z, esi = X, ecx = Y
1615 MulStartup
1616 MulAccumulate(0,0)
1617 MulStoreDigit(0)
1618 MulShiftCarry
1619
1620 MulAccumulate(1,0)
1621 MulAccumulate(0,1)
1622 MulStoreDigit(1)
1623 MulShiftCarry
1624
1625 MulAccumulate(2,0)
1626 MulAccumulate(1,1)
1627 MulAccumulate(0,2)
1628 MulStoreDigit(2)
1629 MulShiftCarry
1630
1631 MulAccumulate(3,0)
1632 MulAccumulate(2,1)
1633 MulAccumulate(1,2)
1634 MulAccumulate(0,3)
1635 MulStoreDigit(3)
1636 MulShiftCarry
1637
1638 MulAccumulate(4,0)
1639 MulAccumulate(3,1)
1640 MulAccumulate(2,2)
1641 MulAccumulate(1,3)
1642 MulAccumulate(0,4)
1643 MulStoreDigit(4)
1644 MulShiftCarry
1645
1646 MulAccumulate(5,0)
1647 MulAccumulate(4,1)
1648 MulAccumulate(3,2)
1649 MulAccumulate(2,3)
1650 MulAccumulate(1,4)
1651 MulAccumulate(0,5)
1652 MulStoreDigit(5)
1653 MulShiftCarry
1654
1655 MulAccumulate(6,0)
1656 MulAccumulate(5,1)
1657 MulAccumulate(4,2)
1658 MulAccumulate(3,3)
1659 MulAccumulate(2,4)
1660 MulAccumulate(1,5)
1661 MulAccumulate(0,6)
1662 MulStoreDigit(6)
1663 MulShiftCarry
1664
1665 MulAccumulateBottom(7,0)
1666 MulAccumulateBottom(6,1)
1667 MulAccumulateBottom(5,2)
1668 MulAccumulateBottom(4,3)
1669 MulAccumulateBottom(3,4)
1670 MulAccumulateBottom(2,5)
1671 MulAccumulateBottom(1,6)
1672 MulAccumulateBottom(0,7)
1673 MulStoreDigit(7)
1674 MulEpilogue
1675 }
1676
1677 #undef AS1
1678 #undef AS2
1679
1680 #else // not x86 - no processor specific code at this layer
1681
1682 typedef Portable LowLevel;
1683
1684 #endif
1685
1686 #ifdef SSE2_INTRINSICS_AVAILABLE
1687
1688 #ifdef __GNUC__
1689 #define TAOCRYPT_FASTCALL
1690 #else
1691 #define TAOCRYPT_FASTCALL __fastcall
1692 #endif
1693
P4_Mul(__m128i * C,const __m128i * A,const __m128i * B)1694 static void TAOCRYPT_FASTCALL P4_Mul(__m128i *C, const __m128i *A,
1695 const __m128i *B)
1696 {
1697 __m128i a3210 = _mm_load_si128(A);
1698 __m128i b3210 = _mm_load_si128(B);
1699
1700 __m128i sum;
1701
1702 __m128i z = _mm_setzero_si128();
1703 __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
1704 C[0] = a2b2_a0b0;
1705
1706 __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
1707 __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
1708 __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
1709 __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
1710 __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
1711 C[1] = _mm_add_epi64(a1b0, a0b1);
1712
1713 __m128i a31 = _mm_srli_epi64(a3210, 32);
1714 __m128i b31 = _mm_srli_epi64(b3210, 32);
1715 __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
1716 C[6] = a3b3_a1b1;
1717
1718 __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
1719 __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
1720 __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
1721 __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
1722 __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
1723 sum = _mm_add_epi64(a1b1, a0b2);
1724 C[2] = _mm_add_epi64(sum, a2b0);
1725
1726 __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
1727 __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
1728 __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
1729 __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
1730 __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
1731 __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
1732 __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
1733 __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
1734 __m128i sum1 = _mm_add_epi64(a3b0, a1b2);
1735 sum = _mm_add_epi64(a2b1, a0b3);
1736 C[3] = _mm_add_epi64(sum, sum1);
1737
1738 __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
1739 __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
1740 __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
1741 __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
1742 sum = _mm_add_epi64(a2b2, a3b1);
1743 C[4] = _mm_add_epi64(sum, a1b3);
1744
1745 __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
1746 __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
1747 __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
1748 __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
1749 __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
1750 C[5] = _mm_add_epi64(a3b2, a2b3);
1751 }
1752
Multiply4(word * C,const word * A,const word * B)1753 void P4Optimized::Multiply4(word *C, const word *A, const word *B)
1754 {
1755 __m128i temp[7];
1756 const word *w = (word *)temp;
1757 const __m64 *mw = (__m64 *)w;
1758
1759 P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1760
1761 C[0] = w[0];
1762
1763 __m64 s1, s2;
1764
1765 __m64 w1 = _mm_cvtsi32_si64(w[1]);
1766 __m64 w4 = mw[2];
1767 __m64 w6 = mw[3];
1768 __m64 w8 = mw[4];
1769 __m64 w10 = mw[5];
1770 __m64 w12 = mw[6];
1771 __m64 w14 = mw[7];
1772 __m64 w16 = mw[8];
1773 __m64 w18 = mw[9];
1774 __m64 w20 = mw[10];
1775 __m64 w22 = mw[11];
1776 __m64 w26 = _mm_cvtsi32_si64(w[26]);
1777
1778 s1 = _mm_add_si64(w1, w4);
1779 C[1] = _mm_cvtsi64_si32(s1);
1780 s1 = _mm_srli_si64(s1, 32);
1781
1782 s2 = _mm_add_si64(w6, w8);
1783 s1 = _mm_add_si64(s1, s2);
1784 C[2] = _mm_cvtsi64_si32(s1);
1785 s1 = _mm_srli_si64(s1, 32);
1786
1787 s2 = _mm_add_si64(w10, w12);
1788 s1 = _mm_add_si64(s1, s2);
1789 C[3] = _mm_cvtsi64_si32(s1);
1790 s1 = _mm_srli_si64(s1, 32);
1791
1792 s2 = _mm_add_si64(w14, w16);
1793 s1 = _mm_add_si64(s1, s2);
1794 C[4] = _mm_cvtsi64_si32(s1);
1795 s1 = _mm_srli_si64(s1, 32);
1796
1797 s2 = _mm_add_si64(w18, w20);
1798 s1 = _mm_add_si64(s1, s2);
1799 C[5] = _mm_cvtsi64_si32(s1);
1800 s1 = _mm_srli_si64(s1, 32);
1801
1802 s2 = _mm_add_si64(w22, w26);
1803 s1 = _mm_add_si64(s1, s2);
1804 C[6] = _mm_cvtsi64_si32(s1);
1805 s1 = _mm_srli_si64(s1, 32);
1806
1807 C[7] = _mm_cvtsi64_si32(s1) + w[27];
1808 _mm_empty();
1809 }
1810
Multiply8(word * C,const word * A,const word * B)1811 void P4Optimized::Multiply8(word *C, const word *A, const word *B)
1812 {
1813 __m128i temp[28];
1814 const word *w = (word *)temp;
1815 const __m64 *mw = (__m64 *)w;
1816 const word *x = (word *)temp+7*4;
1817 const __m64 *mx = (__m64 *)x;
1818 const word *y = (word *)temp+7*4*2;
1819 const __m64 *my = (__m64 *)y;
1820 const word *z = (word *)temp+7*4*3;
1821 const __m64 *mz = (__m64 *)z;
1822
1823 P4_Mul(temp, (__m128i *)A, (__m128i *)B);
1824
1825 P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
1826
1827 P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
1828
1829 P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
1830
1831 C[0] = w[0];
1832
1833 __m64 s1, s2, s3, s4;
1834
1835 __m64 w1 = _mm_cvtsi32_si64(w[1]);
1836 __m64 w4 = mw[2];
1837 __m64 w6 = mw[3];
1838 __m64 w8 = mw[4];
1839 __m64 w10 = mw[5];
1840 __m64 w12 = mw[6];
1841 __m64 w14 = mw[7];
1842 __m64 w16 = mw[8];
1843 __m64 w18 = mw[9];
1844 __m64 w20 = mw[10];
1845 __m64 w22 = mw[11];
1846 __m64 w26 = _mm_cvtsi32_si64(w[26]);
1847 __m64 w27 = _mm_cvtsi32_si64(w[27]);
1848
1849 __m64 x0 = _mm_cvtsi32_si64(x[0]);
1850 __m64 x1 = _mm_cvtsi32_si64(x[1]);
1851 __m64 x4 = mx[2];
1852 __m64 x6 = mx[3];
1853 __m64 x8 = mx[4];
1854 __m64 x10 = mx[5];
1855 __m64 x12 = mx[6];
1856 __m64 x14 = mx[7];
1857 __m64 x16 = mx[8];
1858 __m64 x18 = mx[9];
1859 __m64 x20 = mx[10];
1860 __m64 x22 = mx[11];
1861 __m64 x26 = _mm_cvtsi32_si64(x[26]);
1862 __m64 x27 = _mm_cvtsi32_si64(x[27]);
1863
1864 __m64 y0 = _mm_cvtsi32_si64(y[0]);
1865 __m64 y1 = _mm_cvtsi32_si64(y[1]);
1866 __m64 y4 = my[2];
1867 __m64 y6 = my[3];
1868 __m64 y8 = my[4];
1869 __m64 y10 = my[5];
1870 __m64 y12 = my[6];
1871 __m64 y14 = my[7];
1872 __m64 y16 = my[8];
1873 __m64 y18 = my[9];
1874 __m64 y20 = my[10];
1875 __m64 y22 = my[11];
1876 __m64 y26 = _mm_cvtsi32_si64(y[26]);
1877 __m64 y27 = _mm_cvtsi32_si64(y[27]);
1878
1879 __m64 z0 = _mm_cvtsi32_si64(z[0]);
1880 __m64 z1 = _mm_cvtsi32_si64(z[1]);
1881 __m64 z4 = mz[2];
1882 __m64 z6 = mz[3];
1883 __m64 z8 = mz[4];
1884 __m64 z10 = mz[5];
1885 __m64 z12 = mz[6];
1886 __m64 z14 = mz[7];
1887 __m64 z16 = mz[8];
1888 __m64 z18 = mz[9];
1889 __m64 z20 = mz[10];
1890 __m64 z22 = mz[11];
1891 __m64 z26 = _mm_cvtsi32_si64(z[26]);
1892
1893 s1 = _mm_add_si64(w1, w4);
1894 C[1] = _mm_cvtsi64_si32(s1);
1895 s1 = _mm_srli_si64(s1, 32);
1896
1897 s2 = _mm_add_si64(w6, w8);
1898 s1 = _mm_add_si64(s1, s2);
1899 C[2] = _mm_cvtsi64_si32(s1);
1900 s1 = _mm_srli_si64(s1, 32);
1901
1902 s2 = _mm_add_si64(w10, w12);
1903 s1 = _mm_add_si64(s1, s2);
1904 C[3] = _mm_cvtsi64_si32(s1);
1905 s1 = _mm_srli_si64(s1, 32);
1906
1907 s3 = _mm_add_si64(x0, y0);
1908 s2 = _mm_add_si64(w14, w16);
1909 s1 = _mm_add_si64(s1, s3);
1910 s1 = _mm_add_si64(s1, s2);
1911 C[4] = _mm_cvtsi64_si32(s1);
1912 s1 = _mm_srli_si64(s1, 32);
1913
1914 s3 = _mm_add_si64(x1, y1);
1915 s4 = _mm_add_si64(x4, y4);
1916 s1 = _mm_add_si64(s1, w18);
1917 s3 = _mm_add_si64(s3, s4);
1918 s1 = _mm_add_si64(s1, w20);
1919 s1 = _mm_add_si64(s1, s3);
1920 C[5] = _mm_cvtsi64_si32(s1);
1921 s1 = _mm_srli_si64(s1, 32);
1922
1923 s3 = _mm_add_si64(x6, y6);
1924 s4 = _mm_add_si64(x8, y8);
1925 s1 = _mm_add_si64(s1, w22);
1926 s3 = _mm_add_si64(s3, s4);
1927 s1 = _mm_add_si64(s1, w26);
1928 s1 = _mm_add_si64(s1, s3);
1929 C[6] = _mm_cvtsi64_si32(s1);
1930 s1 = _mm_srli_si64(s1, 32);
1931
1932 s3 = _mm_add_si64(x10, y10);
1933 s4 = _mm_add_si64(x12, y12);
1934 s1 = _mm_add_si64(s1, w27);
1935 s3 = _mm_add_si64(s3, s4);
1936 s1 = _mm_add_si64(s1, s3);
1937 C[7] = _mm_cvtsi64_si32(s1);
1938 s1 = _mm_srli_si64(s1, 32);
1939
1940 s3 = _mm_add_si64(x14, y14);
1941 s4 = _mm_add_si64(x16, y16);
1942 s1 = _mm_add_si64(s1, z0);
1943 s3 = _mm_add_si64(s3, s4);
1944 s1 = _mm_add_si64(s1, s3);
1945 C[8] = _mm_cvtsi64_si32(s1);
1946 s1 = _mm_srli_si64(s1, 32);
1947
1948 s3 = _mm_add_si64(x18, y18);
1949 s4 = _mm_add_si64(x20, y20);
1950 s1 = _mm_add_si64(s1, z1);
1951 s3 = _mm_add_si64(s3, s4);
1952 s1 = _mm_add_si64(s1, z4);
1953 s1 = _mm_add_si64(s1, s3);
1954 C[9] = _mm_cvtsi64_si32(s1);
1955 s1 = _mm_srli_si64(s1, 32);
1956
1957 s3 = _mm_add_si64(x22, y22);
1958 s4 = _mm_add_si64(x26, y26);
1959 s1 = _mm_add_si64(s1, z6);
1960 s3 = _mm_add_si64(s3, s4);
1961 s1 = _mm_add_si64(s1, z8);
1962 s1 = _mm_add_si64(s1, s3);
1963 C[10] = _mm_cvtsi64_si32(s1);
1964 s1 = _mm_srli_si64(s1, 32);
1965
1966 s3 = _mm_add_si64(x27, y27);
1967 s1 = _mm_add_si64(s1, z10);
1968 s1 = _mm_add_si64(s1, z12);
1969 s1 = _mm_add_si64(s1, s3);
1970 C[11] = _mm_cvtsi64_si32(s1);
1971 s1 = _mm_srli_si64(s1, 32);
1972
1973 s3 = _mm_add_si64(z14, z16);
1974 s1 = _mm_add_si64(s1, s3);
1975 C[12] = _mm_cvtsi64_si32(s1);
1976 s1 = _mm_srli_si64(s1, 32);
1977
1978 s3 = _mm_add_si64(z18, z20);
1979 s1 = _mm_add_si64(s1, s3);
1980 C[13] = _mm_cvtsi64_si32(s1);
1981 s1 = _mm_srli_si64(s1, 32);
1982
1983 s3 = _mm_add_si64(z22, z26);
1984 s1 = _mm_add_si64(s1, s3);
1985 C[14] = _mm_cvtsi64_si32(s1);
1986 s1 = _mm_srli_si64(s1, 32);
1987
1988 C[15] = z[27] + _mm_cvtsi64_si32(s1);
1989 _mm_empty();
1990 }
1991
Multiply8Bottom(word * C,const word * A,const word * B)1992 void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
1993 {
1994 __m128i temp[21];
1995 const word *w = (word *)temp;
1996 const __m64 *mw = (__m64 *)w;
1997 const word *x = (word *)temp+7*4;
1998 const __m64 *mx = (__m64 *)x;
1999 const word *y = (word *)temp+7*4*2;
2000 const __m64 *my = (__m64 *)y;
2001
2002 P4_Mul(temp, (__m128i *)A, (__m128i *)B);
2003
2004 P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
2005
2006 P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
2007
2008 C[0] = w[0];
2009
2010 __m64 s1, s2, s3, s4;
2011
2012 __m64 w1 = _mm_cvtsi32_si64(w[1]);
2013 __m64 w4 = mw[2];
2014 __m64 w6 = mw[3];
2015 __m64 w8 = mw[4];
2016 __m64 w10 = mw[5];
2017 __m64 w12 = mw[6];
2018 __m64 w14 = mw[7];
2019 __m64 w16 = mw[8];
2020 __m64 w18 = mw[9];
2021 __m64 w20 = mw[10];
2022 __m64 w22 = mw[11];
2023 __m64 w26 = _mm_cvtsi32_si64(w[26]);
2024
2025 __m64 x0 = _mm_cvtsi32_si64(x[0]);
2026 __m64 x1 = _mm_cvtsi32_si64(x[1]);
2027 __m64 x4 = mx[2];
2028 __m64 x6 = mx[3];
2029 __m64 x8 = mx[4];
2030
2031 __m64 y0 = _mm_cvtsi32_si64(y[0]);
2032 __m64 y1 = _mm_cvtsi32_si64(y[1]);
2033 __m64 y4 = my[2];
2034 __m64 y6 = my[3];
2035 __m64 y8 = my[4];
2036
2037 s1 = _mm_add_si64(w1, w4);
2038 C[1] = _mm_cvtsi64_si32(s1);
2039 s1 = _mm_srli_si64(s1, 32);
2040
2041 s2 = _mm_add_si64(w6, w8);
2042 s1 = _mm_add_si64(s1, s2);
2043 C[2] = _mm_cvtsi64_si32(s1);
2044 s1 = _mm_srli_si64(s1, 32);
2045
2046 s2 = _mm_add_si64(w10, w12);
2047 s1 = _mm_add_si64(s1, s2);
2048 C[3] = _mm_cvtsi64_si32(s1);
2049 s1 = _mm_srli_si64(s1, 32);
2050
2051 s3 = _mm_add_si64(x0, y0);
2052 s2 = _mm_add_si64(w14, w16);
2053 s1 = _mm_add_si64(s1, s3);
2054 s1 = _mm_add_si64(s1, s2);
2055 C[4] = _mm_cvtsi64_si32(s1);
2056 s1 = _mm_srli_si64(s1, 32);
2057
2058 s3 = _mm_add_si64(x1, y1);
2059 s4 = _mm_add_si64(x4, y4);
2060 s1 = _mm_add_si64(s1, w18);
2061 s3 = _mm_add_si64(s3, s4);
2062 s1 = _mm_add_si64(s1, w20);
2063 s1 = _mm_add_si64(s1, s3);
2064 C[5] = _mm_cvtsi64_si32(s1);
2065 s1 = _mm_srli_si64(s1, 32);
2066
2067 s3 = _mm_add_si64(x6, y6);
2068 s4 = _mm_add_si64(x8, y8);
2069 s1 = _mm_add_si64(s1, w22);
2070 s3 = _mm_add_si64(s3, s4);
2071 s1 = _mm_add_si64(s1, w26);
2072 s1 = _mm_add_si64(s1, s3);
2073 C[6] = _mm_cvtsi64_si32(s1);
2074 s1 = _mm_srli_si64(s1, 32);
2075
2076 C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
2077 _mm_empty();
2078 }
2079
2080 #endif // #ifdef SSE2_INTRINSICS_AVAILABLE
2081
2082 // end optimized
2083
2084 // ********************************************************
2085
2086 #define A0 A
2087 #define A1 (A+N2)
2088 #define B0 B
2089 #define B1 (B+N2)
2090
2091 #define T0 T
2092 #define T1 (T+N2)
2093 #define T2 (T+N)
2094 #define T3 (T+N+N2)
2095
2096 #define R0 R
2097 #define R1 (R+N2)
2098 #define R2 (R+N)
2099 #define R3 (R+N+N2)
2100
2101 //VC60 workaround: compiler bug triggered without the extra dummy parameters
2102
2103 // R[2*N] - result = A*B
2104 // T[2*N] - temporary work space
2105 // A[N] --- multiplier
2106 // B[N] --- multiplicant
2107
2108
RecursiveMultiply(word * R,word * T,const word * A,const word * B,unsigned int N)2109 void RecursiveMultiply(word *R, word *T, const word *A, const word *B,
2110 unsigned int N)
2111 {
2112 if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
2113 LowLevel::Multiply8(R, A, B);
2114 else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
2115 LowLevel::Multiply4(R, A, B);
2116 else if (N==2)
2117 LowLevel::Multiply2(R, A, B);
2118 else
2119 {
2120 const unsigned int N2 = N/2;
2121 int carry;
2122
2123 int aComp = Compare(A0, A1, N2);
2124 int bComp = Compare(B0, B1, N2);
2125
2126 switch (2*aComp + aComp + bComp)
2127 {
2128 case -4:
2129 LowLevel::Subtract(R0, A1, A0, N2);
2130 LowLevel::Subtract(R1, B0, B1, N2);
2131 RecursiveMultiply(T0, T2, R0, R1, N2);
2132 LowLevel::Subtract(T1, T1, R0, N2);
2133 carry = -1;
2134 break;
2135 case -2:
2136 LowLevel::Subtract(R0, A1, A0, N2);
2137 LowLevel::Subtract(R1, B0, B1, N2);
2138 RecursiveMultiply(T0, T2, R0, R1, N2);
2139 carry = 0;
2140 break;
2141 case 2:
2142 LowLevel::Subtract(R0, A0, A1, N2);
2143 LowLevel::Subtract(R1, B1, B0, N2);
2144 RecursiveMultiply(T0, T2, R0, R1, N2);
2145 carry = 0;
2146 break;
2147 case 4:
2148 LowLevel::Subtract(R0, A1, A0, N2);
2149 LowLevel::Subtract(R1, B0, B1, N2);
2150 RecursiveMultiply(T0, T2, R0, R1, N2);
2151 LowLevel::Subtract(T1, T1, R1, N2);
2152 carry = -1;
2153 break;
2154 default:
2155 SetWords(T0, 0, N);
2156 carry = 0;
2157 }
2158
2159 RecursiveMultiply(R0, T2, A0, B0, N2);
2160 RecursiveMultiply(R2, T2, A1, B1, N2);
2161
2162 // now T[01] holds (A1-A0)*(B0-B1),R[01] holds A0*B0, R[23] holds A1*B1
2163
2164 carry += LowLevel::Add(T0, T0, R0, N);
2165 carry += LowLevel::Add(T0, T0, R2, N);
2166 carry += LowLevel::Add(R1, R1, T0, N);
2167
2168 Increment(R3, N2, carry);
2169 }
2170 }
2171
2172
RecursiveSquare(word * R,word * T,const word * A,unsigned int N)2173 void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)
2174 {
2175 if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
2176 LowLevel::Square4(R, A);
2177 else if (N==2)
2178 LowLevel::Square2(R, A);
2179 else
2180 {
2181 const unsigned int N2 = N/2;
2182
2183 RecursiveSquare(R0, T2, A0, N2);
2184 RecursiveSquare(R2, T2, A1, N2);
2185 RecursiveMultiply(T0, T2, A0, A1, N2);
2186
2187 word carry = LowLevel::Add(R1, R1, T0, N);
2188 carry += LowLevel::Add(R1, R1, T0, N);
2189 Increment(R3, N2, carry);
2190 }
2191 }
2192
2193
2194 // R[N] - bottom half of A*B
2195 // T[N] - temporary work space
2196 // A[N] - multiplier
2197 // B[N] - multiplicant
2198
2199
RecursiveMultiplyBottom(word * R,word * T,const word * A,const word * B,unsigned int N)2200 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B,
2201 unsigned int N)
2202 {
2203 if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
2204 LowLevel::Multiply8Bottom(R, A, B);
2205 else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
2206 LowLevel::Multiply4Bottom(R, A, B);
2207 else if (N==2)
2208 LowLevel::Multiply2Bottom(R, A, B);
2209 else
2210 {
2211 const unsigned int N2 = N/2;
2212
2213 RecursiveMultiply(R, T, A0, B0, N2);
2214 RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
2215 LowLevel::Add(R1, R1, T0, N2);
2216 RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
2217 LowLevel::Add(R1, R1, T0, N2);
2218 }
2219 }
2220
2221
RecursiveMultiplyTop(word * R,word * T,const word * L,const word * A,const word * B,unsigned int N)2222 void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A,
2223 const word *B, unsigned int N)
2224 {
2225 if (N==4)
2226 {
2227 LowLevel::Multiply4(T, A, B);
2228 memcpy(R, T+4, 4*WORD_SIZE);
2229 }
2230 else if (N==2)
2231 {
2232 LowLevel::Multiply2(T, A, B);
2233 memcpy(R, T+2, 2*WORD_SIZE);
2234 }
2235 else
2236 {
2237 const unsigned int N2 = N/2;
2238 int carry;
2239
2240 int aComp = Compare(A0, A1, N2);
2241 int bComp = Compare(B0, B1, N2);
2242
2243 switch (2*aComp + aComp + bComp)
2244 {
2245 case -4:
2246 LowLevel::Subtract(R0, A1, A0, N2);
2247 LowLevel::Subtract(R1, B0, B1, N2);
2248 RecursiveMultiply(T0, T2, R0, R1, N2);
2249 LowLevel::Subtract(T1, T1, R0, N2);
2250 carry = -1;
2251 break;
2252 case -2:
2253 LowLevel::Subtract(R0, A1, A0, N2);
2254 LowLevel::Subtract(R1, B0, B1, N2);
2255 RecursiveMultiply(T0, T2, R0, R1, N2);
2256 carry = 0;
2257 break;
2258 case 2:
2259 LowLevel::Subtract(R0, A0, A1, N2);
2260 LowLevel::Subtract(R1, B1, B0, N2);
2261 RecursiveMultiply(T0, T2, R0, R1, N2);
2262 carry = 0;
2263 break;
2264 case 4:
2265 LowLevel::Subtract(R0, A1, A0, N2);
2266 LowLevel::Subtract(R1, B0, B1, N2);
2267 RecursiveMultiply(T0, T2, R0, R1, N2);
2268 LowLevel::Subtract(T1, T1, R1, N2);
2269 carry = -1;
2270 break;
2271 default:
2272 SetWords(T0, 0, N);
2273 carry = 0;
2274 }
2275
2276 RecursiveMultiply(T2, R0, A1, B1, N2);
2277
2278 // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
2279
2280 word c2 = LowLevel::Subtract(R0, L+N2, L, N2);
2281 c2 += LowLevel::Subtract(R0, R0, T0, N2);
2282 word t = (Compare(R0, T2, N2) == -1);
2283
2284 carry += t;
2285 carry += Increment(R0, N2, c2+t);
2286 carry += LowLevel::Add(R0, R0, T1, N2);
2287 carry += LowLevel::Add(R0, R0, T3, N2);
2288
2289 CopyWords(R1, T3, N2);
2290 Increment(R1, N2, carry);
2291 }
2292 }
2293
2294
Add(word * C,const word * A,const word * B,unsigned int N)2295 inline word Add(word *C, const word *A, const word *B, unsigned int N)
2296 {
2297 return LowLevel::Add(C, A, B, N);
2298 }
2299
Subtract(word * C,const word * A,const word * B,unsigned int N)2300 inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
2301 {
2302 return LowLevel::Subtract(C, A, B, N);
2303 }
2304
Multiply(word * R,word * T,const word * A,const word * B,unsigned int N)2305 inline void Multiply(word *R, word *T, const word *A, const word *B,
2306 unsigned int N)
2307 {
2308 RecursiveMultiply(R, T, A, B, N);
2309 }
2310
Square(word * R,word * T,const word * A,unsigned int N)2311 inline void Square(word *R, word *T, const word *A, unsigned int N)
2312 {
2313 RecursiveSquare(R, T, A, N);
2314 }
2315
2316
AsymmetricMultiply(word * R,word * T,const word * A,unsigned int NA,const word * B,unsigned int NB)2317 void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA,
2318 const word *B, unsigned int NB)
2319 {
2320 if (NA == NB)
2321 {
2322 if (A == B)
2323 Square(R, T, A, NA);
2324 else
2325 Multiply(R, T, A, B, NA);
2326
2327 return;
2328 }
2329
2330 if (NA > NB)
2331 {
2332 STL::swap(A, B);
2333 STL::swap(NA, NB);
2334 }
2335
2336 if (NA==2 && !A[1])
2337 {
2338 switch (A[0])
2339 {
2340 case 0:
2341 SetWords(R, 0, NB+2);
2342 return;
2343 case 1:
2344 CopyWords(R, B, NB);
2345 R[NB] = R[NB+1] = 0;
2346 return;
2347 default:
2348 R[NB] = LinearMultiply(R, B, A[0], NB);
2349 R[NB+1] = 0;
2350 return;
2351 }
2352 }
2353
2354 Multiply(R, T, A, B, NA);
2355 CopyWords(T+2*NA, R+NA, NA);
2356
2357 unsigned i;
2358
2359 for (i=2*NA; i<NB; i+=2*NA)
2360 Multiply(T+NA+i, T, A, B+i, NA);
2361 for (i=NA; i<NB; i+=2*NA)
2362 Multiply(R+i, T, A, B+i, NA);
2363
2364 if (Add(R+NA, R+NA, T+2*NA, NB-NA))
2365 Increment(R+NB, NA);
2366 }
2367
2368
PositiveMultiply(Integer & product,const Integer & a,const Integer & b)2369 void PositiveMultiply(Integer& product, const Integer& a, const Integer& b)
2370 {
2371 unsigned int aSize = RoundupSize(a.WordCount());
2372 unsigned int bSize = RoundupSize(b.WordCount());
2373
2374 product.reg_.CleanNew(RoundupSize(aSize + bSize));
2375 product.sign_ = Integer::POSITIVE;
2376
2377 AlignedWordBlock workspace(aSize + bSize);
2378 AsymmetricMultiply(product.reg_.get_buffer(), workspace.get_buffer(),
2379 a.reg_.get_buffer(), aSize, b.reg_.get_buffer(), bSize);
2380 }
2381
Multiply(Integer & product,const Integer & a,const Integer & b)2382 void Multiply(Integer &product, const Integer &a, const Integer &b)
2383 {
2384 PositiveMultiply(product, a, b);
2385
2386 if (a.NotNegative() != b.NotNegative())
2387 product.Negate();
2388 }
2389
2390
EvenWordCount(const word * X,unsigned int N)2391 static inline unsigned int EvenWordCount(const word *X, unsigned int N)
2392 {
2393 while (N && X[N-2]==0 && X[N-1]==0)
2394 N-=2;
2395 return N;
2396 }
2397
2398
AlmostInverse(word * R,word * T,const word * A,unsigned int NA,const word * M,unsigned int N)2399 unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA,
2400 const word *M, unsigned int N)
2401 {
2402 word *b = T;
2403 word *c = T+N;
2404 word *f = T+2*N;
2405 word *g = T+3*N;
2406 unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
2407 unsigned int k=0, s=0;
2408
2409 SetWords(T, 0, 3*N);
2410 b[0]=1;
2411 CopyWords(f, A, NA);
2412 CopyWords(g, M, N);
2413
2414 while (1)
2415 {
2416 word t=f[0];
2417 while (!t)
2418 {
2419 if (EvenWordCount(f, fgLen)==0)
2420 {
2421 SetWords(R, 0, N);
2422 return 0;
2423 }
2424
2425 ShiftWordsRightByWords(f, fgLen, 1);
2426 if (c[bcLen-1]) bcLen+=2;
2427 ShiftWordsLeftByWords(c, bcLen, 1);
2428 k+=WORD_BITS;
2429 t=f[0];
2430 }
2431
2432 unsigned int i=0;
2433 while (t%2 == 0)
2434 {
2435 t>>=1;
2436 i++;
2437 }
2438 k+=i;
2439
2440 if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
2441 {
2442 if (s%2==0)
2443 CopyWords(R, b, N);
2444 else
2445 Subtract(R, M, b, N);
2446 return k;
2447 }
2448
2449 ShiftWordsRightByBits(f, fgLen, i);
2450 t=ShiftWordsLeftByBits(c, bcLen, i);
2451 if (t)
2452 {
2453 c[bcLen] = t;
2454 bcLen+=2;
2455 }
2456
2457 if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
2458 fgLen-=2;
2459
2460 if (Compare(f, g, fgLen)==-1)
2461 {
2462 STL::swap(f, g);
2463 STL::swap(b, c);
2464 s++;
2465 }
2466
2467 Subtract(f, f, g, fgLen);
2468
2469 if (Add(b, b, c, bcLen))
2470 {
2471 b[bcLen] = 1;
2472 bcLen+=2;
2473 }
2474 }
2475 }
2476
2477 // R[N] - result = A/(2^k) mod M
2478 // A[N] - input
2479 // M[N] - modulus
2480
DivideByPower2Mod(word * R,const word * A,unsigned int k,const word * M,unsigned int N)2481 void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M,
2482 unsigned int N)
2483 {
2484 CopyWords(R, A, N);
2485
2486 while (k--)
2487 {
2488 if (R[0]%2==0)
2489 ShiftWordsRightByBits(R, N, 1);
2490 else
2491 {
2492 word carry = Add(R, R, M, N);
2493 ShiftWordsRightByBits(R, N, 1);
2494 R[N-1] += carry<<(WORD_BITS-1);
2495 }
2496 }
2497 }
2498
2499 // R[N] - result = A*(2^k) mod M
2500 // A[N] - input
2501 // M[N] - modulus
2502
MultiplyByPower2Mod(word * R,const word * A,unsigned int k,const word * M,unsigned int N)2503 void MultiplyByPower2Mod(word *R, const word *A, unsigned int k, const word *M,
2504 unsigned int N)
2505 {
2506 CopyWords(R, A, N);
2507
2508 while (k--)
2509 if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
2510 Subtract(R, R, M, N);
2511 }
2512
2513
2514 // ********** end of integer needs
2515
2516
Integer()2517 Integer::Integer()
2518 : reg_(2), sign_(POSITIVE)
2519 {
2520 reg_[0] = reg_[1] = 0;
2521 }
2522
2523
Integer(const Integer & t)2524 Integer::Integer(const Integer& t)
2525 : reg_(RoundupSize(t.WordCount())), sign_(t.sign_)
2526 {
2527 CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
2528 }
2529
2530
Integer(signed long value)2531 Integer::Integer(signed long value)
2532 : reg_(2)
2533 {
2534 if (value >= 0)
2535 sign_ = POSITIVE;
2536 else
2537 {
2538 sign_ = NEGATIVE;
2539 value = -value;
2540 }
2541 reg_[0] = word(value);
2542 reg_[1] = word(SafeRightShift<WORD_BITS, unsigned long>(value));
2543 }
2544
2545
Integer(Sign s,word high,word low)2546 Integer::Integer(Sign s, word high, word low)
2547 : reg_(2), sign_(s)
2548 {
2549 reg_[0] = low;
2550 reg_[1] = high;
2551 }
2552
2553
Integer(word value,unsigned int length)2554 Integer::Integer(word value, unsigned int length)
2555 : reg_(RoundupSize(length)), sign_(POSITIVE)
2556 {
2557 reg_[0] = value;
2558 SetWords(reg_ + 1, 0, reg_.size() - 1);
2559 }
2560
2561
Integer(const byte * encodedInteger,unsigned int byteCount,Signedness s)2562 Integer::Integer(const byte *encodedInteger, unsigned int byteCount,
2563 Signedness s)
2564 {
2565 Decode(encodedInteger, byteCount, s);
2566 }
2567
2568 class BadBER {};
2569
2570 // BER Decode Source
Integer(Source & source)2571 Integer::Integer(Source& source)
2572 : reg_(2), sign_(POSITIVE)
2573 {
2574 Decode(source);
2575 }
2576
Decode(Source & source)2577 void Integer::Decode(Source& source)
2578 {
2579 byte b = source.next();
2580 if (b != INTEGER) {
2581 source.SetError(INTEGER_E);
2582 return;
2583 }
2584
2585 word32 length = GetLength(source);
2586 if (length == 0 || source.GetError().What()) return;
2587
2588 if ( (b = source.next()) == 0x00)
2589 length--;
2590 else
2591 source.prev();
2592
2593 if (source.IsLeft(length) == false) return;
2594
2595 unsigned int words = (length + WORD_SIZE - 1) / WORD_SIZE;
2596 words = RoundupSize(words);
2597 if (words > reg_.size()) reg_.CleanNew(words);
2598
2599 for (int j = length; j > 0; j--) {
2600 b = source.next();
2601 reg_ [(j-1) / WORD_SIZE] |= (word)b << ((j-1) % WORD_SIZE) * 8;
2602 }
2603 }
2604
2605
Decode(const byte * input,unsigned int inputLen,Signedness s)2606 void Integer::Decode(const byte* input, unsigned int inputLen, Signedness s)
2607 {
2608 unsigned int idx(0);
2609 byte b = 0;
2610 if (inputLen>0)
2611 b = input[idx]; // peek
2612 sign_ = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
2613
2614 while (inputLen>0 && (sign_==POSITIVE ? b==0 : b==0xff))
2615 {
2616 idx++; // skip
2617 if (--inputLen>0)
2618 b = input[idx]; // peek
2619 }
2620
2621 reg_.CleanNew(RoundupSize(BytesToWords(inputLen)));
2622
2623 for (unsigned int i=inputLen; i > 0; i--)
2624 {
2625 b = input[idx++];
2626 reg_[(i-1)/WORD_SIZE] |= (word)b << ((i-1)%WORD_SIZE)*8;
2627 }
2628
2629 if (sign_ == NEGATIVE)
2630 {
2631 for (unsigned i=inputLen; i<reg_.size()*WORD_SIZE; i++)
2632 reg_[i/WORD_SIZE] |= (word)0xff << (i%WORD_SIZE)*8;
2633 TwosComplement(reg_.get_buffer(), reg_.size());
2634 }
2635 }
2636
2637
Encode(byte * output,unsigned int outputLen,Signedness signedness) const2638 unsigned int Integer::Encode(byte* output, unsigned int outputLen,
2639 Signedness signedness) const
2640 {
2641 unsigned int idx(0);
2642 if (signedness == UNSIGNED || NotNegative())
2643 {
2644 for (unsigned int i=outputLen; i > 0; i--)
2645 output[idx++] = GetByte(i-1);
2646 }
2647 else
2648 {
2649 // take two's complement of *this
2650 Integer temp = Integer::Power2(8*max(ByteCount(), outputLen)) + *this;
2651 for (unsigned i=0; i<outputLen; i++)
2652 output[idx++] = temp.GetByte(outputLen-i-1);
2653 }
2654 return outputLen;
2655 }
2656
2657
2658 static Integer* zero = 0;
2659
Zero()2660 const Integer &Integer::Zero()
2661 {
2662 if (!zero)
2663 zero = NEW_TC Integer;
2664 return *zero;
2665 }
2666
2667
2668 static Integer* one = 0;
2669
One()2670 const Integer &Integer::One()
2671 {
2672 if (!one)
2673 one = NEW_TC Integer(1,2);
2674 return *one;
2675 }
2676
2677
2678 // Clean up static singleton holders, not a leak, but helpful to have gone
2679 // when checking for leaks
CleanUp()2680 void CleanUp()
2681 {
2682 tcDelete(one);
2683 tcDelete(zero);
2684
2685 // In case user calls more than once, prevent seg fault
2686 one = 0;
2687 zero = 0;
2688 }
2689
Integer(RandomNumberGenerator & rng,const Integer & min,const Integer & max)2690 Integer::Integer(RandomNumberGenerator& rng, const Integer& min,
2691 const Integer& max)
2692 {
2693 Randomize(rng, min, max);
2694 }
2695
2696
Randomize(RandomNumberGenerator & rng,unsigned int nbits)2697 void Integer::Randomize(RandomNumberGenerator& rng, unsigned int nbits)
2698 {
2699 const unsigned int nbytes = nbits/8 + 1;
2700 ByteBlock buf(nbytes);
2701 rng.GenerateBlock(buf.get_buffer(), nbytes);
2702 if (nbytes)
2703 buf[0] = (byte)Crop(buf[0], nbits % 8);
2704 Decode(buf.get_buffer(), nbytes, UNSIGNED);
2705 }
2706
Randomize(RandomNumberGenerator & rng,const Integer & min,const Integer & max)2707 void Integer::Randomize(RandomNumberGenerator& rng, const Integer& min,
2708 const Integer& max)
2709 {
2710 Integer range = max - min;
2711 const unsigned int nbits = range.BitCount();
2712
2713 do
2714 {
2715 Randomize(rng, nbits);
2716 }
2717 while (*this > range);
2718
2719 *this += min;
2720 }
2721
2722
Power2(unsigned int e)2723 Integer Integer::Power2(unsigned int e)
2724 {
2725 Integer r((word)0, BitsToWords(e + 1));
2726 r.SetBit(e);
2727 return r;
2728 }
2729
2730
SetBit(unsigned int n,bool value)2731 void Integer::SetBit(unsigned int n, bool value)
2732 {
2733 if (value)
2734 {
2735 reg_.CleanGrow(RoundupSize(BitsToWords(n + 1)));
2736 reg_[n / WORD_BITS] |= (word(1) << (n % WORD_BITS));
2737 }
2738 else
2739 {
2740 if (n / WORD_BITS < reg_.size())
2741 reg_[n / WORD_BITS] &= ~(word(1) << (n % WORD_BITS));
2742 }
2743 }
2744
2745
SetByte(unsigned int n,byte value)2746 void Integer::SetByte(unsigned int n, byte value)
2747 {
2748 reg_.CleanGrow(RoundupSize(BytesToWords(n+1)));
2749 reg_[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
2750 reg_[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
2751 }
2752
2753
Negate()2754 void Integer::Negate()
2755 {
2756 if (!!(*this)) // don't flip sign if *this==0
2757 sign_ = Sign(1 - sign_);
2758 }
2759
2760
operator !() const2761 bool Integer::operator!() const
2762 {
2763 return IsNegative() ? false : (reg_[0]==0 && WordCount()==0);
2764 }
2765
2766
operator =(const Integer & t)2767 Integer& Integer::operator=(const Integer& t)
2768 {
2769 if (this != &t)
2770 {
2771 reg_.New(RoundupSize(t.WordCount()));
2772 CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
2773 sign_ = t.sign_;
2774 }
2775 return *this;
2776 }
2777
2778
operator +=(const Integer & t)2779 Integer& Integer::operator+=(const Integer& t)
2780 {
2781 reg_.CleanGrow(t.reg_.size());
2782 if (NotNegative())
2783 {
2784 if (t.NotNegative())
2785 PositiveAdd(*this, *this, t);
2786 else
2787 PositiveSubtract(*this, *this, t);
2788 }
2789 else
2790 {
2791 if (t.NotNegative())
2792 PositiveSubtract(*this, t, *this);
2793 else
2794 {
2795 PositiveAdd(*this, *this, t);
2796 sign_ = Integer::NEGATIVE;
2797 }
2798 }
2799 return *this;
2800 }
2801
2802
operator -() const2803 Integer Integer::operator-() const
2804 {
2805 Integer result(*this);
2806 result.Negate();
2807 return result;
2808 }
2809
2810
operator -=(const Integer & t)2811 Integer& Integer::operator-=(const Integer& t)
2812 {
2813 reg_.CleanGrow(t.reg_.size());
2814 if (NotNegative())
2815 {
2816 if (t.NotNegative())
2817 PositiveSubtract(*this, *this, t);
2818 else
2819 PositiveAdd(*this, *this, t);
2820 }
2821 else
2822 {
2823 if (t.NotNegative())
2824 {
2825 PositiveAdd(*this, *this, t);
2826 sign_ = Integer::NEGATIVE;
2827 }
2828 else
2829 PositiveSubtract(*this, t, *this);
2830 }
2831 return *this;
2832 }
2833
2834
operator ++()2835 Integer& Integer::operator++()
2836 {
2837 if (NotNegative())
2838 {
2839 if (Increment(reg_.get_buffer(), reg_.size()))
2840 {
2841 reg_.CleanGrow(2*reg_.size());
2842 reg_[reg_.size()/2]=1;
2843 }
2844 }
2845 else
2846 {
2847 word borrow = Decrement(reg_.get_buffer(), reg_.size());
2848 (void)borrow; // shut up compiler
2849 if (WordCount()==0)
2850 *this = Zero();
2851 }
2852 return *this;
2853 }
2854
operator --()2855 Integer& Integer::operator--()
2856 {
2857 if (IsNegative())
2858 {
2859 if (Increment(reg_.get_buffer(), reg_.size()))
2860 {
2861 reg_.CleanGrow(2*reg_.size());
2862 reg_[reg_.size()/2]=1;
2863 }
2864 }
2865 else
2866 {
2867 if (Decrement(reg_.get_buffer(), reg_.size()))
2868 *this = -One();
2869 }
2870 return *this;
2871 }
2872
2873
operator <<=(unsigned int n)2874 Integer& Integer::operator<<=(unsigned int n)
2875 {
2876 const unsigned int wordCount = WordCount();
2877 const unsigned int shiftWords = n / WORD_BITS;
2878 const unsigned int shiftBits = n % WORD_BITS;
2879
2880 reg_.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
2881 ShiftWordsLeftByWords(reg_.get_buffer(), wordCount + shiftWords,
2882 shiftWords);
2883 ShiftWordsLeftByBits(reg_+shiftWords, wordCount+BitsToWords(shiftBits),
2884 shiftBits);
2885 return *this;
2886 }
2887
operator >>=(unsigned int n)2888 Integer& Integer::operator>>=(unsigned int n)
2889 {
2890 const unsigned int wordCount = WordCount();
2891 const unsigned int shiftWords = n / WORD_BITS;
2892 const unsigned int shiftBits = n % WORD_BITS;
2893
2894 ShiftWordsRightByWords(reg_.get_buffer(), wordCount, shiftWords);
2895 if (wordCount > shiftWords)
2896 ShiftWordsRightByBits(reg_.get_buffer(), wordCount-shiftWords,
2897 shiftBits);
2898 if (IsNegative() && WordCount()==0) // avoid -0
2899 *this = Zero();
2900 return *this;
2901 }
2902
2903
PositiveAdd(Integer & sum,const Integer & a,const Integer & b)2904 void PositiveAdd(Integer& sum, const Integer& a, const Integer& b)
2905 {
2906 word carry;
2907 if (a.reg_.size() == b.reg_.size())
2908 carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2909 b.reg_.get_buffer(), a.reg_.size());
2910 else if (a.reg_.size() > b.reg_.size())
2911 {
2912 carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2913 b.reg_.get_buffer(), b.reg_.size());
2914 CopyWords(sum.reg_+b.reg_.size(), a.reg_+b.reg_.size(),
2915 a.reg_.size()-b.reg_.size());
2916 carry = Increment(sum.reg_+b.reg_.size(), a.reg_.size()-b.reg_.size(),
2917 carry);
2918 }
2919 else
2920 {
2921 carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
2922 b.reg_.get_buffer(), a.reg_.size());
2923 CopyWords(sum.reg_+a.reg_.size(), b.reg_+a.reg_.size(),
2924 b.reg_.size()-a.reg_.size());
2925 carry = Increment(sum.reg_+a.reg_.size(), b.reg_.size()-a.reg_.size(),
2926 carry);
2927 }
2928
2929 if (carry)
2930 {
2931 sum.reg_.CleanGrow(2*sum.reg_.size());
2932 sum.reg_[sum.reg_.size()/2] = 1;
2933 }
2934 sum.sign_ = Integer::POSITIVE;
2935 }
2936
PositiveSubtract(Integer & diff,const Integer & a,const Integer & b)2937 void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
2938 {
2939 unsigned aSize = a.WordCount();
2940 aSize += aSize%2;
2941 unsigned bSize = b.WordCount();
2942 bSize += bSize%2;
2943
2944 if (aSize == bSize)
2945 {
2946 if (Compare(a.reg_.get_buffer(), b.reg_.get_buffer(), aSize) >= 0)
2947 {
2948 Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
2949 b.reg_.get_buffer(), aSize);
2950 diff.sign_ = Integer::POSITIVE;
2951 }
2952 else
2953 {
2954 Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
2955 a.reg_.get_buffer(), aSize);
2956 diff.sign_ = Integer::NEGATIVE;
2957 }
2958 }
2959 else if (aSize > bSize)
2960 {
2961 word borrow = Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
2962 b.reg_.get_buffer(), bSize);
2963 CopyWords(diff.reg_+bSize, a.reg_+bSize, aSize-bSize);
2964 borrow = Decrement(diff.reg_+bSize, aSize-bSize, borrow);
2965 diff.sign_ = Integer::POSITIVE;
2966 }
2967 else
2968 {
2969 word borrow = Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
2970 a.reg_.get_buffer(), aSize);
2971 CopyWords(diff.reg_+aSize, b.reg_+aSize, bSize-aSize);
2972 borrow = Decrement(diff.reg_+aSize, bSize-aSize, borrow);
2973 diff.sign_ = Integer::NEGATIVE;
2974 }
2975 }
2976
2977
MinEncodedSize(Signedness signedness) const2978 unsigned int Integer::MinEncodedSize(Signedness signedness) const
2979 {
2980 unsigned int outputLen = max(1U, ByteCount());
2981 if (signedness == UNSIGNED)
2982 return outputLen;
2983 if (NotNegative() && (GetByte(outputLen-1) & 0x80))
2984 outputLen++;
2985 if (IsNegative() && *this < -Power2(outputLen*8-1))
2986 outputLen++;
2987 return outputLen;
2988 }
2989
2990
Compare(const Integer & t) const2991 int Integer::Compare(const Integer& t) const
2992 {
2993 if (NotNegative())
2994 {
2995 if (t.NotNegative())
2996 return PositiveCompare(t);
2997 else
2998 return 1;
2999 }
3000 else
3001 {
3002 if (t.NotNegative())
3003 return -1;
3004 else
3005 return -PositiveCompare(t);
3006 }
3007 }
3008
3009
PositiveCompare(const Integer & t) const3010 int Integer::PositiveCompare(const Integer& t) const
3011 {
3012 unsigned size = WordCount(), tSize = t.WordCount();
3013
3014 if (size == tSize)
3015 return TaoCrypt::Compare(reg_.get_buffer(), t.reg_.get_buffer(), size);
3016 else
3017 return size > tSize ? 1 : -1;
3018 }
3019
3020
GetBit(unsigned int n) const3021 bool Integer::GetBit(unsigned int n) const
3022 {
3023 if (n/WORD_BITS >= reg_.size())
3024 return 0;
3025 else
3026 return bool((reg_[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
3027 }
3028
3029
GetBits(unsigned int i,unsigned int n) const3030 unsigned long Integer::GetBits(unsigned int i, unsigned int n) const
3031 {
3032 unsigned long v = 0;
3033 for (unsigned int j=0; j<n; j++)
3034 v |= GetBit(i+j) << j;
3035 return v;
3036 }
3037
3038
GetByte(unsigned int n) const3039 byte Integer::GetByte(unsigned int n) const
3040 {
3041 if (n/WORD_SIZE >= reg_.size())
3042 return 0;
3043 else
3044 return byte(reg_[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
3045 }
3046
3047
BitCount() const3048 unsigned int Integer::BitCount() const
3049 {
3050 unsigned wordCount = WordCount();
3051 if (wordCount)
3052 return (wordCount-1)*WORD_BITS + BitPrecision(reg_[wordCount-1]);
3053 else
3054 return 0;
3055 }
3056
3057
ByteCount() const3058 unsigned int Integer::ByteCount() const
3059 {
3060 unsigned wordCount = WordCount();
3061 if (wordCount)
3062 return (wordCount-1)*WORD_SIZE + BytePrecision(reg_[wordCount-1]);
3063 else
3064 return 0;
3065 }
3066
3067
WordCount() const3068 unsigned int Integer::WordCount() const
3069 {
3070 return CountWords(reg_.get_buffer(), reg_.size());
3071 }
3072
3073
IsConvertableToLong() const3074 bool Integer::IsConvertableToLong() const
3075 {
3076 if (ByteCount() > sizeof(long))
3077 return false;
3078
3079 unsigned long value = reg_[0];
3080 value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
3081
3082 if (sign_ == POSITIVE)
3083 return (signed long)value >= 0;
3084 else
3085 return -(signed long)value < 0;
3086 }
3087
3088
ConvertToLong() const3089 signed long Integer::ConvertToLong() const
3090 {
3091 unsigned long value = reg_[0];
3092 value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
3093 return sign_ == POSITIVE ? value : -(signed long)value;
3094 }
3095
3096
Swap(Integer & a)3097 void Integer::Swap(Integer& a)
3098 {
3099 reg_.Swap(a.reg_);
3100 STL::swap(sign_, a.sign_);
3101 }
3102
3103
Plus(const Integer & b) const3104 Integer Integer::Plus(const Integer& b) const
3105 {
3106 Integer sum((word)0, max(reg_.size(), b.reg_.size()));
3107 if (NotNegative())
3108 {
3109 if (b.NotNegative())
3110 PositiveAdd(sum, *this, b);
3111 else
3112 PositiveSubtract(sum, *this, b);
3113 }
3114 else
3115 {
3116 if (b.NotNegative())
3117 PositiveSubtract(sum, b, *this);
3118 else
3119 {
3120 PositiveAdd(sum, *this, b);
3121 sum.sign_ = Integer::NEGATIVE;
3122 }
3123 }
3124 return sum;
3125 }
3126
3127
Minus(const Integer & b) const3128 Integer Integer::Minus(const Integer& b) const
3129 {
3130 Integer diff((word)0, max(reg_.size(), b.reg_.size()));
3131 if (NotNegative())
3132 {
3133 if (b.NotNegative())
3134 PositiveSubtract(diff, *this, b);
3135 else
3136 PositiveAdd(diff, *this, b);
3137 }
3138 else
3139 {
3140 if (b.NotNegative())
3141 {
3142 PositiveAdd(diff, *this, b);
3143 diff.sign_ = Integer::NEGATIVE;
3144 }
3145 else
3146 PositiveSubtract(diff, b, *this);
3147 }
3148 return diff;
3149 }
3150
3151
Times(const Integer & b) const3152 Integer Integer::Times(const Integer &b) const
3153 {
3154 Integer product;
3155 Multiply(product, *this, b);
3156 return product;
3157 }
3158
3159
3160 #undef A0
3161 #undef A1
3162 #undef B0
3163 #undef B1
3164
3165 #undef T0
3166 #undef T1
3167 #undef T2
3168 #undef T3
3169
3170 #undef R0
3171 #undef R1
3172 #undef R2
3173 #undef R3
3174
3175
AtomicDivide(word * Q,const word * A,const word * B)3176 static inline void AtomicDivide(word *Q, const word *A, const word *B)
3177 {
3178 word T[4];
3179 DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]),
3180 DWord(A[2], A[3]), DWord(B[0], B[1]));
3181 Q[0] = q.GetLowHalf();
3182 Q[1] = q.GetHighHalf();
3183
3184 #ifndef NDEBUG
3185 if (B[0] || B[1])
3186 {
3187 // multiply quotient and divisor and add remainder, make sure it
3188 // equals dividend
3189 word P[4];
3190 Portable::Multiply2(P, Q, B);
3191 Add(P, P, T, 4);
3192 }
3193 #endif
3194 }
3195
3196
3197 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
CorrectQuotientEstimate(word * R,word * T,word * Q,const word * B,unsigned int N)3198 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B,
3199 unsigned int N)
3200 {
3201 if (Q[1])
3202 {
3203 T[N] = T[N+1] = 0;
3204 unsigned i;
3205 for (i=0; i<N; i+=4)
3206 LowLevel::Multiply2(T+i, Q, B+i);
3207 for (i=2; i<N; i+=4)
3208 if (LowLevel::Multiply2Add(T+i, Q, B+i))
3209 T[i+5] += (++T[i+4]==0);
3210 }
3211 else
3212 {
3213 T[N] = LinearMultiply(T, B, Q[0], N);
3214 T[N+1] = 0;
3215 }
3216
3217 word borrow = Subtract(R, R, T, N+2);
3218 (void)borrow; // shut up compiler
3219
3220 while (R[N] || Compare(R, B, N) >= 0)
3221 {
3222 R[N] -= Subtract(R, R, B, N);
3223 Q[1] += (++Q[0]==0);
3224 }
3225 }
3226
3227 // R[NB] -------- remainder = A%B
3228 // Q[NA-NB+2] --- quotient = A/B
3229 // T[NA+2*NB+4] - temp work space
3230 // A[NA] -------- dividend
3231 // B[NB] -------- divisor
3232
3233
Divide(word * R,word * Q,word * T,const word * A,unsigned int NA,const word * B,unsigned int NB)3234 void Divide(word* R, word* Q, word* T, const word* A, unsigned int NA,
3235 const word* B, unsigned int NB)
3236 {
3237 // set up temporary work space
3238 word *const TA=T;
3239 word *const TB=T+NA+2;
3240 word *const TP=T+NA+2+NB;
3241
3242 // copy B into TB and normalize it so that TB has highest bit set to 1
3243 unsigned shiftWords = (B[NB-1]==0);
3244 TB[0] = TB[NB-1] = 0;
3245 CopyWords(TB+shiftWords, B, NB-shiftWords);
3246 unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
3247 ShiftWordsLeftByBits(TB, NB, shiftBits);
3248
3249 // copy A into TA and normalize it
3250 TA[0] = TA[NA] = TA[NA+1] = 0;
3251 CopyWords(TA+shiftWords, A, NA);
3252 ShiftWordsLeftByBits(TA, NA+2, shiftBits);
3253
3254 if (TA[NA+1]==0 && TA[NA] <= 1)
3255 {
3256 Q[NA-NB+1] = Q[NA-NB] = 0;
3257 while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
3258 {
3259 TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
3260 ++Q[NA-NB];
3261 }
3262 }
3263 else
3264 {
3265 NA+=2;
3266 }
3267
3268 word BT[2];
3269 BT[0] = TB[NB-2] + 1;
3270 BT[1] = TB[NB-1] + (BT[0]==0);
3271
3272 // start reducing TA mod TB, 2 words at a time
3273 for (unsigned i=NA-2; i>=NB; i-=2)
3274 {
3275 AtomicDivide(Q+i-NB, TA+i-2, BT);
3276 CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
3277 }
3278
3279 // copy TA into R, and denormalize it
3280 CopyWords(R, TA+shiftWords, NB);
3281 ShiftWordsRightByBits(R, NB, shiftBits);
3282 }
3283
3284
PositiveDivide(Integer & remainder,Integer & quotient,const Integer & a,const Integer & b)3285 void PositiveDivide(Integer& remainder, Integer& quotient,
3286 const Integer& a, const Integer& b)
3287 {
3288 unsigned aSize = a.WordCount();
3289 unsigned bSize = b.WordCount();
3290
3291 if (a.PositiveCompare(b) == -1)
3292 {
3293 remainder = a;
3294 remainder.sign_ = Integer::POSITIVE;
3295 quotient = Integer::Zero();
3296 return;
3297 }
3298
3299 aSize += aSize%2; // round up to next even number
3300 bSize += bSize%2;
3301
3302 remainder.reg_.CleanNew(RoundupSize(bSize));
3303 remainder.sign_ = Integer::POSITIVE;
3304 quotient.reg_.CleanNew(RoundupSize(aSize-bSize+2));
3305 quotient.sign_ = Integer::POSITIVE;
3306
3307 AlignedWordBlock T(aSize+2*bSize+4);
3308 Divide(remainder.reg_.get_buffer(), quotient.reg_.get_buffer(),
3309 T.get_buffer(), a.reg_.get_buffer(), aSize, b.reg_.get_buffer(),
3310 bSize);
3311 }
3312
Divide(Integer & remainder,Integer & quotient,const Integer & dividend,const Integer & divisor)3313 void Integer::Divide(Integer &remainder, Integer "ient,
3314 const Integer ÷nd, const Integer &divisor)
3315 {
3316 PositiveDivide(remainder, quotient, dividend, divisor);
3317
3318 if (dividend.IsNegative())
3319 {
3320 quotient.Negate();
3321 if (remainder.NotZero())
3322 {
3323 --quotient;
3324 remainder = divisor.AbsoluteValue() - remainder;
3325 }
3326 }
3327
3328 if (divisor.IsNegative())
3329 quotient.Negate();
3330 }
3331
DivideByPowerOf2(Integer & r,Integer & q,const Integer & a,unsigned int n)3332 void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a,
3333 unsigned int n)
3334 {
3335 q = a;
3336 q >>= n;
3337
3338 const unsigned int wordCount = BitsToWords(n);
3339 if (wordCount <= a.WordCount())
3340 {
3341 r.reg_.resize(RoundupSize(wordCount));
3342 CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), wordCount);
3343 SetWords(r.reg_+wordCount, 0, r.reg_.size()-wordCount);
3344 if (n % WORD_BITS != 0)
3345 r.reg_[wordCount-1] %= (word(1) << (n % WORD_BITS));
3346 }
3347 else
3348 {
3349 r.reg_.resize(RoundupSize(a.WordCount()));
3350 CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), r.reg_.size());
3351 }
3352 r.sign_ = POSITIVE;
3353
3354 if (a.IsNegative() && r.NotZero())
3355 {
3356 --q;
3357 r = Power2(n) - r;
3358 }
3359 }
3360
DividedBy(const Integer & b) const3361 Integer Integer::DividedBy(const Integer &b) const
3362 {
3363 Integer remainder, quotient;
3364 Integer::Divide(remainder, quotient, *this, b);
3365 return quotient;
3366 }
3367
Modulo(const Integer & b) const3368 Integer Integer::Modulo(const Integer &b) const
3369 {
3370 Integer remainder, quotient;
3371 Integer::Divide(remainder, quotient, *this, b);
3372 return remainder;
3373 }
3374
Divide(word & remainder,Integer & quotient,const Integer & dividend,word divisor)3375 void Integer::Divide(word &remainder, Integer "ient,
3376 const Integer ÷nd, word divisor)
3377 {
3378 if ((divisor & (divisor-1)) == 0) // divisor is a power of 2
3379 {
3380 quotient = dividend >> (BitPrecision(divisor)-1);
3381 remainder = dividend.reg_[0] & (divisor-1);
3382 return;
3383 }
3384
3385 unsigned int i = dividend.WordCount();
3386 quotient.reg_.CleanNew(RoundupSize(i));
3387 remainder = 0;
3388 while (i--)
3389 {
3390 quotient.reg_[i] = DWord(dividend.reg_[i], remainder) / divisor;
3391 remainder = DWord(dividend.reg_[i], remainder) % divisor;
3392 }
3393
3394 if (dividend.NotNegative())
3395 quotient.sign_ = POSITIVE;
3396 else
3397 {
3398 quotient.sign_ = NEGATIVE;
3399 if (remainder)
3400 {
3401 --quotient;
3402 remainder = divisor - remainder;
3403 }
3404 }
3405 }
3406
DividedBy(word b) const3407 Integer Integer::DividedBy(word b) const
3408 {
3409 word remainder;
3410 Integer quotient;
3411 Integer::Divide(remainder, quotient, *this, b);
3412 return quotient;
3413 }
3414
Modulo(word divisor) const3415 word Integer::Modulo(word divisor) const
3416 {
3417 word remainder;
3418
3419 if ((divisor & (divisor-1)) == 0) // divisor is a power of 2
3420 remainder = reg_[0] & (divisor-1);
3421 else
3422 {
3423 unsigned int i = WordCount();
3424
3425 if (divisor <= 5)
3426 {
3427 DWord sum(0, 0);
3428 while (i--)
3429 sum += reg_[i];
3430 remainder = sum % divisor;
3431 }
3432 else
3433 {
3434 remainder = 0;
3435 while (i--)
3436 remainder = DWord(reg_[i], remainder) % divisor;
3437 }
3438 }
3439
3440 if (IsNegative() && remainder)
3441 remainder = divisor - remainder;
3442
3443 return remainder;
3444 }
3445
3446
AbsoluteValue() const3447 Integer Integer::AbsoluteValue() const
3448 {
3449 Integer result(*this);
3450 result.sign_ = POSITIVE;
3451 return result;
3452 }
3453
3454
SquareRoot() const3455 Integer Integer::SquareRoot() const
3456 {
3457 if (!IsPositive())
3458 return Zero();
3459
3460 // overestimate square root
3461 Integer x, y = Power2((BitCount()+1)/2);
3462
3463 do
3464 {
3465 x = y;
3466 y = (x + *this/x) >> 1;
3467 } while (y<x);
3468
3469 return x;
3470 }
3471
IsSquare() const3472 bool Integer::IsSquare() const
3473 {
3474 Integer r = SquareRoot();
3475 return *this == r.Squared();
3476 }
3477
IsUnit() const3478 bool Integer::IsUnit() const
3479 {
3480 return (WordCount() == 1) && (reg_[0] == 1);
3481 }
3482
MultiplicativeInverse() const3483 Integer Integer::MultiplicativeInverse() const
3484 {
3485 return IsUnit() ? *this : Zero();
3486 }
3487
a_times_b_mod_c(const Integer & x,const Integer & y,const Integer & m)3488 Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
3489 {
3490 return x*y%m;
3491 }
3492
a_exp_b_mod_c(const Integer & x,const Integer & e,const Integer & m)3493 Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
3494 {
3495 ModularArithmetic mr(m);
3496 return mr.Exponentiate(x, e);
3497 }
3498
Gcd(const Integer & a,const Integer & b)3499 Integer Integer::Gcd(const Integer &a, const Integer &b)
3500 {
3501 return EuclideanDomainOf().Gcd(a, b);
3502 }
3503
InverseMod(const Integer & m) const3504 Integer Integer::InverseMod(const Integer &m) const
3505 {
3506 if (IsNegative() || *this>=m)
3507 return (*this%m).InverseMod(m);
3508
3509 if (m.IsEven())
3510 {
3511 if (!m || IsEven())
3512 return Zero(); // no inverse
3513 if (*this == One())
3514 return One();
3515
3516 Integer u = m.InverseMod(*this);
3517 return !u ? Zero() : (m*(*this-u)+1)/(*this);
3518 }
3519
3520 AlignedWordBlock T(m.reg_.size() * 4);
3521 Integer r((word)0, m.reg_.size());
3522 unsigned k = AlmostInverse(r.reg_.get_buffer(), T.get_buffer(),
3523 reg_.get_buffer(), reg_.size(),
3524 m.reg_.get_buffer(), m.reg_.size());
3525 DivideByPower2Mod(r.reg_.get_buffer(), r.reg_.get_buffer(), k,
3526 m.reg_.get_buffer(), m.reg_.size());
3527 return r;
3528 }
3529
InverseMod(const word mod) const3530 word Integer::InverseMod(const word mod) const
3531 {
3532 word g0 = mod, g1 = *this % mod;
3533 word v0 = 0, v1 = 1;
3534 word y;
3535
3536 while (g1)
3537 {
3538 if (g1 == 1)
3539 return v1;
3540 y = g0 / g1;
3541 g0 = g0 % g1;
3542 v0 += y * v1;
3543
3544 if (!g0)
3545 break;
3546 if (g0 == 1)
3547 return mod-v0;
3548 y = g1 / g0;
3549 g1 = g1 % g0;
3550 v1 += y * v0;
3551 }
3552 return 0;
3553 }
3554
3555 // ********* ModArith stuff
3556
Half(const Integer & a) const3557 const Integer& ModularArithmetic::Half(const Integer &a) const
3558 {
3559 if (a.reg_.size()==modulus.reg_.size())
3560 {
3561 TaoCrypt::DivideByPower2Mod(result.reg_.begin(), a.reg_.begin(), 1,
3562 modulus.reg_.begin(), a.reg_.size());
3563 return result;
3564 }
3565 else
3566 return result1 = (a.IsEven() ? (a >> 1) : ((a+modulus) >> 1));
3567 }
3568
Add(const Integer & a,const Integer & b) const3569 const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
3570 {
3571 if (a.reg_.size()==modulus.reg_.size() &&
3572 b.reg_.size()==modulus.reg_.size())
3573 {
3574 if (TaoCrypt::Add(result.reg_.begin(), a.reg_.begin(), b.reg_.begin(),
3575 a.reg_.size())
3576 || Compare(result.reg_.get_buffer(), modulus.reg_.get_buffer(),
3577 a.reg_.size()) >= 0)
3578 {
3579 TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
3580 modulus.reg_.begin(), a.reg_.size());
3581 }
3582 return result;
3583 }
3584 else
3585 {
3586 result1 = a+b;
3587 if (result1 >= modulus)
3588 result1 -= modulus;
3589 return result1;
3590 }
3591 }
3592
Accumulate(Integer & a,const Integer & b) const3593 Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
3594 {
3595 if (a.reg_.size()==modulus.reg_.size() &&
3596 b.reg_.size()==modulus.reg_.size())
3597 {
3598 if (TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
3599 b.reg_.get_buffer(), a.reg_.size())
3600 || Compare(a.reg_.get_buffer(), modulus.reg_.get_buffer(),
3601 a.reg_.size()) >= 0)
3602 {
3603 TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
3604 modulus.reg_.get_buffer(), a.reg_.size());
3605 }
3606 }
3607 else
3608 {
3609 a+=b;
3610 if (a>=modulus)
3611 a-=modulus;
3612 }
3613
3614 return a;
3615 }
3616
Subtract(const Integer & a,const Integer & b) const3617 const Integer& ModularArithmetic::Subtract(const Integer &a,
3618 const Integer &b) const
3619 {
3620 if (a.reg_.size()==modulus.reg_.size() &&
3621 b.reg_.size()==modulus.reg_.size())
3622 {
3623 if (TaoCrypt::Subtract(result.reg_.begin(), a.reg_.begin(),
3624 b.reg_.begin(), a.reg_.size()))
3625 TaoCrypt::Add(result.reg_.begin(), result.reg_.begin(),
3626 modulus.reg_.begin(), a.reg_.size());
3627 return result;
3628 }
3629 else
3630 {
3631 result1 = a-b;
3632 if (result1.IsNegative())
3633 result1 += modulus;
3634 return result1;
3635 }
3636 }
3637
Reduce(Integer & a,const Integer & b) const3638 Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
3639 {
3640 if (a.reg_.size()==modulus.reg_.size() &&
3641 b.reg_.size()==modulus.reg_.size())
3642 {
3643 if (TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
3644 b.reg_.get_buffer(), a.reg_.size()))
3645 TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
3646 modulus.reg_.get_buffer(), a.reg_.size());
3647 }
3648 else
3649 {
3650 a-=b;
3651 if (a.IsNegative())
3652 a+=modulus;
3653 }
3654
3655 return a;
3656 }
3657
Inverse(const Integer & a) const3658 const Integer& ModularArithmetic::Inverse(const Integer &a) const
3659 {
3660 if (!a)
3661 return a;
3662
3663 CopyWords(result.reg_.begin(), modulus.reg_.begin(), modulus.reg_.size());
3664 if (TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
3665 a.reg_.begin(), a.reg_.size()))
3666 Decrement(result.reg_.begin()+a.reg_.size(), 1,
3667 modulus.reg_.size()-a.reg_.size());
3668
3669 return result;
3670 }
3671
CascadeExponentiate(const Integer & x,const Integer & e1,const Integer & y,const Integer & e2) const3672 Integer ModularArithmetic::CascadeExponentiate(const Integer &x,
3673 const Integer &e1, const Integer &y, const Integer &e2) const
3674 {
3675 if (modulus.IsOdd())
3676 {
3677 MontgomeryRepresentation dr(modulus);
3678 return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1,
3679 dr.ConvertIn(y), e2));
3680 }
3681 else
3682 return AbstractRing::CascadeExponentiate(x, e1, y, e2);
3683 }
3684
SimultaneousExponentiate(Integer * results,const Integer & base,const Integer * exponents,unsigned int exponentsCount) const3685 void ModularArithmetic::SimultaneousExponentiate(Integer *results,
3686 const Integer &base, const Integer *exponents,
3687 unsigned int exponentsCount) const
3688 {
3689 if (modulus.IsOdd())
3690 {
3691 MontgomeryRepresentation dr(modulus);
3692 dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents,
3693 exponentsCount);
3694 for (unsigned int i=0; i<exponentsCount; i++)
3695 results[i] = dr.ConvertOut(results[i]);
3696 }
3697 else
3698 AbstractRing::SimultaneousExponentiate(results, base,
3699 exponents, exponentsCount);
3700 }
3701
3702
3703 // ********************************************************
3704
3705 #define A0 A
3706 #define A1 (A+N2)
3707 #define B0 B
3708 #define B1 (B+N2)
3709
3710 #define T0 T
3711 #define T1 (T+N2)
3712 #define T2 (T+N)
3713 #define T3 (T+N+N2)
3714
3715 #define R0 R
3716 #define R1 (R+N2)
3717 #define R2 (R+N)
3718 #define R3 (R+N+N2)
3719
3720
MultiplyBottom(word * R,word * T,const word * A,const word * B,unsigned int N)3721 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B,
3722 unsigned int N)
3723 {
3724 RecursiveMultiplyBottom(R, T, A, B, N);
3725 }
3726
MultiplyTop(word * R,word * T,const word * L,const word * A,const word * B,unsigned int N)3727 inline void MultiplyTop(word *R, word *T, const word *L, const word *A,
3728 const word *B, unsigned int N)
3729 {
3730 RecursiveMultiplyTop(R, T, L, A, B, N);
3731 }
3732
3733
3734 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M
3735 // T[3*N] - temporary work space
3736 // X[2*N] - number to be reduced
3737 // M[N] --- modulus
3738 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
3739
MontgomeryReduce(word * R,word * T,const word * X,const word * M,const word * U,unsigned int N)3740 void MontgomeryReduce(word *R, word *T, const word *X, const word *M,
3741 const word *U, unsigned int N)
3742 {
3743 MultiplyBottom(R, T, X, U, N);
3744 MultiplyTop(T, T+N, X, R, M, N);
3745 word borrow = Subtract(T, X+N, T, N);
3746 // defend against timing attack by doing this Add even when not needed
3747 word carry = Add(T+N, T, M, N);
3748 (void)carry; // shut up compiler
3749 CopyWords(R, T + (borrow ? N : 0), N);
3750 }
3751
3752 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
3753 // T[3*N/2] - temporary work space
3754 // A[N] ----- an odd number as input
3755
RecursiveInverseModPower2(word * R,word * T,const word * A,unsigned int N)3756 void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
3757 {
3758 if (N==2)
3759 {
3760 T[0] = AtomicInverseModPower2(A[0]);
3761 T[1] = 0;
3762 LowLevel::Multiply2Bottom(T+2, T, A);
3763 TwosComplement(T+2, 2);
3764 Increment(T+2, 2, 2);
3765 LowLevel::Multiply2Bottom(R, T, T+2);
3766 }
3767 else
3768 {
3769 const unsigned int N2 = N/2;
3770 RecursiveInverseModPower2(R0, T0, A0, N2);
3771 T0[0] = 1;
3772 SetWords(T0+1, 0, N2-1);
3773 MultiplyTop(R1, T1, T0, R0, A0, N2);
3774 MultiplyBottom(T0, T1, R0, A1, N2);
3775 Add(T0, R1, T0, N2);
3776 TwosComplement(T0, N2);
3777 MultiplyBottom(R1, T1, R0, T0, N2);
3778 }
3779 }
3780
3781
3782 #undef A0
3783 #undef A1
3784 #undef B0
3785 #undef B1
3786
3787 #undef T0
3788 #undef T1
3789 #undef T2
3790 #undef T3
3791
3792 #undef R0
3793 #undef R1
3794 #undef R2
3795 #undef R3
3796
3797
3798 // modulus must be odd
MontgomeryRepresentation(const Integer & m)3799 MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)
3800 : ModularArithmetic(m),
3801 u((word)0, modulus.reg_.size()),
3802 workspace(5*modulus.reg_.size())
3803 {
3804 RecursiveInverseModPower2(u.reg_.get_buffer(), workspace.get_buffer(),
3805 modulus.reg_.get_buffer(), modulus.reg_.size());
3806 }
3807
Multiply(const Integer & a,const Integer & b) const3808 const Integer& MontgomeryRepresentation::Multiply(const Integer &a,
3809 const Integer &b) const
3810 {
3811 word *const T = workspace.begin();
3812 word *const R = result.reg_.begin();
3813 const unsigned int N = modulus.reg_.size();
3814
3815 AsymmetricMultiply(T, T+2*N, a.reg_.get_buffer(), a.reg_.size(),
3816 b.reg_.get_buffer(), b.reg_.size());
3817 SetWords(T+a.reg_.size()+b.reg_.size(),0, 2*N-a.reg_.size()-b.reg_.size());
3818 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3819 u.reg_.get_buffer(), N);
3820 return result;
3821 }
3822
Square(const Integer & a) const3823 const Integer& MontgomeryRepresentation::Square(const Integer &a) const
3824 {
3825 word *const T = workspace.begin();
3826 word *const R = result.reg_.begin();
3827 const unsigned int N = modulus.reg_.size();
3828
3829 TaoCrypt::Square(T, T+2*N, a.reg_.get_buffer(), a.reg_.size());
3830 SetWords(T+2*a.reg_.size(), 0, 2*N-2*a.reg_.size());
3831 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3832 u.reg_.get_buffer(), N);
3833 return result;
3834 }
3835
ConvertOut(const Integer & a) const3836 Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
3837 {
3838 word *const T = workspace.begin();
3839 word *const R = result.reg_.begin();
3840 const unsigned int N = modulus.reg_.size();
3841
3842 CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
3843 SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
3844 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3845 u.reg_.get_buffer(), N);
3846 return result;
3847 }
3848
MultiplicativeInverse(const Integer & a) const3849 const Integer& MontgomeryRepresentation::MultiplicativeInverse(
3850 const Integer &a) const
3851 {
3852 // return (EuclideanMultiplicativeInverse(a, modulus)<<
3853 // (2*WORD_BITS*modulus.reg_.size()))%modulus;
3854 word *const T = workspace.begin();
3855 word *const R = result.reg_.begin();
3856 const unsigned int N = modulus.reg_.size();
3857
3858 CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
3859 SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
3860 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
3861 u.reg_.get_buffer(), N);
3862 unsigned k = AlmostInverse(R, T, R, N, modulus.reg_.get_buffer(), N);
3863
3864 // cout << "k=" << k << " N*32=" << 32*N << endl;
3865
3866 if (k>N*WORD_BITS)
3867 DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg_.get_buffer(), N);
3868 else
3869 MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg_.get_buffer(), N);
3870
3871 return result;
3872 }
3873
3874
3875 // mod Root stuff
ModularRoot(const Integer & a,const Integer & dp,const Integer & dq,const Integer & p,const Integer & q,const Integer & u)3876 Integer ModularRoot(const Integer &a, const Integer &dp, const Integer &dq,
3877 const Integer &p, const Integer &q, const Integer &u)
3878 {
3879 Integer p2 = ModularExponentiation((a % p), dp, p);
3880 Integer q2 = ModularExponentiation((a % q), dq, q);
3881 return CRT(p2, p, q2, q, u);
3882 }
3883
CRT(const Integer & xp,const Integer & p,const Integer & xq,const Integer & q,const Integer & u)3884 Integer CRT(const Integer &xp, const Integer &p, const Integer &xq,
3885 const Integer &q, const Integer &u)
3886 {
3887 // isn't operator overloading great?
3888 return p * (u * (xq-xp) % q) + xp;
3889 }
3890
3891 } // namespace
3892
3893