1 // integer.cpp - originally written and placed in the public domain by Wei Dai
2 // contains public domain code contributed by Alister Lee and Leonard Janke
3 
4 // Notes by JW: The Integer class needs to do two things. First, it needs
5 //  to set function pointers on some platforms, like X86 and X64. The
6 //  function pointers select a fast multiply and addition based on the cpu.
7 //  Second, it wants to create Integer::Zero(), Integer::One() and
8 //  Integer::Two().
9 // The function pointers are initialized in the InitializeInteger class by
10 //  calling SetFunctionPointers(). The call to SetFunctionPointers() is
11 //  guarded to run once using a double-checked pattern. We don't use C++
12 //  std::call_once due to bad interactions between libstdc++, glibc and
13 //  pthreads. The bad interactions were causing crashes for us on platforms
14 //  like Sparc and PowerPC. Since we are only setting function pointers we
15 //  don't have to worry about leaking memory. The worst case seems to be the
16 //  pointers gets written twice until the init flag is set and visible to
17 //  all threads.
18 // For Integer::Zero(), Integer::One() and Integer::Two(), we use one of three
19 //  strategies. First, if initialization priorities are available then we use
20 //  them. Initialization priorities are init_priority() on Linux and init_seg()
21 //  on Windows. OS X and several other platforms lack them. Initialization
22 //  priorities are platform specific but they are also the most trouble free
23 //  with determisitic destruction.
24 // Second, if C++11 dynamic initialization is available, then we use it. After
25 //  the std::call_once fiasco we moved to dynamic initialization to avoid
26 //  unknown troubles platforms that are tested less frequently. In addition
27 //  Microsoft platforms mostly do not provide dynamic initialization.
28 //  The MSDN docs claim they do but they don't in practice because we need
29 //  Visual Studio 2017 and Windows 10 or above.
30 // Third, we fall back to Wei's original code of a Singleton. Wei's original
31 //  code was much simpler. It simply used the Singleton pattern, but it always
32 //  produced memory findings on some platforms. The Singleton generates memory
33 //  findings because it uses a Create On First Use pattern (a dumb Nifty
34 //  Counter) and the compiler had to be smart enough to fold them to return
35 //  the same object. Unix and Linux compilers do a good job of folding objects,
36 //  but Microsoft compilers do a rather poor job for some versions of the
37 //  compiler.
38 // Another problem with the Singleton is resource destruction requires running
39 //  resource acquisition in reverse. For resources provided through the
40 //  Singletons, there is no way to express the dependency order to safely
41 //  destroy resources. (That's one of the problems C++11 dynamic
42 //  intitialization with concurrent execution is supposed to solve).
43 // The final problem with Singletons is resource/memory exhaustion in languages
44 //  like Java and .Net. Java and .Net load and unload a native DLL hundreds or
45 //  thousands of times during the life of a program. Each load produces a
46 //  memory leak and they can add up quickly. If they library is being used in
47 //  Java or .Net then Singleton must be avoided at all costs.
48 //
49 // The code below has a path cut-in for BMI2 using mulx and adcx instructions.
50 //  There was a modest speedup of approximately 0.03 ms in public key Integer
51 //  operations. We had to disable BMI2 for the moment because some OS X machines
52 //  were advertising BMI/BMI2 support but caused SIGILL's at runtime. Also see
53 //  https://github.com/weidai11/cryptopp/issues/850.
54 
55 #include "pch.h"
56 #include "config.h"
57 
58 #ifndef CRYPTOPP_IMPORTS
59 
60 #include "integer.h"
61 #include "secblock.h"
62 #include "modarith.h"
63 #include "nbtheory.h"
64 #include "smartptr.h"
65 #include "algparam.h"
66 #include "filters.h"
67 #include "stdcpp.h"
68 #include "asn.h"
69 #include "oids.h"
70 #include "words.h"
71 #include "pubkey.h"		// for P1363_KDF2
72 #include "sha.h"
73 #include "cpu.h"
74 #include "misc.h"
75 
76 #include <iostream>
77 
78 #if (_MSC_VER >= 1400) && !defined(_M_ARM)
79 	#include <intrin.h>
80 #endif
81 
82 #ifdef __DECCXX
83 	#include <c_asm.h>
84 #endif
85 
86 // "Error: The operand ___LKDB cannot be assigned to",
87 // http://github.com/weidai11/cryptopp/issues/188
88 #if (__SUNPRO_CC >= 0x5130)
89 # define MAYBE_CONST
90 # define MAYBE_UNCONST_CAST(x) const_cast<word*>(x)
91 #else
92 # define MAYBE_CONST const
93 # define MAYBE_UNCONST_CAST(x) x
94 #endif
95 
96 // "Inline assembly operands don't work with .intel_syntax",
97 //  http://llvm.org/bugs/show_bug.cgi?id=24232
98 #if CRYPTOPP_BOOL_X32 || defined(CRYPTOPP_DISABLE_MIXED_ASM)
99 # undef CRYPTOPP_X86_ASM_AVAILABLE
100 # undef CRYPTOPP_X32_ASM_AVAILABLE
101 # undef CRYPTOPP_X64_ASM_AVAILABLE
102 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
103 # undef CRYPTOPP_SSSE3_ASM_AVAILABLE
104 #else
105 # define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86))
106 #endif
107 
108 // ***************** C++ Static Initialization ********************
109 
110 NAMESPACE_BEGIN(CryptoPP)
111 
112 // Function body near the middle of the file
113 static void SetFunctionPointers();
114 
115 // Use a double-checked pattern. We are not leaking anything so it
116 // does not matter if a pointer is written twice during a race.
117 // Avoid std::call_once due to too many problems on platforms like
118 // Solaris and Sparc. Also see
119 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=66146 and
120 // http://github.com/weidai11/cryptopp/issues/707.
InitializeInteger()121 InitializeInteger::InitializeInteger()
122 {
123 	static bool s_flag;
124 	MEMORY_BARRIER();
125 	if (s_flag == false)
126 	{
127 		SetFunctionPointers();
128 		s_flag = true;
129 		MEMORY_BARRIER();
130 	}
131 }
132 
133 template <long i>
134 struct NewInteger
135 {
operator ()NewInteger136 	Integer * operator()() const
137 	{
138 		return new Integer(i);
139 	}
140 };
141 
142 // ***************** Library code ********************
143 
Compare(const word * A,const word * B,size_t N)144 inline static int Compare(const word *A, const word *B, size_t N)
145 {
146 	while (N--)
147 		if (A[N] > B[N])
148 			return 1;
149 		else if (A[N] < B[N])
150 			return -1;
151 
152 	return 0;
153 }
154 
Increment(word * A,size_t N,word B=1)155 inline static int Increment(word *A, size_t N, word B=1)
156 {
157 	CRYPTOPP_ASSERT(N);
158 	word t = A[0];
159 	A[0] = t+B;
160 	if (A[0] >= t)
161 		return 0;
162 	for (unsigned i=1; i<N; i++)
163 		if (++A[i])
164 			return 0;
165 	return 1;
166 }
167 
Decrement(word * A,size_t N,word B=1)168 inline static int Decrement(word *A, size_t N, word B=1)
169 {
170 	CRYPTOPP_ASSERT(N);
171 	word t = A[0];
172 	A[0] = t-B;
173 	if (A[0] <= t)
174 		return 0;
175 	for (unsigned i=1; i<N; i++)
176 		if (A[i]--)
177 			return 0;
178 	return 1;
179 }
180 
TwosComplement(word * A,size_t N)181 static void TwosComplement(word *A, size_t N)
182 {
183 	Decrement(A, N);
184 	for (unsigned i=0; i<N; i++)
185 		A[i] = ~A[i];
186 }
187 
AtomicInverseModPower2(word A)188 static word AtomicInverseModPower2(word A)
189 {
190 	CRYPTOPP_ASSERT(A%2==1);
191 
192 	word R=A%8;
193 
194 	for (unsigned i=3; i<WORD_BITS; i*=2)
195 		R = R*(2-R*A);
196 
197 	CRYPTOPP_ASSERT(R*A==1);
198 	return R;
199 }
200 
201 // ********************************************************
202 
203 #if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || ((defined(__aarch64__) || defined(__x86_64__)) && defined(CRYPTOPP_WORD128_AVAILABLE))
204 	#define TWO_64_BIT_WORDS 1
205 	#define Declare2Words(x)			word x##0, x##1;
206 	#define AssignWord(a, b)			a##0 = b; a##1 = 0;
207 	#define Add2WordsBy1(a, b, c)		a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
208 	#define LowWord(a)					a##0
209 	#define HighWord(a)					a##1
210 	#ifdef _MSC_VER
211 		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = _umul128(a, b, &p1);
212 		#ifndef __INTEL_COMPILER
213 			#define Double3Words(c, d)		d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
214 		#endif
215 	#elif defined(__aarch32__) || defined(__aarch64__)
216 		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = a*b; asm ("umulh %0,%1,%2" : "=r"(p1) : "r"(a), "r"(b));
217 	#elif defined(__DECCXX)
218 		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = a*b; p1 = asm("umulh %a0, %a1, %v0", a, b);
219 	#elif defined(__x86_64__)
220 		#if defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5100
221 			// Sun Studio's gcc-style inline assembly is heavily bugged as of version 5.9 Patch 124864-09 2008/12/16, but this one works
222 			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "r"(b) : "cc");
223 		#elif defined(__BMI2__) && 0
224 			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulxq %3, %0, %1" : "=r"(p0), "=r"(p1) : "d"(a), "r"(b));
225 			#define MulAcc(c, d, a, b)		asm ("mulxq %6, %3, %4; addq %3, %0; adcxq %4, %1; adcxq %7, %2;" : "+&r"(c), "+&r"(d##0), "+&r"(d##1), "=&r"(p0), "=&r"(p1) : "d"(a), "r"(b), "r"(W64LIT(0)) : "cc");
226 			#define Double3Words(c, d)		asm ("addq %0, %0; adcxq %1, %1; adcxq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
227 			#define Acc2WordsBy1(a, b)		asm ("addq %2, %0; adcxq %3, %1;" : "+&r"(a##0), "+r"(a##1) : "r"(b), "r"(W64LIT(0)) : "cc");
228 			#define Acc2WordsBy2(a, b)		asm ("addq %2, %0; adcxq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
229 			#define Acc3WordsBy2(c, d, e)	asm ("addq %5, %0; adcxq %6, %1; adcxq %7, %2;" : "+r"(c), "=&r"(e##0), "=&r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1), "r"(W64LIT(0)) : "cc");
230 		#else
231 			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
232 			#define MulAcc(c, d, a, b)		asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
233 			#define Double3Words(c, d)		asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
234 			#define Acc2WordsBy1(a, b)		asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
235 			#define Acc2WordsBy2(a, b)		asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
236 			#define Acc3WordsBy2(c, d, e)	asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
237 		#endif
238 	#endif
239 	#define MultiplyWords(p, a, b)		MultiplyWordsLoHi(p##0, p##1, a, b)
240 	#ifndef Double3Words
241 		#define Double3Words(c, d)		d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
242 	#endif
243 	#ifndef Acc2WordsBy2
244 		#define Acc2WordsBy2(a, b)		a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
245 	#endif
246 	#define AddWithCarry(u, a, b)		{word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
247 	#define SubtractWithBorrow(u, a, b)	{word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
248 	#define GetCarry(u)					u##1
249 	#define GetBorrow(u)				u##1
250 #else
251 	#define Declare2Words(x)			dword x;
252 	#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_IA64))
253 		#define MultiplyWords(p, a, b)		p = __emulu(a, b);
254 	#else
255 		#define MultiplyWords(p, a, b)		p = (dword)a*b;
256 	#endif
257 	#define AssignWord(a, b)			a = b;
258 	#define Add2WordsBy1(a, b, c)		a = b + c;
259 	#define Acc2WordsBy2(a, b)			a += b;
260 	#define LowWord(a)					word(a)
261 	#define HighWord(a)					word(a>>WORD_BITS)
262 	#define Double3Words(c, d)			d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
263 	#define AddWithCarry(u, a, b)		u = dword(a) + b + GetCarry(u);
264 	#define SubtractWithBorrow(u, a, b)	u = dword(a) - b - GetBorrow(u);
265 	#define GetCarry(u)					HighWord(u)
266 	#define GetBorrow(u)				word(u>>(WORD_BITS*2-1))
267 #endif
268 #ifndef MulAcc
269 	#define MulAcc(c, d, a, b)			MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
270 #endif
271 #ifndef Acc2WordsBy1
272 	#define Acc2WordsBy1(a, b)			Add2WordsBy1(a, a, b)
273 #endif
274 #ifndef Acc3WordsBy2
275 	#define Acc3WordsBy2(c, d, e)		Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
276 #endif
277 
278 class DWord
279 {
280 public:
281 #if defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE)
DWord()282 	DWord() {std::memset(&m_whole, 0x00, sizeof(m_whole));}
283 #else
284 	DWord() {std::memset(&m_halfs, 0x00, sizeof(m_halfs));}
285 #endif
286 
287 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
DWord(word low)288 	explicit DWord(word low) : m_whole(low) { }
289 #else
DWord(word low)290 	explicit DWord(word low)
291 	{
292 		m_halfs.high = 0;
293 		m_halfs.low = low;
294 	}
295 #endif
296 
297 #if defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE)
DWord(word low,word high)298 	DWord(word low, word high) : m_whole()
299 #else
300 	DWord(word low, word high) : m_halfs()
301 #endif
302 	{
303 #if defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE)
304 #  if (CRYPTOPP_LITTLE_ENDIAN)
305 		const word t[2] = {low,high};
306 		memcpy(&m_whole, t, sizeof(m_whole));
307 #  else
308 		const word t[2] = {high,low};
309 		memcpy(&m_whole, t, sizeof(m_whole));
310 #  endif
311 #else
312 		m_halfs.low = low;
313 		m_halfs.high = high;
314 #endif
315 	}
316 
Multiply(word a,word b)317 	static DWord Multiply(word a, word b)
318 	{
319 		DWord r;
320 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
321 			r.m_whole = (dword)a * b;
322 		#elif defined(MultiplyWordsLoHi)
323 			MultiplyWordsLoHi(r.m_halfs.low, r.m_halfs.high, a, b);
324 		#else
325 			CRYPTOPP_ASSERT(0);
326 		#endif
327 		return r;
328 	}
329 
MultiplyAndAdd(word a,word b,word c)330 	static DWord MultiplyAndAdd(word a, word b, word c)
331 	{
332 		DWord r = Multiply(a, b);
333 		return r += c;
334 	}
335 
operator +=(word a)336 	DWord & operator+=(word a)
337 	{
338 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
339 			m_whole = m_whole + a;
340 		#else
341 			m_halfs.low += a;
342 			m_halfs.high += (m_halfs.low < a);
343 		#endif
344 		return *this;
345 	}
346 
operator +(word a)347 	DWord operator+(word a)
348 	{
349 		DWord r;
350 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
351 			r.m_whole = m_whole + a;
352 		#else
353 			r.m_halfs.low = m_halfs.low + a;
354 			r.m_halfs.high = m_halfs.high + (r.m_halfs.low < a);
355 		#endif
356 		return r;
357 	}
358 
operator -(DWord a)359 	DWord operator-(DWord a)
360 	{
361 		DWord r;
362 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
363 			r.m_whole = m_whole - a.m_whole;
364 		#else
365 			r.m_halfs.low = m_halfs.low - a.m_halfs.low;
366 			r.m_halfs.high = m_halfs.high - a.m_halfs.high - (r.m_halfs.low > m_halfs.low);
367 		#endif
368 		return r;
369 	}
370 
operator -(word a)371 	DWord operator-(word a)
372 	{
373 		DWord r;
374 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
375 			r.m_whole = m_whole - a;
376 		#else
377 			r.m_halfs.low = m_halfs.low - a;
378 			r.m_halfs.high = m_halfs.high - (r.m_halfs.low > m_halfs.low);
379 		#endif
380 		return r;
381 	}
382 
383 	// returns quotient, which must fit in a word
384 	word operator/(word divisor);
385 
386 	word operator%(word a);
387 
operator !() const388 	bool operator!() const
389 	{
390 	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
391 		return !m_whole;
392 	#else
393 		return !m_halfs.high && !m_halfs.low;
394 	#endif
395 	}
396 
397 	// TODO: When NATIVE_DWORD is in effect, we access high and low, which are inactive
398 	//  union members, and that's UB. Also see http://stackoverflow.com/q/11373203.
GetLowHalf() const399 	word GetLowHalf() const {return m_halfs.low;}
GetHighHalf() const400 	word GetHighHalf() const {return m_halfs.high;}
GetHighHalfAsBorrow() const401 	word GetHighHalfAsBorrow() const {return 0-m_halfs.high;}
402 
403 private:
404 	// Issue 274, "Types cannot be declared in anonymous union",
405 	//   http://github.com/weidai11/cryptopp/issues/274
406 	//   Thanks to Martin Bonner at http://stackoverflow.com/a/39507183
407     struct half_words
408     {
409     #if (CRYPTOPP_LITTLE_ENDIAN)
410         word low;
411         word high;
412     #else
413         word high;
414         word low;
415     #endif
416    };
417    union
418    {
419    #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
420        dword m_whole;
421    #endif
422        half_words m_halfs;
423    };
424 };
425 
426 class Word
427 {
428 public:
Word()429 	Word() : m_whole(0) {}
Word(word value)430 	Word(word value) : m_whole(value) {}
Word(hword low,hword high)431 	Word(hword low, hword high) : m_whole(low | (word(high) << (WORD_BITS/2))) {}
432 
Multiply(hword a,hword b)433 	static Word Multiply(hword a, hword b)
434 	{
435 		Word r;
436 		r.m_whole = (word)a * b;
437 		return r;
438 	}
439 
operator -(Word a)440 	Word operator-(Word a)
441 	{
442 		Word r;
443 		r.m_whole = m_whole - a.m_whole;
444 		return r;
445 	}
446 
operator -(hword a)447 	Word operator-(hword a)
448 	{
449 		Word r;
450 		r.m_whole = m_whole - a;
451 		return r;
452 	}
453 
454 	// returns quotient, which must fit in a word
operator /(hword divisor)455 	hword operator/(hword divisor)
456 	{
457 		return hword(m_whole / divisor);
458 	}
459 
operator !() const460 	bool operator!() const
461 	{
462 		return !m_whole;
463 	}
464 
GetWhole() const465 	word GetWhole() const {return m_whole;}
GetLowHalf() const466 	hword GetLowHalf() const {return hword(m_whole);}
GetHighHalf() const467 	hword GetHighHalf() const {return hword(m_whole>>(WORD_BITS/2));}
GetHighHalfAsBorrow() const468 	hword GetHighHalfAsBorrow() const {return 0-hword(m_whole>>(WORD_BITS/2));}
469 
470 private:
471 	word m_whole;
472 };
473 
474 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
475 template <class S, class D>
476 S DivideThreeWordsByTwo(S *A, S B0, S B1, D *dummy=NULLPTR)
477 {
478 	CRYPTOPP_UNUSED(dummy);
479 
480 	// Assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S
481 	CRYPTOPP_ASSERT(A[2] < B1 || (A[2]==B1 && A[1] < B0));
482 
483 	// Profiling guided the flow below.
484 
485 	// estimate the quotient: do a 2 S by 1 S divide.
486 	S Q; bool pre = (S(B1+1) == 0);
487 	if (B1 > 0 && !pre)
488 		Q = D(A[1], A[2]) / S(B1+1);
489 	else if (pre)
490 		Q = A[2];
491 	else
492 		Q = D(A[0], A[1]) / B0;
493 
494 	// now subtract Q*B from A
495 	D p = D::Multiply(B0, Q);
496 	D u = (D) A[0] - p.GetLowHalf();
497 	A[0] = u.GetLowHalf();
498 	u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - D::Multiply(B1, Q);
499 	A[1] = u.GetLowHalf();
500 	A[2] += u.GetHighHalf();
501 
502 	// Q <= actual quotient, so fix it
503 	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
504 	{
505 		u = (D) A[0] - B0;
506 		A[0] = u.GetLowHalf();
507 		u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
508 		A[1] = u.GetLowHalf();
509 		A[2] += u.GetHighHalf();
510 		Q++;
511 		CRYPTOPP_ASSERT(Q);	// shouldn't overflow
512 	}
513 
514 	return Q;
515 }
516 
517 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
518 template <class S, class D>
DivideFourWordsByTwo(S * T,const D & Al,const D & Ah,const D & B)519 inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
520 {
521 	// Profiling guided the flow below.
522 
523 	if (!!B)
524 	{
525 		S Q[2];
526 		T[0] = Al.GetLowHalf();
527 		T[1] = Al.GetHighHalf();
528 		T[2] = Ah.GetLowHalf();
529 		T[3] = Ah.GetHighHalf();
530 		Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), B.GetHighHalf());
531 		Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
532 		return D(Q[0], Q[1]);
533 	}
534 	else  // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
535 	{
536 		return D(Ah.GetLowHalf(), Ah.GetHighHalf());
537 	}
538 }
539 
540 // returns quotient, which must fit in a word
operator /(word a)541 inline word DWord::operator/(word a)
542 {
543 	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
544 		return word(m_whole / a);
545 	#else
546 		hword r[4];
547 		return DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a).GetWhole();
548 	#endif
549 }
550 
operator %(word a)551 inline word DWord::operator%(word a)
552 {
553 	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
554 		return word(m_whole % a);
555 	#else
556 		if (a < (word(1) << (WORD_BITS/2)))
557 		{
558 			hword h = hword(a);
559 			word r = m_halfs.high % h;
560 			r = ((m_halfs.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
561 			return hword((hword(m_halfs.low) + (r << (WORD_BITS/2))) % h);
562 		}
563 		else
564 		{
565 			hword r[4];
566 			DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a);
567 			return Word(r[0], r[1]).GetWhole();
568 		}
569 	#endif
570 }
571 
572 // ********************************************************
573 
574 // Use some tricks to share assembly code between MSVC, GCC, Clang and Sun CC.
575 #if defined(__GNUC__)
576 	#define AddPrologue \
577 		int result;	\
578 		__asm__ __volatile__ \
579 		( \
580 			INTEL_NOPREFIX
581 	#define AddEpilogue \
582 			ATT_PREFIX \
583 					: "=a" (result)\
584 					: "d" (C), "a" (A), "D" (B), "c" (N) \
585 					: "%esi", "memory", "cc" \
586 		);\
587 		return result;
588 	#define MulPrologue \
589 		__asm__ __volatile__ \
590 		( \
591 			INTEL_NOPREFIX \
592 			AS1(	push	ebx) \
593 			AS2(	mov		ebx, edx)
594 	#define MulEpilogue \
595 			AS1(	pop		ebx) \
596 			ATT_PREFIX \
597 			: \
598 			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
599 			: "%esi", "memory", "cc" \
600 		);
601 	#define SquPrologue		MulPrologue
602 	#define SquEpilogue	\
603 			AS1(	pop		ebx) \
604 			ATT_PREFIX \
605 			: \
606 			: "d" (s_maskLow16), "c" (C), "a" (A) \
607 			: "%esi", "%edi", "memory", "cc" \
608 		);
609 	#define TopPrologue		MulPrologue
610 	#define TopEpilogue	\
611 			AS1(	pop		ebx) \
612 			ATT_PREFIX \
613 			: \
614 			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
615 			: "memory", "cc" \
616 		);
617 #else
618 	#define AddPrologue \
619 		__asm	push edi \
620 		__asm	push esi \
621 		__asm	mov		eax, [esp+12] \
622 		__asm	mov		edi, [esp+16]
623 	#define AddEpilogue \
624 		__asm	pop esi \
625 		__asm	pop edi \
626 		__asm	ret 8
627 	#define SaveEBX
628 	#define RestoreEBX
629 	#define SquPrologue					\
630 		AS2(	mov		eax, A)			\
631 		AS2(	mov		ecx, C)			\
632 		SaveEBX							\
633 		AS2(	lea		ebx, s_maskLow16)
634 	#define MulPrologue					\
635 		AS2(	mov		eax, A)			\
636 		AS2(	mov		edi, B)			\
637 		AS2(	mov		ecx, C)			\
638 		SaveEBX							\
639 		AS2(	lea		ebx, s_maskLow16)
640 	#define TopPrologue					\
641 		AS2(	mov		eax, A)			\
642 		AS2(	mov		edi, B)			\
643 		AS2(	mov		ecx, C)			\
644 		AS2(	mov		esi, L)			\
645 		SaveEBX							\
646 		AS2(	lea		ebx, s_maskLow16)
647 	#define SquEpilogue		RestoreEBX
648 	#define MulEpilogue		RestoreEBX
649 	#define TopEpilogue		RestoreEBX
650 #endif
651 
652 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
653 extern "C" {
654 int Baseline_Add(size_t N, word *C, const word *A, const word *B);
655 int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
656 }
657 #elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) && defined(CRYPTOPP_WORD128_AVAILABLE)
Baseline_Add(size_t N,word * C,const word * A,const word * B)658 int Baseline_Add(size_t N, word *C, const word *A, const word *B)
659 {
660 	word result;
661 	__asm__ __volatile__
662 	(
663 	INTEL_NOPREFIX
664 	AS1(	neg		%1)
665 	ASJ(	jz,		1, f)
666 	AS2(	mov		%0,[%3+8*%1])
667 	AS2(	add		%0,[%4+8*%1])
668 	AS2(	mov		[%2+8*%1],%0)
669 	ASL(0)
670 	AS2(	mov		%0,[%3+8*%1+8])
671 	AS2(	adc		%0,[%4+8*%1+8])
672 	AS2(	mov		[%2+8*%1+8],%0)
673 	AS2(	lea		%1,[%1+2])
674 	ASJ(	jrcxz,	1, f)
675 	AS2(	mov		%0,[%3+8*%1])
676 	AS2(	adc		%0,[%4+8*%1])
677 	AS2(	mov		[%2+8*%1],%0)
678 	ASJ(	jmp,	0, b)
679 	ASL(1)
680 	AS2(	mov		%0, 0)
681 	AS2(	adc		%0, %0)
682 	ATT_NOPREFIX
683 	: "=&r" (result), "+c" (N)
684 	: "r" (C+N), "r" (A+N), "r" (B+N)
685 	: "memory", "cc"
686 	);
687 	return (int)result;
688 }
689 
Baseline_Sub(size_t N,word * C,const word * A,const word * B)690 int Baseline_Sub(size_t N, word *C, const word *A, const word *B)
691 {
692 	word result;
693 	__asm__ __volatile__
694 	(
695 	INTEL_NOPREFIX
696 	AS1(	neg		%1)
697 	ASJ(	jz,		1, f)
698 	AS2(	mov		%0,[%3+8*%1])
699 	AS2(	sub		%0,[%4+8*%1])
700 	AS2(	mov		[%2+8*%1],%0)
701 	ASL(0)
702 	AS2(	mov		%0,[%3+8*%1+8])
703 	AS2(	sbb		%0,[%4+8*%1+8])
704 	AS2(	mov		[%2+8*%1+8],%0)
705 	AS2(	lea		%1,[%1+2])
706 	ASJ(	jrcxz,	1, f)
707 	AS2(	mov		%0,[%3+8*%1])
708 	AS2(	sbb		%0,[%4+8*%1])
709 	AS2(	mov		[%2+8*%1],%0)
710 	ASJ(	jmp,	0, b)
711 	ASL(1)
712 	AS2(	mov		%0, 0)
713 	AS2(	adc		%0, %0)
714 	ATT_NOPREFIX
715 	: "=&r" (result), "+c" (N)
716 	: "r" (C+N), "r" (A+N), "r" (B+N)
717 	: "memory", "cc"
718 	);
719 	return (int)result;
720 }
721 #elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
Baseline_Add(size_t N,word * C,const word * A,const word * B)722 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
723 {
724 	AddPrologue
725 
726 	// now: eax = A, edi = B, edx = C, ecx = N
727 	AS2(	lea		eax, [eax+4*ecx])
728 	AS2(	lea		edi, [edi+4*ecx])
729 	AS2(	lea		edx, [edx+4*ecx])
730 
731 	AS1(	neg		ecx)				// ecx is negative index
732 	AS2(	test	ecx, 2)				// this clears carry flag
733 	ASJ(	jz,		0, f)
734 	AS2(	sub		ecx, 2)
735 	ASJ(	jmp,	1, f)
736 
737 	ASL(0)
738 	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
739 	AS2(	mov		esi,[eax+4*ecx])
740 	AS2(	adc		esi,[edi+4*ecx])
741 	AS2(	mov		[edx+4*ecx],esi)
742 	AS2(	mov		esi,[eax+4*ecx+4])
743 	AS2(	adc		esi,[edi+4*ecx+4])
744 	AS2(	mov		[edx+4*ecx+4],esi)
745 	ASL(1)
746 	AS2(	mov		esi,[eax+4*ecx+8])
747 	AS2(	adc		esi,[edi+4*ecx+8])
748 	AS2(	mov		[edx+4*ecx+8],esi)
749 	AS2(	mov		esi,[eax+4*ecx+12])
750 	AS2(	adc		esi,[edi+4*ecx+12])
751 	AS2(	mov		[edx+4*ecx+12],esi)
752 
753 	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
754 	ASJ(	jmp,	0, b)
755 
756 	ASL(2)
757 	AS2(	mov		eax, 0)
758 	AS1(	setc	al)					// store carry into eax (return result register)
759 
760 	AddEpilogue
761 
762 	// http://github.com/weidai11/cryptopp/issues/340
763 	CRYPTOPP_UNUSED(A); CRYPTOPP_UNUSED(B);
764 	CRYPTOPP_UNUSED(C); CRYPTOPP_UNUSED(N);
765 }
766 
Baseline_Sub(size_t N,word * C,const word * A,const word * B)767 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
768 {
769 	AddPrologue
770 
771 	// now: eax = A, edi = B, edx = C, ecx = N
772 	AS2(	lea		eax, [eax+4*ecx])
773 	AS2(	lea		edi, [edi+4*ecx])
774 	AS2(	lea		edx, [edx+4*ecx])
775 
776 	AS1(	neg		ecx)				// ecx is negative index
777 	AS2(	test	ecx, 2)				// this clears carry flag
778 	ASJ(	jz,		0, f)
779 	AS2(	sub		ecx, 2)
780 	ASJ(	jmp,	1, f)
781 
782 	ASL(0)
783 	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
784 	AS2(	mov		esi,[eax+4*ecx])
785 	AS2(	sbb		esi,[edi+4*ecx])
786 	AS2(	mov		[edx+4*ecx],esi)
787 	AS2(	mov		esi,[eax+4*ecx+4])
788 	AS2(	sbb		esi,[edi+4*ecx+4])
789 	AS2(	mov		[edx+4*ecx+4],esi)
790 	ASL(1)
791 	AS2(	mov		esi,[eax+4*ecx+8])
792 	AS2(	sbb		esi,[edi+4*ecx+8])
793 	AS2(	mov		[edx+4*ecx+8],esi)
794 	AS2(	mov		esi,[eax+4*ecx+12])
795 	AS2(	sbb		esi,[edi+4*ecx+12])
796 	AS2(	mov		[edx+4*ecx+12],esi)
797 
798 	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
799 	ASJ(	jmp,	0, b)
800 
801 	ASL(2)
802 	AS2(	mov		eax, 0)
803 	AS1(	setc	al)					// store carry into eax (return result register)
804 
805 	AddEpilogue
806 
807 	// http://github.com/weidai11/cryptopp/issues/340
808 	CRYPTOPP_UNUSED(A); CRYPTOPP_UNUSED(B);
809 	CRYPTOPP_UNUSED(C); CRYPTOPP_UNUSED(N);
810 }
811 
812 #if CRYPTOPP_INTEGER_SSE2
SSE2_Add(size_t N,word * C,const word * A,const word * B)813 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
814 {
815 	AddPrologue
816 
817 	// now: eax = A, edi = B, edx = C, ecx = N
818 	AS2(	lea		eax, [eax+4*ecx])
819 	AS2(	lea		edi, [edi+4*ecx])
820 	AS2(	lea		edx, [edx+4*ecx])
821 
822 	AS1(	neg		ecx)				// ecx is negative index
823 	AS2(	pxor    mm2, mm2)
824 	ASJ(	jz,		2, f)
825 	AS2(	test	ecx, 2)				// this clears carry flag
826 	ASJ(	jz,		0, f)
827 	AS2(	sub		ecx, 2)
828 	ASJ(	jmp,	1, f)
829 
830 	ASL(0)
831 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
832 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx])
833 	AS2(	paddq    mm0, mm1)
834 	AS2(	paddq	 mm2, mm0)
835 	AS2(	movd	 DWORD PTR [edx+4*ecx], mm2)
836 	AS2(	psrlq    mm2, 32)
837 
838 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+4])
839 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+4])
840 	AS2(	paddq    mm0, mm1)
841 	AS2(	paddq	 mm2, mm0)
842 	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
843 	AS2(	psrlq    mm2, 32)
844 
845 	ASL(1)
846 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
847 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+8])
848 	AS2(	paddq    mm0, mm1)
849 	AS2(	paddq	 mm2, mm0)
850 	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm2)
851 	AS2(	psrlq    mm2, 32)
852 
853 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+12])
854 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+12])
855 	AS2(	paddq    mm0, mm1)
856 	AS2(	paddq	 mm2, mm0)
857 	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
858 	AS2(	psrlq    mm2, 32)
859 
860 	AS2(	add		ecx, 4)
861 	ASJ(	jnz,	0, b)
862 
863 	ASL(2)
864 	AS2(	movd	eax, mm2)
865 	AS1(	emms)
866 
867 	AddEpilogue
868 
869 	// http://github.com/weidai11/cryptopp/issues/340
870 	CRYPTOPP_UNUSED(A); CRYPTOPP_UNUSED(B);
871 	CRYPTOPP_UNUSED(C); CRYPTOPP_UNUSED(N);
872 }
SSE2_Sub(size_t N,word * C,const word * A,const word * B)873 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
874 {
875 	AddPrologue
876 
877 	// now: eax = A, edi = B, edx = C, ecx = N
878 	AS2(	lea		eax, [eax+4*ecx])
879 	AS2(	lea		edi, [edi+4*ecx])
880 	AS2(	lea		edx, [edx+4*ecx])
881 
882 	AS1(	neg		ecx)				// ecx is negative index
883 	AS2(	pxor    mm2, mm2)
884 	ASJ(	jz,		2, f)
885 	AS2(	test	ecx, 2)				// this clears carry flag
886 	ASJ(	jz,		0, f)
887 	AS2(	sub		ecx, 2)
888 	ASJ(	jmp,	1, f)
889 
890 	ASL(0)
891 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
892 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx])
893 	AS2(	psubq    mm0, mm1)
894 	AS2(	psubq	 mm0, mm2)
895 	AS2(	movd	 DWORD PTR [edx+4*ecx], mm0)
896 	AS2(	psrlq    mm0, 63)
897 
898 	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+4])
899 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+4])
900 	AS2(	psubq    mm2, mm1)
901 	AS2(	psubq	 mm2, mm0)
902 	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
903 	AS2(	psrlq    mm2, 63)
904 
905 	ASL(1)
906 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
907 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+8])
908 	AS2(	psubq    mm0, mm1)
909 	AS2(	psubq	 mm0, mm2)
910 	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm0)
911 	AS2(	psrlq    mm0, 63)
912 
913 	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+12])
914 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+12])
915 	AS2(	psubq    mm2, mm1)
916 	AS2(	psubq	 mm2, mm0)
917 	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
918 	AS2(	psrlq    mm2, 63)
919 
920 	AS2(	add		ecx, 4)
921 	ASJ(	jnz,	0, b)
922 
923 	ASL(2)
924 	AS2(	movd	eax, mm2)
925 	AS1(	emms)
926 
927 	AddEpilogue
928 
929 	// http://github.com/weidai11/cryptopp/issues/340
930 	CRYPTOPP_UNUSED(A); CRYPTOPP_UNUSED(B);
931 	CRYPTOPP_UNUSED(C); CRYPTOPP_UNUSED(N);
932 }
933 #endif	// CRYPTOPP_INTEGER_SSE2
934 #else   // CRYPTOPP_SSE2_ASM_AVAILABLE
Baseline_Add(size_t N,word * C,const word * A,const word * B)935 int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
936 {
937 	CRYPTOPP_ASSERT (N%2 == 0);
938 
939 	Declare2Words(u);
940 	AssignWord(u, 0);
941 	for (size_t i=0; i<N; i+=2)
942 	{
943 		AddWithCarry(u, A[i], B[i]);
944 		C[i] = LowWord(u);
945 		AddWithCarry(u, A[i+1], B[i+1]);
946 		C[i+1] = LowWord(u);
947 	}
948 	return int(GetCarry(u));
949 }
950 
Baseline_Sub(size_t N,word * C,const word * A,const word * B)951 int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
952 {
953 	CRYPTOPP_ASSERT (N%2 == 0);
954 
955 	Declare2Words(u);
956 	AssignWord(u, 0);
957 	for (size_t i=0; i<N; i+=2)
958 	{
959 		SubtractWithBorrow(u, A[i], B[i]);
960 		C[i] = LowWord(u);
961 		SubtractWithBorrow(u, A[i+1], B[i+1]);
962 		C[i+1] = LowWord(u);
963 	}
964 	return int(GetBorrow(u));
965 }
966 #endif
967 
LinearMultiply(word * C,const word * AA,word B,size_t N)968 static word LinearMultiply(word *C, const word *AA, word B, size_t N)
969 {
970 	// http://github.com/weidai11/cryptopp/issues/188
971 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
972 
973 	word carry=0;
974 	for(unsigned i=0; i<N; i++)
975 	{
976 		Declare2Words(p);
977 		MultiplyWords(p, A[i], B);
978 		Acc2WordsBy1(p, carry);
979 		C[i] = LowWord(p);
980 		carry = HighWord(p);
981 	}
982 	return carry;
983 }
984 
985 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
986 
987 #define Mul_2 \
988 	Mul_Begin(2) \
989 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
990 	Mul_End(1, 1)
991 
992 #define Mul_4 \
993 	Mul_Begin(4) \
994 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
995 	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
996 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
997 	Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
998 	Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
999 	Mul_End(5, 3)
1000 
1001 #define Mul_8 \
1002 	Mul_Begin(8) \
1003 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
1004 	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
1005 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
1006 	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
1007 	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1008 	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1009 	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1010 	Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
1011 	Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
1012 	Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
1013 	Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
1014 	Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
1015 	Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
1016 	Mul_End(13, 7)
1017 
1018 #define Mul_16 \
1019 	Mul_Begin(16) \
1020 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
1021 	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
1022 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
1023 	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
1024 	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1025 	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1026 	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1027 	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
1028 	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
1029 	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
1030 	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
1031 	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
1032 	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
1033 	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
1034 	Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
1035 	Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
1036 	Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
1037 	Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
1038 	Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
1039 	Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
1040 	Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
1041 	Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
1042 	Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
1043 	Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
1044 	Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
1045 	Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
1046 	Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
1047 	Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
1048 	Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
1049 	Mul_End(29, 15)
1050 
1051 #define Squ_2 \
1052 	Squ_Begin(2) \
1053 	Squ_End(2)
1054 
1055 #define Squ_4 \
1056 	Squ_Begin(4) \
1057 	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
1058 	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
1059 	Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
1060 	Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
1061 	Squ_End(4)
1062 
1063 #define Squ_8 \
1064 	Squ_Begin(8) \
1065 	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
1066 	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
1067 	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
1068 	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
1069 	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
1070 	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
1071 	Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
1072 	Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5)  Squ_NonDiag \
1073 	Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
1074 	Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
1075 	Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
1076 	Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
1077 	Squ_End(8)
1078 
1079 #define Squ_16 \
1080 	Squ_Begin(16) \
1081 	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
1082 	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
1083 	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
1084 	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
1085 	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
1086 	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
1087 	Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
1088 	Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
1089 	Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
1090 	Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
1091 	Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
1092 	Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
1093 	Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
1094 	Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
1095 	Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
1096 	Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
1097 	Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
1098 	Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
1099 	Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
1100 	Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
1101 	Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
1102 	Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
1103 	Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
1104 	Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
1105 	Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
1106 	Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
1107 	Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
1108 	Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
1109 	Squ_End(16)
1110 
1111 #define Bot_2 \
1112 	Mul_Begin(2) \
1113 	Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
1114 	Bot_End(2)
1115 
1116 #define Bot_4 \
1117 	Mul_Begin(4) \
1118 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
1119 	Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2)  \
1120 	Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0)  \
1121 	Bot_End(4)
1122 
1123 #define Bot_8 \
1124 	Mul_Begin(8) \
1125 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
1126 	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
1127 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
1128 	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
1129 	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1130 	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1131 	Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
1132 	Bot_End(8)
1133 
1134 #define Bot_16 \
1135 	Mul_Begin(16) \
1136 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
1137 	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
1138 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
1139 	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
1140 	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1141 	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1142 	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1143 	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
1144 	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
1145 	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
1146 	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
1147 	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
1148 	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
1149 	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
1150 	Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
1151 	Bot_End(16)
1152 
1153 #endif
1154 
1155 #if 0
1156 #define Mul_Begin(n)				\
1157 	Declare2Words(p)				\
1158 	Declare2Words(c)				\
1159 	Declare2Words(d)				\
1160 	MultiplyWords(p, A[0], B[0])	\
1161 	AssignWord(c, LowWord(p))		\
1162 	AssignWord(d, HighWord(p))
1163 
1164 #define Mul_Acc(i, j)				\
1165 	MultiplyWords(p, A[i], B[j])	\
1166 	Acc2WordsBy1(c, LowWord(p))		\
1167 	Acc2WordsBy1(d, HighWord(p))
1168 
1169 #define Mul_SaveAcc(k, i, j) 		\
1170 	R[k] = LowWord(c);				\
1171 	Add2WordsBy1(c, d, HighWord(c))	\
1172 	MultiplyWords(p, A[i], B[j])	\
1173 	AssignWord(d, HighWord(p))		\
1174 	Acc2WordsBy1(c, LowWord(p))
1175 
1176 #define Mul_End(n)					\
1177 	R[2*n-3] = LowWord(c);			\
1178 	Acc2WordsBy1(d, HighWord(c))	\
1179 	MultiplyWords(p, A[n-1], B[n-1])\
1180 	Acc2WordsBy2(d, p)				\
1181 	R[2*n-2] = LowWord(d);			\
1182 	R[2*n-1] = HighWord(d);
1183 
1184 #define Bot_SaveAcc(k, i, j)		\
1185 	R[k] = LowWord(c);				\
1186 	word e = LowWord(d) + HighWord(c);	\
1187 	e += A[i] * B[j];
1188 
1189 #define Bot_Acc(i, j)	\
1190 	e += A[i] * B[j];
1191 
1192 #define Bot_End(n)		\
1193 	R[n-1] = e;
1194 #else
1195 #define Mul_Begin(n)				\
1196 	Declare2Words(p)				\
1197 	word c;	\
1198 	Declare2Words(d)				\
1199 	MultiplyWords(p, A[0], B[0])	\
1200 	c = LowWord(p);		\
1201 	AssignWord(d, HighWord(p))
1202 
1203 #define Mul_Acc(i, j)				\
1204 	MulAcc(c, d, A[i], B[j])
1205 
1206 #define Mul_SaveAcc(k, i, j) 		\
1207 	R[k] = c;				\
1208 	c = LowWord(d);	\
1209 	AssignWord(d, HighWord(d))	\
1210 	MulAcc(c, d, A[i], B[j])
1211 
1212 #define Mul_End(k, i)					\
1213 	R[k] = c;			\
1214 	MultiplyWords(p, A[i], B[i])	\
1215 	Acc2WordsBy2(p, d)				\
1216 	R[k+1] = LowWord(p);			\
1217 	R[k+2] = HighWord(p);
1218 
1219 #define Bot_SaveAcc(k, i, j)		\
1220 	R[k] = c;				\
1221 	c = LowWord(d);	\
1222 	c += A[i] * B[j];
1223 
1224 #define Bot_Acc(i, j)	\
1225 	c += A[i] * B[j];
1226 
1227 #define Bot_End(n)		\
1228 	R[n-1] = c;
1229 #endif
1230 
1231 #define Squ_Begin(n)				\
1232 	Declare2Words(p)				\
1233 	word c;				\
1234 	Declare2Words(d)				\
1235 	Declare2Words(e)				\
1236 	MultiplyWords(p, A[0], A[0])	\
1237 	R[0] = LowWord(p);				\
1238 	AssignWord(e, HighWord(p))		\
1239 	MultiplyWords(p, A[0], A[1])	\
1240 	c = LowWord(p);		\
1241 	AssignWord(d, HighWord(p))		\
1242 	Squ_NonDiag						\
1243 
1244 #define Squ_NonDiag				\
1245 	Double3Words(c, d)
1246 
1247 #define Squ_SaveAcc(k, i, j) 		\
1248 	Acc3WordsBy2(c, d, e)			\
1249 	R[k] = c;				\
1250 	MultiplyWords(p, A[i], A[j])	\
1251 	c = LowWord(p);		\
1252 	AssignWord(d, HighWord(p))		\
1253 
1254 #define Squ_Acc(i, j)				\
1255 	MulAcc(c, d, A[i], A[j])
1256 
1257 #define Squ_Diag(i)					\
1258 	Squ_NonDiag						\
1259 	MulAcc(c, d, A[i], A[i])
1260 
1261 #define Squ_End(n)					\
1262 	Acc3WordsBy2(c, d, e)			\
1263 	R[2*n-3] = c;			\
1264 	MultiplyWords(p, A[n-1], A[n-1])\
1265 	Acc2WordsBy2(p, e)				\
1266 	R[2*n-2] = LowWord(p);			\
1267 	R[2*n-1] = HighWord(p);
1268 
1269 
Baseline_Multiply2(word * R,const word * AA,const word * BB)1270 void Baseline_Multiply2(word *R, const word *AA, const word *BB)
1271 {
1272 	// http://github.com/weidai11/cryptopp/issues/188
1273 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1274 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1275 
1276 	Mul_2
1277 }
1278 
Baseline_Multiply4(word * R,const word * AA,const word * BB)1279 void Baseline_Multiply4(word *R, const word *AA, const word *BB)
1280 {
1281 	// http://github.com/weidai11/cryptopp/issues/188
1282 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1283 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1284 
1285 	Mul_4
1286 }
1287 
Baseline_Multiply8(word * R,const word * AA,const word * BB)1288 void Baseline_Multiply8(word *R, const word *AA, const word *BB)
1289 {
1290 	// http://github.com/weidai11/cryptopp/issues/188
1291 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1292 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1293 
1294 	Mul_8
1295 }
1296 
Baseline_Square2(word * R,const word * AA)1297 void Baseline_Square2(word *R, const word *AA)
1298 {
1299 	// http://github.com/weidai11/cryptopp/issues/188
1300 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1301 
1302 	Squ_2
1303 }
1304 
Baseline_Square4(word * R,const word * AA)1305 void Baseline_Square4(word *R, const word *AA)
1306 {
1307 	// http://github.com/weidai11/cryptopp/issues/188
1308 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1309 
1310 	Squ_4
1311 }
1312 
Baseline_Square8(word * R,const word * AA)1313 void Baseline_Square8(word *R, const word *AA)
1314 {
1315 	// http://github.com/weidai11/cryptopp/issues/188
1316 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1317 
1318 	Squ_8
1319 }
1320 
Baseline_MultiplyBottom2(word * R,const word * AA,const word * BB)1321 void Baseline_MultiplyBottom2(word *R, const word *AA, const word *BB)
1322 {
1323 	// http://github.com/weidai11/cryptopp/issues/188
1324 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1325 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1326 
1327 	Bot_2
1328 
1329 // http://github.com/weidai11/cryptopp/issues/340
1330 #if defined(TWO_64_BIT_WORDS)
1331 	CRYPTOPP_UNUSED(d0); CRYPTOPP_UNUSED(d1);
1332 #endif
1333 }
1334 
Baseline_MultiplyBottom4(word * R,const word * AA,const word * BB)1335 void Baseline_MultiplyBottom4(word *R, const word *AA, const word *BB)
1336 {
1337 	// http://github.com/weidai11/cryptopp/issues/188
1338 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1339 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1340 
1341 	Bot_4
1342 }
1343 
Baseline_MultiplyBottom8(word * R,const word * AA,const word * BB)1344 void Baseline_MultiplyBottom8(word *R, const word *AA, const word *BB)
1345 {
1346 	// http://github.com/weidai11/cryptopp/issues/188
1347 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1348 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1349 
1350 	Bot_8
1351 }
1352 
1353 #define Top_Begin(n)				\
1354 	Declare2Words(p)				\
1355 	word c;	\
1356 	Declare2Words(d)				\
1357 	MultiplyWords(p, A[0], B[n-2]);\
1358 	AssignWord(d, HighWord(p));
1359 
1360 #define Top_Acc(i, j)	\
1361 	MultiplyWords(p, A[i], B[j]);\
1362 	Acc2WordsBy1(d, HighWord(p));
1363 
1364 #define Top_SaveAcc0(i, j) 		\
1365 	c = LowWord(d);	\
1366 	AssignWord(d, HighWord(d))	\
1367 	MulAcc(c, d, A[i], B[j])
1368 
1369 #define Top_SaveAcc1(i, j) 		\
1370 	c = L<c; \
1371 	Acc2WordsBy1(d, c);	\
1372 	c = LowWord(d);	\
1373 	AssignWord(d, HighWord(d))	\
1374 	MulAcc(c, d, A[i], B[j])
1375 
Baseline_MultiplyTop2(word * R,const word * A,const word * B,word L)1376 void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
1377 {
1378 	CRYPTOPP_UNUSED(L);
1379 	word T[4];
1380 	Baseline_Multiply2(T, A, B);
1381 	R[0] = T[2];
1382 	R[1] = T[3];
1383 }
1384 
Baseline_MultiplyTop4(word * R,const word * AA,const word * BB,word L)1385 void Baseline_MultiplyTop4(word *R, const word *AA, const word *BB, word L)
1386 {
1387 	// http://github.com/weidai11/cryptopp/issues/188
1388 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1389 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1390 
1391 	Top_Begin(4)
1392 	Top_Acc(1, 1) Top_Acc(2, 0)  \
1393 	Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
1394 	Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
1395 	Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
1396 	Mul_End(1, 3)
1397 }
1398 
Baseline_MultiplyTop8(word * R,const word * AA,const word * BB,word L)1399 void Baseline_MultiplyTop8(word *R, const word *AA, const word *BB, word L)
1400 {
1401 	// http://github.com/weidai11/cryptopp/issues/188
1402 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1403 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1404 
1405 	Top_Begin(8)
1406 	Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
1407 	Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1408 	Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
1409 	Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
1410 	Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
1411 	Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
1412 	Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
1413 	Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
1414 	Mul_End(5, 7)
1415 }
1416 
1417 #if !CRYPTOPP_INTEGER_SSE2	// save memory by not compiling these functions when SSE2 is available
Baseline_Multiply16(word * R,const word * AA,const word * BB)1418 void Baseline_Multiply16(word *R, const word *AA, const word *BB)
1419 {
1420 	// http://github.com/weidai11/cryptopp/issues/188
1421 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1422 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1423 
1424 	Mul_16
1425 }
1426 
Baseline_Square16(word * R,const word * AA)1427 void Baseline_Square16(word *R, const word *AA)
1428 {
1429 	// http://github.com/weidai11/cryptopp/issues/188
1430 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1431 
1432 	Squ_16
1433 }
1434 
Baseline_MultiplyBottom16(word * R,const word * AA,const word * BB)1435 void Baseline_MultiplyBottom16(word *R, const word *AA, const word *BB)
1436 {
1437 	// http://github.com/weidai11/cryptopp/issues/188
1438 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1439 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1440 
1441 	Bot_16
1442 }
1443 
Baseline_MultiplyTop16(word * R,const word * AA,const word * BB,word L)1444 void Baseline_MultiplyTop16(word *R, const word *AA, const word *BB, word L)
1445 {
1446 	// http://github.com/weidai11/cryptopp/issues/188
1447 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1448 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1449 
1450 	Top_Begin(16)
1451 	Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
1452 	Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
1453 	Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
1454 	Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
1455 	Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
1456 	Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
1457 	Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
1458 	Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
1459 	Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
1460 	Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
1461 	Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
1462 	Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
1463 	Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
1464 	Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
1465 	Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
1466 	Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
1467 	Mul_End(13, 15)
1468 }
1469 #endif
1470 
1471 // ********************************************************
1472 
1473 #if CRYPTOPP_INTEGER_SSE2
1474 
1475 CRYPTOPP_ALIGN_DATA(16)
1476 CRYPTOPP_TABLE
1477 const word32 s_maskLow16[4] = {
1478 	0xffff,0xffff,0xffff,0xffff
1479 };
1480 
1481 #undef Mul_Begin
1482 #undef Mul_Acc
1483 #undef Top_Begin
1484 #undef Top_Acc
1485 #undef Squ_Acc
1486 #undef Squ_NonDiag
1487 #undef Squ_Diag
1488 #undef Squ_SaveAcc
1489 #undef Squ_Begin
1490 #undef Mul_SaveAcc
1491 #undef Bot_Acc
1492 #undef Bot_SaveAcc
1493 #undef Bot_End
1494 #undef Squ_End
1495 #undef Mul_End
1496 
1497 #define SSE2_FinalSave(k)			\
1498 	AS2(	psllq		xmm5, 16)	\
1499 	AS2(	paddq		xmm4, xmm5)	\
1500 	AS2(	movq		QWORD PTR [ecx+8*(k)], xmm4)
1501 
1502 #define SSE2_SaveShift(k)			\
1503 	AS2(	movq		xmm0, xmm6)	\
1504 	AS2(	punpckhqdq	xmm6, xmm0)	\
1505 	AS2(	movq		xmm1, xmm7)	\
1506 	AS2(	punpckhqdq	xmm7, xmm1)	\
1507 	AS2(	paddd		xmm6, xmm0)	\
1508 	AS2(	pslldq		xmm6, 4)	\
1509 	AS2(	paddd		xmm7, xmm1)	\
1510 	AS2(	paddd		xmm4, xmm6)	\
1511 	AS2(	pslldq		xmm7, 4)	\
1512 	AS2(	movq		xmm6, xmm4)	\
1513 	AS2(	paddd		xmm5, xmm7)	\
1514 	AS2(	movq		xmm7, xmm5)	\
1515 	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
1516 	AS2(	psrlq		xmm6, 16)	\
1517 	AS2(	paddq		xmm6, xmm7)	\
1518 	AS2(	punpckhqdq	xmm4, xmm0)	\
1519 	AS2(	punpckhqdq	xmm5, xmm0)	\
1520 	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm6)	\
1521 	AS2(	psrlq		xmm6, 3*16)	\
1522 	AS2(	paddd		xmm4, xmm6)	\
1523 
1524 #define Squ_SSE2_SaveShift(k)			\
1525 	AS2(	movq		xmm0, xmm6)	\
1526 	AS2(	punpckhqdq	xmm6, xmm0)	\
1527 	AS2(	movq		xmm1, xmm7)	\
1528 	AS2(	punpckhqdq	xmm7, xmm1)	\
1529 	AS2(	paddd		xmm6, xmm0)	\
1530 	AS2(	pslldq		xmm6, 4)	\
1531 	AS2(	paddd		xmm7, xmm1)	\
1532 	AS2(	paddd		xmm4, xmm6)	\
1533 	AS2(	pslldq		xmm7, 4)	\
1534 	AS2(	movhlps		xmm6, xmm4)	\
1535 	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
1536 	AS2(	paddd		xmm5, xmm7)	\
1537 	AS2(	movhps		QWORD PTR [esp+12], xmm5)\
1538 	AS2(	psrlq		xmm4, 16)	\
1539 	AS2(	paddq		xmm4, xmm5)	\
1540 	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm4)	\
1541 	AS2(	psrlq		xmm4, 3*16)	\
1542 	AS2(	paddd		xmm4, xmm6)	\
1543 	AS2(	movq		QWORD PTR [esp+4], xmm4)\
1544 
1545 #define SSE2_FirstMultiply(i)				\
1546 	AS2(	movdqa		xmm7, [esi+(i)*16])\
1547 	AS2(	movdqa		xmm5, [edi-(i)*16])\
1548 	AS2(	pmuludq		xmm5, xmm7)		\
1549 	AS2(	movdqa		xmm4, [ebx])\
1550 	AS2(	movdqa		xmm6, xmm4)		\
1551 	AS2(	pand		xmm4, xmm5)		\
1552 	AS2(	psrld		xmm5, 16)		\
1553 	AS2(	pmuludq		xmm7, [edx-(i)*16])\
1554 	AS2(	pand		xmm6, xmm7)		\
1555 	AS2(	psrld		xmm7, 16)
1556 
1557 #define Squ_Begin(n)							\
1558 	SquPrologue									\
1559 	AS2(	mov		esi, esp)\
1560 	AS2(	and		esp, 0xfffffff0)\
1561 	AS2(	lea		edi, [esp-32*n])\
1562 	AS2(	sub		esp, 32*n+16)\
1563 	AS1(	push	esi)\
1564 	AS2(	mov		esi, edi)					\
1565 	AS2(	xor		edx, edx)					\
1566 	ASL(1)										\
1567 	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1568 	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1569 	AS2(	movdqa	[edi+2*edx], xmm0)		\
1570 	AS2(	psrlq	xmm0, 32)					\
1571 	AS2(	movdqa	[edi+2*edx+16], xmm0)	\
1572 	AS2(	movdqa	[edi+16*n+2*edx], xmm1)		\
1573 	AS2(	psrlq	xmm1, 32)					\
1574 	AS2(	movdqa	[edi+16*n+2*edx+16], xmm1)	\
1575 	AS2(	add		edx, 16)					\
1576 	AS2(	cmp		edx, 8*(n))					\
1577 	ASJ(	jne,	1, b)						\
1578 	AS2(	lea		edx, [edi+16*n])\
1579 	SSE2_FirstMultiply(0)							\
1580 
1581 #define Squ_Acc(i)								\
1582 	ASL(LSqu##i)								\
1583 	AS2(	movdqa		xmm1, [esi+(i)*16])	\
1584 	AS2(	movdqa		xmm0, [edi-(i)*16])	\
1585 	AS2(	movdqa		xmm2, [ebx])	\
1586 	AS2(	pmuludq		xmm0, xmm1)				\
1587 	AS2(	pmuludq		xmm1, [edx-(i)*16])	\
1588 	AS2(	movdqa		xmm3, xmm2)			\
1589 	AS2(	pand		xmm2, xmm0)			\
1590 	AS2(	psrld		xmm0, 16)			\
1591 	AS2(	paddd		xmm4, xmm2)			\
1592 	AS2(	paddd		xmm5, xmm0)			\
1593 	AS2(	pand		xmm3, xmm1)			\
1594 	AS2(	psrld		xmm1, 16)			\
1595 	AS2(	paddd		xmm6, xmm3)			\
1596 	AS2(	paddd		xmm7, xmm1)		\
1597 
1598 #define Squ_Acc1(i)
1599 #define Squ_Acc2(i)		ASC(call, LSqu##i)
1600 #define Squ_Acc3(i)		Squ_Acc2(i)
1601 #define Squ_Acc4(i)		Squ_Acc2(i)
1602 #define Squ_Acc5(i)		Squ_Acc2(i)
1603 #define Squ_Acc6(i)		Squ_Acc2(i)
1604 #define Squ_Acc7(i)		Squ_Acc2(i)
1605 #define Squ_Acc8(i)		Squ_Acc2(i)
1606 
1607 #define SSE2_End(E, n)					\
1608 	SSE2_SaveShift(2*(n)-3)			\
1609 	AS2(	movdqa		xmm7, [esi+16])	\
1610 	AS2(	movdqa		xmm0, [edi])	\
1611 	AS2(	pmuludq		xmm0, xmm7)				\
1612 	AS2(	movdqa		xmm2, [ebx])		\
1613 	AS2(	pmuludq		xmm7, [edx])	\
1614 	AS2(	movdqa		xmm6, xmm2)				\
1615 	AS2(	pand		xmm2, xmm0)				\
1616 	AS2(	psrld		xmm0, 16)				\
1617 	AS2(	paddd		xmm4, xmm2)				\
1618 	AS2(	paddd		xmm5, xmm0)				\
1619 	AS2(	pand		xmm6, xmm7)				\
1620 	AS2(	psrld		xmm7, 16)	\
1621 	SSE2_SaveShift(2*(n)-2)			\
1622 	SSE2_FinalSave(2*(n)-1)			\
1623 	AS1(	pop		esp)\
1624 	E
1625 
1626 #define Squ_End(n)		SSE2_End(SquEpilogue, n)
1627 #define Mul_End(n)		SSE2_End(MulEpilogue, n)
1628 #define Top_End(n)		SSE2_End(TopEpilogue, n)
1629 
1630 #define Squ_Column1(k, i)	\
1631 	Squ_SSE2_SaveShift(k)					\
1632 	AS2(	add			esi, 16)	\
1633 	SSE2_FirstMultiply(1)\
1634 	Squ_Acc##i(i)	\
1635 	AS2(	paddd		xmm4, xmm4)		\
1636 	AS2(	paddd		xmm5, xmm5)		\
1637 	AS2(	movdqa		xmm3, [esi])				\
1638 	AS2(	movq		xmm1, QWORD PTR [esi+8])	\
1639 	AS2(	pmuludq		xmm1, xmm3)		\
1640 	AS2(	pmuludq		xmm3, xmm3)		\
1641 	AS2(	movdqa		xmm0, [ebx])\
1642 	AS2(	movdqa		xmm2, xmm0)		\
1643 	AS2(	pand		xmm0, xmm1)		\
1644 	AS2(	psrld		xmm1, 16)		\
1645 	AS2(	paddd		xmm6, xmm0)		\
1646 	AS2(	paddd		xmm7, xmm1)		\
1647 	AS2(	pand		xmm2, xmm3)		\
1648 	AS2(	psrld		xmm3, 16)		\
1649 	AS2(	paddd		xmm6, xmm6)		\
1650 	AS2(	paddd		xmm7, xmm7)		\
1651 	AS2(	paddd		xmm4, xmm2)		\
1652 	AS2(	paddd		xmm5, xmm3)		\
1653 	AS2(	movq		xmm0, QWORD PTR [esp+4])\
1654 	AS2(	movq		xmm1, QWORD PTR [esp+12])\
1655 	AS2(	paddd		xmm4, xmm0)\
1656 	AS2(	paddd		xmm5, xmm1)\
1657 
1658 #define Squ_Column0(k, i)	\
1659 	Squ_SSE2_SaveShift(k)					\
1660 	AS2(	add			edi, 16)	\
1661 	AS2(	add			edx, 16)	\
1662 	SSE2_FirstMultiply(1)\
1663 	Squ_Acc##i(i)	\
1664 	AS2(	paddd		xmm6, xmm6)		\
1665 	AS2(	paddd		xmm7, xmm7)		\
1666 	AS2(	paddd		xmm4, xmm4)		\
1667 	AS2(	paddd		xmm5, xmm5)		\
1668 	AS2(	movq		xmm0, QWORD PTR [esp+4])\
1669 	AS2(	movq		xmm1, QWORD PTR [esp+12])\
1670 	AS2(	paddd		xmm4, xmm0)\
1671 	AS2(	paddd		xmm5, xmm1)\
1672 
1673 #define SSE2_MulAdd45						\
1674 	AS2(	movdqa		xmm7, [esi])	\
1675 	AS2(	movdqa		xmm0, [edi])	\
1676 	AS2(	pmuludq		xmm0, xmm7)				\
1677 	AS2(	movdqa		xmm2, [ebx])		\
1678 	AS2(	pmuludq		xmm7, [edx])	\
1679 	AS2(	movdqa		xmm6, xmm2)				\
1680 	AS2(	pand		xmm2, xmm0)				\
1681 	AS2(	psrld		xmm0, 16)				\
1682 	AS2(	paddd		xmm4, xmm2)				\
1683 	AS2(	paddd		xmm5, xmm0)				\
1684 	AS2(	pand		xmm6, xmm7)				\
1685 	AS2(	psrld		xmm7, 16)
1686 
1687 #define Mul_Begin(n)							\
1688 	MulPrologue									\
1689 	AS2(	mov		esi, esp)\
1690 	AS2(	and		esp, 0xfffffff0)\
1691 	AS2(	sub		esp, 48*n+16)\
1692 	AS1(	push	esi)\
1693 	AS2(	xor		edx, edx)					\
1694 	ASL(1)										\
1695 	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1696 	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1697 	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
1698 	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
1699 	AS2(	psrlq	xmm0, 32)					\
1700 	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
1701 	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
1702 	AS2(	psrlq	xmm1, 32)					\
1703 	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
1704 	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
1705 	AS2(	psrlq	xmm2, 32)					\
1706 	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
1707 	AS2(	add		edx, 16)					\
1708 	AS2(	cmp		edx, 8*(n))					\
1709 	ASJ(	jne,	1, b)						\
1710 	AS2(	lea		edi, [esp+20])\
1711 	AS2(	lea		edx, [esp+20+16*n])\
1712 	AS2(	lea		esi, [esp+20+32*n])\
1713 	SSE2_FirstMultiply(0)							\
1714 
1715 #define Mul_Acc(i)								\
1716 	ASL(LMul##i)										\
1717 	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
1718 	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
1719 	AS2(	movdqa		xmm2, [ebx])	\
1720 	AS2(	pmuludq		xmm0, xmm1)				\
1721 	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
1722 	AS2(	movdqa		xmm3, xmm2)			\
1723 	AS2(	pand		xmm2, xmm0)			\
1724 	AS2(	psrld		xmm0, 16)			\
1725 	AS2(	paddd		xmm4, xmm2)			\
1726 	AS2(	paddd		xmm5, xmm0)			\
1727 	AS2(	pand		xmm3, xmm1)			\
1728 	AS2(	psrld		xmm1, 16)			\
1729 	AS2(	paddd		xmm6, xmm3)			\
1730 	AS2(	paddd		xmm7, xmm1)		\
1731 
1732 #define Mul_Acc1(i)
1733 #define Mul_Acc2(i)		ASC(call, LMul##i)
1734 #define Mul_Acc3(i)		Mul_Acc2(i)
1735 #define Mul_Acc4(i)		Mul_Acc2(i)
1736 #define Mul_Acc5(i)		Mul_Acc2(i)
1737 #define Mul_Acc6(i)		Mul_Acc2(i)
1738 #define Mul_Acc7(i)		Mul_Acc2(i)
1739 #define Mul_Acc8(i)		Mul_Acc2(i)
1740 #define Mul_Acc9(i)		Mul_Acc2(i)
1741 #define Mul_Acc10(i)	Mul_Acc2(i)
1742 #define Mul_Acc11(i)	Mul_Acc2(i)
1743 #define Mul_Acc12(i)	Mul_Acc2(i)
1744 #define Mul_Acc13(i)	Mul_Acc2(i)
1745 #define Mul_Acc14(i)	Mul_Acc2(i)
1746 #define Mul_Acc15(i)	Mul_Acc2(i)
1747 #define Mul_Acc16(i)	Mul_Acc2(i)
1748 
1749 #define Mul_Column1(k, i)	\
1750 	SSE2_SaveShift(k)					\
1751 	AS2(	add			esi, 16)	\
1752 	SSE2_MulAdd45\
1753 	Mul_Acc##i(i)	\
1754 
1755 #define Mul_Column0(k, i)	\
1756 	SSE2_SaveShift(k)					\
1757 	AS2(	add			edi, 16)	\
1758 	AS2(	add			edx, 16)	\
1759 	SSE2_MulAdd45\
1760 	Mul_Acc##i(i)	\
1761 
1762 #define Bot_Acc(i)							\
1763 	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
1764 	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
1765 	AS2(	pmuludq		xmm0, xmm1)				\
1766 	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])		\
1767 	AS2(	paddq		xmm4, xmm0)				\
1768 	AS2(	paddd		xmm6, xmm1)
1769 
1770 #define Bot_SaveAcc(k)					\
1771 	SSE2_SaveShift(k)							\
1772 	AS2(	add			edi, 16)	\
1773 	AS2(	add			edx, 16)	\
1774 	AS2(	movdqa		xmm6, [esi])	\
1775 	AS2(	movdqa		xmm0, [edi])	\
1776 	AS2(	pmuludq		xmm0, xmm6)				\
1777 	AS2(	paddq		xmm4, xmm0)				\
1778 	AS2(	psllq		xmm5, 16)				\
1779 	AS2(	paddq		xmm4, xmm5)				\
1780 	AS2(	pmuludq		xmm6, [edx])
1781 
1782 #define Bot_End(n)							\
1783 	AS2(	movhlps		xmm7, xmm6)			\
1784 	AS2(	paddd		xmm6, xmm7)			\
1785 	AS2(	psllq		xmm6, 32)			\
1786 	AS2(	paddd		xmm4, xmm6)			\
1787 	AS2(	movq		QWORD PTR [ecx+8*((n)-1)], xmm4)	\
1788 	AS1(	pop		esp)\
1789 	MulEpilogue
1790 
1791 #define Top_Begin(n)							\
1792 	TopPrologue									\
1793 	AS2(	mov		edx, esp)\
1794 	AS2(	and		esp, 0xfffffff0)\
1795 	AS2(	sub		esp, 48*n+16)\
1796 	AS1(	push	edx)\
1797 	AS2(	xor		edx, edx)					\
1798 	ASL(1)										\
1799 	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1800 	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1801 	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
1802 	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
1803 	AS2(	psrlq	xmm0, 32)					\
1804 	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
1805 	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
1806 	AS2(	psrlq	xmm1, 32)					\
1807 	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
1808 	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
1809 	AS2(	psrlq	xmm2, 32)					\
1810 	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
1811 	AS2(	add		edx, 16)					\
1812 	AS2(	cmp		edx, 8*(n))					\
1813 	ASJ(	jne,	1, b)						\
1814 	AS2(	mov		eax, esi)					\
1815 	AS2(	lea		edi, [esp+20+00*n+16*(n/2-1)])\
1816 	AS2(	lea		edx, [esp+20+16*n+16*(n/2-1)])\
1817 	AS2(	lea		esi, [esp+20+32*n+16*(n/2-1)])\
1818 	AS2(	pxor	xmm4, xmm4)\
1819 	AS2(	pxor	xmm5, xmm5)
1820 
1821 #define Top_Acc(i)							\
1822 	AS2(	movq		xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8])	\
1823 	AS2(	pmuludq		xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
1824 	AS2(	psrlq		xmm0, 48)				\
1825 	AS2(	paddd		xmm5, xmm0)\
1826 
1827 #define Top_Column0(i)	\
1828 	AS2(	psllq		xmm5, 32)				\
1829 	AS2(	add			edi, 16)	\
1830 	AS2(	add			edx, 16)	\
1831 	SSE2_MulAdd45\
1832 	Mul_Acc##i(i)	\
1833 
1834 #define Top_Column1(i)	\
1835 	SSE2_SaveShift(0)					\
1836 	AS2(	add			esi, 16)	\
1837 	SSE2_MulAdd45\
1838 	Mul_Acc##i(i)	\
1839 	AS2(	shr			eax, 16)	\
1840 	AS2(	movd		xmm0, eax)\
1841 	AS2(	movd		xmm1, [ecx+4])\
1842 	AS2(	psrld		xmm1, 16)\
1843 	AS2(	pcmpgtd		xmm1, xmm0)\
1844 	AS2(	psrld		xmm1, 31)\
1845 	AS2(	paddd		xmm4, xmm1)\
1846 
SSE2_Square4(word * C,const word * A)1847 void SSE2_Square4(word *C, const word *A)
1848 {
1849 	Squ_Begin(2)
1850 	Squ_Column0(0, 1)
1851 	Squ_End(2)
1852 }
1853 
SSE2_Square8(word * C,const word * A)1854 void SSE2_Square8(word *C, const word *A)
1855 {
1856 	Squ_Begin(4)
1857 #ifndef __GNUC__
1858 	ASJ(	jmp,	0, f)
1859 	Squ_Acc(2)
1860 	AS1(	ret) ASL(0)
1861 #endif
1862 	Squ_Column0(0, 1)
1863 	Squ_Column1(1, 1)
1864 	Squ_Column0(2, 2)
1865 	Squ_Column1(3, 1)
1866 	Squ_Column0(4, 1)
1867 	Squ_End(4)
1868 }
1869 
SSE2_Square16(word * C,const word * A)1870 void SSE2_Square16(word *C, const word *A)
1871 {
1872 	Squ_Begin(8)
1873 #ifndef __GNUC__
1874 	ASJ(	jmp,	0, f)
1875 	Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1876 	AS1(	ret) ASL(0)
1877 #endif
1878 	Squ_Column0(0, 1)
1879 	Squ_Column1(1, 1)
1880 	Squ_Column0(2, 2)
1881 	Squ_Column1(3, 2)
1882 	Squ_Column0(4, 3)
1883 	Squ_Column1(5, 3)
1884 	Squ_Column0(6, 4)
1885 	Squ_Column1(7, 3)
1886 	Squ_Column0(8, 3)
1887 	Squ_Column1(9, 2)
1888 	Squ_Column0(10, 2)
1889 	Squ_Column1(11, 1)
1890 	Squ_Column0(12, 1)
1891 	Squ_End(8)
1892 }
1893 
SSE2_Square32(word * C,const word * A)1894 void SSE2_Square32(word *C, const word *A)
1895 {
1896 	Squ_Begin(16)
1897 	ASJ(	jmp,	0, f)
1898 	Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1899 	AS1(	ret) ASL(0)
1900 	Squ_Column0(0, 1)
1901 	Squ_Column1(1, 1)
1902 	Squ_Column0(2, 2)
1903 	Squ_Column1(3, 2)
1904 	Squ_Column0(4, 3)
1905 	Squ_Column1(5, 3)
1906 	Squ_Column0(6, 4)
1907 	Squ_Column1(7, 4)
1908 	Squ_Column0(8, 5)
1909 	Squ_Column1(9, 5)
1910 	Squ_Column0(10, 6)
1911 	Squ_Column1(11, 6)
1912 	Squ_Column0(12, 7)
1913 	Squ_Column1(13, 7)
1914 	Squ_Column0(14, 8)
1915 	Squ_Column1(15, 7)
1916 	Squ_Column0(16, 7)
1917 	Squ_Column1(17, 6)
1918 	Squ_Column0(18, 6)
1919 	Squ_Column1(19, 5)
1920 	Squ_Column0(20, 5)
1921 	Squ_Column1(21, 4)
1922 	Squ_Column0(22, 4)
1923 	Squ_Column1(23, 3)
1924 	Squ_Column0(24, 3)
1925 	Squ_Column1(25, 2)
1926 	Squ_Column0(26, 2)
1927 	Squ_Column1(27, 1)
1928 	Squ_Column0(28, 1)
1929 	Squ_End(16)
1930 }
1931 
SSE2_Multiply4(word * C,const word * A,const word * B)1932 void SSE2_Multiply4(word *C, const word *A, const word *B)
1933 {
1934 	Mul_Begin(2)
1935 #ifndef __GNUC__
1936 	ASJ(	jmp,	0, f)
1937 	Mul_Acc(2)
1938 	AS1(	ret) ASL(0)
1939 #endif
1940 	Mul_Column0(0, 2)
1941 	Mul_End(2)
1942 }
1943 
SSE2_Multiply8(word * C,const word * A,const word * B)1944 void SSE2_Multiply8(word *C, const word *A, const word *B)
1945 {
1946 	Mul_Begin(4)
1947 #ifndef __GNUC__
1948 	ASJ(	jmp,	0, f)
1949 	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1950 	AS1(	ret) ASL(0)
1951 #endif
1952 	Mul_Column0(0, 2)
1953 	Mul_Column1(1, 3)
1954 	Mul_Column0(2, 4)
1955 	Mul_Column1(3, 3)
1956 	Mul_Column0(4, 2)
1957 	Mul_End(4)
1958 }
1959 
SSE2_Multiply16(word * C,const word * A,const word * B)1960 void SSE2_Multiply16(word *C, const word *A, const word *B)
1961 {
1962 	Mul_Begin(8)
1963 #ifndef __GNUC__
1964 	ASJ(	jmp,	0, f)
1965 	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1966 	AS1(	ret) ASL(0)
1967 #endif
1968 	Mul_Column0(0, 2)
1969 	Mul_Column1(1, 3)
1970 	Mul_Column0(2, 4)
1971 	Mul_Column1(3, 5)
1972 	Mul_Column0(4, 6)
1973 	Mul_Column1(5, 7)
1974 	Mul_Column0(6, 8)
1975 	Mul_Column1(7, 7)
1976 	Mul_Column0(8, 6)
1977 	Mul_Column1(9, 5)
1978 	Mul_Column0(10, 4)
1979 	Mul_Column1(11, 3)
1980 	Mul_Column0(12, 2)
1981 	Mul_End(8)
1982 }
1983 
SSE2_Multiply32(word * C,const word * A,const word * B)1984 void SSE2_Multiply32(word *C, const word *A, const word *B)
1985 {
1986 	Mul_Begin(16)
1987 	ASJ(	jmp,	0, f)
1988 	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1989 	AS1(	ret) ASL(0)
1990 	Mul_Column0(0, 2)
1991 	Mul_Column1(1, 3)
1992 	Mul_Column0(2, 4)
1993 	Mul_Column1(3, 5)
1994 	Mul_Column0(4, 6)
1995 	Mul_Column1(5, 7)
1996 	Mul_Column0(6, 8)
1997 	Mul_Column1(7, 9)
1998 	Mul_Column0(8, 10)
1999 	Mul_Column1(9, 11)
2000 	Mul_Column0(10, 12)
2001 	Mul_Column1(11, 13)
2002 	Mul_Column0(12, 14)
2003 	Mul_Column1(13, 15)
2004 	Mul_Column0(14, 16)
2005 	Mul_Column1(15, 15)
2006 	Mul_Column0(16, 14)
2007 	Mul_Column1(17, 13)
2008 	Mul_Column0(18, 12)
2009 	Mul_Column1(19, 11)
2010 	Mul_Column0(20, 10)
2011 	Mul_Column1(21, 9)
2012 	Mul_Column0(22, 8)
2013 	Mul_Column1(23, 7)
2014 	Mul_Column0(24, 6)
2015 	Mul_Column1(25, 5)
2016 	Mul_Column0(26, 4)
2017 	Mul_Column1(27, 3)
2018 	Mul_Column0(28, 2)
2019 	Mul_End(16)
2020 }
2021 
SSE2_MultiplyBottom4(word * C,const word * A,const word * B)2022 void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
2023 {
2024 	Mul_Begin(2)
2025 	Bot_SaveAcc(0) Bot_Acc(2)
2026 	Bot_End(2)
2027 }
2028 
SSE2_MultiplyBottom8(word * C,const word * A,const word * B)2029 void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
2030 {
2031 	Mul_Begin(4)
2032 #ifndef __GNUC__
2033 	ASJ(	jmp,	0, f)
2034 	Mul_Acc(3) Mul_Acc(2)
2035 	AS1(	ret) ASL(0)
2036 #endif
2037 	Mul_Column0(0, 2)
2038 	Mul_Column1(1, 3)
2039 	Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
2040 	Bot_End(4)
2041 }
2042 
SSE2_MultiplyBottom16(word * C,const word * A,const word * B)2043 void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
2044 {
2045 	Mul_Begin(8)
2046 #ifndef __GNUC__
2047 	ASJ(	jmp,	0, f)
2048 	Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
2049 	AS1(	ret) ASL(0)
2050 #endif
2051 	Mul_Column0(0, 2)
2052 	Mul_Column1(1, 3)
2053 	Mul_Column0(2, 4)
2054 	Mul_Column1(3, 5)
2055 	Mul_Column0(4, 6)
2056 	Mul_Column1(5, 7)
2057 	Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
2058 	Bot_End(8)
2059 }
2060 
SSE2_MultiplyBottom32(word * C,const word * A,const word * B)2061 void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
2062 {
2063 	Mul_Begin(16)
2064 #ifndef __GNUC__
2065 	ASJ(	jmp,	0, f)
2066 	Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
2067 	AS1(	ret) ASL(0)
2068 #endif
2069 	Mul_Column0(0, 2)
2070 	Mul_Column1(1, 3)
2071 	Mul_Column0(2, 4)
2072 	Mul_Column1(3, 5)
2073 	Mul_Column0(4, 6)
2074 	Mul_Column1(5, 7)
2075 	Mul_Column0(6, 8)
2076 	Mul_Column1(7, 9)
2077 	Mul_Column0(8, 10)
2078 	Mul_Column1(9, 11)
2079 	Mul_Column0(10, 12)
2080 	Mul_Column1(11, 13)
2081 	Mul_Column0(12, 14)
2082 	Mul_Column1(13, 15)
2083 	Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
2084 	Bot_End(16)
2085 }
2086 
SSE2_MultiplyTop8(word * C,const word * A,const word * B,word L)2087 void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
2088 {
2089 	Top_Begin(4)
2090 	Top_Acc(3) Top_Acc(2) Top_Acc(1)
2091 #ifndef __GNUC__
2092 	ASJ(	jmp,	0, f)
2093 	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
2094 	AS1(	ret) ASL(0)
2095 #endif
2096 	Top_Column0(4)
2097 	Top_Column1(3)
2098 	Mul_Column0(0, 2)
2099 	Top_End(2)
2100 }
2101 
SSE2_MultiplyTop16(word * C,const word * A,const word * B,word L)2102 void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
2103 {
2104 	Top_Begin(8)
2105 	Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
2106 #ifndef __GNUC__
2107 	ASJ(	jmp,	0, f)
2108 	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
2109 	AS1(	ret) ASL(0)
2110 #endif
2111 	Top_Column0(8)
2112 	Top_Column1(7)
2113 	Mul_Column0(0, 6)
2114 	Mul_Column1(1, 5)
2115 	Mul_Column0(2, 4)
2116 	Mul_Column1(3, 3)
2117 	Mul_Column0(4, 2)
2118 	Top_End(4)
2119 }
2120 
SSE2_MultiplyTop32(word * C,const word * A,const word * B,word L)2121 void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
2122 {
2123 	Top_Begin(16)
2124 	Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
2125 #ifndef __GNUC__
2126 	ASJ(	jmp,	0, f)
2127 	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
2128 	AS1(	ret) ASL(0)
2129 #endif
2130 	Top_Column0(16)
2131 	Top_Column1(15)
2132 	Mul_Column0(0, 14)
2133 	Mul_Column1(1, 13)
2134 	Mul_Column0(2, 12)
2135 	Mul_Column1(3, 11)
2136 	Mul_Column0(4, 10)
2137 	Mul_Column1(5, 9)
2138 	Mul_Column0(6, 8)
2139 	Mul_Column1(7, 7)
2140 	Mul_Column0(8, 6)
2141 	Mul_Column1(9, 5)
2142 	Mul_Column0(10, 4)
2143 	Mul_Column1(11, 3)
2144 	Mul_Column0(12, 2)
2145 	Top_End(8)
2146 }
2147 
2148 #endif	// #if CRYPTOPP_INTEGER_SSE2
2149 
2150 // ********************************************************
2151 
2152 typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
2153 typedef void (* PMul)(word *C, const word *A, const word *B);
2154 typedef void (* PSqu)(word *C, const word *A);
2155 typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
2156 
2157 #if CRYPTOPP_INTEGER_SSE2
2158 static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
2159 static size_t s_recursionLimit = 8;
2160 #else
2161 static const size_t s_recursionLimit = 16;
2162 #endif  // CRYPTOPP_INTEGER_SSE2
2163 
2164 static PMul s_pMul[9], s_pBot[9];
2165 static PSqu s_pSqu[9];
2166 static PMulTop s_pTop[9];
2167 
SetFunctionPointers()2168 void SetFunctionPointers()
2169 {
2170 	s_pMul[0] = &Baseline_Multiply2;
2171 	s_pBot[0] = &Baseline_MultiplyBottom2;
2172 	s_pSqu[0] = &Baseline_Square2;
2173 	s_pTop[0] = &Baseline_MultiplyTop2;
2174 	s_pTop[1] = &Baseline_MultiplyTop4;
2175 
2176 #if CRYPTOPP_INTEGER_SSE2
2177 	if (HasSSE2())
2178 	{
2179 		if (IsP4())
2180 		{
2181 			s_pAdd = &SSE2_Add;
2182 			s_pSub = &SSE2_Sub;
2183 		}
2184 
2185 		s_recursionLimit = 32;
2186 
2187 		s_pMul[1] = &SSE2_Multiply4;
2188 		s_pMul[2] = &SSE2_Multiply8;
2189 		s_pMul[4] = &SSE2_Multiply16;
2190 		s_pMul[8] = &SSE2_Multiply32;
2191 
2192 		s_pBot[1] = &SSE2_MultiplyBottom4;
2193 		s_pBot[2] = &SSE2_MultiplyBottom8;
2194 		s_pBot[4] = &SSE2_MultiplyBottom16;
2195 		s_pBot[8] = &SSE2_MultiplyBottom32;
2196 
2197 		s_pSqu[1] = &SSE2_Square4;
2198 		s_pSqu[2] = &SSE2_Square8;
2199 		s_pSqu[4] = &SSE2_Square16;
2200 		s_pSqu[8] = &SSE2_Square32;
2201 
2202 		s_pTop[2] = &SSE2_MultiplyTop8;
2203 		s_pTop[4] = &SSE2_MultiplyTop16;
2204 		s_pTop[8] = &SSE2_MultiplyTop32;
2205 	}
2206 	else
2207 #endif  // CRYPTOPP_INTEGER_SSE2
2208 	{
2209 		s_pMul[1] = &Baseline_Multiply4;
2210 		s_pMul[2] = &Baseline_Multiply8;
2211 
2212 		s_pBot[1] = &Baseline_MultiplyBottom4;
2213 		s_pBot[2] = &Baseline_MultiplyBottom8;
2214 
2215 		s_pSqu[1] = &Baseline_Square4;
2216 		s_pSqu[2] = &Baseline_Square8;
2217 
2218 		s_pTop[2] = &Baseline_MultiplyTop8;
2219 
2220 #if	!CRYPTOPP_INTEGER_SSE2
2221 		s_pMul[4] = &Baseline_Multiply16;
2222 		s_pBot[4] = &Baseline_MultiplyBottom16;
2223 		s_pSqu[4] = &Baseline_Square16;
2224 		s_pTop[4] = &Baseline_MultiplyTop16;
2225 #endif  // !CRYPTOPP_INTEGER_SSE2
2226 	}
2227 }
2228 
Add(word * C,const word * A,const word * B,size_t N)2229 inline int Add(word *C, const word *A, const word *B, size_t N)
2230 {
2231 #if CRYPTOPP_INTEGER_SSE2
2232 	return s_pAdd(N, C, A, B);
2233 #else
2234 	return Baseline_Add(N, C, A, B);
2235 #endif  // CRYPTOPP_INTEGER_SSE2
2236 }
2237 
Subtract(word * C,const word * A,const word * B,size_t N)2238 inline int Subtract(word *C, const word *A, const word *B, size_t N)
2239 {
2240 #if CRYPTOPP_INTEGER_SSE2
2241 	return s_pSub(N, C, A, B);
2242 #else
2243 	return Baseline_Sub(N, C, A, B);
2244 #endif  // CRYPTOPP_INTEGER_SSE2
2245 }
2246 
2247 // ********************************************************
2248 
2249 
2250 #define A0		A
2251 #define A1		(A+N2)
2252 #define B0		B
2253 #define B1		(B+N2)
2254 
2255 #define T0		T
2256 #define T1		(T+N2)
2257 #define T2		(T+N)
2258 #define T3		(T+N+N2)
2259 
2260 #define R0		R
2261 #define R1		(R+N2)
2262 #define R2		(R+N)
2263 #define R3		(R+N+N2)
2264 
2265 // R[2*N] - result = A*B
2266 // T[2*N] - temporary work space
2267 // A[N] --- multiplier
2268 // B[N] --- multiplicant
2269 
RecursiveMultiply(word * R,word * T,const word * A,const word * B,size_t N)2270 void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
2271 {
2272 	CRYPTOPP_ASSERT(N>=2 && N%2==0);
2273 
2274 	if (N <= s_recursionLimit)
2275 		s_pMul[N/4](R, A, B);
2276 	else
2277 	{
2278 		const size_t N2 = N/2;
2279 
2280 		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
2281 		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
2282 
2283 		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
2284 		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
2285 
2286 		RecursiveMultiply(R2, T2, A1, B1, N2);
2287 		RecursiveMultiply(T0, T2, R0, R1, N2);
2288 		RecursiveMultiply(R0, T2, A0, B0, N2);
2289 
2290 		// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
2291 
2292 		int c2 = Add(R2, R2, R1, N2);
2293 		int c3 = c2;
2294 		c2 += Add(R1, R2, R0, N2);
2295 		c3 += Add(R2, R2, R3, N2);
2296 
2297 		if (AN2 == BN2)
2298 			c3 -= Subtract(R1, R1, T0, N);
2299 		else
2300 			c3 += Add(R1, R1, T0, N);
2301 
2302 		c3 += Increment(R2, N2, c2);
2303 		CRYPTOPP_ASSERT (c3 >= 0 && c3 <= 2);
2304 		Increment(R3, N2, c3);
2305 	}
2306 }
2307 
2308 // R[2*N] - result = A*A
2309 // T[2*N] - temporary work space
2310 // A[N] --- number to be squared
2311 
RecursiveSquare(word * R,word * T,const word * A,size_t N)2312 void RecursiveSquare(word *R, word *T, const word *A, size_t N)
2313 {
2314 	CRYPTOPP_ASSERT(N && N%2==0);
2315 
2316 	if (N <= s_recursionLimit)
2317 		s_pSqu[N/4](R, A);
2318 	else
2319 	{
2320 		const size_t N2 = N/2;
2321 
2322 		RecursiveSquare(R0, T2, A0, N2);
2323 		RecursiveSquare(R2, T2, A1, N2);
2324 		RecursiveMultiply(T0, T2, A0, A1, N2);
2325 
2326 		int carry = Add(R1, R1, T0, N);
2327 		carry += Add(R1, R1, T0, N);
2328 		Increment(R3, N2, carry);
2329 	}
2330 }
2331 
2332 // R[N] - bottom half of A*B
2333 // T[3*N/2] - temporary work space
2334 // A[N] - multiplier
2335 // B[N] - multiplicant
2336 
RecursiveMultiplyBottom(word * R,word * T,const word * A,const word * B,size_t N)2337 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
2338 {
2339 	CRYPTOPP_ASSERT(N>=2 && N%2==0);
2340 
2341 	if (N <= s_recursionLimit)
2342 		s_pBot[N/4](R, A, B);
2343 	else
2344 	{
2345 		const size_t N2 = N/2;
2346 
2347 		RecursiveMultiply(R, T, A0, B0, N2);
2348 		RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
2349 		Add(R1, R1, T0, N2);
2350 		RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
2351 		Add(R1, R1, T0, N2);
2352 	}
2353 }
2354 
2355 // R[N] --- upper half of A*B
2356 // T[2*N] - temporary work space
2357 // L[N] --- lower half of A*B
2358 // A[N] --- multiplier
2359 // B[N] --- multiplicant
2360 
MultiplyTop(word * R,word * T,const word * L,const word * A,const word * B,size_t N)2361 void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
2362 {
2363 	CRYPTOPP_ASSERT(N>=2 && N%2==0);
2364 
2365 	if (N <= s_recursionLimit)
2366 		s_pTop[N/4](R, A, B, L[N-1]);
2367 	else
2368 	{
2369 		const size_t N2 = N/2;
2370 
2371 		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
2372 		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
2373 
2374 		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
2375 		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
2376 
2377 		RecursiveMultiply(T0, T2, R0, R1, N2);
2378 		RecursiveMultiply(R0, T2, A1, B1, N2);
2379 
2380 		// now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1
2381 
2382 		int t, c3;
2383 		int c2 = Subtract(T2, L+N2, L, N2);
2384 
2385 		if (AN2 == BN2)
2386 		{
2387 			c2 -= Add(T2, T2, T0, N2);
2388 			t = (Compare(T2, R0, N2) == -1);
2389 			c3 = t - Subtract(T2, T2, T1, N2);
2390 		}
2391 		else
2392 		{
2393 			c2 += Subtract(T2, T2, T0, N2);
2394 			t = (Compare(T2, R0, N2) == -1);
2395 			c3 = t + Add(T2, T2, T1, N2);
2396 		}
2397 
2398 		c2 += t;
2399 		if (c2 >= 0)
2400 			c3 += Increment(T2, N2, c2);
2401 		else
2402 			c3 -= Decrement(T2, N2, -c2);
2403 		c3 += Add(R0, T2, R1, N2);
2404 
2405 		CRYPTOPP_ASSERT (c3 >= 0 && c3 <= 2);
2406 		Increment(R1, N2, c3);
2407 	}
2408 }
2409 
Multiply(word * R,word * T,const word * A,const word * B,size_t N)2410 inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)
2411 {
2412 	RecursiveMultiply(R, T, A, B, N);
2413 }
2414 
Square(word * R,word * T,const word * A,size_t N)2415 inline void Square(word *R, word *T, const word *A, size_t N)
2416 {
2417 	RecursiveSquare(R, T, A, N);
2418 }
2419 
MultiplyBottom(word * R,word * T,const word * A,const word * B,size_t N)2420 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
2421 {
2422 	RecursiveMultiplyBottom(R, T, A, B, N);
2423 }
2424 
2425 // R[NA+NB] - result = A*B
2426 // T[NA+NB] - temporary work space
2427 // A[NA] ---- multiplier
2428 // B[NB] ---- multiplicant
2429 
AsymmetricMultiply(word * R,word * T,const word * A,size_t NA,const word * B,size_t NB)2430 void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *B, size_t NB)
2431 {
2432 	if (NA == NB)
2433 	{
2434 		// Profiling guided the flow below.
2435 		if (A != B)
2436 			Multiply(R, T, A, B, NA);
2437 		else
2438 			Square(R, T, A, NA);
2439 
2440 		return;
2441 	}
2442 
2443 	if (NA > NB)
2444 	{
2445 		std::swap(A, B);
2446 		std::swap(NA, NB);
2447 	}
2448 
2449 	CRYPTOPP_ASSERT(NB % NA == 0);
2450 
2451 	if (NA==2 && !A[1])
2452 	{
2453 		// Profiling guided the flow below.
2454 		switch (A[0])
2455 		{
2456 		default:
2457 			R[NB] = LinearMultiply(R, B, A[0], NB);
2458 			R[NB+1] = 0;
2459 			return;
2460 		case 0:
2461 			SetWords(R, 0, NB+2);
2462 			return;
2463 		case 1:
2464 			CopyWords(R, B, NB);
2465 			R[NB] = R[NB+1] = 0;
2466 			return;
2467 		}
2468 	}
2469 
2470 	size_t i;
2471 	if ((NB/NA)%2 == 0)
2472 	{
2473 		Multiply(R, T, A, B, NA);
2474 		CopyWords(T+2*NA, R+NA, NA);
2475 
2476 		for (i=2*NA; i<NB; i+=2*NA)
2477 			Multiply(T+NA+i, T, A, B+i, NA);
2478 		for (i=NA; i<NB; i+=2*NA)
2479 			Multiply(R+i, T, A, B+i, NA);
2480 	}
2481 	else
2482 	{
2483 		for (i=0; i<NB; i+=2*NA)
2484 			Multiply(R+i, T, A, B+i, NA);
2485 		for (i=NA; i<NB; i+=2*NA)
2486 			Multiply(T+NA+i, T, A, B+i, NA);
2487 	}
2488 
2489 	if (Add(R+NA, R+NA, T+2*NA, NB-NA))
2490 		Increment(R+NB, NA);
2491 }
2492 
2493 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
2494 // T[3*N/2] - temporary work space
2495 // A[N] ----- an odd number as input
2496 
RecursiveInverseModPower2(word * R,word * T,const word * A,size_t N)2497 void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
2498 {
2499 	// Profiling guided the flow below.
2500 	if (N!=2)
2501 	{
2502 		const size_t N2 = N/2;
2503 		RecursiveInverseModPower2(R0, T0, A0, N2);
2504 		T0[0] = 1;
2505 		SetWords(T0+1, 0, N2-1);
2506 		MultiplyTop(R1, T1, T0, R0, A0, N2);
2507 		MultiplyBottom(T0, T1, R0, A1, N2);
2508 		Add(T0, R1, T0, N2);
2509 		TwosComplement(T0, N2);
2510 		MultiplyBottom(R1, T1, R0, T0, N2);
2511 	}
2512 	else
2513 	{
2514 		T[0] = AtomicInverseModPower2(A[0]);
2515 		T[1] = 0;
2516 		s_pBot[0](T+2, T, A);
2517 		TwosComplement(T+2, 2);
2518 		Increment(T+2, 2, 2);
2519 		s_pBot[0](R, T, T+2);
2520 	}
2521 }
2522 
2523 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M
2524 // T[3*N] - temporary work space
2525 // X[2*N] - number to be reduced
2526 // M[N] --- modulus
2527 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
2528 
MontgomeryReduce(word * R,word * T,word * X,const word * M,const word * U,size_t N)2529 void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N)
2530 {
2531 #if 1
2532 	MultiplyBottom(R, T, X, U, N);
2533 	MultiplyTop(T, T+N, X, R, M, N);
2534 	word borrow = Subtract(T, X+N, T, N);
2535 	// defend against timing attack by doing this Add even when not needed
2536 	word carry = Add(T+N, T, M, N);
2537 	CRYPTOPP_ASSERT(carry | !borrow);
2538 	CRYPTOPP_UNUSED(carry), CRYPTOPP_UNUSED(borrow);
2539 	CopyWords(R, T + ((0-borrow) & N), N);
2540 #elif 0
2541 	const word u = 0-U[0];
2542 	Declare2Words(p)
2543 	for (size_t i=0; i<N; i++)
2544 	{
2545 		const word t = u * X[i];
2546 		word c = 0;
2547 		for (size_t j=0; j<N; j+=2)
2548 		{
2549 			MultiplyWords(p, t, M[j]);
2550 			Acc2WordsBy1(p, X[i+j]);
2551 			Acc2WordsBy1(p, c);
2552 			X[i+j] = LowWord(p);
2553 			c = HighWord(p);
2554 			MultiplyWords(p, t, M[j+1]);
2555 			Acc2WordsBy1(p, X[i+j+1]);
2556 			Acc2WordsBy1(p, c);
2557 			X[i+j+1] = LowWord(p);
2558 			c = HighWord(p);
2559 		}
2560 
2561 		if (Increment(X+N+i, N-i, c))
2562 			while (!Subtract(X+N, X+N, M, N)) {}
2563 	}
2564 
2565 	memcpy(R, X+N, N*WORD_SIZE);
2566 #else
2567 	__m64 u = _mm_cvtsi32_si64(0-U[0]), p;
2568 	for (size_t i=0; i<N; i++)
2569 	{
2570 		__m64 t = _mm_cvtsi32_si64(X[i]);
2571 		t = _mm_mul_su32(t, u);
2572 		__m64 c = _mm_setzero_si64();
2573 		for (size_t j=0; j<N; j+=2)
2574 		{
2575 			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j]));
2576 			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j]));
2577 			c = _mm_add_si64(c, p);
2578 			X[i+j] = _mm_cvtsi64_si32(c);
2579 			c = _mm_srli_si64(c, 32);
2580 			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1]));
2581 			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1]));
2582 			c = _mm_add_si64(c, p);
2583 			X[i+j+1] = _mm_cvtsi64_si32(c);
2584 			c = _mm_srli_si64(c, 32);
2585 		}
2586 
2587 		if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c)))
2588 			while (!Subtract(X+N, X+N, M, N)) {}
2589 	}
2590 
2591 	memcpy(R, X+N, N*WORD_SIZE);
2592 	_mm_empty();
2593 #endif
2594 }
2595 
2596 // R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
2597 // T[2*N] - temporary work space
2598 // X[2*N] - number to be reduced
2599 // M[N] --- modulus
2600 // U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2)
2601 // V[N] --- 2**(WORD_BITS*3*N/2) mod M
2602 
HalfMontgomeryReduce(word * R,word * T,const word * X,const word * M,const word * U,const word * V,size_t N)2603 void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, size_t N)
2604 {
2605 	CRYPTOPP_ASSERT(N%2==0 && N>=4);
2606 
2607 #define M0		M
2608 #define M1		(M+N2)
2609 #define V0		V
2610 #define V1		(V+N2)
2611 
2612 #define X0		X
2613 #define X1		(X+N2)
2614 #define X2		(X+N)
2615 #define X3		(X+N+N2)
2616 
2617 	const size_t N2 = N/2;
2618 	Multiply(T0, T2, V0, X3, N2);
2619 	int c2 = Add(T0, T0, X0, N);
2620 	MultiplyBottom(T3, T2, T0, U, N2);
2621 	MultiplyTop(T2, R, T0, T3, M0, N2);
2622 	c2 -= Subtract(T2, T1, T2, N2);
2623 	Multiply(T0, R, T3, M1, N2);
2624 	c2 -= Subtract(T0, T2, T0, N2);
2625 	int c3 = -(int)Subtract(T1, X2, T1, N2);
2626 	Multiply(R0, T2, V1, X3, N2);
2627 	c3 += Add(R, R, T, N);
2628 
2629 	if (c2>0)
2630 		c3 += Increment(R1, N2);
2631 	else if (c2<0)
2632 		c3 -= Decrement(R1, N2, -c2);
2633 
2634 	CRYPTOPP_ASSERT(c3>=-1 && c3<=1);
2635 	if (c3>0)
2636 		Subtract(R, R, M, N);
2637 	else if (c3<0)
2638 		Add(R, R, M, N);
2639 
2640 #undef M0
2641 #undef M1
2642 #undef V0
2643 #undef V1
2644 
2645 #undef X0
2646 #undef X1
2647 #undef X2
2648 #undef X3
2649 }
2650 
2651 #undef A0
2652 #undef A1
2653 #undef B0
2654 #undef B1
2655 
2656 #undef T0
2657 #undef T1
2658 #undef T2
2659 #undef T3
2660 
2661 #undef R0
2662 #undef R1
2663 #undef R2
2664 #undef R3
2665 
2666 /*
2667 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
2668 static word SubatomicDivide(word *A, word B0, word B1)
2669 {
2670 	// Assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
2671 	CRYPTOPP_ASSERT(A[2] < B1 || (A[2]==B1 && A[1] < B0));
2672 
2673 	// estimate the quotient: do a 2 word by 1 word divide
2674 	word Q;
2675 	if (B1+1 == 0)
2676 		Q = A[2];
2677 	else
2678 		Q = DWord(A[1], A[2]).DividedBy(B1+1);
2679 
2680 	// now subtract Q*B from A
2681 	DWord p = DWord::Multiply(B0, Q);
2682 	DWord u = (DWord) A[0] - p.GetLowHalf();
2683 	A[0] = u.GetLowHalf();
2684 	u = (DWord) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - DWord::Multiply(B1, Q);
2685 	A[1] = u.GetLowHalf();
2686 	A[2] += u.GetHighHalf();
2687 
2688 	// Q <= actual quotient, so fix it
2689 	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
2690 	{
2691 		u = (DWord) A[0] - B0;
2692 		A[0] = u.GetLowHalf();
2693 		u = (DWord) A[1] - B1 - u.GetHighHalfAsBorrow();
2694 		A[1] = u.GetLowHalf();
2695 		A[2] += u.GetHighHalf();
2696 		Q++;
2697 		CRYPTOPP_ASSERT(Q);	// shouldn't overflow
2698 	}
2699 
2700 	return Q;
2701 }
2702 
2703 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
2704 static inline void AtomicDivide(word *Q, const word *A, const word *B)
2705 {
2706 	if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
2707 	{
2708 		Q[0] = A[2];
2709 		Q[1] = A[3];
2710 	}
2711 	else
2712 	{
2713 		word T[4];
2714 		T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
2715 		Q[1] = SubatomicDivide(T+1, B[0], B[1]);
2716 		Q[0] = SubatomicDivide(T, B[0], B[1]);
2717 
2718 #if defined(CRYPTOPP_DEBUG)
2719 		// multiply quotient and divisor and add remainder, make sure it equals dividend
2720 		CRYPTOPP_ASSERT(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2721 		word P[4];
2722 		LowLevel::Multiply2(P, Q, B);
2723 		Add(P, P, T, 4);
2724 		CRYPTOPP_ASSERT(memcmp(P, A, 4*WORD_SIZE)==0);
2725 #endif
2726 	}
2727 }
2728 */
2729 
AtomicDivide(word * Q,const word * A,const word * B)2730 static inline void AtomicDivide(word *Q, const word *A, const word *B)
2731 {
2732 	word T[4];
2733 	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
2734 	Q[0] = q.GetLowHalf();
2735 	Q[1] = q.GetHighHalf();
2736 
2737 #if defined(CRYPTOPP_DEBUG)
2738 	if (B[0] || B[1])
2739 	{
2740 		// multiply quotient and divisor and add remainder, make sure it equals dividend
2741 		CRYPTOPP_ASSERT(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2742 		word P[4];
2743 		s_pMul[0](P, Q, B);
2744 		Add(P, P, T, 4);
2745 		CRYPTOPP_ASSERT(memcmp(P, A, 4*WORD_SIZE)==0);
2746 	}
2747 #endif
2748 }
2749 
2750 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
CorrectQuotientEstimate(word * R,word * T,word * Q,const word * B,size_t N)2751 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, size_t N)
2752 {
2753 	CRYPTOPP_ASSERT(N && N%2==0);
2754 
2755 	AsymmetricMultiply(T, T+N+2, Q, 2, B, N);
2756 
2757 	word borrow = Subtract(R, R, T, N+2);
2758 	CRYPTOPP_ASSERT(!borrow && !R[N+1]);
2759 	CRYPTOPP_UNUSED(borrow);
2760 
2761 	while (R[N] || Compare(R, B, N) >= 0)
2762 	{
2763 		R[N] -= Subtract(R, R, B, N);
2764 		Q[1] += (++Q[0]==0);
2765 		CRYPTOPP_ASSERT(Q[0] || Q[1]); // no overflow
2766 	}
2767 }
2768 
2769 // R[NB] -------- remainder = A%B
2770 // Q[NA-NB+2] --- quotient	= A/B
2771 // T[NA+3*(NB+2)] - temp work space
2772 // A[NA] -------- dividend
2773 // B[NB] -------- divisor
2774 
Divide(word * R,word * Q,word * T,const word * A,size_t NA,const word * B,size_t NB)2775 void Divide(word *R, word *Q, word *T, const word *A, size_t NA, const word *B, size_t NB)
2776 {
2777 	CRYPTOPP_ASSERT(NA && NB && NA%2==0 && NB%2==0);
2778 	CRYPTOPP_ASSERT(B[NB-1] || B[NB-2]);
2779 	CRYPTOPP_ASSERT(NB <= NA);
2780 
2781 	// set up temporary work space
2782 	word *const TA=T;
2783 	word *const TB=T+NA+2;
2784 	word *const TP=T+NA+2+NB;
2785 
2786 	// copy B into TB and normalize it so that TB has highest bit set to 1
2787 	unsigned shiftWords = (B[NB-1]==0);
2788 	TB[0] = TB[NB-1] = 0;
2789 	CopyWords(TB+shiftWords, B, NB-shiftWords);
2790 	unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
2791 	CRYPTOPP_ASSERT(shiftBits < WORD_BITS);
2792 	ShiftWordsLeftByBits(TB, NB, shiftBits);
2793 
2794 	// copy A into TA and normalize it
2795 	TA[0] = TA[NA] = TA[NA+1] = 0;
2796 	CopyWords(TA+shiftWords, A, NA);
2797 	ShiftWordsLeftByBits(TA, NA+2, shiftBits);
2798 
2799 	if (TA[NA+1]==0 && TA[NA] <= 1)
2800 	{
2801 		Q[NA-NB+1] = Q[NA-NB] = 0;
2802 		while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
2803 		{
2804 			TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
2805 			++Q[NA-NB];
2806 		}
2807 	}
2808 	else
2809 	{
2810 		NA+=2;
2811 		CRYPTOPP_ASSERT(Compare(TA+NA-NB, TB, NB) < 0);
2812 	}
2813 
2814 	word BT[2];
2815 	BT[0] = TB[NB-2] + 1;
2816 	BT[1] = TB[NB-1] + (BT[0]==0);
2817 
2818 	// start reducing TA mod TB, 2 words at a time
2819 	for (size_t i=NA-2; i>=NB; i-=2)
2820 	{
2821 		AtomicDivide(Q+i-NB, TA+i-2, BT);
2822 		CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
2823 	}
2824 
2825 	// copy TA into R, and denormalize it
2826 	CopyWords(R, TA+shiftWords, NB);
2827 	ShiftWordsRightByBits(R, NB, shiftBits);
2828 }
2829 
EvenWordCount(const word * X,size_t N)2830 static inline size_t EvenWordCount(const word *X, size_t N)
2831 {
2832 	while (N && X[N-2]==0 && X[N-1]==0)
2833 		N-=2;
2834 	return N;
2835 }
2836 
2837 // return k
2838 // R[N] --- result = A^(-1) * 2^k mod M
2839 // T[4*N] - temporary work space
2840 // A[NA] -- number to take inverse of
2841 // M[N] --- modulus
2842 
AlmostInverse(word * R,word * T,const word * A,size_t NA,const word * M,size_t N)2843 unsigned int AlmostInverse(word *R, word *T, const word *A, size_t NA, const word *M, size_t N)
2844 {
2845 	CRYPTOPP_ASSERT(NA<=N && N && N%2==0);
2846 
2847 	word *b = T;
2848 	word *c = T+N;
2849 	word *f = T+2*N;
2850 	word *g = T+3*N;
2851 	size_t bcLen=2, fgLen=EvenWordCount(M, N);
2852 	unsigned int k=0;
2853 	bool s=false;
2854 
2855 	SetWords(T, 0, 3*N);
2856 	b[0]=1;
2857 	CopyWords(f, A, NA);
2858 	CopyWords(g, M, N);
2859 
2860 	while (1)
2861 	{
2862 		word t=f[0];
2863 		while (!t)
2864 		{
2865 			if (EvenWordCount(f, fgLen)==0)
2866 			{
2867 				SetWords(R, 0, N);
2868 				return 0;
2869 			}
2870 
2871 			ShiftWordsRightByWords(f, fgLen, 1);
2872 			bcLen += 2 * (c[bcLen-1] != 0);
2873 			CRYPTOPP_ASSERT(bcLen <= N);
2874 			ShiftWordsLeftByWords(c, bcLen, 1);
2875 			k+=WORD_BITS;
2876 			t=f[0];
2877 		}
2878 
2879 		// t must be non-0; otherwise undefined.
2880 		unsigned int i = TrailingZeros(t);
2881 		t >>= i;
2882 		k += i;
2883 
2884 		if (t==1 && f[1]==0 && EvenWordCount(f+2, fgLen-2)==0)
2885 		{
2886 			if (s)
2887 				Subtract(R, M, b, N);
2888 			else
2889 				CopyWords(R, b, N);
2890 			return k;
2891 		}
2892 
2893 		ShiftWordsRightByBits(f, fgLen, i);
2894 		t = ShiftWordsLeftByBits(c, bcLen, i);
2895 		c[bcLen] += t;
2896 		bcLen += 2 * (t!=0);
2897 		CRYPTOPP_ASSERT(bcLen <= N);
2898 
2899 		bool swap = Compare(f, g, fgLen)==-1;
2900 		ConditionalSwapPointers(swap, f, g);
2901 		ConditionalSwapPointers(swap, b, c);
2902 		s ^= swap;
2903 
2904 		fgLen -= 2 * !(f[fgLen-2] | f[fgLen-1]);
2905 
2906 		Subtract(f, f, g, fgLen);
2907 		t = Add(b, b, c, bcLen);
2908 		b[bcLen] += t;
2909 		bcLen += 2*t;
2910 		CRYPTOPP_ASSERT(bcLen <= N);
2911 	}
2912 }
2913 
2914 // R[N] - result = A/(2^k) mod M
2915 // A[N] - input
2916 // M[N] - modulus
2917 
DivideByPower2Mod(word * R,const word * A,size_t k,const word * M,size_t N)2918 void DivideByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
2919 {
2920 	CopyWords(R, A, N);
2921 
2922 	while (k--)
2923 	{
2924 		if (R[0]%2==0)
2925 			ShiftWordsRightByBits(R, N, 1);
2926 		else
2927 		{
2928 			word carry = Add(R, R, M, N);
2929 			ShiftWordsRightByBits(R, N, 1);
2930 			R[N-1] += carry<<(WORD_BITS-1);
2931 		}
2932 	}
2933 }
2934 
2935 // R[N] - result = A*(2^k) mod M
2936 // A[N] - input
2937 // M[N] - modulus
2938 
MultiplyByPower2Mod(word * R,const word * A,size_t k,const word * M,size_t N)2939 void MultiplyByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
2940 {
2941 	CopyWords(R, A, N);
2942 
2943 	while (k--)
2944 		if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
2945 			Subtract(R, R, M, N);
2946 }
2947 
2948 // ******************************************************************
2949 
2950 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
2951 
RoundupSize(size_t n)2952 static inline size_t RoundupSize(size_t n)
2953 {
2954 	if (n<=8)
2955 		return RoundupSizeTable[n];
2956 	else if (n<=16)
2957 		return 16;
2958 	else if (n<=32)
2959 		return 32;
2960 	else if (n<=64)
2961 		return 64;
2962 	else
2963 		return size_t(1) << BitPrecision(n-1);
2964 }
2965 
Integer()2966 Integer::Integer()
2967 	: reg(2), sign(POSITIVE)
2968 {
2969 	reg[0] = reg[1] = 0;
2970 }
2971 
Integer(const Integer & t)2972 Integer::Integer(const Integer& t)
2973 	: reg(RoundupSize(t.WordCount())), sign(t.sign)
2974 {
2975 	CopyWords(reg, t.reg, reg.size());
2976 }
2977 
Integer(Sign s,lword value)2978 Integer::Integer(Sign s, lword value)
2979 	: reg(2), sign(s)
2980 {
2981 	reg[0] = word(value);
2982 	reg[1] = word(SafeRightShift<WORD_BITS>(value));
2983 }
2984 
Integer(signed long value)2985 Integer::Integer(signed long value)
2986 	: reg(2)
2987 {
2988 	if (value >= 0)
2989 		sign = POSITIVE;
2990 	else
2991 	{
2992 		sign = NEGATIVE;
2993 		value = -value;
2994 	}
2995 	reg[0] = word(value);
2996 	reg[1] = word(SafeRightShift<WORD_BITS>((unsigned long)value));
2997 }
2998 
Integer(Sign s,word high,word low)2999 Integer::Integer(Sign s, word high, word low)
3000 	: reg(2), sign(s)
3001 {
3002 	reg[0] = low;
3003 	reg[1] = high;
3004 }
3005 
IsConvertableToLong() const3006 bool Integer::IsConvertableToLong() const
3007 {
3008 	if (ByteCount() > sizeof(long))
3009 		return false;
3010 
3011 	unsigned long value = (unsigned long)reg[0];
3012 	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
3013 
3014 	if (sign==POSITIVE)
3015 		return (signed long)value >= 0;
3016 	else
3017 		return -(signed long)value < 0;
3018 }
3019 
ConvertToLong() const3020 signed long Integer::ConvertToLong() const
3021 {
3022 	CRYPTOPP_ASSERT(IsConvertableToLong());
3023 
3024 	unsigned long value = (unsigned long)reg[0];
3025 	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
3026 	return sign==POSITIVE ? value : -(signed long)value;
3027 }
3028 
Integer(BufferedTransformation & encodedInteger,size_t byteCount,Signedness s,ByteOrder o)3029 Integer::Integer(BufferedTransformation &encodedInteger, size_t byteCount, Signedness s, ByteOrder o)
3030 {
3031 	CRYPTOPP_ASSERT(o == BIG_ENDIAN_ORDER || o == LITTLE_ENDIAN_ORDER);
3032 
3033 	if (o != LITTLE_ENDIAN_ORDER)
3034 	{
3035 		Decode(encodedInteger, byteCount, s);
3036 	}
3037 	else
3038 	{
3039 		SecByteBlock block(byteCount);
3040 		encodedInteger.Get(block, block.size());
3041 		std::reverse(block.begin(), block.begin()+block.size());
3042 
3043 		Decode(block.begin(), block.size(), s);
3044 	}
3045 }
3046 
Integer(const byte * encodedInteger,size_t byteCount,Signedness s,ByteOrder o)3047 Integer::Integer(const byte *encodedInteger, size_t byteCount, Signedness s, ByteOrder o)
3048 {
3049 	CRYPTOPP_ASSERT(encodedInteger && byteCount); // NULL buffer
3050 	CRYPTOPP_ASSERT(o == BIG_ENDIAN_ORDER || o == LITTLE_ENDIAN_ORDER);
3051 
3052 	if (o != LITTLE_ENDIAN_ORDER)
3053 	{
3054 		Decode(encodedInteger, byteCount, s);
3055 	}
3056 	else
3057 	{
3058 		SecByteBlock block(byteCount);
3059 #if (_MSC_VER >= 1500)
3060 		std::reverse_copy(encodedInteger, encodedInteger+byteCount,
3061 			stdext::make_checked_array_iterator(block.begin(), block.size()));
3062 #else
3063 		std::reverse_copy(encodedInteger, encodedInteger+byteCount, block.begin());
3064 #endif
3065 		Decode(block.begin(), block.size(), s);
3066 		return;
3067 	}
3068 }
3069 
Integer(BufferedTransformation & bt)3070 Integer::Integer(BufferedTransformation &bt)
3071 {
3072 	// Make explicit call to avoid virtual-dispatch findings in ctor
3073 	Integer::BERDecode(bt);
3074 }
3075 
Integer(RandomNumberGenerator & rng,size_t bitcount)3076 Integer::Integer(RandomNumberGenerator &rng, size_t bitcount)
3077 {
3078 	Randomize(rng, bitcount);
3079 }
3080 
Integer(RandomNumberGenerator & rng,const Integer & min,const Integer & max,RandomNumberType rnType,const Integer & equiv,const Integer & mod)3081 Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
3082 {
3083 	if (!Randomize(rng, min, max, rnType, equiv, mod))
3084 		throw Integer::RandomNumberNotFound();
3085 }
3086 
Power2(size_t e)3087 Integer Integer::Power2(size_t e)
3088 {
3089 	Integer r((word)0, BitsToWords(e+1));
3090 	r.SetBit(e);
3091 	return r;
3092 }
3093 
operator !() const3094 bool Integer::operator!() const
3095 {
3096 	return IsNegative() ? false : (reg[0]==0 && WordCount()==0);
3097 }
3098 
operator =(const Integer & t)3099 Integer& Integer::operator=(const Integer& t)
3100 {
3101 	if (this != &t)
3102 	{
3103 		if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0)
3104 			reg.New(RoundupSize(t.WordCount()));
3105 		CopyWords(reg, t.reg, reg.size());
3106 		sign = t.sign;
3107 	}
3108 	return *this;
3109 }
3110 
GetBit(size_t n) const3111 bool Integer::GetBit(size_t n) const
3112 {
3113 	// Profiling guided the flow below.
3114 	if (n/WORD_BITS < reg.size())
3115 		return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
3116 	else
3117 		return 0;
3118 }
3119 
SetBit(size_t n,bool value)3120 void Integer::SetBit(size_t n, bool value)
3121 {
3122 	if (value)
3123 	{
3124 		reg.CleanGrow(RoundupSize(BitsToWords(n+1)));
3125 		reg[n/WORD_BITS] |= (word(1) << (n%WORD_BITS));
3126 	}
3127 	else
3128 	{
3129 		if (n/WORD_BITS < reg.size())
3130 			reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS));
3131 	}
3132 }
3133 
GetByte(size_t n) const3134 byte Integer::GetByte(size_t n) const
3135 {
3136 	// Profiling guided the flow below.
3137 	if (n/WORD_SIZE < reg.size())
3138 		return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
3139 	else
3140 		return 0;
3141 }
3142 
SetByte(size_t n,byte value)3143 void Integer::SetByte(size_t n, byte value)
3144 {
3145 	reg.CleanGrow(RoundupSize(BytesToWords(n+1)));
3146 	reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
3147 	reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
3148 }
3149 
GetBits(size_t i,size_t n) const3150 lword Integer::GetBits(size_t i, size_t n) const
3151 {
3152 	lword v = 0;
3153 	CRYPTOPP_ASSERT(n <= sizeof(v)*8);
3154 	for (unsigned int j=0; j<n; j++)
3155 		v |= lword(GetBit(i+j)) << j;
3156 	return v;
3157 }
3158 
operator -() const3159 Integer Integer::operator-() const
3160 {
3161 	Integer result(*this);
3162 	result.Negate();
3163 	return result;
3164 }
3165 
AbsoluteValue() const3166 Integer Integer::AbsoluteValue() const
3167 {
3168 	Integer result(*this);
3169 	result.sign = POSITIVE;
3170 	return result;
3171 }
3172 
swap(Integer & a)3173 void Integer::swap(Integer &a)
3174 {
3175 	reg.swap(a.reg);
3176 	std::swap(sign, a.sign);
3177 }
3178 
Integer(word value,size_t length)3179 Integer::Integer(word value, size_t length)
3180 	: reg(RoundupSize(length)), sign(POSITIVE)
3181 {
3182 	reg[0] = value;
3183 	SetWords(reg+1, 0, reg.size()-1);
3184 }
3185 
3186 template <class T>
StringToInteger(const T * str,ByteOrder order)3187 static Integer StringToInteger(const T *str, ByteOrder order)
3188 {
3189 	CRYPTOPP_ASSERT( order == BIG_ENDIAN_ORDER || order == LITTLE_ENDIAN_ORDER );
3190 
3191 	int radix, sign = 1;
3192 	// GCC workaround
3193 	// std::char_traits<wchar_t>::length() not defined in GCC 3.2 and STLport 4.5.3
3194 	unsigned int length;
3195 	for (length = 0; str[length] != 0; length++) {}
3196 
3197 	Integer v;
3198 
3199 	if (length == 0)
3200 		return Integer::Zero();
3201 
3202 	switch (str[length-1])
3203 	{
3204 	case 'h':
3205 	case 'H':
3206 		radix=16;
3207 		break;
3208 	case 'o':
3209 	case 'O':
3210 		radix=8;
3211 		break;
3212 	case 'b':
3213 	case 'B':
3214 		radix=2;
3215 		break;
3216 	default:
3217 		radix=10;
3218 	}
3219 
3220 	// 'str' is of length 1 or more
3221 	if (str[0] == '-')
3222 	{
3223 		sign = -1;
3224 		str += 1, length -= 1;
3225 	}
3226 
3227 	if (length > 2 && str[0] == '0' && (str[1] == 'x' || str[1] == 'X'))
3228 	{
3229 		radix = 16;
3230 		str += 2, length -= 2;
3231 	}
3232 
3233 	if (order == BIG_ENDIAN_ORDER)
3234 	{
3235 		for (unsigned int i=0; i<length; i++)
3236 		{
3237 			int digit, ch = static_cast<int>(str[i]);
3238 
3239 			// Profiling guided the flow below.
3240 			if (ch >= '0' && ch <= '9')
3241 				digit = ch - '0';
3242 			else if (ch >= 'a' && ch <= 'f')
3243 				digit = ch - 'a' + 10;
3244 			else if (ch >= 'A' && ch <= 'F')
3245 				digit = ch - 'A' + 10;
3246 			else
3247 				digit = radix;
3248 
3249 			if (digit < radix)
3250 			{
3251 				v *= radix;
3252 				v += digit;
3253 			}
3254 		}
3255 	}
3256 	else if (radix == 16 && order == LITTLE_ENDIAN_ORDER)
3257 	{
3258 		// Nibble high, low and count
3259 		unsigned int nh = 0, nl = 0, nc = 0;
3260 		Integer position(Integer::One());
3261 
3262 		for (unsigned int i=0; i<length; i++)
3263 		{
3264 			int digit, ch = static_cast<int>(str[i]);
3265 
3266 			if (ch >= '0' && ch <= '9')
3267 				digit = ch - '0';
3268 			else if (ch >= 'a' && ch <= 'f')
3269 				digit = ch - 'a' + 10;
3270 			else if (ch >= 'A' && ch <= 'F')
3271 				digit = ch - 'A' + 10;
3272 			else
3273 				digit = radix;
3274 
3275 			if (digit < radix)
3276 			{
3277 				if (nc++ == 0)
3278 					nh = digit;
3279 				else
3280 					nl = digit;
3281 
3282 				if (nc == 2)
3283 				{
3284 					v += position * (nh << 4 | nl);
3285 					nc = 0, position <<= 8;
3286 				}
3287 			}
3288 		}
3289 
3290 		if (nc == 1)
3291 			v += nh * position;
3292 	}
3293 	else // LITTLE_ENDIAN_ORDER && radix != 16
3294 	{
3295 		for (int i=static_cast<int>(length)-1; i>=0; i--)
3296 		{
3297 			int digit, ch = static_cast<int>(str[i]);
3298 
3299 			if (ch >= '0' && ch <= '9')
3300 				digit = ch - '0';
3301 			else if (ch >= 'a' && ch <= 'f')
3302 				digit = ch - 'a' + 10;
3303 			else if (ch >= 'A' && ch <= 'F')
3304 				digit = ch - 'A' + 10;
3305 			else
3306 				digit = radix;
3307 
3308 			if (digit < radix)
3309 			{
3310 				v *= radix;
3311 				v += digit;
3312 			}
3313 		}
3314 	}
3315 
3316 	if (sign == -1)
3317 		v.Negate();
3318 
3319 	return v;
3320 }
3321 
Integer(const char * str,ByteOrder order)3322 Integer::Integer(const char *str, ByteOrder order)
3323 	: reg(2), sign(POSITIVE)
3324 {
3325 	*this = StringToInteger(str,order);
3326 }
3327 
Integer(const wchar_t * str,ByteOrder order)3328 Integer::Integer(const wchar_t *str, ByteOrder order)
3329 	: reg(2), sign(POSITIVE)
3330 {
3331 	*this = StringToInteger(str,order);
3332 }
3333 
WordCount() const3334 unsigned int Integer::WordCount() const
3335 {
3336 	return (unsigned int)CountWords(reg, reg.size());
3337 }
3338 
ByteCount() const3339 unsigned int Integer::ByteCount() const
3340 {
3341 	unsigned wordCount = WordCount();
3342 	if (wordCount)
3343 		return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]);
3344 	else
3345 		return 0;
3346 }
3347 
BitCount() const3348 unsigned int Integer::BitCount() const
3349 {
3350 	unsigned wordCount = WordCount();
3351 	if (wordCount)
3352 		return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]);
3353 	else
3354 		return 0;
3355 }
3356 
Decode(const byte * input,size_t inputLen,Signedness s)3357 void Integer::Decode(const byte *input, size_t inputLen, Signedness s)
3358 {
3359 	CRYPTOPP_ASSERT(input && inputLen); // NULL buffer
3360 	StringStore store(input, inputLen);
3361 	Decode(store, inputLen, s);
3362 }
3363 
Decode(BufferedTransformation & bt,size_t inputLen,Signedness s)3364 void Integer::Decode(BufferedTransformation &bt, size_t inputLen, Signedness s)
3365 {
3366 	CRYPTOPP_ASSERT(bt.MaxRetrievable() >= inputLen);
3367 	if (bt.MaxRetrievable() < inputLen)
3368 		throw InvalidArgument("Integer: input length is too small");
3369 
3370 	byte b;
3371 	bt.Peek(b);
3372 	sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
3373 
3374 	while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff))
3375 	{
3376 		bt.Skip(1);
3377 		inputLen--;
3378 		bt.Peek(b);
3379 	}
3380 
3381 	reg.CleanNew(RoundupSize(BytesToWords(inputLen)));
3382 	for (size_t i=inputLen; i > 0; i--)
3383 	{
3384 		(void)bt.Get(b);
3385 		reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8;
3386 	}
3387 
3388 	if (sign == NEGATIVE)
3389 	{
3390 		for (size_t i=inputLen; i<reg.size()*WORD_SIZE; i++)
3391 			reg[i/WORD_SIZE] |= word(0xff) << (i%WORD_SIZE)*8;
3392 		TwosComplement(reg, reg.size());
3393 	}
3394 }
3395 
MinEncodedSize(Signedness signedness) const3396 size_t Integer::MinEncodedSize(Signedness signedness) const
3397 {
3398 	// Profiling guided the flow below.
3399 	unsigned int outputLen = STDMAX(1U, ByteCount());
3400 	const bool pre = (signedness == UNSIGNED);
3401 	if (!pre && NotNegative() && (GetByte(outputLen-1) & 0x80))
3402 		outputLen++;
3403 	if (pre)
3404 		return outputLen;
3405 	if (IsNegative() && *this < -Power2(outputLen*8-1))
3406 		outputLen++;
3407 	return outputLen;
3408 }
3409 
3410 // PKCS12_PBKDF and other classes use undersized buffers
Encode(byte * output,size_t outputLen,Signedness signedness) const3411 void Integer::Encode(byte *output, size_t outputLen, Signedness signedness) const
3412 {
3413 	CRYPTOPP_ASSERT(output && outputLen);            // NULL buffer
3414 	ArraySink sink(output, outputLen);
3415 	Encode(sink, outputLen, signedness);
3416 }
3417 
Encode(BufferedTransformation & bt,size_t outputLen,Signedness signedness) const3418 void Integer::Encode(BufferedTransformation &bt, size_t outputLen, Signedness signedness) const
3419 {
3420 	if (signedness == UNSIGNED || NotNegative())
3421 	{
3422 		for (size_t i=outputLen; i > 0; i--)
3423 			bt.Put(GetByte(i-1));
3424 	}
3425 	else
3426 	{
3427 		// take two's complement of *this
3428 		Integer temp = Integer::Power2(8*STDMAX((size_t)ByteCount(), outputLen)) + *this;
3429 		temp.Encode(bt, outputLen, UNSIGNED);
3430 	}
3431 }
3432 
DEREncode(BufferedTransformation & bt) const3433 void Integer::DEREncode(BufferedTransformation &bt) const
3434 {
3435 	DERGeneralEncoder enc(bt, INTEGER);
3436 	Encode(enc, MinEncodedSize(SIGNED), SIGNED);
3437 	enc.MessageEnd();
3438 }
3439 
BERDecode(const byte * input,size_t len)3440 void Integer::BERDecode(const byte *input, size_t len)
3441 {
3442 	CRYPTOPP_ASSERT(input && len); // NULL buffer
3443 	StringStore store(input, len);
3444 	BERDecode(store);
3445 }
3446 
BERDecode(BufferedTransformation & bt)3447 void Integer::BERDecode(BufferedTransformation &bt)
3448 {
3449 	BERGeneralDecoder dec(bt, INTEGER);
3450 	if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength())
3451 		BERDecodeError();
3452 	Decode(dec, (size_t)dec.RemainingLength(), SIGNED);
3453 	dec.MessageEnd();
3454 }
3455 
DEREncodeAsOctetString(BufferedTransformation & bt,size_t length) const3456 void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, size_t length) const
3457 {
3458 	DERGeneralEncoder enc(bt, OCTET_STRING);
3459 	Encode(enc, length);
3460 	enc.MessageEnd();
3461 }
3462 
BERDecodeAsOctetString(BufferedTransformation & bt,size_t length)3463 void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, size_t length)
3464 {
3465 	BERGeneralDecoder dec(bt, OCTET_STRING);
3466 	if (!dec.IsDefiniteLength() || dec.RemainingLength() != length)
3467 		BERDecodeError();
3468 	Decode(dec, length);
3469 	dec.MessageEnd();
3470 }
3471 
OpenPGPEncode(byte * output,size_t bufferSize) const3472 size_t Integer::OpenPGPEncode(byte *output, size_t bufferSize) const
3473 {
3474 	CRYPTOPP_ASSERT(output && bufferSize);         // NULL buffer
3475 	CRYPTOPP_ASSERT(bufferSize >= 2+ByteCount());  // Undersized buffer
3476 	ArraySink sink(output, bufferSize);
3477 	return OpenPGPEncode(sink);
3478 }
3479 
OpenPGPEncode(BufferedTransformation & bt) const3480 size_t Integer::OpenPGPEncode(BufferedTransformation &bt) const
3481 {
3482 	word16 bitCount = word16(BitCount());
3483 	bt.PutWord16(bitCount);
3484 	size_t byteCount = BitsToBytes(bitCount);
3485 	Encode(bt, byteCount);
3486 	return 2 + byteCount;
3487 }
3488 
OpenPGPDecode(const byte * input,size_t len)3489 void Integer::OpenPGPDecode(const byte *input, size_t len)
3490 {
3491 	CRYPTOPP_ASSERT(input && len);  // NULL buffer
3492 	StringStore store(input, len);
3493 	OpenPGPDecode(store);
3494 }
3495 
OpenPGPDecode(BufferedTransformation & bt)3496 void Integer::OpenPGPDecode(BufferedTransformation &bt)
3497 {
3498 	word16 bitCount;
3499 	if (bt.GetWord16(bitCount) != 2 || bt.MaxRetrievable() < BitsToBytes(bitCount))
3500 		throw OpenPGPDecodeErr();
3501 	Decode(bt, BitsToBytes(bitCount));
3502 }
3503 
Randomize(RandomNumberGenerator & rng,size_t nbits)3504 void Integer::Randomize(RandomNumberGenerator &rng, size_t nbits)
3505 {
3506 	const size_t nbytes = nbits/8 + 1;
3507 	SecByteBlock buf(nbytes);
3508 	rng.GenerateBlock(buf, nbytes);
3509 	if (nbytes)
3510 		buf[0] = (byte)Crop(buf[0], nbits % 8);
3511 	Decode(buf, nbytes, UNSIGNED);
3512 }
3513 
Randomize(RandomNumberGenerator & rng,const Integer & min,const Integer & max)3514 void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max)
3515 {
3516 	if (min > max)
3517 		throw InvalidArgument("Integer: Min must be no greater than Max");
3518 
3519 	Integer range = max - min;
3520 	const unsigned int nbits = range.BitCount();
3521 
3522 	do
3523 	{
3524 		Randomize(rng, nbits);
3525 	}
3526 	while (*this > range);
3527 
3528 	*this += min;
3529 }
3530 
Randomize(RandomNumberGenerator & rng,const Integer & min,const Integer & max,RandomNumberType rnType,const Integer & equiv,const Integer & mod)3531 bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
3532 {
3533 	return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)
3534 		("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod));
3535 }
3536 
3537 class KDF2_RNG : public RandomNumberGenerator
3538 {
3539 public:
KDF2_RNG(const byte * seed,size_t seedSize)3540 	KDF2_RNG(const byte *seed, size_t seedSize)
3541 		: m_counter(0), m_counterAndSeed(ClampSize(seedSize) + 4)
3542 	{
3543 		memcpy(m_counterAndSeed + 4, seed, ClampSize(seedSize));
3544 	}
3545 
GenerateBlock(byte * output,size_t size)3546 	void GenerateBlock(byte *output, size_t size)
3547 	{
3548 		CRYPTOPP_ASSERT(output && size); // NULL buffer
3549 		PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
3550 		++m_counter;
3551 		P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULLPTR, 0);
3552 	}
3553 
3554 	// UBsan finding, -Wstringop-overflow
ClampSize(size_t req) const3555 	inline size_t ClampSize(size_t req) const
3556 	{
3557 		// Clamp at 16 MB
3558 		if (req > 16U*1024*1024)
3559 			return 16U*1024*1024;
3560 		return req;
3561 	}
3562 
3563 private:
3564 	word32 m_counter;
3565 	SecByteBlock m_counterAndSeed;
3566 };
3567 
GenerateRandomNoThrow(RandomNumberGenerator & i_rng,const NameValuePairs & params)3568 bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs &params)
3569 {
3570 	Integer min = params.GetValueWithDefault("Min", Integer::Zero());
3571 	Integer max;
3572 	if (!params.GetValue("Max", max))
3573 	{
3574 		int bitLength;
3575 		if (params.GetIntValue("BitLength", bitLength))
3576 			max = Integer::Power2(bitLength);
3577 		else
3578 			throw InvalidArgument("Integer: missing Max argument");
3579 	}
3580 	if (min > max)
3581 		throw InvalidArgument("Integer: Min must be no greater than Max");
3582 
3583 	Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero());
3584 	Integer mod = params.GetValueWithDefault("Mod", Integer::One());
3585 
3586 	if (equiv.IsNegative() || equiv >= mod)
3587 		throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument");
3588 
3589 	Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY);
3590 
3591 	member_ptr<KDF2_RNG> kdf2Rng;
3592 	ConstByteArrayParameter seed;
3593 	if (params.GetValue(Name::Seed(), seed))
3594 	{
3595 		ByteQueue bq;
3596 		DERSequenceEncoder seq(bq);
3597 		min.DEREncode(seq);
3598 		max.DEREncode(seq);
3599 		equiv.DEREncode(seq);
3600 		mod.DEREncode(seq);
3601 		DEREncodeUnsigned(seq, rnType);
3602 		DEREncodeOctetString(seq, seed.begin(), seed.size());
3603 		seq.MessageEnd();
3604 
3605 		SecByteBlock finalSeed((size_t)bq.MaxRetrievable());
3606 		bq.Get(finalSeed, finalSeed.size());
3607 		kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));
3608 	}
3609 	RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng;
3610 
3611 	switch (rnType)
3612 	{
3613 		case ANY:
3614 			if (mod == One())
3615 				Randomize(rng, min, max);
3616 			else
3617 			{
3618 				Integer min1 = min + (equiv-min)%mod;
3619 				if (max < min1)
3620 					return false;
3621 				Randomize(rng, Zero(), (max - min1) / mod);
3622 				*this *= mod;
3623 				*this += min1;
3624 			}
3625 			return true;
3626 
3627 		case PRIME:
3628 		{
3629 			const PrimeSelector *pSelector = params.GetValueWithDefault(Name::PointerToPrimeSelector(), (const PrimeSelector *)NULLPTR);
3630 
3631 			int i;
3632 			i = 0;
3633 			while (1)
3634 			{
3635 				if (++i==16)
3636 				{
3637 					// check if there are any suitable primes in [min, max]
3638 					Integer first = min;
3639 					if (FirstPrime(first, max, equiv, mod, pSelector))
3640 					{
3641 						// if there is only one suitable prime, we're done
3642 						*this = first;
3643 						if (!FirstPrime(first, max, equiv, mod, pSelector))
3644 							return true;
3645 					}
3646 					else
3647 						return false;
3648 				}
3649 
3650 				Randomize(rng, min, max);
3651 				if (FirstPrime(*this, STDMIN(*this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector))
3652 					return true;
3653 			}
3654 		}
3655 
3656 		default:
3657 			throw InvalidArgument("Integer: invalid RandomNumberType argument");
3658 	}
3659 }
3660 
operator >>(std::istream & in,Integer & a)3661 std::istream& operator>>(std::istream& in, Integer &a)
3662 {
3663 	char c;
3664 	unsigned int length = 0;
3665 	SecBlock<char> str(length + 16);
3666 
3667 	std::ws(in);
3668 
3669 	do
3670 	{
3671 		in.read(&c, 1);
3672 		str[length++] = c;
3673 		if (length >= str.size())
3674 			str.Grow(length + 16);
3675 	}
3676 	while (in && (c=='-' || c=='x' || (c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') || c=='h' || c=='H' || c=='o' || c=='O' || c==',' || c=='.'));
3677 
3678 	if (in.gcount())
3679 		in.putback(c);
3680 	str[length-1] = '\0';
3681 	a = Integer(str);
3682 
3683 	return in;
3684 }
3685 
3686 // Ensure base 10 is default
FlagToBase(long f)3687 inline int FlagToBase(long f) {
3688 	return f == std::ios::hex ? 16 : (f == std::ios::oct ? 8 : 10);
3689 }
3690 
FlagToSuffix(long f)3691 inline char FlagToSuffix(long f) {
3692 	return f == std::ios::hex ? 'h' : (f == std::ios::oct ? 'o' : '.');
3693 }
3694 
3695 // Ensure base 10 is default
operator <<(std::ostream & out,const Integer & a)3696 std::ostream& operator<<(std::ostream& out, const Integer &a)
3697 {
3698 	// Get relevant conversion specifications from ostream.
3699 	const long f = out.flags() & std::ios::basefield;
3700 	const int base = FlagToBase(f);
3701 	const char suffix = FlagToSuffix(f);
3702 
3703 	Integer temp1=a, temp2;
3704 	if (a.IsNegative())
3705 	{
3706 		out << '-';
3707 		temp1.Negate();
3708 	}
3709 
3710 	if (!a)
3711 		out << '0';
3712 
3713 	static const char upper[]="0123456789ABCDEF";
3714 	static const char lower[]="0123456789abcdef";
3715 
3716 	const char* vec = (out.flags() & std::ios::uppercase) ? upper : lower;
3717 	unsigned int i=0;
3718 	SecBlock<char> s(a.BitCount() / (SaturatingSubtract1(BitPrecision(base),1U)) + 1);
3719 
3720 	while (!!temp1)
3721 	{
3722 		word digit;
3723 		Integer::Divide(digit, temp2, temp1, base);
3724 		s[i++]=vec[digit];
3725 		temp1.swap(temp2);
3726 	}
3727 
3728 	while (i--)
3729 	{
3730 		out << s[i];
3731 	}
3732 
3733 #ifdef CRYPTOPP_USE_STD_SHOWBASE
3734 	if (out.flags() & std::ios_base::showbase)
3735 		out << suffix;
3736 
3737 	return out;
3738 #else
3739  	return out << suffix;
3740 #endif
3741 }
3742 
operator ++()3743 Integer& Integer::operator++()
3744 {
3745 	if (NotNegative())
3746 	{
3747 		if (Increment(reg, reg.size()))
3748 		{
3749 			reg.CleanGrow(2*reg.size());
3750 			reg[reg.size()/2]=1;
3751 		}
3752 	}
3753 	else
3754 	{
3755 		word borrow = Decrement(reg, reg.size());
3756 		CRYPTOPP_ASSERT(!borrow); CRYPTOPP_UNUSED(borrow);
3757 
3758 		if (WordCount()==0)
3759 			*this = Zero();
3760 	}
3761 	return *this;
3762 }
3763 
operator --()3764 Integer& Integer::operator--()
3765 {
3766 	if (IsNegative())
3767 	{
3768 		if (Increment(reg, reg.size()))
3769 		{
3770 			reg.CleanGrow(2*reg.size());
3771 			reg[reg.size()/2]=1;
3772 		}
3773 	}
3774 	else
3775 	{
3776 		if (Decrement(reg, reg.size()))
3777 			*this = -One();
3778 	}
3779 	return *this;
3780 }
3781 
3782 // This is a bit operation. We set sign to POSITIVE, so there's no need to
3783 //  worry about negative zero. Also see http://stackoverflow.com/q/11644362.
And(const Integer & t) const3784 Integer Integer::And(const Integer& t) const
3785 {
3786 	if (this == &t)
3787 	{
3788 		return AbsoluteValue();
3789 	}
3790 	else if (reg.size() >= t.reg.size())
3791 	{
3792 		Integer result(t);
3793 		AndWords(result.reg, reg, t.reg.size());
3794 
3795 		result.sign = POSITIVE;
3796 		return result;
3797 	}
3798 	else // reg.size() < t.reg.size()
3799 	{
3800 		Integer result(*this);
3801 		AndWords(result.reg, t.reg, reg.size());
3802 
3803 		result.sign = POSITIVE;
3804 		return result;
3805 	}
3806 }
3807 
3808 // This is a bit operation. We set sign to POSITIVE, so there's no need to
3809 //  worry about negative zero. Also see http://stackoverflow.com/q/11644362.
Or(const Integer & t) const3810 Integer Integer::Or(const Integer& t) const
3811 {
3812 	if (this == &t)
3813 	{
3814 		return AbsoluteValue();
3815 	}
3816 	else if (reg.size() >= t.reg.size())
3817 	{
3818 		Integer result(*this);
3819 		OrWords(result.reg, t.reg, t.reg.size());
3820 
3821 		result.sign = POSITIVE;
3822 		return result;
3823 	}
3824 	else // reg.size() < t.reg.size()
3825 	{
3826 		Integer result(t);
3827 		OrWords(result.reg, reg, reg.size());
3828 
3829 		result.sign = POSITIVE;
3830 		return result;
3831 	}
3832 }
3833 
3834 // This is a bit operation. We set sign to POSITIVE, so there's no need to
3835 //  worry about negative zero. Also see http://stackoverflow.com/q/11644362.
Xor(const Integer & t) const3836 Integer Integer::Xor(const Integer& t) const
3837 {
3838 	if (this == &t)
3839 	{
3840 		return Integer::Zero();
3841 	}
3842 	else if (reg.size() >= t.reg.size())
3843 	{
3844 		Integer result(*this);
3845 		XorWords(result.reg, t.reg, t.reg.size());
3846 
3847 		result.sign = POSITIVE;
3848 		return result;
3849 	}
3850 	else // reg.size() < t.reg.size()
3851 	{
3852 		Integer result(t);
3853 		XorWords(result.reg, reg, reg.size());
3854 
3855 		result.sign = POSITIVE;
3856 		return result;
3857 	}
3858 }
3859 
PositiveAdd(Integer & sum,const Integer & a,const Integer & b)3860 void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)
3861 {
3862 	// Profiling guided the flow below.
3863 	int carry; const bool pre = (a.reg.size() == b.reg.size());
3864 	if (!pre && a.reg.size() > b.reg.size())
3865 	{
3866 		carry = Add(sum.reg, a.reg, b.reg, b.reg.size());
3867 		CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size());
3868 		carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry);
3869 	}
3870 	else if (pre)
3871 	{
3872 		carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3873 	}
3874 	else
3875 	{
3876 		carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3877 		CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size());
3878 		carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry);
3879 	}
3880 
3881 	if (carry)
3882 	{
3883 		sum.reg.CleanGrow(2*sum.reg.size());
3884 		sum.reg[sum.reg.size()/2] = 1;
3885 	}
3886 	sum.sign = Integer::POSITIVE;
3887 }
3888 
PositiveSubtract(Integer & diff,const Integer & a,const Integer & b)3889 void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
3890 {
3891 	unsigned aSize = a.WordCount();
3892 	aSize += aSize%2;
3893 	unsigned bSize = b.WordCount();
3894 	bSize += bSize%2;
3895 
3896 	// Profiling guided the flow below.
3897 	if (aSize > bSize)
3898 	{
3899 		word borrow = Subtract(diff.reg, a.reg, b.reg, bSize);
3900 		CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize);
3901 		borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow);
3902 		CRYPTOPP_ASSERT(!borrow); CRYPTOPP_UNUSED(borrow);
3903 		diff.sign = Integer::POSITIVE;
3904 	}
3905 	else if (aSize == bSize)
3906 	{
3907 		if (Compare(a.reg, b.reg, aSize) >= 0)
3908 		{
3909 			Subtract(diff.reg, a.reg, b.reg, aSize);
3910 			diff.sign = Integer::POSITIVE;
3911 		}
3912 		else
3913 		{
3914 			Subtract(diff.reg, b.reg, a.reg, aSize);
3915 			diff.sign = Integer::NEGATIVE;
3916 		}
3917 	}
3918 	else
3919 	{
3920 		word borrow = Subtract(diff.reg, b.reg, a.reg, aSize);
3921 		CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize);
3922 		borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow);
3923 		CRYPTOPP_ASSERT(!borrow); CRYPTOPP_UNUSED(borrow);
3924 		diff.sign = Integer::NEGATIVE;
3925 	}
3926 }
3927 
3928 // MSVC .NET 2003 workaround
STDMAX2(const T & a,const T & b)3929 template <class T> inline const T& STDMAX2(const T& a, const T& b)
3930 {
3931 	return a < b ? b : a;
3932 }
3933 
Plus(const Integer & b) const3934 Integer Integer::Plus(const Integer& b) const
3935 {
3936 	Integer sum((word)0, STDMAX2(reg.size(), b.reg.size()));
3937 	if (NotNegative())
3938 	{
3939 		if (b.NotNegative())
3940 			PositiveAdd(sum, *this, b);
3941 		else
3942 			PositiveSubtract(sum, *this, b);
3943 	}
3944 	else
3945 	{
3946 		if (b.NotNegative())
3947 			PositiveSubtract(sum, b, *this);
3948 		else
3949 		{
3950 			PositiveAdd(sum, *this, b);
3951 			sum.sign = Integer::NEGATIVE;
3952 		}
3953 	}
3954 	return sum;
3955 }
3956 
operator +=(const Integer & t)3957 Integer& Integer::operator+=(const Integer& t)
3958 {
3959 	reg.CleanGrow(t.reg.size());
3960 	if (NotNegative())
3961 	{
3962 		if (t.NotNegative())
3963 			PositiveAdd(*this, *this, t);
3964 		else
3965 			PositiveSubtract(*this, *this, t);
3966 	}
3967 	else
3968 	{
3969 		if (t.NotNegative())
3970 			PositiveSubtract(*this, t, *this);
3971 		else
3972 		{
3973 			PositiveAdd(*this, *this, t);
3974 			sign = Integer::NEGATIVE;
3975 		}
3976 	}
3977 	return *this;
3978 }
3979 
Minus(const Integer & b) const3980 Integer Integer::Minus(const Integer& b) const
3981 {
3982 	Integer diff((word)0, STDMAX2(reg.size(), b.reg.size()));
3983 	if (NotNegative())
3984 	{
3985 		if (b.NotNegative())
3986 			PositiveSubtract(diff, *this, b);
3987 		else
3988 			PositiveAdd(diff, *this, b);
3989 	}
3990 	else
3991 	{
3992 		if (b.NotNegative())
3993 		{
3994 			PositiveAdd(diff, *this, b);
3995 			diff.sign = Integer::NEGATIVE;
3996 		}
3997 		else
3998 			PositiveSubtract(diff, b, *this);
3999 	}
4000 	return diff;
4001 }
4002 
operator -=(const Integer & t)4003 Integer& Integer::operator-=(const Integer& t)
4004 {
4005 	reg.CleanGrow(t.reg.size());
4006 	if (NotNegative())
4007 	{
4008 		if (t.NotNegative())
4009 			PositiveSubtract(*this, *this, t);
4010 		else
4011 			PositiveAdd(*this, *this, t);
4012 	}
4013 	else
4014 	{
4015 		if (t.NotNegative())
4016 		{
4017 			PositiveAdd(*this, *this, t);
4018 			sign = Integer::NEGATIVE;
4019 		}
4020 		else
4021 			PositiveSubtract(*this, t, *this);
4022 	}
4023 	return *this;
4024 }
4025 
operator <<=(size_t n)4026 Integer& Integer::operator<<=(size_t n)
4027 {
4028 	const size_t wordCount = WordCount();
4029 	const size_t shiftWords = n / WORD_BITS;
4030 	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
4031 
4032 	reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
4033 	ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);
4034 	ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits);
4035 	return *this;
4036 }
4037 
operator >>=(size_t n)4038 Integer& Integer::operator>>=(size_t n)
4039 {
4040 	const size_t wordCount = WordCount();
4041 	const size_t shiftWords = n / WORD_BITS;
4042 	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
4043 
4044 	ShiftWordsRightByWords(reg, wordCount, shiftWords);
4045 	if (wordCount > shiftWords)
4046 		ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits);
4047 	if (IsNegative() && WordCount()==0)   // avoid -0
4048 		*this = Zero();
4049 	return *this;
4050 }
4051 
operator &=(const Integer & t)4052 Integer& Integer::operator&=(const Integer& t)
4053 {
4054 	if (this != &t)
4055 	{
4056 		const size_t size = STDMIN(reg.size(), t.reg.size());
4057 		reg.resize(size);
4058 		AndWords(reg, t.reg, size);
4059 	}
4060 	sign = POSITIVE;
4061 	return *this;
4062 }
4063 
operator |=(const Integer & t)4064 Integer& Integer::operator|=(const Integer& t)
4065 {
4066 	if (this != &t)
4067 	{
4068 		if (reg.size() >= t.reg.size())
4069 		{
4070 			OrWords(reg, t.reg, t.reg.size());
4071 		}
4072 		else  // reg.size() < t.reg.size()
4073 		{
4074 			const size_t head = reg.size();
4075 			const size_t tail = t.reg.size() - reg.size();
4076 			reg.resize(head+tail);
4077 			OrWords(reg, t.reg, head);
4078 			CopyWords(reg+head,t.reg+head,tail);
4079 		}
4080 	}
4081 	sign = POSITIVE;
4082 	return *this;
4083 }
4084 
operator ^=(const Integer & t)4085 Integer& Integer::operator^=(const Integer& t)
4086 {
4087 	if (this == &t)
4088 	{
4089 		*this = Zero();
4090 	}
4091 	else
4092 	{
4093 		if (reg.size() >= t.reg.size())
4094 		{
4095 			XorWords(reg, t.reg, t.reg.size());
4096 		}
4097 		else  // reg.size() < t.reg.size()
4098 		{
4099 			const size_t head = reg.size();
4100 			const size_t tail = t.reg.size() - reg.size();
4101 			reg.resize(head+tail);
4102 			XorWords(reg, t.reg, head);
4103 			CopyWords(reg+head,t.reg+head,tail);
4104 		}
4105 	}
4106 	sign = POSITIVE;
4107 	return *this;
4108 }
4109 
PositiveMultiply(Integer & product,const Integer & a,const Integer & b)4110 void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
4111 {
4112 	size_t aSize = RoundupSize(a.WordCount());
4113 	size_t bSize = RoundupSize(b.WordCount());
4114 
4115 	product.reg.CleanNew(RoundupSize(aSize+bSize));
4116 	product.sign = Integer::POSITIVE;
4117 
4118 	IntegerSecBlock workspace(aSize + bSize);
4119 	AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
4120 }
4121 
Multiply(Integer & product,const Integer & a,const Integer & b)4122 void Multiply(Integer &product, const Integer &a, const Integer &b)
4123 {
4124 	PositiveMultiply(product, a, b);
4125 
4126 	if (a.NotNegative() != b.NotNegative())
4127 		product.Negate();
4128 }
4129 
Times(const Integer & b) const4130 Integer Integer::Times(const Integer &b) const
4131 {
4132 	Integer product;
4133 	Multiply(product, *this, b);
4134 	return product;
4135 }
4136 
4137 /*
4138 void PositiveDivide(Integer &remainder, Integer &quotient,
4139 				   const Integer &dividend, const Integer &divisor)
4140 {
4141 	remainder.reg.CleanNew(divisor.reg.size());
4142 	remainder.sign = Integer::POSITIVE;
4143 	quotient.reg.New(0);
4144 	quotient.sign = Integer::POSITIVE;
4145 	unsigned i=dividend.BitCount();
4146 	while (i--)
4147 	{
4148 		word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1);
4149 		remainder.reg[0] |= dividend[i];
4150 		if (overflow || remainder >= divisor)
4151 		{
4152 			Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size());
4153 			quotient.SetBit(i);
4154 		}
4155 	}
4156 }
4157 */
4158 
PositiveDivide(Integer & remainder,Integer & quotient,const Integer & a,const Integer & b)4159 void PositiveDivide(Integer &remainder, Integer &quotient,
4160 				   const Integer &a, const Integer &b)
4161 {
4162 	unsigned aSize = a.WordCount();
4163 	unsigned bSize = b.WordCount();
4164 
4165 	if (!bSize)
4166 		throw Integer::DivideByZero();
4167 
4168 	if (aSize < bSize)
4169 	{
4170 		remainder = a;
4171 		remainder.sign = Integer::POSITIVE;
4172 		quotient = Integer::Zero();
4173 		return;
4174 	}
4175 
4176 	aSize += aSize%2;	// round up to next even number
4177 	bSize += bSize%2;
4178 
4179 	remainder.reg.CleanNew(RoundupSize(bSize));
4180 	remainder.sign = Integer::POSITIVE;
4181 	quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
4182 	quotient.sign = Integer::POSITIVE;
4183 
4184 	IntegerSecBlock T(aSize+3*(bSize+2));
4185 	Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
4186 }
4187 
Divide(Integer & remainder,Integer & quotient,const Integer & dividend,const Integer & divisor)4188 void Integer::Divide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor)
4189 {
4190 	PositiveDivide(remainder, quotient, dividend, divisor);
4191 
4192 	if (dividend.IsNegative())
4193 	{
4194 		quotient.Negate();
4195 		if (remainder.NotZero())
4196 		{
4197 			--quotient;
4198 			remainder = divisor.AbsoluteValue() - remainder;
4199 		}
4200 	}
4201 
4202 	if (divisor.IsNegative())
4203 		quotient.Negate();
4204 }
4205 
DivideByPowerOf2(Integer & r,Integer & q,const Integer & a,unsigned int n)4206 void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n)
4207 {
4208 	q = a;
4209 	q >>= n;
4210 
4211 	const size_t wordCount = BitsToWords(n);
4212 	if (wordCount <= a.WordCount())
4213 	{
4214 		r.reg.resize(RoundupSize(wordCount));
4215 		CopyWords(r.reg, a.reg, wordCount);
4216 		SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);
4217 		if (n % WORD_BITS != 0)
4218 			r.reg[wordCount-1] %= (word(1) << (n % WORD_BITS));
4219 	}
4220 	else
4221 	{
4222 		r.reg.resize(RoundupSize(a.WordCount()));
4223 		CopyWords(r.reg, a.reg, r.reg.size());
4224 	}
4225 	r.sign = POSITIVE;
4226 
4227 	if (a.IsNegative() && r.NotZero())
4228 	{
4229 		--q;
4230 		r = Power2(n) - r;
4231 	}
4232 }
4233 
DividedBy(const Integer & b) const4234 Integer Integer::DividedBy(const Integer &b) const
4235 {
4236 	Integer remainder, quotient;
4237 	Integer::Divide(remainder, quotient, *this, b);
4238 	return quotient;
4239 }
4240 
Modulo(const Integer & b) const4241 Integer Integer::Modulo(const Integer &b) const
4242 {
4243 	Integer remainder, quotient;
4244 	Integer::Divide(remainder, quotient, *this, b);
4245 	return remainder;
4246 }
4247 
Divide(word & remainder,Integer & quotient,const Integer & dividend,word divisor)4248 void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend, word divisor)
4249 {
4250 	if (!divisor)
4251 		throw Integer::DivideByZero();
4252 
4253 	// IsPowerOf2 uses BMI on x86 if available. There is a small
4254 	// but measurable improvement during decryption and signing.
4255 	if (IsPowerOf2(divisor))
4256 	{
4257 		quotient = dividend >> (BitPrecision(divisor)-1);
4258 		remainder = dividend.reg[0] & (divisor-1);
4259 		return;
4260 	}
4261 
4262 	unsigned int i = dividend.WordCount();
4263 	quotient.reg.CleanNew(RoundupSize(i));
4264 	remainder = 0;
4265 	while (i--)
4266 	{
4267 		quotient.reg[i] = DWord(dividend.reg[i], remainder) / divisor;
4268 		remainder = DWord(dividend.reg[i], remainder) % divisor;
4269 	}
4270 
4271 	if (dividend.NotNegative())
4272 		quotient.sign = POSITIVE;
4273 	else
4274 	{
4275 		quotient.sign = NEGATIVE;
4276 		if (remainder)
4277 		{
4278 			--quotient;
4279 			remainder = divisor - remainder;
4280 		}
4281 	}
4282 }
4283 
DividedBy(word b) const4284 Integer Integer::DividedBy(word b) const
4285 {
4286 	word remainder;
4287 	Integer quotient;
4288 	Integer::Divide(remainder, quotient, *this, b);
4289 	return quotient;
4290 }
4291 
Modulo(word divisor) const4292 word Integer::Modulo(word divisor) const
4293 {
4294 	if (!divisor)
4295 		throw Integer::DivideByZero();
4296 
4297 	word remainder;
4298 
4299 	// Profiling guided the flow below.
4300 	if ((divisor & (divisor-1)) != 0)	// divisor is not a power of 2
4301 	{
4302 		// Profiling guided the flow below.
4303 		unsigned int i = WordCount();
4304 		if (divisor > 5)
4305 		{
4306 			remainder = 0;
4307 			while (i--)
4308 				remainder = DWord(reg[i], remainder) % divisor;
4309 		}
4310 		else
4311 		{
4312 			DWord sum(0, 0);
4313 			while (i--)
4314 				sum += reg[i];
4315 			remainder = sum % divisor;
4316 		}
4317 	}
4318 	else  // divisor is a power of 2
4319 	{
4320 		remainder = reg[0] & (divisor-1);
4321 	}
4322 
4323 	if (IsNegative() && remainder)
4324 		remainder = divisor - remainder;
4325 
4326 	return remainder;
4327 }
4328 
Negate()4329 void Integer::Negate()
4330 {
4331 	if (!!(*this))	// don't flip sign if *this==0
4332 		sign = Sign(1-sign);
4333 }
4334 
PositiveCompare(const Integer & t) const4335 int Integer::PositiveCompare(const Integer& t) const
4336 {
4337 	// Profiling guided the flow below.
4338 	const unsigned size = WordCount(), tSize = t.WordCount();
4339 	if (size != tSize)
4340 		return size > tSize ? 1 : -1;
4341 	else
4342 		return CryptoPP::Compare(reg, t.reg, size);
4343 }
4344 
Compare(const Integer & t) const4345 int Integer::Compare(const Integer& t) const
4346 {
4347 	if (NotNegative())
4348 	{
4349 		if (t.NotNegative())
4350 			return PositiveCompare(t);
4351 		else
4352 			return 1;
4353 	}
4354 	else
4355 	{
4356 		if (t.NotNegative())
4357 			return -1;
4358 		else
4359 			return -PositiveCompare(t);
4360 	}
4361 }
4362 
SquareRoot() const4363 Integer Integer::SquareRoot() const
4364 {
4365 	if (!IsPositive())
4366 		return Zero();
4367 
4368 	// overestimate square root
4369 	Integer x, y = Power2((BitCount()+1)/2);
4370 	CRYPTOPP_ASSERT(y*y >= *this);
4371 
4372 	do
4373 	{
4374 		x = y;
4375 		y = (x + *this/x) >> 1;
4376 	} while (y<x);
4377 
4378 	return x;
4379 }
4380 
IsSquare() const4381 bool Integer::IsSquare() const
4382 {
4383 	Integer r = SquareRoot();
4384 	return *this == r.Squared();
4385 }
4386 
IsUnit() const4387 bool Integer::IsUnit() const
4388 {
4389 	return (WordCount() == 1) && (reg[0] == 1);
4390 }
4391 
MultiplicativeInverse() const4392 Integer Integer::MultiplicativeInverse() const
4393 {
4394 	return IsUnit() ? *this : Zero();
4395 }
4396 
a_times_b_mod_c(const Integer & x,const Integer & y,const Integer & m)4397 Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
4398 {
4399 	CRYPTOPP_ASSERT(m.NotZero());
4400 	if (m.IsZero())
4401 		throw Integer::DivideByZero();
4402 
4403 	return x*y%m;
4404 }
4405 
a_exp_b_mod_c(const Integer & x,const Integer & e,const Integer & m)4406 Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
4407 {
4408 	CRYPTOPP_ASSERT(m.NotZero());
4409 	if (m.IsZero())
4410 		throw Integer::DivideByZero();
4411 
4412 	ModularArithmetic mr(m);
4413 	return mr.Exponentiate(x, e);
4414 }
4415 
Gcd(const Integer & a,const Integer & b)4416 Integer Integer::Gcd(const Integer &a, const Integer &b)
4417 {
4418 	return EuclideanDomainOf<Integer>().Gcd(a, b);
4419 }
4420 
InverseMod(const Integer & m) const4421 Integer Integer::InverseMod(const Integer &m) const
4422 {
4423 	CRYPTOPP_ASSERT(m.NotNegative());
4424 	CRYPTOPP_ASSERT(m.NotZero());
4425 
4426 	if (IsNegative())
4427 		return Modulo(m).InverseModNext(m);
4428 
4429 	// http://github.com/weidai11/cryptopp/issues/602
4430 	if (*this >= m)
4431 		return Modulo(m).InverseModNext(m);
4432 
4433 	return InverseModNext(m);
4434 }
4435 
InverseModNext(const Integer & m) const4436 Integer Integer::InverseModNext(const Integer &m) const
4437 {
4438 	CRYPTOPP_ASSERT(m.NotNegative());
4439 	CRYPTOPP_ASSERT(m.NotZero());
4440 
4441 	if (m.IsEven())
4442 	{
4443 		if (!m || IsEven())
4444 			return Zero();	// no inverse
4445 		if (*this == One())
4446 			return One();
4447 
4448 		Integer u = m.Modulo(*this).InverseModNext(*this);
4449 		return !u ? Zero() : (m*(*this-u)+1)/(*this);
4450 	}
4451 
4452 	// AlmostInverse requires a 4x workspace
4453 	IntegerSecBlock T(m.reg.size() * 4);
4454 	Integer r((word)0, m.reg.size());
4455 	unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size());
4456 	DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size());
4457 	return r;
4458 }
4459 
InverseMod(word mod) const4460 word Integer::InverseMod(word mod) const
4461 {
4462 	CRYPTOPP_ASSERT(mod != 0);
4463 
4464 	word g0 = mod, g1 = *this % mod;
4465 	word v0 = 0, v1 = 1;
4466 	word y;
4467 
4468 	while (g1)
4469 	{
4470 		if (g1 == 1)
4471 			return v1;
4472 		y = g0 / g1;
4473 		g0 = g0 % g1;
4474 		v0 += y * v1;
4475 
4476 		if (!g0)
4477 			break;
4478 		if (g0 == 1)
4479 			return mod-v0;
4480 		y = g1 / g0;
4481 		g1 = g1 % g0;
4482 		v1 += y * v0;
4483 	}
4484 	return 0;
4485 }
4486 
4487 // ********************************************************
4488 
ModularArithmetic(BufferedTransformation & bt)4489 ModularArithmetic::ModularArithmetic(BufferedTransformation &bt)
4490 {
4491 	BERSequenceDecoder seq(bt);
4492 	OID oid(seq);
4493 	if (oid != ASN1::prime_field())
4494 		BERDecodeError();
4495 	m_modulus.BERDecode(seq);
4496 	seq.MessageEnd();
4497 	m_result.reg.resize(m_modulus.reg.size());
4498 }
4499 
DEREncode(BufferedTransformation & bt) const4500 void ModularArithmetic::DEREncode(BufferedTransformation &bt) const
4501 {
4502 	DERSequenceEncoder seq(bt);
4503 	ASN1::prime_field().DEREncode(seq);
4504 	m_modulus.DEREncode(seq);
4505 	seq.MessageEnd();
4506 }
4507 
DEREncodeElement(BufferedTransformation & out,const Element & a) const4508 void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const
4509 {
4510 	a.DEREncodeAsOctetString(out, MaxElementByteLength());
4511 }
4512 
BERDecodeElement(BufferedTransformation & in,Element & a) const4513 void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const
4514 {
4515 	a.BERDecodeAsOctetString(in, MaxElementByteLength());
4516 }
4517 
Half(const Integer & a) const4518 const Integer& ModularArithmetic::Half(const Integer &a) const
4519 {
4520 	if (a.reg.size()==m_modulus.reg.size())
4521 	{
4522 		CryptoPP::DivideByPower2Mod(m_result.reg.begin(), a.reg, 1, m_modulus.reg, a.reg.size());
4523 		return m_result;
4524 	}
4525 	else
4526 		return m_result1 = (a.IsEven() ? (a >> 1) : ((a+m_modulus) >> 1));
4527 }
4528 
Add(const Integer & a,const Integer & b) const4529 const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
4530 {
4531 	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4532 	{
4533 		if (CryptoPP::Add(m_result.reg.begin(), a.reg, b.reg, a.reg.size())
4534 			|| Compare(m_result.reg, m_modulus.reg, a.reg.size()) >= 0)
4535 		{
4536 			CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
4537 		}
4538 		return m_result;
4539 	}
4540 	else
4541 	{
4542 		m_result1 = a+b;
4543 		if (m_result1 >= m_modulus)
4544 			m_result1 -= m_modulus;
4545 		return m_result1;
4546 	}
4547 }
4548 
Accumulate(Integer & a,const Integer & b) const4549 Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
4550 {
4551 	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4552 	{
4553 		if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())
4554 			|| Compare(a.reg, m_modulus.reg, a.reg.size()) >= 0)
4555 		{
4556 			CryptoPP::Subtract(a.reg, a.reg, m_modulus.reg, a.reg.size());
4557 		}
4558 	}
4559 	else
4560 	{
4561 		a+=b;
4562 		if (a>=m_modulus)
4563 			a-=m_modulus;
4564 	}
4565 
4566 	return a;
4567 }
4568 
Subtract(const Integer & a,const Integer & b) const4569 const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const
4570 {
4571 	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4572 	{
4573 		if (CryptoPP::Subtract(m_result.reg.begin(), a.reg, b.reg, a.reg.size()))
4574 			CryptoPP::Add(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
4575 		return m_result;
4576 	}
4577 	else
4578 	{
4579 		m_result1 = a-b;
4580 		if (m_result1.IsNegative())
4581 			m_result1 += m_modulus;
4582 		return m_result1;
4583 	}
4584 }
4585 
Reduce(Integer & a,const Integer & b) const4586 Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
4587 {
4588 	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4589 	{
4590 		if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))
4591 			CryptoPP::Add(a.reg, a.reg, m_modulus.reg, a.reg.size());
4592 	}
4593 	else
4594 	{
4595 		a-=b;
4596 		if (a.IsNegative())
4597 			a+=m_modulus;
4598 	}
4599 
4600 	return a;
4601 }
4602 
Inverse(const Integer & a) const4603 const Integer& ModularArithmetic::Inverse(const Integer &a) const
4604 {
4605 	if (!a)
4606 		return a;
4607 
4608 	CopyWords(m_result.reg.begin(), m_modulus.reg, m_modulus.reg.size());
4609 	if (CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, a.reg, a.reg.size()))
4610 		Decrement(m_result.reg.begin()+a.reg.size(), m_modulus.reg.size()-a.reg.size());
4611 
4612 	return m_result;
4613 }
4614 
CascadeExponentiate(const Integer & x,const Integer & e1,const Integer & y,const Integer & e2) const4615 Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const
4616 {
4617 	if (m_modulus.IsOdd())
4618 	{
4619 		MontgomeryRepresentation dr(m_modulus);
4620 		return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));
4621 	}
4622 	else
4623 		return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2);
4624 }
4625 
SimultaneousExponentiate(Integer * results,const Integer & base,const Integer * exponents,unsigned int exponentsCount) const4626 void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const
4627 {
4628 	if (m_modulus.IsOdd())
4629 	{
4630 		MontgomeryRepresentation dr(m_modulus);
4631 		dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);
4632 		for (unsigned int i=0; i<exponentsCount; i++)
4633 			results[i] = dr.ConvertOut(results[i]);
4634 	}
4635 	else
4636 		AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount);
4637 }
4638 
MontgomeryRepresentation(const Integer & m)4639 MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)	// modulus must be odd
4640 	: ModularArithmetic(m),
4641 	  m_u((word)0, m_modulus.reg.size()),
4642 	  m_workspace(5*m_modulus.reg.size())
4643 {
4644 	if (!m_modulus.IsOdd())
4645 		throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");
4646 
4647 	RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size());
4648 }
4649 
Multiply(const Integer & a,const Integer & b) const4650 const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
4651 {
4652 	word *const T = m_workspace.begin();
4653 	word *const R = m_result.reg.begin();
4654 	const size_t N = m_modulus.reg.size();
4655 	CRYPTOPP_ASSERT(a.reg.size()<=N && b.reg.size()<=N);
4656 
4657 	AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
4658 	SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
4659 	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4660 	return m_result;
4661 }
4662 
Square(const Integer & a) const4663 const Integer& MontgomeryRepresentation::Square(const Integer &a) const
4664 {
4665 	word *const T = m_workspace.begin();
4666 	word *const R = m_result.reg.begin();
4667 	const size_t N = m_modulus.reg.size();
4668 	CRYPTOPP_ASSERT(a.reg.size()<=N);
4669 
4670 	CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
4671 	SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());
4672 	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4673 	return m_result;
4674 }
4675 
ConvertOut(const Integer & a) const4676 Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
4677 {
4678 	word *const T = m_workspace.begin();
4679 	word *const R = m_result.reg.begin();
4680 	const size_t N = m_modulus.reg.size();
4681 	CRYPTOPP_ASSERT(a.reg.size()<=N);
4682 
4683 	CopyWords(T, a.reg, a.reg.size());
4684 	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
4685 	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4686 	return m_result;
4687 }
4688 
MultiplicativeInverse(const Integer & a) const4689 const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
4690 {
4691 //	  return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;
4692 	word *const T = m_workspace.begin();
4693 	word *const R = m_result.reg.begin();
4694 	const size_t N = m_modulus.reg.size();
4695 	CRYPTOPP_ASSERT(a.reg.size()<=N);
4696 
4697 	CopyWords(T, a.reg, a.reg.size());
4698 	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
4699 	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4700 	unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N);
4701 
4702 //	cout << "k=" << k << " N*32=" << 32*N << endl;
4703 
4704 	if (k>N*WORD_BITS)
4705 		DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N);
4706 	else
4707 		MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N);
4708 
4709 	return m_result;
4710 }
4711 
4712 // Specialization declared in misc.h to allow us to print integers
4713 //  with additional control options, like arbirary bases and uppercase.
4714 template <> CRYPTOPP_DLL
IntToString(Integer value,unsigned int base)4715 std::string IntToString<Integer>(Integer value, unsigned int base)
4716 {
4717 	// Hack... set the high bit for uppercase. Set the next bit fo a suffix.
4718 	static const unsigned int BIT_32 = (1U << 31);
4719 	const bool UPPER = !!(base & BIT_32);
4720 	static const unsigned int BIT_31 = (1U << 30);
4721 	const bool BASE = !!(base & BIT_31);
4722 
4723 	const char CH = UPPER ? 'A' : 'a';
4724 	base &= ~(BIT_32|BIT_31);
4725 	CRYPTOPP_ASSERT(base >= 2 && base <= 32);
4726 
4727 	if (value == 0)
4728 		return "0";
4729 
4730 	bool negative = false, zero = false;
4731 	if (value.IsNegative())
4732 	{
4733 		negative = true;
4734 		value.Negate();
4735 	}
4736 
4737 	if (!value)
4738 		zero = true;
4739 
4740 	SecBlock<char> s(value.BitCount() / (SaturatingSubtract1(BitPrecision(base),1U)) + 1);
4741 	Integer temp;
4742 
4743 	unsigned int i=0;
4744 	while (!!value)
4745 	{
4746 		word digit;
4747 		Integer::Divide(digit, temp, value, word(base));
4748 		s[i++]=char((digit < 10 ? '0' : (CH - 10)) + digit);
4749 		value.swap(temp);
4750 	}
4751 
4752 	std::string result;
4753 	result.reserve(i+2);
4754 
4755 	if (negative)
4756 		result += '-';
4757 
4758 	if (zero)
4759 		result += '0';
4760 
4761 	while (i--)
4762 		result += s[i];
4763 
4764 	if (BASE)
4765 	{
4766 		if (base == 10)
4767 			result += '.';
4768 		else if (base == 16)
4769 			result += 'h';
4770 		else if (base == 8)
4771 			result += 'o';
4772 		else if (base == 2)
4773 			result += 'b';
4774 	}
4775 
4776 	return result;
4777 }
4778 
4779 // Specialization declared in misc.h to avoid Coverity findings.
4780 template <> CRYPTOPP_DLL
IntToString(word64 value,unsigned int base)4781 std::string IntToString<word64>(word64 value, unsigned int base)
4782 {
4783 	// Hack... set the high bit for uppercase.
4784 	static const unsigned int HIGH_BIT = (1U << 31);
4785 	const char CH = !!(base & HIGH_BIT) ? 'A' : 'a';
4786 	base &= ~HIGH_BIT;
4787 
4788 	CRYPTOPP_ASSERT(base >= 2);
4789 	if (value == 0)
4790 		return "0";
4791 
4792 	std::string result;
4793 	while (value > 0)
4794 	{
4795 		word64 digit = value % base;
4796 		result = char((digit < 10 ? '0' : (CH - 10)) + digit) + result;
4797 		value /= base;
4798 	}
4799 	return result;
4800 }
4801 
4802 #ifndef CRYPTOPP_NO_ASSIGN_TO_INTEGER
4803 // Allow the linker to discard Integer code if not needed.
4804 // Also see http://github.com/weidai11/cryptopp/issues/389.
AssignIntToInteger(const std::type_info & valueType,void * pInteger,const void * pInt)4805 bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
4806 {
4807 	if (valueType != typeid(Integer))
4808 		return false;
4809 	*reinterpret_cast<Integer *>(pInteger) = *reinterpret_cast<const int *>(pInt);
4810 	return true;
4811 }
4812 #endif  // CRYPTOPP_NO_ASSIGN_TO_INTEGER
4813 
4814 // *************************** C++ Static Initialization ***************************
4815 
4816 class InitInteger
4817 {
4818 public:
InitInteger()4819 	InitInteger()
4820 	{
4821 		SetFunctionPointers();
4822 	}
4823 };
4824 
4825 // This is not really needed because each Integer can dynamically initialize
4826 // itself, but we take a peephole optimization and initialize the class once
4827 // if init priorities are available. Dynamic initialization will be used if
4828 // init priorities are not available.
4829 
4830 #if defined(HAVE_GCC_INIT_PRIORITY)
4831 	const InitInteger s_init __attribute__ ((init_priority (CRYPTOPP_INIT_PRIORITY + 10))) = InitInteger();
4832 	const Integer g_zero __attribute__ ((init_priority (CRYPTOPP_INIT_PRIORITY + 11))) = Integer(0L);
4833 	const Integer g_one __attribute__ ((init_priority (CRYPTOPP_INIT_PRIORITY + 12))) = Integer(1L);
4834 	const Integer g_two __attribute__ ((init_priority (CRYPTOPP_INIT_PRIORITY + 13))) = Integer(2L);
4835 #elif defined(HAVE_MSC_INIT_PRIORITY)
4836 	#pragma warning(disable: 4075)
4837 	#pragma init_seg(".CRT$XCU")
4838 	const InitInteger s_init;
4839 	const Integer g_zero(0L);
4840 	const Integer g_one(1L);
4841 	const Integer g_two(2L);
4842 	#pragma warning(default: 4075)
4843 #elif HAVE_XLC_INIT_PRIORITY
4844 	// XLC needs constant, not a define
4845 	#pragma priority(280)
4846 	const InitInteger s_init;
4847 	const Integer g_zero(0L);
4848 	const Integer g_one(1L);
4849 	const Integer g_two(2L);
4850 #else
4851 	const InitInteger s_init;
4852 #endif
4853 
4854 // ***************** Library code ********************
4855 
Zero()4856 const Integer &Integer::Zero()
4857 {
4858 #if defined(HAVE_GCC_INIT_PRIORITY) || defined(HAVE_MSC_INIT_PRIORITY) || defined(HAVE_XLC_INIT_PRIORITY)
4859 	return g_zero;
4860 #elif defined(CRYPTOPP_CXX11_STATIC_INIT)
4861 	static const Integer s_zero(0L);
4862 	return s_zero;
4863 #else  // Potential memory leak. Avoid if possible.
4864 	return Singleton<Integer, NewInteger<0L> >().Ref();
4865 #endif
4866 }
4867 
One()4868 const Integer &Integer::One()
4869 {
4870 #if defined(HAVE_GCC_INIT_PRIORITY) || defined(HAVE_MSC_INIT_PRIORITY) || defined(HAVE_XLC_INIT_PRIORITY)
4871 	return g_one;
4872 #elif defined(CRYPTOPP_CXX11_STATIC_INIT)
4873 	static const Integer s_one(1L);
4874 	return s_one;
4875 #else  // Potential memory leak. Avoid if possible.
4876 	return Singleton<Integer, NewInteger<1L> >().Ref();
4877 #endif
4878 }
4879 
Two()4880 const Integer &Integer::Two()
4881 {
4882 #if defined(HAVE_GCC_INIT_PRIORITY) || defined(HAVE_MSC_INIT_PRIORITY) || defined(HAVE_XLC_INIT_PRIORITY)
4883 	return g_two;
4884 #elif defined(CRYPTOPP_CXX11_STATIC_INIT)
4885 	static const Integer s_two(2L);
4886 	return s_two;
4887 #else  // Potential memory leak. Avoid if possible.
4888 	return Singleton<Integer, NewInteger<2L> >().Ref();
4889 #endif
4890 }
4891 
4892 NAMESPACE_END
4893 
4894 #endif  // CRYPTOPP_IMPORTS
4895