1 // integer.cpp - originally written and placed in the public domain by Wei Dai
2 // contains public domain code contributed by Alister Lee and Leonard Janke
3 
4 // Notes by JW: The Integer class needs to do two things. First, it needs
5 //  to set function pointers on some platforms, like X86 and X64. The
6 //  function pointers select a fast multiply and addition based on the cpu.
7 //  Second, it wants to create Integer::Zero(), Integer::One() and
8 //  Integer::Two().
9 // The function pointers are initialized in the InitializeInteger class by
10 //  calling SetFunctionPointers(). The call to SetFunctionPointers() is
11 //  guarded to run once using a double-checked pattern. We don't use C++
12 //  std::call_once due to bad interactions between libstdc++, glibc and
13 //  pthreads. The bad interactions were causing crashes for us on platforms
14 //  like Sparc and PowerPC. Since we are only setting function pointers we
15 //  don't have to worry about leaking memory. The worst case seems to be the
16 //  pointers gets written twice until the init flag is set and visible to
17 //  all threads.
18 // For Integer::Zero(), Integer::One() and Integer::Two(), we use one of three
19 //  strategies. First, if initialization priorities are available then we use
20 //  them. Initialization priorities are init_priority() on Linux and init_seg()
21 //  on Windows. OS X and several other platforms lack them. Initialization
22 //  priorities are platform specific but they are also the most trouble free
23 //  with determisitic destruction.
24 // Second, if C++11 dynamic initialization is available, then we use it. After
25 //  the std::call_once fiasco we moved to dynamic initialization to avoid
26 //  unknown troubles platforms that are tested less frequently. In addition
27 //  Microsoft platforms mostly do not provide dynamic initialization.
28 //  The MSDN docs claim they do but they don't in practice because we need
29 //  Visual Studio 2017 and Windows 10 or above.
30 // Third, we fall back to Wei's original code of a Singleton. Wei's original
31 //  code was much simpler. It simply used the Singleton pattern, but it always
32 //  produced memory findings on some platforms. The Singleton generates memory
33 //  findings because it uses a Create On First Use pattern (a dumb Nifty
34 //  Counter) and the compiler had to be smart enough to fold them to return
35 //  the same object. Unix and Linux compilers do a good job of folding objects,
36 //  but Microsoft compilers do a rather poor job for some versions of the
37 //  compiler.
38 // Another problem with the Singleton is resource destruction requires running
39 //  resource acquisition in reverse. For resources provided through the
40 //  Singletons, there is no way to express the dependency order to safely
41 //  destroy resources. (That's one of the problems C++11 dynamic
42 //  intitialization with concurrent execution is supposed to solve).
43 // The final problem with Singletons is resource/memory exhaustion in languages
44 //  like Java and .Net. Java and .Net load and unload a native DLL hundreds or
45 //  thousands of times during the life of a program. Each load produces a
46 //  memory leak and they can add up quickly. If they library is being used in
47 //  Java or .Net then Singleton must be avoided at all costs.
48 //
49 // The code below has a path cut-in for BMI2 using mulx and adcx instructions.
50 //  There was a modest speedup of approximately 0.03 ms in public key Integer
51 //  operations. We had to disable BMI2 for the moment because some OS X machines
52 //  were advertising BMI/BMI2 support but caused SIGILL's at runtime. Also see
53 //  https://github.com/weidai11/cryptopp/issues/850.
54 
55 #include "pch.h"
56 #include "config.h"
57 
58 #ifndef CRYPTOPP_IMPORTS
59 
60 #include "integer.h"
61 #include "secblock.h"
62 #include "modarith.h"
63 #include "nbtheory.h"
64 #include "smartptr.h"
65 #include "algparam.h"
66 #include "filters.h"
67 #include "stdcpp.h"
68 #include "asn.h"
69 #include "oids.h"
70 #include "words.h"
71 #include "pubkey.h"		// for P1363_KDF2
72 #include "sha.h"
73 #include "cpu.h"
74 #include "misc.h"
75 
76 #include <iostream>
77 
78 #if (_MSC_VER >= 1400) && !defined(_M_ARM)
79 	#include <intrin.h>
80 #endif
81 
82 #ifdef __DECCXX
83 	#include <c_asm.h>
84 #endif
85 
86 // "Error: The operand ___LKDB cannot be assigned to",
87 // http://github.com/weidai11/cryptopp/issues/188
88 #if (__SUNPRO_CC >= 0x5130)
89 # define MAYBE_CONST
90 # define MAYBE_UNCONST_CAST(x) const_cast<word*>(x)
91 #else
92 # define MAYBE_CONST const
93 # define MAYBE_UNCONST_CAST(x) x
94 #endif
95 
96 // "Inline assembly operands don't work with .intel_syntax",
97 //  http://llvm.org/bugs/show_bug.cgi?id=24232
98 #if CRYPTOPP_BOOL_X32 || defined(CRYPTOPP_DISABLE_MIXED_ASM)
99 # undef CRYPTOPP_X86_ASM_AVAILABLE
100 # undef CRYPTOPP_X32_ASM_AVAILABLE
101 # undef CRYPTOPP_X64_ASM_AVAILABLE
102 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
103 # undef CRYPTOPP_SSSE3_ASM_AVAILABLE
104 #else
105 # define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86))
106 #endif
107 
108 // ***************** C++ Static Initialization ********************
109 
110 NAMESPACE_BEGIN(CryptoPP)
111 
112 // Function body near the middle of the file
113 static void SetFunctionPointers();
114 
115 // Use a double-checked pattern. We are not leaking anything so it
116 // does not matter if a pointer is written twice during a race.
117 // Avoid std::call_once due to too many problems on platforms like
118 // Solaris and Sparc. Also see
119 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=66146 and
120 // http://github.com/weidai11/cryptopp/issues/707.
InitializeInteger()121 InitializeInteger::InitializeInteger()
122 {
123 	static bool s_flag;
124 	MEMORY_BARRIER();
125 	if (s_flag == false)
126 	{
127 		SetFunctionPointers();
128 		s_flag = true;
129 		MEMORY_BARRIER();
130 	}
131 }
132 
133 template <long i>
134 struct NewInteger
135 {
operator ()NewInteger136 	Integer * operator()() const
137 	{
138 		return new Integer(i);
139 	}
140 };
141 
142 // ***************** Library code ********************
143 
Compare(const word * A,const word * B,size_t N)144 inline static int Compare(const word *A, const word *B, size_t N)
145 {
146 	while (N--)
147 		if (A[N] > B[N])
148 			return 1;
149 		else if (A[N] < B[N])
150 			return -1;
151 
152 	return 0;
153 }
154 
Increment(word * A,size_t N,word B=1)155 inline static int Increment(word *A, size_t N, word B=1)
156 {
157 	CRYPTOPP_ASSERT(N);
158 	word t = A[0];
159 	A[0] = t+B;
160 	if (A[0] >= t)
161 		return 0;
162 	for (unsigned i=1; i<N; i++)
163 		if (++A[i])
164 			return 0;
165 	return 1;
166 }
167 
Decrement(word * A,size_t N,word B=1)168 inline static int Decrement(word *A, size_t N, word B=1)
169 {
170 	CRYPTOPP_ASSERT(N);
171 	word t = A[0];
172 	A[0] = t-B;
173 	if (A[0] <= t)
174 		return 0;
175 	for (unsigned i=1; i<N; i++)
176 		if (A[i]--)
177 			return 0;
178 	return 1;
179 }
180 
TwosComplement(word * A,size_t N)181 static void TwosComplement(word *A, size_t N)
182 {
183 	Decrement(A, N);
184 	for (unsigned i=0; i<N; i++)
185 		A[i] = ~A[i];
186 }
187 
AtomicInverseModPower2(word A)188 static word AtomicInverseModPower2(word A)
189 {
190 	CRYPTOPP_ASSERT(A%2==1);
191 
192 	word R=A%8;
193 
194 	for (unsigned i=3; i<WORD_BITS; i*=2)
195 		R = R*(2-R*A);
196 
197 	CRYPTOPP_ASSERT(R*A==1);
198 	return R;
199 }
200 
201 // ********************************************************
202 
203 #if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || ((defined(__aarch64__) || defined(__x86_64__)) && defined(CRYPTOPP_WORD128_AVAILABLE))
204 	#define TWO_64_BIT_WORDS 1
205 	#define Declare2Words(x)			word x##0, x##1;
206 	#define AssignWord(a, b)			a##0 = b; a##1 = 0;
207 	#define Add2WordsBy1(a, b, c)		a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
208 	#define LowWord(a)					a##0
209 	#define HighWord(a)					a##1
210 	#ifdef _MSC_VER
211 		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = _umul128(a, b, &p1);
212 		#ifndef __INTEL_COMPILER
213 			#define Double3Words(c, d)		d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
214 		#endif
215 	#elif defined(__aarch32__) || defined(__aarch64__)
216 		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = a*b; asm ("umulh %0,%1,%2" : "=r"(p1) : "r"(a), "r"(b));
217 	#elif defined(__DECCXX)
218 		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = a*b; p1 = asm("umulh %a0, %a1, %v0", a, b);
219 	#elif defined(__x86_64__)
220 		#if defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5100
221 			// Sun Studio's gcc-style inline assembly is heavily bugged as of version 5.9 Patch 124864-09 2008/12/16, but this one works
222 			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "r"(b) : "cc");
223 		#elif defined(__BMI2__) && 0
224 			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulxq %3, %0, %1" : "=r"(p0), "=r"(p1) : "d"(a), "r"(b));
225 			#define MulAcc(c, d, a, b)		asm ("mulxq %6, %3, %4; addq %3, %0; adcxq %4, %1; adcxq %7, %2;" : "+&r"(c), "+&r"(d##0), "+&r"(d##1), "=&r"(p0), "=&r"(p1) : "d"(a), "r"(b), "r"(W64LIT(0)) : "cc");
226 			#define Double3Words(c, d)		asm ("addq %0, %0; adcxq %1, %1; adcxq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
227 			#define Acc2WordsBy1(a, b)		asm ("addq %2, %0; adcxq %3, %1;" : "+&r"(a##0), "+r"(a##1) : "r"(b), "r"(W64LIT(0)) : "cc");
228 			#define Acc2WordsBy2(a, b)		asm ("addq %2, %0; adcxq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
229 			#define Acc3WordsBy2(c, d, e)	asm ("addq %5, %0; adcxq %6, %1; adcxq %7, %2;" : "+r"(c), "=&r"(e##0), "=&r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1), "r"(W64LIT(0)) : "cc");
230 		#else
231 			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
232 			#define MulAcc(c, d, a, b)		asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
233 			#define Double3Words(c, d)		asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
234 			#define Acc2WordsBy1(a, b)		asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
235 			#define Acc2WordsBy2(a, b)		asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
236 			#define Acc3WordsBy2(c, d, e)	asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
237 		#endif
238 	#endif
239 	#define MultiplyWords(p, a, b)		MultiplyWordsLoHi(p##0, p##1, a, b)
240 	#ifndef Double3Words
241 		#define Double3Words(c, d)		d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
242 	#endif
243 	#ifndef Acc2WordsBy2
244 		#define Acc2WordsBy2(a, b)		a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
245 	#endif
246 	#define AddWithCarry(u, a, b)		{word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
247 	#define SubtractWithBorrow(u, a, b)	{word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
248 	#define GetCarry(u)					u##1
249 	#define GetBorrow(u)				u##1
250 #else
251 	#define Declare2Words(x)			dword x;
252 	#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_IA64))
253 		#define MultiplyWords(p, a, b)		p = __emulu(a, b);
254 	#else
255 		#define MultiplyWords(p, a, b)		p = (dword)a*b;
256 	#endif
257 	#define AssignWord(a, b)			a = b;
258 	#define Add2WordsBy1(a, b, c)		a = b + c;
259 	#define Acc2WordsBy2(a, b)			a += b;
260 	#define LowWord(a)					word(a)
261 	#define HighWord(a)					word(a>>WORD_BITS)
262 	#define Double3Words(c, d)			d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
263 	#define AddWithCarry(u, a, b)		u = dword(a) + b + GetCarry(u);
264 	#define SubtractWithBorrow(u, a, b)	u = dword(a) - b - GetBorrow(u);
265 	#define GetCarry(u)					HighWord(u)
266 	#define GetBorrow(u)				word(u>>(WORD_BITS*2-1))
267 #endif
268 #ifndef MulAcc
269 	#define MulAcc(c, d, a, b)			MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
270 #endif
271 #ifndef Acc2WordsBy1
272 	#define Acc2WordsBy1(a, b)			Add2WordsBy1(a, a, b)
273 #endif
274 #ifndef Acc3WordsBy2
275 	#define Acc3WordsBy2(c, d, e)		Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
276 #endif
277 
278 class DWord
279 {
280 public:
281 #if defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE)
DWord()282 	DWord() {std::memset(&m_whole, 0x00, sizeof(m_whole));}
283 #else
284 	DWord() {std::memset(&m_halfs, 0x00, sizeof(m_halfs));}
285 #endif
286 
287 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
DWord(word low)288 	explicit DWord(word low) : m_whole(low) { }
289 #else
DWord(word low)290 	explicit DWord(word low)
291 	{
292 		m_halfs.high = 0;
293 		m_halfs.low = low;
294 	}
295 #endif
296 
297 #if defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE)
DWord(word low,word high)298 	DWord(word low, word high) : m_whole()
299 #else
300 	DWord(word low, word high) : m_halfs()
301 #endif
302 	{
303 #if defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE)
304 #  if (CRYPTOPP_LITTLE_ENDIAN)
305 		const word t[2] = {low,high};
306 		memcpy(&m_whole, t, sizeof(m_whole));
307 #  else
308 		const word t[2] = {high,low};
309 		memcpy(&m_whole, t, sizeof(m_whole));
310 #  endif
311 #else
312 		m_halfs.low = low;
313 		m_halfs.high = high;
314 #endif
315 	}
316 
Multiply(word a,word b)317 	static DWord Multiply(word a, word b)
318 	{
319 		DWord r;
320 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
321 			r.m_whole = (dword)a * b;
322 		#elif defined(MultiplyWordsLoHi)
323 			MultiplyWordsLoHi(r.m_halfs.low, r.m_halfs.high, a, b);
324 		#else
325 			CRYPTOPP_ASSERT(0);
326 		#endif
327 		return r;
328 	}
329 
MultiplyAndAdd(word a,word b,word c)330 	static DWord MultiplyAndAdd(word a, word b, word c)
331 	{
332 		DWord r = Multiply(a, b);
333 		return r += c;
334 	}
335 
operator +=(word a)336 	DWord & operator+=(word a)
337 	{
338 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
339 			m_whole = m_whole + a;
340 		#else
341 			m_halfs.low += a;
342 			m_halfs.high += (m_halfs.low < a);
343 		#endif
344 		return *this;
345 	}
346 
operator +(word a)347 	DWord operator+(word a)
348 	{
349 		DWord r;
350 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
351 			r.m_whole = m_whole + a;
352 		#else
353 			r.m_halfs.low = m_halfs.low + a;
354 			r.m_halfs.high = m_halfs.high + (r.m_halfs.low < a);
355 		#endif
356 		return r;
357 	}
358 
operator -(DWord a)359 	DWord operator-(DWord a)
360 	{
361 		DWord r;
362 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
363 			r.m_whole = m_whole - a.m_whole;
364 		#else
365 			r.m_halfs.low = m_halfs.low - a.m_halfs.low;
366 			r.m_halfs.high = m_halfs.high - a.m_halfs.high - (r.m_halfs.low > m_halfs.low);
367 		#endif
368 		return r;
369 	}
370 
operator -(word a)371 	DWord operator-(word a)
372 	{
373 		DWord r;
374 		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
375 			r.m_whole = m_whole - a;
376 		#else
377 			r.m_halfs.low = m_halfs.low - a;
378 			r.m_halfs.high = m_halfs.high - (r.m_halfs.low > m_halfs.low);
379 		#endif
380 		return r;
381 	}
382 
383 	// returns quotient, which must fit in a word
384 	word operator/(word divisor);
385 
386 	word operator%(word a);
387 
operator !() const388 	bool operator!() const
389 	{
390 	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
391 		return !m_whole;
392 	#else
393 		return !m_halfs.high && !m_halfs.low;
394 	#endif
395 	}
396 
397 	// TODO: When NATIVE_DWORD is in effect, we access high and low, which are inactive
398 	//  union members, and that's UB. Also see http://stackoverflow.com/q/11373203.
GetLowHalf() const399 	word GetLowHalf() const {return m_halfs.low;}
GetHighHalf() const400 	word GetHighHalf() const {return m_halfs.high;}
GetHighHalfAsBorrow() const401 	word GetHighHalfAsBorrow() const {return 0-m_halfs.high;}
402 
403 private:
404 	// Issue 274, "Types cannot be declared in anonymous union",
405 	//   http://github.com/weidai11/cryptopp/issues/274
406 	//   Thanks to Martin Bonner at http://stackoverflow.com/a/39507183
407     struct half_words
408     {
409     #if (CRYPTOPP_LITTLE_ENDIAN)
410         word low;
411         word high;
412     #else
413         word high;
414         word low;
415     #endif
416    };
417    union
418    {
419    #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
420        dword m_whole;
421    #endif
422        half_words m_halfs;
423    };
424 };
425 
426 class Word
427 {
428 public:
Word()429 	Word() : m_whole(0) {}
Word(word value)430 	Word(word value) : m_whole(value) {}
Word(hword low,hword high)431 	Word(hword low, hword high) : m_whole(low | (word(high) << (WORD_BITS/2))) {}
432 
Multiply(hword a,hword b)433 	static Word Multiply(hword a, hword b)
434 	{
435 		Word r;
436 		r.m_whole = (word)a * b;
437 		return r;
438 	}
439 
operator -(Word a)440 	Word operator-(Word a)
441 	{
442 		Word r;
443 		r.m_whole = m_whole - a.m_whole;
444 		return r;
445 	}
446 
operator -(hword a)447 	Word operator-(hword a)
448 	{
449 		Word r;
450 		r.m_whole = m_whole - a;
451 		return r;
452 	}
453 
454 	// returns quotient, which must fit in a word
operator /(hword divisor)455 	hword operator/(hword divisor)
456 	{
457 		return hword(m_whole / divisor);
458 	}
459 
operator !() const460 	bool operator!() const
461 	{
462 		return !m_whole;
463 	}
464 
GetWhole() const465 	word GetWhole() const {return m_whole;}
GetLowHalf() const466 	hword GetLowHalf() const {return hword(m_whole);}
GetHighHalf() const467 	hword GetHighHalf() const {return hword(m_whole>>(WORD_BITS/2));}
GetHighHalfAsBorrow() const468 	hword GetHighHalfAsBorrow() const {return 0-hword(m_whole>>(WORD_BITS/2));}
469 
470 private:
471 	word m_whole;
472 };
473 
474 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
475 template <class S, class D>
476 S DivideThreeWordsByTwo(S *A, S B0, S B1, D *dummy=NULLPTR)
477 {
478 	CRYPTOPP_UNUSED(dummy);
479 
480 	// Assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S
481 	CRYPTOPP_ASSERT(A[2] < B1 || (A[2]==B1 && A[1] < B0));
482 
483 	// Profiling guided the flow below.
484 
485 	// estimate the quotient: do a 2 S by 1 S divide.
486 	S Q; bool pre = (S(B1+1) == 0);
487 	if (B1 > 0 && !pre)
488 		Q = D(A[1], A[2]) / S(B1+1);
489 	else if (pre)
490 		Q = A[2];
491 	else
492 		Q = D(A[0], A[1]) / B0;
493 
494 	// now subtract Q*B from A
495 	D p = D::Multiply(B0, Q);
496 	D u = (D) A[0] - p.GetLowHalf();
497 	A[0] = u.GetLowHalf();
498 	u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - D::Multiply(B1, Q);
499 	A[1] = u.GetLowHalf();
500 	A[2] += u.GetHighHalf();
501 
502 	// Q <= actual quotient, so fix it
503 	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
504 	{
505 		u = (D) A[0] - B0;
506 		A[0] = u.GetLowHalf();
507 		u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
508 		A[1] = u.GetLowHalf();
509 		A[2] += u.GetHighHalf();
510 		Q++;
511 		CRYPTOPP_ASSERT(Q);	// shouldn't overflow
512 	}
513 
514 	return Q;
515 }
516 
517 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
518 template <class S, class D>
DivideFourWordsByTwo(S * T,const D & Al,const D & Ah,const D & B)519 inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
520 {
521 	// Profiling guided the flow below.
522 
523 	if (!!B)
524 	{
525 		S Q[2];
526 		T[0] = Al.GetLowHalf();
527 		T[1] = Al.GetHighHalf();
528 		T[2] = Ah.GetLowHalf();
529 		T[3] = Ah.GetHighHalf();
530 		Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), B.GetHighHalf());
531 		Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
532 		return D(Q[0], Q[1]);
533 	}
534 	else  // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
535 	{
536 		return D(Ah.GetLowHalf(), Ah.GetHighHalf());
537 	}
538 }
539 
540 // returns quotient, which must fit in a word
operator /(word a)541 inline word DWord::operator/(word a)
542 {
543 	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
544 		return word(m_whole / a);
545 	#else
546 		hword r[4];
547 		return DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a).GetWhole();
548 	#endif
549 }
550 
operator %(word a)551 inline word DWord::operator%(word a)
552 {
553 	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
554 		return word(m_whole % a);
555 	#else
556 		if (a < (word(1) << (WORD_BITS/2)))
557 		{
558 			hword h = hword(a);
559 			word r = m_halfs.high % h;
560 			r = ((m_halfs.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
561 			return hword((hword(m_halfs.low) + (r << (WORD_BITS/2))) % h);
562 		}
563 		else
564 		{
565 			hword r[4];
566 			DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a);
567 			return Word(r[0], r[1]).GetWhole();
568 		}
569 	#endif
570 }
571 
572 // ********************************************************
573 
574 // Use some tricks to share assembly code between MSVC, GCC, Clang and Sun CC.
575 #if defined(__GNUC__)
576 	#define AddPrologue \
577 		int result;	\
578 		__asm__ __volatile__ \
579 		( \
580 			INTEL_NOPREFIX
581 	#define AddEpilogue \
582 			ATT_PREFIX \
583 					: "=a" (result)\
584 					: "d" (C), "a" (A), "D" (B), "c" (N) \
585 					: "%esi", "memory", "cc" \
586 		);\
587 		return result;
588 	#define MulPrologue \
589 		__asm__ __volatile__ \
590 		( \
591 			INTEL_NOPREFIX \
592 			AS1(	push	ebx) \
593 			AS2(	mov		ebx, edx)
594 	#define MulEpilogue \
595 			AS1(	pop		ebx) \
596 			ATT_PREFIX \
597 			: \
598 			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
599 			: "%esi", "memory", "cc" \
600 		);
601 	#define SquPrologue		MulPrologue
602 	#define SquEpilogue	\
603 			AS1(	pop		ebx) \
604 			ATT_PREFIX \
605 			: \
606 			: "d" (s_maskLow16), "c" (C), "a" (A) \
607 			: "%esi", "%edi", "memory", "cc" \
608 		);
609 	#define TopPrologue		MulPrologue
610 	#define TopEpilogue	\
611 			AS1(	pop		ebx) \
612 			ATT_PREFIX \
613 			: \
614 			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
615 			: "memory", "cc" \
616 		);
617 #else
618 	#define AddPrologue \
619 		__asm	push edi \
620 		__asm	push esi \
621 		__asm	mov		eax, [esp+12] \
622 		__asm	mov		edi, [esp+16]
623 	#define AddEpilogue \
624 		__asm	pop esi \
625 		__asm	pop edi \
626 		__asm	ret 8
627 	#define SaveEBX
628 	#define RestoreEBX
629 	#define SquPrologue					\
630 		AS2(	mov		eax, A)			\
631 		AS2(	mov		ecx, C)			\
632 		SaveEBX							\
633 		AS2(	lea		ebx, s_maskLow16)
634 	#define MulPrologue					\
635 		AS2(	mov		eax, A)			\
636 		AS2(	mov		edi, B)			\
637 		AS2(	mov		ecx, C)			\
638 		SaveEBX							\
639 		AS2(	lea		ebx, s_maskLow16)
640 	#define TopPrologue					\
641 		AS2(	mov		eax, A)			\
642 		AS2(	mov		edi, B)			\
643 		AS2(	mov		ecx, C)			\
644 		AS2(	mov		esi, L)			\
645 		SaveEBX							\
646 		AS2(	lea		ebx, s_maskLow16)
647 	#define SquEpilogue		RestoreEBX
648 	#define MulEpilogue		RestoreEBX
649 	#define TopEpilogue		RestoreEBX
650 #endif
651 
652 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
653 extern "C" {
654 int Baseline_Add(size_t N, word *C, const word *A, const word *B);
655 int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
656 }
657 #elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) && defined(CRYPTOPP_WORD128_AVAILABLE)
Baseline_Add(size_t N,word * C,const word * A,const word * B)658 int Baseline_Add(size_t N, word *C, const word *A, const word *B)
659 {
660 	word result;
661 	__asm__ __volatile__
662 	(
663 	INTEL_NOPREFIX
664 	AS1(	neg		%1)
665 	ASJ(	jz,		1, f)
666 	AS2(	mov		%0,[%3+8*%1])
667 	AS2(	add		%0,[%4+8*%1])
668 	AS2(	mov		[%2+8*%1],%0)
669 	ASL(0)
670 	AS2(	mov		%0,[%3+8*%1+8])
671 	AS2(	adc		%0,[%4+8*%1+8])
672 	AS2(	mov		[%2+8*%1+8],%0)
673 	AS2(	lea		%1,[%1+2])
674 	ASJ(	jrcxz,	1, f)
675 	AS2(	mov		%0,[%3+8*%1])
676 	AS2(	adc		%0,[%4+8*%1])
677 	AS2(	mov		[%2+8*%1],%0)
678 	ASJ(	jmp,	0, b)
679 	ASL(1)
680 	AS2(	mov		%0, 0)
681 	AS2(	adc		%0, %0)
682 	ATT_NOPREFIX
683 	: "=&r" (result), "+c" (N)
684 	: "r" (C+N), "r" (A+N), "r" (B+N)
685 	: "memory", "cc"
686 	);
687 	return (int)result;
688 }
689 
Baseline_Sub(size_t N,word * C,const word * A,const word * B)690 int Baseline_Sub(size_t N, word *C, const word *A, const word *B)
691 {
692 	word result;
693 	__asm__ __volatile__
694 	(
695 	INTEL_NOPREFIX
696 	AS1(	neg		%1)
697 	ASJ(	jz,		1, f)
698 	AS2(	mov		%0,[%3+8*%1])
699 	AS2(	sub		%0,[%4+8*%1])
700 	AS2(	mov		[%2+8*%1],%0)
701 	ASL(0)
702 	AS2(	mov		%0,[%3+8*%1+8])
703 	AS2(	sbb		%0,[%4+8*%1+8])
704 	AS2(	mov		[%2+8*%1+8],%0)
705 	AS2(	lea		%1,[%1+2])
706 	ASJ(	jrcxz,	1, f)
707 	AS2(	mov		%0,[%3+8*%1])
708 	AS2(	sbb		%0,[%4+8*%1])
709 	AS2(	mov		[%2+8*%1],%0)
710 	ASJ(	jmp,	0, b)
711 	ASL(1)
712 	AS2(	mov		%0, 0)
713 	AS2(	adc		%0, %0)
714 	ATT_NOPREFIX
715 	: "=&r" (result), "+c" (N)
716 	: "r" (C+N), "r" (A+N), "r" (B+N)
717 	: "memory", "cc"
718 	);
719 	return (int)result;
720 }
721 #elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
Baseline_Add(size_t N,word * C,const word * A,const word * B)722 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
723 {
724 	AddPrologue
725 
726 	// now: eax = A, edi = B, edx = C, ecx = N
727 	AS2(	lea		eax, [eax+4*ecx])
728 	AS2(	lea		edi, [edi+4*ecx])
729 	AS2(	lea		edx, [edx+4*ecx])
730 
731 	AS1(	neg		ecx)				// ecx is negative index
732 	AS2(	test	ecx, 2)				// this clears carry flag
733 	ASJ(	jz,		0, f)
734 	AS2(	sub		ecx, 2)
735 	ASJ(	jmp,	1, f)
736 
737 	ASL(0)
738 	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
739 	AS2(	mov		esi,[eax+4*ecx])
740 	AS2(	adc		esi,[edi+4*ecx])
741 	AS2(	mov		[edx+4*ecx],esi)
742 	AS2(	mov		esi,[eax+4*ecx+4])
743 	AS2(	adc		esi,[edi+4*ecx+4])
744 	AS2(	mov		[edx+4*ecx+4],esi)
745 	ASL(1)
746 	AS2(	mov		esi,[eax+4*ecx+8])
747 	AS2(	adc		esi,[edi+4*ecx+8])
748 	AS2(	mov		[edx+4*ecx+8],esi)
749 	AS2(	mov		esi,[eax+4*ecx+12])
750 	AS2(	adc		esi,[edi+4*ecx+12])
751 	AS2(	mov		[edx+4*ecx+12],esi)
752 
753 	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
754 	ASJ(	jmp,	0, b)
755 
756 	ASL(2)
757 	AS2(	mov		eax, 0)
758 	AS1(	setc	al)					// store carry into eax (return result register)
759 
760 	AddEpilogue
761 
762 	// http://github.com/weidai11/cryptopp/issues/340
763 	CRYPTOPP_UNUSED(A); CRYPTOPP_UNUSED(B);
764 	CRYPTOPP_UNUSED(C); CRYPTOPP_UNUSED(N);
765 }
766 
Baseline_Sub(size_t N,word * C,const word * A,const word * B)767 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
768 {
769 	AddPrologue
770 
771 	// now: eax = A, edi = B, edx = C, ecx = N
772 	AS2(	lea		eax, [eax+4*ecx])
773 	AS2(	lea		edi, [edi+4*ecx])
774 	AS2(	lea		edx, [edx+4*ecx])
775 
776 	AS1(	neg		ecx)				// ecx is negative index
777 	AS2(	test	ecx, 2)				// this clears carry flag
778 	ASJ(	jz,		0, f)
779 	AS2(	sub		ecx, 2)
780 	ASJ(	jmp,	1, f)
781 
782 	ASL(0)
783 	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
784 	AS2(	mov		esi,[eax+4*ecx])
785 	AS2(	sbb		esi,[edi+4*ecx])
786 	AS2(	mov		[edx+4*ecx],esi)
787 	AS2(	mov		esi,[eax+4*ecx+4])
788 	AS2(	sbb		esi,[edi+4*ecx+4])
789 	AS2(	mov		[edx+4*ecx+4],esi)
790 	ASL(1)
791 	AS2(	mov		esi,[eax+4*ecx+8])
792 	AS2(	sbb		esi,[edi+4*ecx+8])
793 	AS2(	mov		[edx+4*ecx+8],esi)
794 	AS2(	mov		esi,[eax+4*ecx+12])
795 	AS2(	sbb		esi,[edi+4*ecx+12])
796 	AS2(	mov		[edx+4*ecx+12],esi)
797 
798 	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
799 	ASJ(	jmp,	0, b)
800 
801 	ASL(2)
802 	AS2(	mov		eax, 0)
803 	AS1(	setc	al)					// store carry into eax (return result register)
804 
805 	AddEpilogue
806 
807 	// http://github.com/weidai11/cryptopp/issues/340
808 	CRYPTOPP_UNUSED(A); CRYPTOPP_UNUSED(B);
809 	CRYPTOPP_UNUSED(C); CRYPTOPP_UNUSED(N);
810 }
811 
812 #if CRYPTOPP_INTEGER_SSE2
SSE2_Add(size_t N,word * C,const word * A,const word * B)813 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
814 {
815 	AddPrologue
816 
817 	// now: eax = A, edi = B, edx = C, ecx = N
818 	AS2(	lea		eax, [eax+4*ecx])
819 	AS2(	lea		edi, [edi+4*ecx])
820 	AS2(	lea		edx, [edx+4*ecx])
821 
822 	AS1(	neg		ecx)				// ecx is negative index
823 	AS2(	pxor    mm2, mm2)
824 	ASJ(	jz,		2, f)
825 	AS2(	test	ecx, 2)				// this clears carry flag
826 	ASJ(	jz,		0, f)
827 	AS2(	sub		ecx, 2)
828 	ASJ(	jmp,	1, f)
829 
830 	ASL(0)
831 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
832 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx])
833 	AS2(	paddq    mm0, mm1)
834 	AS2(	paddq	 mm2, mm0)
835 	AS2(	movd	 DWORD PTR [edx+4*ecx], mm2)
836 	AS2(	psrlq    mm2, 32)
837 
838 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+4])
839 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+4])
840 	AS2(	paddq    mm0, mm1)
841 	AS2(	paddq	 mm2, mm0)
842 	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
843 	AS2(	psrlq    mm2, 32)
844 
845 	ASL(1)
846 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
847 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+8])
848 	AS2(	paddq    mm0, mm1)
849 	AS2(	paddq	 mm2, mm0)
850 	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm2)
851 	AS2(	psrlq    mm2, 32)
852 
853 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+12])
854 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+12])
855 	AS2(	paddq    mm0, mm1)
856 	AS2(	paddq	 mm2, mm0)
857 	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
858 	AS2(	psrlq    mm2, 32)
859 
860 	AS2(	add		ecx, 4)
861 	ASJ(	jnz,	0, b)
862 
863 	ASL(2)
864 	AS2(	movd	eax, mm2)
865 	AS1(	emms)
866 
867 	AddEpilogue
868 
869 	// http://github.com/weidai11/cryptopp/issues/340
870 	CRYPTOPP_UNUSED(A); CRYPTOPP_UNUSED(B);
871 	CRYPTOPP_UNUSED(C); CRYPTOPP_UNUSED(N);
872 }
SSE2_Sub(size_t N,word * C,const word * A,const word * B)873 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
874 {
875 	AddPrologue
876 
877 	// now: eax = A, edi = B, edx = C, ecx = N
878 	AS2(	lea		eax, [eax+4*ecx])
879 	AS2(	lea		edi, [edi+4*ecx])
880 	AS2(	lea		edx, [edx+4*ecx])
881 
882 	AS1(	neg		ecx)				// ecx is negative index
883 	AS2(	pxor    mm2, mm2)
884 	ASJ(	jz,		2, f)
885 	AS2(	test	ecx, 2)				// this clears carry flag
886 	ASJ(	jz,		0, f)
887 	AS2(	sub		ecx, 2)
888 	ASJ(	jmp,	1, f)
889 
890 	ASL(0)
891 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
892 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx])
893 	AS2(	psubq    mm0, mm1)
894 	AS2(	psubq	 mm0, mm2)
895 	AS2(	movd	 DWORD PTR [edx+4*ecx], mm0)
896 	AS2(	psrlq    mm0, 63)
897 
898 	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+4])
899 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+4])
900 	AS2(	psubq    mm2, mm1)
901 	AS2(	psubq	 mm2, mm0)
902 	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
903 	AS2(	psrlq    mm2, 63)
904 
905 	ASL(1)
906 	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
907 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+8])
908 	AS2(	psubq    mm0, mm1)
909 	AS2(	psubq	 mm0, mm2)
910 	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm0)
911 	AS2(	psrlq    mm0, 63)
912 
913 	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+12])
914 	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+12])
915 	AS2(	psubq    mm2, mm1)
916 	AS2(	psubq	 mm2, mm0)
917 	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
918 	AS2(	psrlq    mm2, 63)
919 
920 	AS2(	add		ecx, 4)
921 	ASJ(	jnz,	0, b)
922 
923 	ASL(2)
924 	AS2(	movd	eax, mm2)
925 	AS1(	emms)
926 
927 	AddEpilogue
928 
929 	// http://github.com/weidai11/cryptopp/issues/340
930 	CRYPTOPP_UNUSED(A); CRYPTOPP_UNUSED(B);
931 	CRYPTOPP_UNUSED(C); CRYPTOPP_UNUSED(N);
932 }
933 #endif	// CRYPTOPP_INTEGER_SSE2
934 #else   // CRYPTOPP_SSE2_ASM_AVAILABLE
Baseline_Add(size_t N,word * C,const word * A,const word * B)935 int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
936 {
937 	CRYPTOPP_ASSERT (N%2 == 0);
938 
939 	Declare2Words(u);
940 	AssignWord(u, 0);
941 	for (size_t i=0; i<N; i+=2)
942 	{
943 		AddWithCarry(u, A[i], B[i]);
944 		C[i] = LowWord(u);
945 		AddWithCarry(u, A[i+1], B[i+1]);
946 		C[i+1] = LowWord(u);
947 	}
948 	return int(GetCarry(u));
949 }
950 
Baseline_Sub(size_t N,word * C,const word * A,const word * B)951 int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
952 {
953 	CRYPTOPP_ASSERT (N%2 == 0);
954 
955 	Declare2Words(u);
956 	AssignWord(u, 0);
957 	for (size_t i=0; i<N; i+=2)
958 	{
959 		SubtractWithBorrow(u, A[i], B[i]);
960 		C[i] = LowWord(u);
961 		SubtractWithBorrow(u, A[i+1], B[i+1]);
962 		C[i+1] = LowWord(u);
963 	}
964 	return int(GetBorrow(u));
965 }
966 #endif
967 
LinearMultiply(word * C,const word * AA,word B,size_t N)968 static word LinearMultiply(word *C, const word *AA, word B, size_t N)
969 {
970 	// http://github.com/weidai11/cryptopp/issues/188
971 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
972 
973 	word carry=0;
974 	for(unsigned i=0; i<N; i++)
975 	{
976 		Declare2Words(p);
977 		MultiplyWords(p, A[i], B);
978 		Acc2WordsBy1(p, carry);
979 		C[i] = LowWord(p);
980 		carry = HighWord(p);
981 	}
982 	return carry;
983 }
984 
985 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
986 
987 #define Mul_2 \
988 	Mul_Begin(2) \
989 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
990 	Mul_End(1, 1)
991 
992 #define Mul_4 \
993 	Mul_Begin(4) \
994 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
995 	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
996 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
997 	Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
998 	Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
999 	Mul_End(5, 3)
1000 
1001 #define Mul_8 \
1002 	Mul_Begin(8) \
1003 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
1004 	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
1005 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
1006 	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
1007 	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1008 	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1009 	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1010 	Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
1011 	Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
1012 	Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
1013 	Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
1014 	Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
1015 	Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
1016 	Mul_End(13, 7)
1017 
1018 #define Mul_16 \
1019 	Mul_Begin(16) \
1020 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
1021 	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
1022 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
1023 	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
1024 	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1025 	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1026 	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1027 	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
1028 	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
1029 	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
1030 	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
1031 	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
1032 	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
1033 	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
1034 	Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
1035 	Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
1036 	Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
1037 	Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
1038 	Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
1039 	Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
1040 	Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
1041 	Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
1042 	Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
1043 	Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
1044 	Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
1045 	Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
1046 	Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
1047 	Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
1048 	Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
1049 	Mul_End(29, 15)
1050 
1051 #define Squ_2 \
1052 	Squ_Begin(2) \
1053 	Squ_End(2)
1054 
1055 #define Squ_4 \
1056 	Squ_Begin(4) \
1057 	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
1058 	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
1059 	Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
1060 	Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
1061 	Squ_End(4)
1062 
1063 #define Squ_8 \
1064 	Squ_Begin(8) \
1065 	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
1066 	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
1067 	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
1068 	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
1069 	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
1070 	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
1071 	Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
1072 	Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5)  Squ_NonDiag \
1073 	Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
1074 	Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
1075 	Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
1076 	Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
1077 	Squ_End(8)
1078 
1079 #define Squ_16 \
1080 	Squ_Begin(16) \
1081 	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
1082 	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
1083 	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
1084 	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
1085 	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
1086 	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
1087 	Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
1088 	Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
1089 	Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
1090 	Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
1091 	Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
1092 	Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
1093 	Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
1094 	Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
1095 	Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
1096 	Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
1097 	Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
1098 	Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
1099 	Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
1100 	Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
1101 	Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
1102 	Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
1103 	Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
1104 	Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
1105 	Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
1106 	Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
1107 	Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
1108 	Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
1109 	Squ_End(16)
1110 
1111 #define Bot_2 \
1112 	Mul_Begin(2) \
1113 	Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
1114 	Bot_End(2)
1115 
1116 #define Bot_4 \
1117 	Mul_Begin(4) \
1118 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
1119 	Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2)  \
1120 	Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0)  \
1121 	Bot_End(4)
1122 
1123 #define Bot_8 \
1124 	Mul_Begin(8) \
1125 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
1126 	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
1127 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
1128 	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
1129 	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1130 	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1131 	Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
1132 	Bot_End(8)
1133 
1134 #define Bot_16 \
1135 	Mul_Begin(16) \
1136 	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
1137 	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
1138 	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
1139 	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
1140 	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
1141 	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
1142 	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1143 	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
1144 	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
1145 	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
1146 	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
1147 	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
1148 	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
1149 	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
1150 	Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
1151 	Bot_End(16)
1152 
1153 #endif
1154 
1155 #if 0
1156 #define Mul_Begin(n)				\
1157 	Declare2Words(p)				\
1158 	Declare2Words(c)				\
1159 	Declare2Words(d)				\
1160 	MultiplyWords(p, A[0], B[0])	\
1161 	AssignWord(c, LowWord(p))		\
1162 	AssignWord(d, HighWord(p))
1163 
1164 #define Mul_Acc(i, j)				\
1165 	MultiplyWords(p, A[i], B[j])	\
1166 	Acc2WordsBy1(c, LowWord(p))		\
1167 	Acc2WordsBy1(d, HighWord(p))
1168 
1169 #define Mul_SaveAcc(k, i, j) 		\
1170 	R[k] = LowWord(c);				\
1171 	Add2WordsBy1(c, d, HighWord(c))	\
1172 	MultiplyWords(p, A[i], B[j])	\
1173 	AssignWord(d, HighWord(p))		\
1174 	Acc2WordsBy1(c, LowWord(p))
1175 
1176 #define Mul_End(n)					\
1177 	R[2*n-3] = LowWord(c);			\
1178 	Acc2WordsBy1(d, HighWord(c))	\
1179 	MultiplyWords(p, A[n-1], B[n-1])\
1180 	Acc2WordsBy2(d, p)				\
1181 	R[2*n-2] = LowWord(d);			\
1182 	R[2*n-1] = HighWord(d);
1183 
1184 #define Bot_SaveAcc(k, i, j)		\
1185 	R[k] = LowWord(c);				\
1186 	word e = LowWord(d) + HighWord(c);	\
1187 	e += A[i] * B[j];
1188 
1189 #define Bot_Acc(i, j)	\
1190 	e += A[i] * B[j];
1191 
1192 #define Bot_End(n)		\
1193 	R[n-1] = e;
1194 #else
1195 #define Mul_Begin(n)				\
1196 	Declare2Words(p)				\
1197 	word c;	\
1198 	Declare2Words(d)				\
1199 	MultiplyWords(p, A[0], B[0])	\
1200 	c = LowWord(p);		\
1201 	AssignWord(d, HighWord(p))
1202 
1203 #define Mul_Acc(i, j)				\
1204 	MulAcc(c, d, A[i], B[j])
1205 
1206 #define Mul_SaveAcc(k, i, j) 		\
1207 	R[k] = c;				\
1208 	c = LowWord(d);	\
1209 	AssignWord(d, HighWord(d))	\
1210 	MulAcc(c, d, A[i], B[j])
1211 
1212 #define Mul_End(k, i)					\
1213 	R[k] = c;			\
1214 	MultiplyWords(p, A[i], B[i])	\
1215 	Acc2WordsBy2(p, d)				\
1216 	R[k+1] = LowWord(p);			\
1217 	R[k+2] = HighWord(p);
1218 
1219 #define Bot_SaveAcc(k, i, j)		\
1220 	R[k] = c;				\
1221 	c = LowWord(d);	\
1222 	c += A[i] * B[j];
1223 
1224 #define Bot_Acc(i, j)	\
1225 	c += A[i] * B[j];
1226 
1227 #define Bot_End(n)		\
1228 	R[n-1] = c;
1229 #endif
1230 
1231 #define Squ_Begin(n)				\
1232 	Declare2Words(p)				\
1233 	word c;				\
1234 	Declare2Words(d)				\
1235 	Declare2Words(e)				\
1236 	MultiplyWords(p, A[0], A[0])	\
1237 	R[0] = LowWord(p);				\
1238 	AssignWord(e, HighWord(p))		\
1239 	MultiplyWords(p, A[0], A[1])	\
1240 	c = LowWord(p);		\
1241 	AssignWord(d, HighWord(p))		\
1242 	Squ_NonDiag						\
1243 
1244 #define Squ_NonDiag				\
1245 	Double3Words(c, d)
1246 
1247 #define Squ_SaveAcc(k, i, j) 		\
1248 	Acc3WordsBy2(c, d, e)			\
1249 	R[k] = c;				\
1250 	MultiplyWords(p, A[i], A[j])	\
1251 	c = LowWord(p);		\
1252 	AssignWord(d, HighWord(p))		\
1253 
1254 #define Squ_Acc(i, j)				\
1255 	MulAcc(c, d, A[i], A[j])
1256 
1257 #define Squ_Diag(i)					\
1258 	Squ_NonDiag						\
1259 	MulAcc(c, d, A[i], A[i])
1260 
1261 #define Squ_End(n)					\
1262 	Acc3WordsBy2(c, d, e)			\
1263 	R[2*n-3] = c;			\
1264 	MultiplyWords(p, A[n-1], A[n-1])\
1265 	Acc2WordsBy2(p, e)				\
1266 	R[2*n-2] = LowWord(p);			\
1267 	R[2*n-1] = HighWord(p);
1268 
1269 
Baseline_Multiply2(word * R,const word * AA,const word * BB)1270 void Baseline_Multiply2(word *R, const word *AA, const word *BB)
1271 {
1272 	// http://github.com/weidai11/cryptopp/issues/188
1273 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1274 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1275 
1276 	Mul_2
1277 }
1278 
Baseline_Multiply4(word * R,const word * AA,const word * BB)1279 void Baseline_Multiply4(word *R, const word *AA, const word *BB)
1280 {
1281 	// http://github.com/weidai11/cryptopp/issues/188
1282 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1283 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1284 
1285 	Mul_4
1286 }
1287 
Baseline_Multiply8(word * R,const word * AA,const word * BB)1288 void Baseline_Multiply8(word *R, const word *AA, const word *BB)
1289 {
1290 	// http://github.com/weidai11/cryptopp/issues/188
1291 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1292 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1293 
1294 	Mul_8
1295 }
1296 
Baseline_Square2(word * R,const word * AA)1297 void Baseline_Square2(word *R, const word *AA)
1298 {
1299 	// http://github.com/weidai11/cryptopp/issues/188
1300 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1301 
1302 	Squ_2
1303 }
1304 
Baseline_Square4(word * R,const word * AA)1305 void Baseline_Square4(word *R, const word *AA)
1306 {
1307 	// http://github.com/weidai11/cryptopp/issues/188
1308 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1309 
1310 	Squ_4
1311 }
1312 
Baseline_Square8(word * R,const word * AA)1313 void Baseline_Square8(word *R, const word *AA)
1314 {
1315 	// http://github.com/weidai11/cryptopp/issues/188
1316 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1317 
1318 	Squ_8
1319 }
1320 
Baseline_MultiplyBottom2(word * R,const word * AA,const word * BB)1321 void Baseline_MultiplyBottom2(word *R, const word *AA, const word *BB)
1322 {
1323 	// http://github.com/weidai11/cryptopp/issues/188
1324 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1325 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1326 
1327 	Bot_2
1328 
1329 // http://github.com/weidai11/cryptopp/issues/340
1330 #if defined(TWO_64_BIT_WORDS)
1331 	CRYPTOPP_UNUSED(d0); CRYPTOPP_UNUSED(d1);
1332 #endif
1333 }
1334 
Baseline_MultiplyBottom4(word * R,const word * AA,const word * BB)1335 void Baseline_MultiplyBottom4(word *R, const word *AA, const word *BB)
1336 {
1337 	// http://github.com/weidai11/cryptopp/issues/188
1338 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1339 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1340 
1341 	Bot_4
1342 }
1343 
Baseline_MultiplyBottom8(word * R,const word * AA,const word * BB)1344 void Baseline_MultiplyBottom8(word *R, const word *AA, const word *BB)
1345 {
1346 	// http://github.com/weidai11/cryptopp/issues/188
1347 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1348 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1349 
1350 	Bot_8
1351 }
1352 
1353 #define Top_Begin(n)				\
1354 	Declare2Words(p)				\
1355 	word c;	\
1356 	Declare2Words(d)				\
1357 	MultiplyWords(p, A[0], B[n-2]);\
1358 	AssignWord(d, HighWord(p));
1359 
1360 #define Top_Acc(i, j)	\
1361 	MultiplyWords(p, A[i], B[j]);\
1362 	Acc2WordsBy1(d, HighWord(p));
1363 
1364 #define Top_SaveAcc0(i, j) 		\
1365 	c = LowWord(d);	\
1366 	AssignWord(d, HighWord(d))	\
1367 	MulAcc(c, d, A[i], B[j])
1368 
1369 #define Top_SaveAcc1(i, j) 		\
1370 	c = L<c; \
1371 	Acc2WordsBy1(d, c);	\
1372 	c = LowWord(d);	\
1373 	AssignWord(d, HighWord(d))	\
1374 	MulAcc(c, d, A[i], B[j])
1375 
Baseline_MultiplyTop2(word * R,const word * A,const word * B,word L)1376 void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
1377 {
1378 	CRYPTOPP_UNUSED(L);
1379 	word T[4];
1380 	Baseline_Multiply2(T, A, B);
1381 	R[0] = T[2];
1382 	R[1] = T[3];
1383 }
1384 
Baseline_MultiplyTop4(word * R,const word * AA,const word * BB,word L)1385 void Baseline_MultiplyTop4(word *R, const word *AA, const word *BB, word L)
1386 {
1387 	// http://github.com/weidai11/cryptopp/issues/188
1388 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1389 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1390 
1391 	Top_Begin(4)
1392 	Top_Acc(1, 1) Top_Acc(2, 0)  \
1393 	Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
1394 	Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
1395 	Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
1396 	Mul_End(1, 3)
1397 }
1398 
Baseline_MultiplyTop8(word * R,const word * AA,const word * BB,word L)1399 void Baseline_MultiplyTop8(word *R, const word *AA, const word *BB, word L)
1400 {
1401 	// http://github.com/weidai11/cryptopp/issues/188
1402 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1403 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1404 
1405 	Top_Begin(8)
1406 	Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
1407 	Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1408 	Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
1409 	Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
1410 	Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
1411 	Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
1412 	Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
1413 	Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
1414 	Mul_End(5, 7)
1415 }
1416 
1417 #if !CRYPTOPP_INTEGER_SSE2	// save memory by not compiling these functions when SSE2 is available
Baseline_Multiply16(word * R,const word * AA,const word * BB)1418 void Baseline_Multiply16(word *R, const word *AA, const word *BB)
1419 {
1420 	// http://github.com/weidai11/cryptopp/issues/188
1421 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1422 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1423 
1424 	Mul_16
1425 }
1426 
Baseline_Square16(word * R,const word * AA)1427 void Baseline_Square16(word *R, const word *AA)
1428 {
1429 	// http://github.com/weidai11/cryptopp/issues/188
1430 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1431 
1432 	Squ_16
1433 }
1434 
Baseline_MultiplyBottom16(word * R,const word * AA,const word * BB)1435 void Baseline_MultiplyBottom16(word *R, const word *AA, const word *BB)
1436 {
1437 	// http://github.com/weidai11/cryptopp/issues/188
1438 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1439 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1440 
1441 	Bot_16
1442 }
1443 
Baseline_MultiplyTop16(word * R,const word * AA,const word * BB,word L)1444 void Baseline_MultiplyTop16(word *R, const word *AA, const word *BB, word L)
1445 {
1446 	// http://github.com/weidai11/cryptopp/issues/188
1447 	MAYBE_CONST word* A = MAYBE_UNCONST_CAST(AA);
1448 	MAYBE_CONST word* B = MAYBE_UNCONST_CAST(BB);
1449 
1450 	Top_Begin(16)
1451 	Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
1452 	Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
1453 	Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
1454 	Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
1455 	Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
1456 	Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
1457 	Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
1458 	Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
1459 	Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
1460 	Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
1461 	Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
1462 	Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
1463 	Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
1464 	Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
1465 	Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
1466 	Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
1467 	Mul_End(13, 15)
1468 }
1469 #endif
1470 
1471 // ********************************************************
1472 
1473 #if CRYPTOPP_INTEGER_SSE2
1474 
1475 CRYPTOPP_ALIGN_DATA(16)
1476 CRYPTOPP_TABLE
1477 const word32 s_maskLow16[4] = {
1478 	0xffff,0xffff,0xffff,0xffff
1479 };
1480 
1481 #undef Mul_Begin
1482 #undef Mul_Acc
1483 #undef Top_Begin
1484 #undef Top_Acc
1485 #undef Squ_Acc
1486 #undef Squ_NonDiag
1487 #undef Squ_Diag
1488 #undef Squ_SaveAcc
1489 #undef Squ_Begin
1490 #undef Mul_SaveAcc
1491 #undef Bot_Acc
1492 #undef Bot_SaveAcc
1493 #undef Bot_End
1494 #undef Squ_End
1495 #undef Mul_End
1496 
1497 #define SSE2_FinalSave(k)			\
1498 	AS2(	psllq		xmm5, 16)	\
1499 	AS2(	paddq		xmm4, xmm5)	\
1500 	AS2(	movq		QWORD PTR [ecx+8*(k)], xmm4)
1501 
1502 #define SSE2_SaveShift(k)			\
1503 	AS2(	movq		xmm0, xmm6)	\
1504 	AS2(	punpckhqdq	xmm6, xmm0)	\
1505 	AS2(	movq		xmm1, xmm7)	\
1506 	AS2(	punpckhqdq	xmm7, xmm1)	\
1507 	AS2(	paddd		xmm6, xmm0)	\
1508 	AS2(	pslldq		xmm6, 4)	\
1509 	AS2(	paddd		xmm7, xmm1)	\
1510 	AS2(	paddd		xmm4, xmm6)	\
1511 	AS2(	pslldq		xmm7, 4)	\
1512 	AS2(	movq		xmm6, xmm4)	\
1513 	AS2(	paddd		xmm5, xmm7)	\
1514 	AS2(	movq		xmm7, xmm5)	\
1515 	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
1516 	AS2(	psrlq		xmm6, 16)	\
1517 	AS2(	paddq		xmm6, xmm7)	\
1518 	AS2(	punpckhqdq	xmm4, xmm0)	\
1519 	AS2(	punpckhqdq	xmm5, xmm0)	\
1520 	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm6)	\
1521 	AS2(	psrlq		xmm6, 3*16)	\
1522 	AS2(	paddd		xmm4, xmm6)	\
1523 
1524 #define Squ_SSE2_SaveShift(k)			\
1525 	AS2(	movq		xmm0, xmm6)	\
1526 	AS2(	punpckhqdq	xmm6, xmm0)	\
1527 	AS2(	movq		xmm1, xmm7)	\
1528 	AS2(	punpckhqdq	xmm7, xmm1)	\
1529 	AS2(	paddd		xmm6, xmm0)	\
1530 	AS2(	pslldq		xmm6, 4)	\
1531 	AS2(	paddd		xmm7, xmm1)	\
1532 	AS2(	paddd		xmm4, xmm6)	\
1533 	AS2(	pslldq		xmm7, 4)	\
1534 	AS2(	movhlps		xmm6, xmm4)	\
1535 	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
1536 	AS2(	paddd		xmm5, xmm7)	\
1537 	AS2(	movhps		QWORD PTR [esp+12], xmm5)\
1538 	AS2(	psrlq		xmm4, 16)	\
1539 	AS2(	paddq		xmm4, xmm5)	\
1540 	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm4)	\
1541 	AS2(	psrlq		xmm4, 3*16)	\
1542 	AS2(	paddd		xmm4, xmm6)	\
1543 	AS2(	movq		QWORD PTR [esp+4], xmm4)\
1544 
1545 #define SSE2_FirstMultiply(i)				\
1546 	AS2(	movdqa		xmm7, [esi+(i)*16])\
1547 	AS2(	movdqa		xmm5, [edi-(i)*16])\
1548 	AS2(	pmuludq		xmm5, xmm7)		\
1549 	AS2(	movdqa		xmm4, [ebx])\
1550 	AS2(	movdqa		xmm6, xmm4)		\
1551 	AS2(	pand		xmm4, xmm5)		\
1552 	AS2(	psrld		xmm5, 16)		\
1553 	AS2(	pmuludq		xmm7, [edx-(i)*16])\
1554 	AS2(	pand		xmm6, xmm7)		\
1555 	AS2(	psrld		xmm7, 16)
1556 
1557 #define Squ_Begin(n)							\
1558 	SquPrologue									\
1559 	AS2(	mov		esi, esp)\
1560 	AS2(	and		esp, 0xfffffff0)\
1561 	AS2(	lea		edi, [esp-32*n])\
1562 	AS2(	sub		esp, 32*n+16)\
1563 	AS1(	push	esi)\
1564 	AS2(	mov		esi, edi)					\
1565 	AS2(	xor		edx, edx)					\
1566 	ASL(1)										\
1567 	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1568 	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1569 	AS2(	movdqa	[edi+2*edx], xmm0)		\
1570 	AS2(	psrlq	xmm0, 32)					\
1571 	AS2(	movdqa	[edi+2*edx+16], xmm0)	\
1572 	AS2(	movdqa	[edi+16*n+2*edx], xmm1)		\
1573 	AS2(	psrlq	xmm1, 32)					\
1574 	AS2(	movdqa	[edi+16*n+2*edx+16], xmm1)	\
1575 	AS2(	add		edx, 16)					\
1576 	AS2(	cmp		edx, 8*(n))					\
1577 	ASJ(	jne,	1, b)						\
1578 	AS2(	lea		edx, [edi+16*n])\
1579 	SSE2_FirstMultiply(0)							\
1580 
1581 #define Squ_Acc(i)								\
1582 	ASL(LSqu##i)								\
1583 	AS2(	movdqa		xmm1, [esi+(i)*16])	\
1584 	AS2(	movdqa		xmm0, [edi-(i)*16])	\
1585 	AS2(	movdqa		xmm2, [ebx])	\
1586 	AS2(	pmuludq		xmm0, xmm1)				\
1587 	AS2(	pmuludq		xmm1, [edx-(i)*16])	\
1588 	AS2(	movdqa		xmm3, xmm2)			\
1589 	AS2(	pand		xmm2, xmm0)			\
1590 	AS2(	psrld		xmm0, 16)			\
1591 	AS2(	paddd		xmm4, xmm2)			\
1592 	AS2(	paddd		xmm5, xmm0)			\
1593 	AS2(	pand		xmm3, xmm1)			\
1594 	AS2(	psrld		xmm1, 16)			\
1595 	AS2(	paddd		xmm6, xmm3)			\
1596 	AS2(	paddd		xmm7, xmm1)		\
1597 
1598 #define Squ_Acc1(i)
1599 #define Squ_Acc2(i)		ASC(call, LSqu##i)
1600 #define Squ_Acc3(i)		Squ_Acc2(i)
1601 #define Squ_Acc4(i)		Squ_Acc2(i)
1602 #define Squ_Acc5(i)		Squ_Acc2(i)
1603 #define Squ_Acc6(i)		Squ_Acc2(i)
1604 #define Squ_Acc7(i)		Squ_Acc2(i)
1605 #define Squ_Acc8(i)		Squ_Acc2(i)
1606 
1607 #define SSE2_End(E, n)					\
1608 	SSE2_SaveShift(2*(n)-3)			\
1609 	AS2(	movdqa		xmm7, [esi+16])	\
1610 	AS2(	movdqa		xmm0, [edi])	\
1611 	AS2(	pmuludq		xmm0, xmm7)				\
1612 	AS2(	movdqa		xmm2, [ebx])		\
1613 	AS2(	pmuludq		xmm7, [edx])	\
1614 	AS2(	movdqa		xmm6, xmm2)				\
1615 	AS2(	pand		xmm2, xmm0)				\
1616 	AS2(	psrld		xmm0, 16)				\
1617 	AS2(	paddd		xmm4, xmm2)				\
1618 	AS2(	paddd		xmm5, xmm0)				\
1619 	AS2(	pand		xmm6, xmm7)				\
1620 	AS2(	psrld		xmm7, 16)	\
1621 	SSE2_SaveShift(2*(n)-2)			\
1622 	SSE2_FinalSave(2*(n)-1)			\
1623 	AS1(	pop		esp)\
1624 	E
1625 
1626 #define Squ_End(n)		SSE2_End(SquEpilogue, n)
1627 #define Mul_End(n)		SSE2_End(MulEpilogue, n)
1628 #define Top_End(n)		SSE2_End(TopEpilogue, n)
1629 
1630 #define Squ_Column1(k, i)	\
1631 	Squ_SSE2_SaveShift(k)					\
1632 	AS2(	add			esi, 16)	\
1633 	SSE2_FirstMultiply(1)\
1634 	Squ_Acc##i(i)	\
1635 	AS2(	paddd		xmm4, xmm4)		\
1636 	AS2(	paddd		xmm5, xmm5)		\
1637 	AS2(	movdqa		xmm3, [esi])				\
1638 	AS2(	movq		xmm1, QWORD PTR [esi+8])	\
1639 	AS2(	pmuludq		xmm1, xmm3)		\
1640 	AS2(	pmuludq		xmm3, xmm3)		\
1641 	AS2(	movdqa		xmm0, [ebx])\
1642 	AS2(	movdqa		xmm2, xmm0)		\
1643 	AS2(	pand		xmm0, xmm1)		\
1644 	AS2(	psrld		xmm1, 16)		\
1645 	AS2(	paddd		xmm6, xmm0)		\
1646 	AS2(	paddd		xmm7, xmm1)		\
1647 	AS2(	pand		xmm2, xmm3)		\
1648 	AS2(	psrld		xmm3, 16)		\
1649 	AS2(	paddd		xmm6, xmm6)		\
1650 	AS2(	paddd		xmm7, xmm7)		\
1651 	AS2(	paddd		xmm4, xmm2)		\
1652 	AS2(	paddd		xmm5, xmm3)		\
1653 	AS2(	movq		xmm0, QWORD PTR [esp+4])\
1654 	AS2(	movq		xmm1, QWORD PTR [esp+12])\
1655 	AS2(	paddd		xmm4, xmm0)\
1656 	AS2(	paddd		xmm5, xmm1)\
1657 
1658 #define Squ_Column0(k, i)	\
1659 	Squ_SSE2_SaveShift(k)					\
1660 	AS2(	add			edi, 16)	\
1661 	AS2(	add			edx, 16)	\
1662 	SSE2_FirstMultiply(1)\
1663 	Squ_Acc##i(i)	\
1664 	AS2(	paddd		xmm6, xmm6)		\
1665 	AS2(	paddd		xmm7, xmm7)		\
1666 	AS2(	paddd		xmm4, xmm4)		\
1667 	AS2(	paddd		xmm5, xmm5)		\
1668 	AS2(	movq		xmm0, QWORD PTR [esp+4])\
1669 	AS2(	movq		xmm1, QWORD PTR [esp+12])\
1670 	AS2(	paddd		xmm4, xmm0)\
1671 	AS2(	paddd		xmm5, xmm1)\
1672 
1673 #define SSE2_MulAdd45						\
1674 	AS2(	movdqa		xmm7, [esi])	\
1675 	AS2(	movdqa		xmm0, [edi])	\
1676 	AS2(	pmuludq		xmm0, xmm7)				\
1677 	AS2(	movdqa		xmm2, [ebx])		\
1678 	AS2(	pmuludq		xmm7, [edx])	\
1679 	AS2(	movdqa		xmm6, xmm2)				\
1680 	AS2(	pand		xmm2, xmm0)				\
1681 	AS2(	psrld		xmm0, 16)				\
1682 	AS2(	paddd		xmm4, xmm2)				\
1683 	AS2(	paddd		xmm5, xmm0)				\
1684 	AS2(	pand		xmm6, xmm7)				\
1685 	AS2(	psrld		xmm7, 16)
1686 
1687 #define Mul_Begin(n)							\
1688 	MulPrologue									\
1689 	AS2(	mov		esi, esp)\
1690 	AS2(	and		esp, 0xfffffff0)\
1691 	AS2(	sub		esp, 48*n+16)\
1692 	AS1(	push	esi)\
1693 	AS2(	xor		edx, edx)					\
1694 	ASL(1)										\
1695 	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1696 	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1697 	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
1698 	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
1699 	AS2(	psrlq	xmm0, 32)					\
1700 	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
1701 	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
1702 	AS2(	psrlq	xmm1, 32)					\
1703 	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
1704 	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
1705 	AS2(	psrlq	xmm2, 32)					\
1706 	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
1707 	AS2(	add		edx, 16)					\
1708 	AS2(	cmp		edx, 8*(n))					\
1709 	ASJ(	jne,	1, b)						\
1710 	AS2(	lea		edi, [esp+20])\
1711 	AS2(	lea		edx, [esp+20+16*n])\
1712 	AS2(	lea		esi, [esp+20+32*n])\
1713 	SSE2_FirstMultiply(0)							\
1714 
1715 #define Mul_Acc(i)								\
1716 	ASL(LMul##i)										\
1717 	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
1718 	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
1719 	AS2(	movdqa		xmm2, [ebx])	\
1720 	AS2(	pmuludq		xmm0, xmm1)				\
1721 	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
1722 	AS2(	movdqa		xmm3, xmm2)			\
1723 	AS2(	pand		xmm2, xmm0)			\
1724 	AS2(	psrld		xmm0, 16)			\
1725 	AS2(	paddd		xmm4, xmm2)			\
1726 	AS2(	paddd		xmm5, xmm0)			\
1727 	AS2(	pand		xmm3, xmm1)			\
1728 	AS2(	psrld		xmm1, 16)			\
1729 	AS2(	paddd		xmm6, xmm3)			\
1730 	AS2(	paddd		xmm7, xmm1)		\
1731 
1732 #define Mul_Acc1(i)
1733 #define Mul_Acc2(i)		ASC(call, LMul##i)
1734 #define Mul_Acc3(i)		Mul_Acc2(i)
1735 #define Mul_Acc4(i)		Mul_Acc2(i)
1736 #define Mul_Acc5(i)		Mul_Acc2(i)
1737 #define Mul_Acc6(i)		Mul_Acc2(i)
1738 #define Mul_Acc7(i)		Mul_Acc2(i)
1739 #define Mul_Acc8(i)		Mul_Acc2(i)
1740 #define Mul_Acc9(i)		Mul_Acc2(i)
1741 #define Mul_Acc10(i)	Mul_Acc2(i)
1742 #define Mul_Acc11(i)	Mul_Acc2(i)
1743 #define Mul_Acc12(i)	Mul_Acc2(i)
1744 #define Mul_Acc13(i)	Mul_Acc2(i)
1745 #define Mul_Acc14(i)	Mul_Acc2(i)
1746 #define Mul_Acc15(i)	Mul_Acc2(i)
1747 #define Mul_Acc16(i)	Mul_Acc2(i)
1748 
1749 #define Mul_Column1(k, i)	\
1750 	SSE2_SaveShift(k)					\
1751 	AS2(	add			esi, 16)	\
1752 	SSE2_MulAdd45\
1753 	Mul_Acc##i(i)	\
1754 
1755 #define Mul_Column0(k, i)	\
1756 	SSE2_SaveShift(k)					\
1757 	AS2(	add			edi, 16)	\
1758 	AS2(	add			edx, 16)	\
1759 	SSE2_MulAdd45\
1760 	Mul_Acc##i(i)	\
1761 
1762 #define Bot_Acc(i)							\
1763 	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
1764 	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
1765 	AS2(	pmuludq		xmm0, xmm1)				\
1766 	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])		\
1767 	AS2(	paddq		xmm4, xmm0)				\
1768 	AS2(	paddd		xmm6, xmm1)
1769 
1770 #define Bot_SaveAcc(k)					\
1771 	SSE2_SaveShift(k)							\
1772 	AS2(	add			edi, 16)	\
1773 	AS2(	add			edx, 16)	\
1774 	AS2(	movdqa		xmm6, [esi])	\
1775 	AS2(	movdqa		xmm0, [edi])	\
1776 	AS2(	pmuludq		xmm0, xmm6)				\
1777 	AS2(	paddq		xmm4, xmm0)				\
1778 	AS2(	psllq		xmm5, 16)				\
1779 	AS2(	paddq		xmm4, xmm5)				\
1780 	AS2(	pmuludq		xmm6, [edx])
1781 
1782 #define Bot_End(n)							\
1783 	AS2(	movhlps		xmm7, xmm6)			\
1784 	AS2(	paddd		xmm6, xmm7)			\
1785 	AS2(	psllq		xmm6, 32)			\
1786 	AS2(	paddd		xmm4, xmm6)			\
1787 	AS2(	movq		QWORD PTR [ecx+8*((n)-1)], xmm4)	\
1788 	AS1(	pop		esp)\
1789 	MulEpilogue
1790 
1791 #define Top_Begin(n)							\
1792 	TopPrologue									\
1793 	AS2(	mov		edx, esp)\
1794 	AS2(	and		esp, 0xfffffff0)\
1795 	AS2(	sub		esp, 48*n+16)\
1796 	AS1(	push	edx)\
1797 	AS2(	xor		edx, edx)					\
1798 	ASL(1)										\
1799 	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1800 	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1801 	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
1802 	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
1803 	AS2(	psrlq	xmm0, 32)					\
1804 	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
1805 	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
1806 	AS2(	psrlq	xmm1, 32)					\
1807 	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
1808 	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
1809 	AS2(	psrlq	xmm2, 32)					\
1810 	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
1811 	AS2(	add		edx, 16)					\
1812 	AS2(	cmp		edx, 8*(n))					\
1813 	ASJ(	jne,	1, b)						\
1814 	AS2(	mov		eax, esi)					\
1815 	AS2(	lea		edi, [esp+20+00*n+16*(n/2-1)])\
1816 	AS2(	lea		edx, [esp+20+16*n+16*(n/2-1)])\
1817 	AS2(	lea		esi, [esp+20+32*n+16*(n/2-1)])\
1818 	AS2(	pxor	xmm4, xmm4)\
1819 	AS2(	pxor	xmm5, xmm5)
1820 
1821 #define Top_Acc(i)							\
1822 	AS2(	movq		xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8])	\
1823 	AS2(	pmuludq		xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
1824 	AS2(	psrlq		xmm0, 48)				\
1825 	AS2(	paddd		xmm5, xmm0)\
1826 
1827 #define Top_Column0(i)	\
1828 	AS2(	psllq		xmm5, 32)				\
1829 	AS2(	add			edi, 16)	\
1830 	AS2(	add			edx, 16)	\
1831 	SSE2_MulAdd45\
1832 	Mul_Acc##i(i)	\
1833 
1834 #define Top_Column1(i)	\
1835 	SSE2_SaveShift(0)					\
1836 	AS2(	add			esi, 16)	\
1837 	SSE2_MulAdd45\
1838 	Mul_Acc##i(i)	\
1839 	AS2(	shr			eax, 16)	\
1840 	AS2(	movd		xmm0, eax)\
1841 	AS2(	movd		xmm1, [ecx+4])\
1842 	AS2(	psrld		xmm1, 16)\
1843 	AS2(	pcmpgtd		xmm1, xmm0)\
1844 	AS2(	psrld		xmm1, 31)\
1845 	AS2(	paddd		xmm4, xmm1)\
1846 
SSE2_Square4(word * C,const word * A)1847 void SSE2_Square4(word *C, const word *A)
1848 {
1849 	Squ_Begin(2)
1850 	Squ_Column0(0, 1)
1851 	Squ_End(2)
1852 }
1853 
SSE2_Square8(word * C,const word * A)1854 void SSE2_Square8(word *C, const word *A)
1855 {
1856 	Squ_Begin(4)
1857 #ifndef __GNUC__
1858 	ASJ(	jmp,	0, f)
1859 	Squ_Acc(2)
1860 	AS1(	ret) ASL(0)
1861 #endif
1862 	Squ_Column0(0, 1)
1863 	Squ_Column1(1, 1)
1864 	Squ_Column0(2, 2)
1865 	Squ_Column1(3, 1)
1866 	Squ_Column0(4, 1)
1867 	Squ_End(4)
1868 }
1869 
SSE2_Square16(word * C,const word * A)1870 void SSE2_Square16(word *C, const word *A)
1871 {
1872 	Squ_Begin(8)
1873 #ifndef __GNUC__
1874 	ASJ(	jmp,	0, f)
1875 	Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1876 	AS1(	ret) ASL(0)
1877 #endif
1878 	Squ_Column0(0, 1)
1879 	Squ_Column1(1, 1)
1880 	Squ_Column0(2, 2)
1881 	Squ_Column1(3, 2)
1882 	Squ_Column0(4, 3)
1883 	Squ_Column1(5, 3)
1884 	Squ_Column0(6, 4)
1885 	Squ_Column1(7, 3)
1886 	Squ_Column0(8, 3)
1887 	Squ_Column1(9, 2)
1888 	Squ_Column0(10, 2)
1889 	Squ_Column1(11, 1)
1890 	Squ_Column0(12, 1)
1891 	Squ_End(8)
1892 }
1893 
SSE2_Square32(word * C,const word * A)1894 void SSE2_Square32(word *C, const word *A)
1895 {
1896 	Squ_Begin(16)
1897 	ASJ(	jmp,	0, f)
1898 	Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1899 	AS1(	ret) ASL(0)
1900 	Squ_Column0(0, 1)
1901 	Squ_Column1(1, 1)
1902 	Squ_Column0(2, 2)
1903 	Squ_Column1(3, 2)
1904 	Squ_Column0(4, 3)
1905 	Squ_Column1(5, 3)
1906 	Squ_Column0(6, 4)
1907 	Squ_Column1(7, 4)
1908 	Squ_Column0(8, 5)
1909 	Squ_Column1(9, 5)
1910 	Squ_Column0(10, 6)
1911 	Squ_Column1(11, 6)
1912 	Squ_Column0(12, 7)
1913 	Squ_Column1(13, 7)
1914 	Squ_Column0(14, 8)
1915 	Squ_Column1(15, 7)
1916 	Squ_Column0(16, 7)
1917 	Squ_Column1(17, 6)
1918 	Squ_Column0(18, 6)
1919 	Squ_Column1(19, 5)
1920 	Squ_Column0(20, 5)
1921 	Squ_Column1(21, 4)
1922 	Squ_Column0(22, 4)
1923 	Squ_Column1(23, 3)
1924 	Squ_Column0(24, 3)
1925 	Squ_Column1(25, 2)
1926 	Squ_Column0(26, 2)
1927 	Squ_Column1(27, 1)
1928 	Squ_Column0(28, 1)
1929 	Squ_End(16)
1930 }
1931 
SSE2_Multiply4(word * C,const word * A,const word * B)1932 void SSE2_Multiply4(word *C, const word *A, const word *B)
1933 {
1934 	Mul_Begin(2)
1935 #ifndef __GNUC__
1936 	ASJ(	jmp,	0, f)
1937 	Mul_Acc(2)
1938 	AS1(	ret) ASL(0)
1939 #endif
1940 	Mul_Column0(0, 2)
1941 	Mul_End(2)
1942 }
1943 
SSE2_Multiply8(word * C,const word * A,const word * B)1944 void SSE2_Multiply8(word *C, const word *A, const word *B)
1945 {
1946 	Mul_Begin(4)
1947 #ifndef __GNUC__
1948 	ASJ(	jmp,	0, f)
1949 	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1950 	AS1(	ret) ASL(0)
1951 #endif
1952 	Mul_Column0(0, 2)
1953 	Mul_Column1(1, 3)
1954 	Mul_Column0(2, 4)
1955 	Mul_Column1(3, 3)
1956 	Mul_Column0(4, 2)
1957 	Mul_End(4)
1958 }
1959 
SSE2_Multiply16(word * C,const word * A,const word * B)1960 void SSE2_Multiply16(word *C, const word *A, const word *B)
1961 {
1962 	Mul_Begin(8)
1963 #ifndef __GNUC__
1964 	ASJ(	jmp,	0, f)
1965 	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1966 	AS1(	ret) ASL(0)
1967 #endif
1968 	Mul_Column0(0, 2)
1969 	Mul_Column1(1, 3)
1970 	Mul_Column0(2, 4)
1971 	Mul_Column1(3, 5)
1972 	Mul_Column0(4, 6)
1973 	Mul_Column1(5, 7)
1974 	Mul_Column0(6, 8)
1975 	Mul_Column1(7, 7)
1976 	Mul_Column0(8, 6)
1977 	Mul_Column1(9, 5)
1978 	Mul_Column0(10, 4)
1979 	Mul_Column1(11, 3)
1980 	Mul_Column0(12, 2)
1981 	Mul_End(8)
1982 }
1983 
SSE2_Multiply32(word * C,const word * A,const word * B)1984 void SSE2_Multiply32(word *C, const word *A, const word *B)
1985 {
1986 	Mul_Begin(16)
1987 	ASJ(	jmp,	0, f)
1988 	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1989 	AS1(	ret) ASL(0)
1990 	Mul_Column0(0, 2)
1991 	Mul_Column1(1, 3)
1992 	Mul_Column0(2, 4)
1993 	Mul_Column1(3, 5)
1994 	Mul_Column0(4, 6)
1995 	Mul_Column1(5, 7)
1996 	Mul_Column0(6, 8)
1997 	Mul_Column1(7, 9)
1998 	Mul_Column0(8, 10)
1999 	Mul_Column1(9, 11)
2000 	Mul_Column0(10, 12)
2001 	Mul_Column1(11, 13)
2002 	Mul_Column0(12, 14)
2003 	Mul_Column1(13, 15)
2004 	Mul_Column0(14, 16)
2005 	Mul_Column1(15, 15)
2006 	Mul_Column0(16, 14)
2007 	Mul_Column1(17, 13)
2008 	Mul_Column0(18, 12)
2009 	Mul_Column1(19, 11)
2010 	Mul_Column0(20, 10)
2011 	Mul_Column1(21, 9)
2012 	Mul_Column0(22, 8)
2013 	Mul_Column1(23, 7)
2014 	Mul_Column0(24, 6)
2015 	Mul_Column1(25, 5)
2016 	Mul_Column0(26, 4)
2017 	Mul_Column1(27, 3)
2018 	Mul_Column0(28, 2)
2019 	Mul_End(16)
2020 }
2021 
SSE2_MultiplyBottom4(word * C,const word * A,const word * B)2022 void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
2023 {
2024 	Mul_Begin(2)
2025 	Bot_SaveAcc(0) Bot_Acc(2)
2026 	Bot_End(2)
2027 }
2028 
SSE2_MultiplyBottom8(word * C,const word * A,const word * B)2029 void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
2030 {
2031 	Mul_Begin(4)
2032 #ifndef __GNUC__
2033 	ASJ(	jmp,	0, f)
2034 	Mul_Acc(3) Mul_Acc(2)
2035 	AS1(	ret) ASL(0)
2036 #endif
2037 	Mul_Column0(0, 2)
2038 	Mul_Column1(1, 3)
2039 	Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
2040 	Bot_End(4)
2041 }
2042 
SSE2_MultiplyBottom16(word * C,const word * A,const word * B)2043 void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
2044 {
2045 	Mul_Begin(8)
2046 #ifndef __GNUC__
2047 	ASJ(	jmp,	0, f)
2048 	Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
2049 	AS1(	ret) ASL(0)
2050 #endif
2051 	Mul_Column0(0, 2)
2052 	Mul_Column1(1, 3)
2053 	Mul_Column0(2, 4)
2054 	Mul_Column1(3, 5)
2055 	Mul_Column0(4, 6)
2056 	Mul_Column1(5, 7)
2057 	Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
2058 	Bot_End(8)
2059 }
2060 
SSE2_MultiplyBottom32(word * C,const word * A,const word * B)2061 void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
2062 {
2063 	Mul_Begin(16)
2064 #ifndef __GNUC__
2065 	ASJ(	jmp,	0, f)
2066 	Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
2067 	AS1(	ret) ASL(0)
2068 #endif
2069 	Mul_Column0(0, 2)
2070 	Mul_Column1(1, 3)
2071 	Mul_Column0(2, 4)
2072 	Mul_Column1(3, 5)
2073 	Mul_Column0(4, 6)
2074 	Mul_Column1(5, 7)
2075 	Mul_Column0(6, 8)
2076 	Mul_Column1(7, 9)
2077 	Mul_Column0(8, 10)
2078 	Mul_Column1(9, 11)
2079 	Mul_Column0(10, 12)
2080 	Mul_Column1(11, 13)
2081 	Mul_Column0(12, 14)
2082 	Mul_Column1(13, 15)
2083 	Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
2084 	Bot_End(16)
2085 }
2086 
SSE2_MultiplyTop8(word * C,const word * A,const word * B,word L)2087 void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
2088 {
2089 	Top_Begin(4)
2090 	Top_Acc(3) Top_Acc(2) Top_Acc(1)
2091 #ifndef __GNUC__
2092 	ASJ(	jmp,	0, f)
2093 	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
2094 	AS1(	ret) ASL(0)
2095 #endif
2096 	Top_Column0(4)
2097 	Top_Column1(3)
2098 	Mul_Column0(0, 2)
2099 	Top_End(2)
2100 }
2101 
SSE2_MultiplyTop16(word * C,const word * A,const word * B,word L)2102 void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
2103 {
2104 	Top_Begin(8)
2105 	Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
2106 #ifndef __GNUC__
2107 	ASJ(	jmp,	0, f)
2108 	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
2109 	AS1(	ret) ASL(0)
2110 #endif
2111 	Top_Column0(8)
2112 	Top_Column1(7)
2113 	Mul_Column0(0, 6)
2114 	Mul_Column1(1, 5)
2115 	Mul_Column0(2, 4)
2116 	Mul_Column1(3, 3)
2117 	Mul_Column0(4, 2)
2118 	Top_End(4)
2119 }
2120 
SSE2_MultiplyTop32(word * C,const word * A,const word * B,word L)2121 void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
2122 {
2123 	Top_Begin(16)
2124 	Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
2125 #ifndef __GNUC__
2126 	ASJ(	jmp,	0, f)
2127 	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
2128 	AS1(	ret) ASL(0)
2129 #endif
2130 	Top_Column0(16)
2131 	Top_Column1(15)
2132 	Mul_Column0(0, 14)
2133 	Mul_Column1(1, 13)
2134 	Mul_Column0(2, 12)
2135 	Mul_Column1(3, 11)
2136 	Mul_Column0(4, 10)
2137 	Mul_Column1(5, 9)
2138 	Mul_Column0(6, 8)
2139 	Mul_Column1(7, 7)
2140 	Mul_Column0(8, 6)
2141 	Mul_Column1(9, 5)
2142 	Mul_Column0(10, 4)
2143 	Mul_Column1(11, 3)
2144 	Mul_Column0(12, 2)
2145 	Top_End(8)
2146 }
2147 
2148 #endif	// #if CRYPTOPP_INTEGER_SSE2
2149 
2150 // ********************************************************
2151 
2152 typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
2153 typedef void (* PMul)(word *C, const word *A, const word *B);
2154 typedef void (* PSqu)(word *C, const word *A);
2155 typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
2156 
2157 #if CRYPTOPP_INTEGER_SSE2
2158 static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
2159 static size_t s_recursionLimit = 8;
2160 #else
2161 static const size_t s_recursionLimit = 16;
2162 #endif  // CRYPTOPP_INTEGER_SSE2
2163 
2164 static PMul s_pMul[9], s_pBot[9];
2165 static PSqu s_pSqu[9];
2166 static PMulTop s_pTop[9];
2167 
SetFunctionPointers()2168 void SetFunctionPointers()
2169 {
2170 	s_pMul[0] = &Baseline_Multiply2;
2171 	s_pBot[0] = &Baseline_MultiplyBottom2;
2172 	s_pSqu[0] = &Baseline_Square2;
2173 	s_pTop[0] = &Baseline_MultiplyTop2;
2174 	s_pTop[1] = &Baseline_MultiplyTop4;
2175 
2176 #if CRYPTOPP_INTEGER_SSE2
2177 	if (HasSSE2())
2178 	{
2179 		if (IsP4())
2180 		{
2181 			s_pAdd = &SSE2_Add;
2182 			s_pSub = &SSE2_Sub;
2183 		}
2184 
2185 		s_recursionLimit = 32;
2186 
2187 		s_pMul[1] = &SSE2_Multiply4;
2188 		s_pMul[2] = &SSE2_Multiply8;
2189 		s_pMul[4] = &SSE2_Multiply16;
2190 		s_pMul[8] = &SSE2_Multiply32;
2191 
2192 		s_pBot[1] = &SSE2_MultiplyBottom4;
2193 		s_pBot[2] = &SSE2_MultiplyBottom8;
2194 		s_pBot[4] = &SSE2_MultiplyBottom16;
2195 		s_pBot[8] = &SSE2_MultiplyBottom32;
2196 
2197 		s_pSqu[1] = &SSE2_Square4;
2198 		s_pSqu[2] = &SSE2_Square8;
2199 		s_pSqu[4] = &SSE2_Square16;
2200 		s_pSqu[8] = &SSE2_Square32;
2201 
2202 		s_pTop[2] = &SSE2_MultiplyTop8;
2203 		s_pTop[4] = &SSE2_MultiplyTop16;
2204 		s_pTop[8] = &SSE2_MultiplyTop32;
2205 	}
2206 	else
2207 #endif  // CRYPTOPP_INTEGER_SSE2
2208 	{
2209 		s_pMul[1] = &Baseline_Multiply4;
2210 		s_pMul[2] = &Baseline_Multiply8;
2211 
2212 		s_pBot[1] = &Baseline_MultiplyBottom4;
2213 		s_pBot[2] = &Baseline_MultiplyBottom8;
2214 
2215 		s_pSqu[1] = &Baseline_Square4;
2216 		s_pSqu[2] = &Baseline_Square8;
2217 
2218 		s_pTop[2] = &Baseline_MultiplyTop8;
2219 
2220 #if	!CRYPTOPP_INTEGER_SSE2
2221 		s_pMul[4] = &Baseline_Multiply16;
2222 		s_pBot[4] = &Baseline_MultiplyBottom16;
2223 		s_pSqu[4] = &Baseline_Square16;
2224 		s_pTop[4] = &Baseline_MultiplyTop16;
2225 #endif  // !CRYPTOPP_INTEGER_SSE2
2226 	}
2227 }
2228 
Add(word * C,const word * A,const word * B,size_t N)2229 inline int Add(word *C, const word *A, const word *B, size_t N)
2230 {
2231 #if CRYPTOPP_INTEGER_SSE2
2232 	return s_pAdd(N, C, A, B);
2233 #else
2234 	return Baseline_Add(N, C, A, B);
2235 #endif  // CRYPTOPP_INTEGER_SSE2
2236 }
2237 
Subtract(word * C,const word * A,const word * B,size_t N)2238 inline int Subtract(word *C, const word *A, const word *B, size_t N)
2239 {
2240 #if CRYPTOPP_INTEGER_SSE2
2241 	return s_pSub(N, C, A, B);
2242 #else
2243 	return Baseline_Sub(N, C, A, B);
2244 #endif  // CRYPTOPP_INTEGER_SSE2
2245 }
2246 
2247 // ********************************************************
2248 
2249 
2250 #define A0		A
2251 #define A1		(A+N2)
2252 #define B0		B
2253 #define B1		(B+N2)
2254 
2255 #define T0		T
2256 #define T1		(T+N2)
2257 #define T2		(T+N)
2258 #define T3		(T+N+N2)
2259 
2260 #define R0		R
2261 #define R1		(R+N2)
2262 #define R2		(R+N)
2263 #define R3		(R+N+N2)
2264 
2265 // R[2*N] - result = A*B
2266 // T[2*N] - temporary work space
2267 // A[N] --- multiplier
2268 // B[N] --- multiplicant
2269 
RecursiveMultiply(word * R,word * T,const word * A,const word * B,size_t N)2270 void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
2271 {
2272 	CRYPTOPP_ASSERT(N>=2 && N%2==0);
2273 
2274 	if (N <= s_recursionLimit)
2275 		s_pMul[N/4](R, A, B);
2276 	else
2277 	{
2278 		const size_t N2 = N/2;
2279 
2280 		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
2281 		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
2282 
2283 		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
2284 		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
2285 
2286 		RecursiveMultiply(R2, T2, A1, B1, N2);
2287 		RecursiveMultiply(T0, T2, R0, R1, N2);
2288 		RecursiveMultiply(R0, T2, A0, B0, N2);
2289 
2290 		// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
2291 
2292 		int c2 = Add(R2, R2, R1, N2);
2293 		int c3 = c2;
2294 		c2 += Add(R1, R2, R0, N2);
2295 		c3 += Add(R2, R2, R3, N2);
2296 
2297 		if (AN2 == BN2)
2298 			c3 -= Subtract(R1, R1, T0, N);
2299 		else
2300 			c3 += Add(R1, R1, T0, N);
2301 
2302 		c3 += Increment(R2, N2, c2);
2303 		CRYPTOPP_ASSERT (c3 >= 0 && c3 <= 2);
2304 		Increment(R3, N2, c3);
2305 	}
2306 }
2307 
2308 // R[2*N] - result = A*A
2309 // T[2*N] - temporary work space
2310 // A[N] --- number to be squared
2311 
RecursiveSquare(word * R,word * T,const word * A,size_t N)2312 void RecursiveSquare(word *R, word *T, const word *A, size_t N)
2313 {
2314 	CRYPTOPP_ASSERT(N && N%2==0);
2315 
2316 	if (N <= s_recursionLimit)
2317 		s_pSqu[N/4](R, A);
2318 	else
2319 	{
2320 		const size_t N2 = N/2;
2321 
2322 		RecursiveSquare(R0, T2, A0, N2);
2323 		RecursiveSquare(R2, T2, A1, N2);
2324 		RecursiveMultiply(T0, T2, A0, A1, N2);
2325 
2326 		int carry = Add(R1, R1, T0, N);
2327 		carry += Add(R1, R1, T0, N);
2328 		Increment(R3, N2, carry);
2329 	}
2330 }
2331 
2332 // R[N] - bottom half of A*B
2333 // T[3*N/2] - temporary work space
2334 // A[N] - multiplier
2335 // B[N] - multiplicant
2336 
RecursiveMultiplyBottom(word * R,word * T,const word * A,const word * B,size_t N)2337 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
2338 {
2339 	CRYPTOPP_ASSERT(N>=2 && N%2==0);
2340 
2341 	if (N <= s_recursionLimit)
2342 		s_pBot[N/4](R, A, B);
2343 	else
2344 	{
2345 		const size_t N2 = N/2;
2346 
2347 		RecursiveMultiply(R, T, A0, B0, N2);
2348 		RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
2349 		Add(R1, R1, T0, N2);
2350 		RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
2351 		Add(R1, R1, T0, N2);
2352 	}
2353 }
2354 
2355 // R[N] --- upper half of A*B
2356 // T[2*N] - temporary work space
2357 // L[N] --- lower half of A*B
2358 // A[N] --- multiplier
2359 // B[N] --- multiplicant
2360 
MultiplyTop(word * R,word * T,const word * L,const word * A,const word * B,size_t N)2361 void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
2362 {
2363 	CRYPTOPP_ASSERT(N>=2 && N%2==0);
2364 
2365 	if (N <= s_recursionLimit)
2366 		s_pTop[N/4](R, A, B, L[N-1]);
2367 	else
2368 	{
2369 		const size_t N2 = N/2;
2370 
2371 		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
2372 		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
2373 
2374 		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
2375 		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
2376 
2377 		RecursiveMultiply(T0, T2, R0, R1, N2);
2378 		RecursiveMultiply(R0, T2, A1, B1, N2);
2379 
2380 		// now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1
2381 
2382 		int t, c3;
2383 		int c2 = Subtract(T2, L+N2, L, N2);
2384 
2385 		if (AN2 == BN2)
2386 		{
2387 			c2 -= Add(T2, T2, T0, N2);
2388 			t = (Compare(T2, R0, N2) == -1);
2389 			c3 = t - Subtract(T2, T2, T1, N2);
2390 		}
2391 		else
2392 		{
2393 			c2 += Subtract(T2, T2, T0, N2);
2394 			t = (Compare(T2, R0, N2) == -1);
2395 			c3 = t + Add(T2, T2, T1, N2);
2396 		}
2397 
2398 		c2 += t;
2399 		if (c2 >= 0)
2400 			c3 += Increment(T2, N2, c2);
2401 		else
2402 			c3 -= Decrement(T2, N2, -c2);
2403 		c3 += Add(R0, T2, R1, N2);
2404 
2405 		CRYPTOPP_ASSERT (c3 >= 0 && c3 <= 2);
2406 		Increment(R1, N2, c3);
2407 	}
2408 }
2409 
Multiply(word * R,word * T,const word * A,const word * B,size_t N)2410 inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)
2411 {
2412 	RecursiveMultiply(R, T, A, B, N);
2413 }
2414 
Square(word * R,word * T,const word * A,size_t N)2415 inline void Square(word *R, word *T, const word *A, size_t N)
2416 {
2417 	RecursiveSquare(R, T, A, N);
2418 }
2419 
MultiplyBottom(word * R,word * T,const word * A,const word * B,size_t N)2420 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
2421 {
2422 	RecursiveMultiplyBottom(R, T, A, B, N);
2423 }
2424 
2425 // R[NA+NB] - result = A*B
2426 // T[NA+NB] - temporary work space
2427 // A[NA] ---- multiplier
2428 // B[NB] ---- multiplicant
2429 
AsymmetricMultiply(word * R,word * T,const word * A,size_t NA,const word * B,size_t NB)2430 void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *B, size_t NB)
2431 {
2432 	if (NA == NB)
2433 	{
2434 		// Profiling guided the flow below.
2435 		if (A != B)
2436 			Multiply(R, T, A, B, NA);
2437 		else
2438 			Square(R, T, A, NA);
2439 
2440 		return;
2441 	}
2442 
2443 	if (NA > NB)
2444 	{
2445 		std::swap(A, B);
2446 		std::swap(NA, NB);
2447 	}
2448 
2449 	CRYPTOPP_ASSERT(NB % NA == 0);
2450 
2451 	if (NA==2 && !A[1])
2452 	{
2453 		// Profiling guided the flow below.
2454 		switch (A[0])
2455 		{
2456 		default:
2457 			R[NB] = LinearMultiply(R, B, A[0], NB);
2458 			R[NB+1] = 0;
2459 			return;
2460 		case 0:
2461 			SetWords(R, 0, NB+2);
2462 			return;
2463 		case 1:
2464 			CopyWords(R, B, NB);
2465 			R[NB] = R[NB+1] = 0;
2466 			return;
2467 		}
2468 	}
2469 
2470 	size_t i;
2471 	if ((NB/NA)%2 == 0)
2472 	{
2473 		Multiply(R, T, A, B, NA);
2474 		CopyWords(T+2*NA, R+NA, NA);
2475 
2476 		for (i=2*NA; i<NB; i+=2*NA)
2477 			Multiply(T+NA+i, T, A, B+i, NA);
2478 		for (i=NA; i<NB; i+=2*NA)
2479 			Multiply(R+i, T, A, B+i, NA);
2480 	}
2481 	else
2482 	{
2483 		for (i=0; i<NB; i+=2*NA)
2484 			Multiply(R+i, T, A, B+i, NA);
2485 		for (i=NA; i<NB; i+=2*NA)
2486 			Multiply(T+NA+i, T, A, B+i, NA);
2487 	}
2488 
2489 	if (Add(R+NA, R+NA, T+2*NA, NB-NA))
2490 		Increment(R+NB, NA);
2491 }
2492 
2493 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
2494 // T[3*N/2] - temporary work space
2495 // A[N] ----- an odd number as input
2496 
RecursiveInverseModPower2(word * R,word * T,const word * A,size_t N)2497 void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
2498 {
2499 	// Profiling guided the flow below.
2500 	if (N!=2)
2501 	{
2502 		const size_t N2 = N/2;
2503 		RecursiveInverseModPower2(R0, T0, A0, N2);
2504 		T0[0] = 1;
2505 		SetWords(T0+1, 0, N2-1);
2506 		MultiplyTop(R1, T1, T0, R0, A0, N2);
2507 		MultiplyBottom(T0, T1, R0, A1, N2);
2508 		Add(T0, R1, T0, N2);
2509 		TwosComplement(T0, N2);
2510 		MultiplyBottom(R1, T1, R0, T0, N2);
2511 	}
2512 	else
2513 	{
2514 		T[0] = AtomicInverseModPower2(A[0]);
2515 		T[1] = 0;
2516 		s_pBot[0](T+2, T, A);
2517 		TwosComplement(T+2, 2);
2518 		Increment(T+2, 2, 2);
2519 		s_pBot[0](R, T, T+2);
2520 	}
2521 }
2522 
2523 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M
2524 // T[3*N] - temporary work space
2525 // X[2*N] - number to be reduced
2526 // M[N] --- modulus
2527 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
2528 
MontgomeryReduce(word * R,word * T,word * X,const word * M,const word * U,size_t N)2529 void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N)
2530 {
2531 #if 1
2532 	MultiplyBottom(R, T, X, U, N);
2533 	MultiplyTop(T, T+N, X, R, M, N);
2534 	word borrow = Subtract(T, X+N, T, N);
2535 	// defend against timing attack by doing this Add even when not needed
2536 	word carry = Add(T+N, T, M, N);
2537 	CRYPTOPP_ASSERT(carry | !borrow);
2538 	CRYPTOPP_UNUSED(carry), CRYPTOPP_UNUSED(borrow);
2539 	CopyWords(R, T + ((0-borrow) & N), N);
2540 #elif 0
2541 	const word u = 0-U[0];
2542 	Declare2Words(p)
2543 	for (size_t i=0; i<N; i++)
2544 	{
2545 		const word t = u * X[i];
2546 		word c = 0;
2547 		for (size_t j=0; j<N; j+=2)
2548 		{
2549 			MultiplyWords(p, t, M[j]);
2550 			Acc2WordsBy1(p, X[i+j]);
2551 			Acc2WordsBy1(p, c);
2552 			X[i+j] = LowWord(p);
2553 			c = HighWord(p);
2554 			MultiplyWords(p, t, M[j+1]);
2555 			Acc2WordsBy1(p, X[i+j+1]);
2556 			Acc2WordsBy1(p, c);
2557 			X[i+j+1] = LowWord(p);
2558 			c = HighWord(p);
2559 		}
2560 
2561 		if (Increment(X+N+i, N-i, c))
2562 			while (!Subtract(X+N, X+N, M, N)) {}
2563 	}
2564 
2565 	memcpy(R, X+N, N*WORD_SIZE);
2566 #else
2567 	__m64 u = _mm_cvtsi32_si64(0-U[0]), p;
2568 	for (size_t i=0; i<N; i++)
2569 	{
2570 		__m64 t = _mm_cvtsi32_si64(X[i]);
2571 		t = _mm_mul_su32(t, u);
2572 		__m64 c = _mm_setzero_si64();
2573 		for (size_t j=0; j<N; j+=2)
2574 		{
2575 			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j]));
2576 			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j]));
2577 			c = _mm_add_si64(c, p);
2578 			X[i+j] = _mm_cvtsi64_si32(c);
2579 			c = _mm_srli_si64(c, 32);
2580 			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1]));
2581 			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1]));
2582 			c = _mm_add_si64(c, p);
2583 			X[i+j+1] = _mm_cvtsi64_si32(c);
2584 			c = _mm_srli_si64(c, 32);
2585 		}
2586 
2587 		if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c)))
2588 			while (!Subtract(X+N, X+N, M, N)) {}
2589 	}
2590 
2591 	memcpy(R, X+N, N*WORD_SIZE);
2592 	_mm_empty();
2593 #endif
2594 }
2595 
2596 // R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
2597 // T[2*N] - temporary work space
2598 // X[2*N] - number to be reduced
2599 // M[N] --- modulus
2600 // U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2)
2601 // V[N] --- 2**(WORD_BITS*3*N/2) mod M
2602 
HalfMontgomeryReduce(word * R,word * T,const word * X,const word * M,const word * U,const word * V,size_t N)2603 void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, size_t N)
2604 {
2605 	CRYPTOPP_ASSERT(N%2==0 && N>=4);
2606 
2607 #define M0		M
2608 #define M1		(M+N2)
2609 #define V0		V
2610 #define V1		(V+N2)
2611 
2612 #define X0		X
2613 #define X1		(X+N2)
2614 #define X2		(X+N)
2615 #define X3		(X+N+N2)
2616 
2617 	const size_t N2 = N/2;
2618 	Multiply(T0, T2, V0, X3, N2);
2619 	int c2 = Add(T0, T0, X0, N);
2620 	MultiplyBottom(T3, T2, T0, U, N2);
2621 	MultiplyTop(T2, R, T0, T3, M0, N2);
2622 	c2 -= Subtract(T2, T1, T2, N2);
2623 	Multiply(T0, R, T3, M1, N2);
2624 	c2 -= Subtract(T0, T2, T0, N2);
2625 	int c3 = -(int)Subtract(T1, X2, T1, N2);
2626 	Multiply(R0, T2, V1, X3, N2);
2627 	c3 += Add(R, R, T, N);
2628 
2629 	if (c2>0)
2630 		c3 += Increment(R1, N2);
2631 	else if (c2<0)
2632 		c3 -= Decrement(R1, N2, -c2);
2633 
2634 	CRYPTOPP_ASSERT(c3>=-1 && c3<=1);
2635 	if (c3>0)
2636 		Subtract(R, R, M, N);
2637 	else if (c3<0)
2638 		Add(R, R, M, N);
2639 
2640 #undef M0
2641 #undef M1
2642 #undef V0
2643 #undef V1
2644 
2645 #undef X0
2646 #undef X1
2647 #undef X2
2648 #undef X3
2649 }
2650 
2651 #undef A0
2652 #undef A1
2653 #undef B0
2654 #undef B1
2655 
2656 #undef T0
2657 #undef T1
2658 #undef T2
2659 #undef T3
2660 
2661 #undef R0
2662 #undef R1
2663 #undef R2
2664 #undef R3
2665 
2666 /*
2667 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
2668 static word SubatomicDivide(word *A, word B0, word B1)
2669 {
2670 	// Assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
2671 	CRYPTOPP_ASSERT(A[2] < B1 || (A[2]==B1 && A[1] < B0));
2672 
2673 	// estimate the quotient: do a 2 word by 1 word divide
2674 	word Q;
2675 	if (B1+1 == 0)
2676 		Q = A[2];
2677 	else
2678 		Q = DWord(A[1], A[2]).DividedBy(B1+1);
2679 
2680 	// now subtract Q*B from A
2681 	DWord p = DWord::Multiply(B0, Q);
2682 	DWord u = (DWord) A[0] - p.GetLowHalf();
2683 	A[0] = u.GetLowHalf();
2684 	u = (DWord) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - DWord::Multiply(B1, Q);
2685 	A[1] = u.GetLowHalf();
2686 	A[2] += u.GetHighHalf();
2687 
2688 	// Q <= actual quotient, so fix it
2689 	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
2690 	{
2691 		u = (DWord) A[0] - B0;
2692 		A[0] = u.GetLowHalf();
2693 		u = (DWord) A[1] - B1 - u.GetHighHalfAsBorrow();
2694 		A[1] = u.GetLowHalf();
2695 		A[2] += u.GetHighHalf();
2696 		Q++;
2697 		CRYPTOPP_ASSERT(Q);	// shouldn't overflow
2698 	}
2699 
2700 	return Q;
2701 }
2702 
2703 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
2704 static inline void AtomicDivide(word *Q, const word *A, const word *B)
2705 {
2706 	if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
2707 	{
2708 		Q[0] = A[2];
2709 		Q[1] = A[3];
2710 	}
2711 	else
2712 	{
2713 		word T[4];
2714 		T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
2715 		Q[1] = SubatomicDivide(T+1, B[0], B[1]);
2716 		Q[0] = SubatomicDivide(T, B[0], B[1]);
2717 
2718 #if defined(CRYPTOPP_DEBUG)
2719 		// multiply quotient and divisor and add remainder, make sure it equals dividend
2720 		CRYPTOPP_ASSERT(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2721 		word P[4];
2722 		LowLevel::Multiply2(P, Q, B);
2723 		Add(P, P, T, 4);
2724 		CRYPTOPP_ASSERT(memcmp(P, A, 4*WORD_SIZE)==0);
2725 #endif
2726 	}
2727 }
2728 */
2729 
AtomicDivide(word * Q,const word * A,const word * B)2730 static inline void AtomicDivide(word *Q, const word *A, const word *B)
2731 {
2732 	word T[4];
2733 	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
2734 	Q[0] = q.GetLowHalf();
2735 	Q[1] = q.GetHighHalf();
2736 
2737 #if defined(CRYPTOPP_DEBUG)
2738 	if (B[0] || B[1])
2739 	{
2740 		// multiply quotient and divisor and add remainder, make sure it equals dividend
2741 		CRYPTOPP_ASSERT(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2742 		word P[4];
2743 		s_pMul[0](P, Q, B);
2744 		Add(P, P, T, 4);
2745 		CRYPTOPP_ASSERT(memcmp(P, A, 4*WORD_SIZE)==0);
2746 	}
2747 #endif
2748 }
2749 
2750 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
CorrectQuotientEstimate(word * R,word * T,word * Q,const word * B,size_t N)2751 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, size_t N)
2752 {
2753 	CRYPTOPP_ASSERT(N && N%2==0);
2754 
2755 	AsymmetricMultiply(T, T+N+2, Q, 2, B, N);
2756 
2757 	word borrow = Subtract(R, R, T, N+2);
2758 	CRYPTOPP_ASSERT(!borrow && !R[N+1]);
2759 	CRYPTOPP_UNUSED(borrow);
2760 
2761 	while (R[N] || Compare(R, B, N) >= 0)
2762 	{
2763 		R[N] -= Subtract(R, R, B, N);
2764 		Q[1] += (++Q[0]==0);
2765 		CRYPTOPP_ASSERT(Q[0] || Q[1]); // no overflow
2766 	}
2767 }
2768 
2769 // R[NB] -------- remainder = A%B
2770 // Q[NA-NB+2] --- quotient	= A/B
2771 // T[NA+3*(NB+2)] - temp work space
2772 // A[NA] -------- dividend
2773 // B[NB] -------- divisor
2774 
Divide(word * R,word * Q,word * T,const word * A,size_t NA,const word * B,size_t NB)2775 void Divide(word *R, word *Q, word *T, const word *A, size_t NA, const word *B, size_t NB)
2776 {
2777 	CRYPTOPP_ASSERT(NA && NB && NA%2==0 && NB%2==0);
2778 	CRYPTOPP_ASSERT(B[NB-1] || B[NB-2]);
2779 	CRYPTOPP_ASSERT(NB <= NA);
2780 
2781 	// set up temporary work space
2782 	word *const TA=T;
2783 	word *const TB=T+NA+2;
2784 	word *const TP=T+NA+2+NB;
2785 
2786 	// copy B into TB and normalize it so that TB has highest bit set to 1
2787 	unsigned shiftWords = (B[NB-1]==0);
2788 	TB[0] = TB[NB-1] = 0;
2789 	CopyWords(TB+shiftWords, B, NB-shiftWords);
2790 	unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
2791 	CRYPTOPP_ASSERT(shiftBits < WORD_BITS);
2792 	ShiftWordsLeftByBits(TB, NB, shiftBits);
2793 
2794 	// copy A into TA and normalize it
2795 	TA[0] = TA[NA] = TA[NA+1] = 0;
2796 	CopyWords(TA+shiftWords, A, NA);
2797 	ShiftWordsLeftByBits(TA, NA+2, shiftBits);
2798 
2799 	if (TA[NA+1]==0 && TA[NA] <= 1)
2800 	{
2801 		Q[NA-NB+1] = Q[NA-NB] = 0;
2802 		while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
2803 		{
2804 			TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
2805 			++Q[NA-NB];
2806 		}
2807 	}
2808 	else
2809 	{
2810 		NA+=2;
2811 		CRYPTOPP_ASSERT(Compare(TA+NA-NB, TB, NB) < 0);
2812 	}
2813 
2814 	word BT[2];
2815 	BT[0] = TB[NB-2] + 1;
2816 	BT[1] = TB[NB-1] + (BT[0]==0);
2817 
2818 	// start reducing TA mod TB, 2 words at a time
2819 	for (size_t i=NA-2; i>=NB; i-=2)
2820 	{
2821 		AtomicDivide(Q+i-NB, TA+i-2, BT);
2822 		CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
2823 	}
2824 
2825 	// copy TA into R, and denormalize it
2826 	CopyWords(R, TA+shiftWords, NB);
2827 	ShiftWordsRightByBits(R, NB, shiftBits);
2828 }
2829 
EvenWordCount(const word * X,size_t N)2830 static inline size_t EvenWordCount(const word *X, size_t N)
2831 {
2832 	while (N && X[N-2]==0 && X[N-1]==0)
2833 		N-=2;
2834 	return N;
2835 }
2836 
2837 // return k
2838 // R[N] --- result = A^(-1) * 2^k mod M
2839 // T[4*N] - temporary work space
2840 // A[NA] -- number to take inverse of
2841 // M[N] --- modulus
2842 
AlmostInverse(word * R,word * T,const word * A,size_t NA,const word * M,size_t N)2843 unsigned int AlmostInverse(word *R, word *T, const word *A, size_t NA, const word *M, size_t N)
2844 {
2845 	CRYPTOPP_ASSERT(NA<=N && N && N%2==0);
2846 
2847 	word *b = T;
2848 	word *c = T+N;
2849 	word *f = T+2*N;
2850 	word *g = T+3*N;
2851 	size_t bcLen=2, fgLen=EvenWordCount(M, N);
2852 	unsigned int k=0;
2853 	bool s=false;
2854 
2855 	SetWords(T, 0, 3*N);
2856 	b[0]=1;
2857 	CopyWords(f, A, NA);
2858 	CopyWords(g, M, N);
2859 
2860 	while (1)
2861 	{
2862 		word t=f[0];
2863 		while (!t)
2864 		{
2865 			if (EvenWordCount(f, fgLen)==0)
2866 			{
2867 				SetWords(R, 0, N);
2868 				return 0;
2869 			}
2870 
2871 			ShiftWordsRightByWords(f, fgLen, 1);
2872 			bcLen += 2 * (c[bcLen-1] != 0);
2873 			CRYPTOPP_ASSERT(bcLen <= N);
2874 			ShiftWordsLeftByWords(c, bcLen, 1);
2875 			k+=WORD_BITS;
2876 			t=f[0];
2877 		}
2878 
2879 		// t must be non-0; otherwise undefined.
2880 		unsigned int i = TrailingZeros(t);
2881 		t >>= i;
2882 		k += i;
2883 
2884 		if (t==1 && f[1]==0 && EvenWordCount(f+2, fgLen-2)==0)
2885 		{
2886 			if (s)
2887 				Subtract(R, M, b, N);
2888 			else
2889 				CopyWords(R, b, N);
2890 			return k;
2891 		}
2892 
2893 		ShiftWordsRightByBits(f, fgLen, i);
2894 		t = ShiftWordsLeftByBits(c, bcLen, i);
2895 		c[bcLen] += t;
2896 		bcLen += 2 * (t!=0);
2897 		CRYPTOPP_ASSERT(bcLen <= N);
2898 
2899 		bool swap = Compare(f, g, fgLen)==-1;
2900 		ConditionalSwapPointers(swap, f, g);
2901 		ConditionalSwapPointers(swap, b, c);
2902 		s ^= swap;
2903 
2904 		fgLen -= 2 * !(f[fgLen-2] | f[fgLen-1]);
2905 
2906 		Subtract(f, f, g, fgLen);
2907 		t = Add(b, b, c, bcLen);
2908 		b[bcLen] += t;
2909 		bcLen += 2*t;
2910 		CRYPTOPP_ASSERT(bcLen <= N);
2911 	}
2912 }
2913 
2914 // R[N] - result = A/(2^k) mod M
2915 // A[N] - input
2916 // M[N] - modulus
2917 
DivideByPower2Mod(word * R,const word * A,size_t k,const word * M,size_t N)2918 void DivideByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
2919 {
2920 	CopyWords(R, A, N);
2921 
2922 	while (k--)
2923 	{
2924 		if (R[0]%2==0)
2925 			ShiftWordsRightByBits(R, N, 1);
2926 		else
2927 		{
2928 			word carry = Add(R, R, M, N);
2929 			ShiftWordsRightByBits(R, N, 1);
2930 			R[N-1] += carry<<(WORD_BITS-1);
2931 		}
2932 	}
2933 }
2934 
2935 // R[N] - result = A*(2^k) mod M
2936 // A[N] - input
2937 // M[N] - modulus
2938 
MultiplyByPower2Mod(word * R,const word * A,size_t k,const word * M,size_t N)2939 void MultiplyByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
2940 {
2941 	CopyWords(R, A, N);
2942 
2943 	while (k--)
2944 		if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
2945 			Subtract(R, R, M, N);
2946 }
2947 
2948 // ******************************************************************
2949 
2950 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
2951 
RoundupSize(size_t n)2952 static inline size_t RoundupSize(size_t n)
2953 {
2954 	if (n<=8)
2955 		return RoundupSizeTable[n];
2956 	else if (n<=16)
2957 		return 16;
2958 	else if (n<=32)
2959 		return 32;
2960 	else if (n<=64)
2961 		return 64;
2962 	else
2963 		return size_t(1) << BitPrecision(n-1);
2964 }
2965 
Integer()2966 Integer::Integer()
2967 	: reg(2), sign(POSITIVE)
2968 {
2969 	reg[0] = reg[1] = 0;
2970 }
2971 
Integer(const Integer & t)2972 Integer::Integer(const Integer& t)
2973 	: reg(RoundupSize(t.WordCount())), sign(t.sign)
2974 {
2975 	CopyWords(reg, t.reg, reg.size());
2976 }
2977 
Integer(Sign s,lword value)2978 Integer::Integer(Sign s, lword value)
2979 	: reg(2), sign(s)
2980 {
2981 	reg[0] = word(value);
2982 	reg[1] = word(SafeRightShift<WORD_BITS>(value));
2983 }
2984 
Integer(signed long value)2985 Integer::Integer(signed long value)
2986 	: reg(2)
2987 {
2988 	if (value >= 0)
2989 		sign = POSITIVE;
2990 	else
2991 	{
2992 		sign = NEGATIVE;
2993 		value = -value;
2994 	}
2995 	reg[0] = word(value);
2996 	reg[1] = word(SafeRightShift<WORD_BITS>((unsigned long)value));
2997 }
2998 
Integer(Sign s,word high,word low)2999 Integer::Integer(Sign s, word high, word low)
3000 	: reg(2), sign(s)
3001 {
3002 	reg[0] = low;
3003 	reg[1] = high;
3004 }
3005 
IsConvertableToLong() const3006 bool Integer::IsConvertableToLong() const
3007 {
3008 	if (ByteCount() > sizeof(long))
3009 		return false;
3010 
3011 	unsigned long value = (unsigned long)reg[0];
3012 	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
3013 
3014 	if (sign==POSITIVE)
3015 		return (signed long)value >= 0;
3016 	else
3017 		return -(signed long)value < 0;
3018 }
3019 
ConvertToLong() const3020 signed long Integer::ConvertToLong() const
3021 {
3022 	CRYPTOPP_ASSERT(IsConvertableToLong());
3023 
3024 	unsigned long value = (unsigned long)reg[0];
3025 	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
3026 	return sign==POSITIVE ? value : -(signed long)value;
3027 }
3028 
Integer(BufferedTransformation & encodedInteger,size_t byteCount,Signedness s,ByteOrder o)3029 Integer::Integer(BufferedTransformation &encodedInteger, size_t byteCount, Signedness s, ByteOrder o)
3030 {
3031 	CRYPTOPP_ASSERT(o == BIG_ENDIAN_ORDER || o == LITTLE_ENDIAN_ORDER);
3032 
3033 	if (o != LITTLE_ENDIAN_ORDER)
3034 	{
3035 		Decode(encodedInteger, byteCount, s);
3036 	}
3037 	else
3038 	{
3039 		SecByteBlock block(byteCount);
3040 		encodedInteger.Get(block, block.size());
3041 		std::reverse(block.begin(), block.begin()+block.size());
3042 
3043 		Decode(block.begin(), block.size(), s);
3044 	}
3045 }
3046 
Integer(const byte * encodedInteger,size_t byteCount,Signedness s,ByteOrder o)3047 Integer::Integer(const byte *encodedInteger, size_t byteCount, Signedness s, ByteOrder o)
3048 {
3049 	CRYPTOPP_ASSERT(encodedInteger && byteCount); // NULL buffer
3050 	CRYPTOPP_ASSERT(o == BIG_ENDIAN_ORDER || o == LITTLE_ENDIAN_ORDER);
3051 
3052 	if (o != LITTLE_ENDIAN_ORDER)
3053 	{
3054 		Decode(encodedInteger, byteCount, s);
3055 	}
3056 	else
3057 	{
3058 		SecByteBlock block(byteCount);
3059 #if (_MSC_VER >= 1500)
3060 		std::reverse_copy(encodedInteger, encodedInteger+byteCount,
3061 			stdext::make_checked_array_iterator(block.begin(), block.size()));
3062 #else
3063 		std::reverse_copy(encodedInteger, encodedInteger+byteCount, block.begin());
3064 #endif
3065 		Decode(block.begin(), block.size(), s);
3066 		return;
3067 	}
3068 }
3069 
Integer(BufferedTransformation & bt)3070 Integer::Integer(BufferedTransformation &bt)
3071 {
3072 	// Make explicit call to avoid virtual-dispatch findings in ctor
3073 	Integer::BERDecode(bt);
3074 }
3075 
Integer(RandomNumberGenerator & rng,size_t bitcount)3076 Integer::Integer(RandomNumberGenerator &rng, size_t bitcount)
3077 {
3078 	Randomize(rng, bitcount);
3079 }
3080 
Integer(RandomNumberGenerator & rng,const Integer & min,const Integer & max,RandomNumberType rnType,const Integer & equiv,const Integer & mod)3081 Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
3082 {
3083 	if (!Randomize(rng, min, max, rnType, equiv, mod))
3084 		throw Integer::RandomNumberNotFound();
3085 }
3086 
Power2(size_t e)3087 Integer Integer::Power2(size_t e)
3088 {
3089 	Integer r((word)0, BitsToWords(e+1));
3090 	r.SetBit(e);
3091 	return r;
3092 }
3093 
operator !() const3094 bool Integer::operator!() const
3095 {
3096 	return IsNegative() ? false : (reg[0]==0 && WordCount()==0);
3097 }
3098 
operator =(const Integer & t)3099 Integer& Integer::operator=(const Integer& t)
3100 {
3101 	if (this != &t)
3102 	{
3103 		if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0)
3104 			reg.New(RoundupSize(t.WordCount()));
3105 		CopyWords(reg, t.reg, reg.size());
3106 		sign = t.sign;
3107 	}
3108 	return *this;
3109 }
3110 
GetBit(size_t n) const3111 bool Integer::GetBit(size_t n) const
3112 {
3113 	// Profiling guided the flow below.
3114 	if (n/WORD_BITS < reg.size())
3115 		return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
3116 	else
3117 		return 0;
3118 }
3119 
SetBit(size_t n,bool value)3120 void Integer::SetBit(size_t n, bool value)
3121 {
3122 	if (value)
3123 	{
3124 		reg.CleanGrow(RoundupSize(BitsToWords(n+1)));
3125 		reg[n/WORD_BITS] |= (word(1) << (n%WORD_BITS));
3126 	}
3127 	else
3128 	{
3129 		if (n/WORD_BITS < reg.size())
3130 			reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS));
3131 	}
3132 }
3133 
GetByte(size_t n) const3134 byte Integer::GetByte(size_t n) const
3135 {
3136 	// Profiling guided the flow below.
3137 	if (n/WORD_SIZE < reg.size())
3138 		return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
3139 	else
3140 		return 0;
3141 }
3142 
SetByte(size_t n,byte value)3143 void Integer::SetByte(size_t n, byte value)
3144 {
3145 	reg.CleanGrow(RoundupSize(BytesToWords(n+1)));
3146 	reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
3147 	reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
3148 }
3149 
GetBits(size_t i,size_t n) const3150 lword Integer::GetBits(size_t i, size_t n) const
3151 {
3152 	lword v = 0;
3153 	CRYPTOPP_ASSERT(n <= sizeof(v)*8);
3154 	for (unsigned int j=0; j<n; j++)
3155 		v |= lword(GetBit(i+j)) << j;
3156 	return v;
3157 }
3158 
operator -() const3159 Integer Integer::operator-() const
3160 {
3161 	Integer result(*this);
3162 	result.Negate();
3163 	return result;
3164 }
3165 
AbsoluteValue() const3166 Integer Integer::AbsoluteValue() const
3167 {
3168 	Integer result(*this);
3169 	result.sign = POSITIVE;
3170 	return result;
3171 }
3172 
swap(Integer & a)3173 void Integer::swap(Integer &a)
3174 {
3175 	reg.swap(a.reg);
3176 	std::swap(sign, a.sign);
3177 }
3178 
Integer(word value,size_t length)3179 Integer::Integer(word value, size_t length)
3180 	: reg(RoundupSize(length)), sign(POSITIVE)
3181 {
3182 	reg[0] = value;
3183 	SetWords(reg+1, 0, reg.size()-1);
3184 }
3185 
3186 template <class T>
StringToInteger(const T * str,ByteOrder order)3187 static Integer StringToInteger(const T *str, ByteOrder order)
3188 {
3189 	CRYPTOPP_ASSERT( order == BIG_ENDIAN_ORDER || order == LITTLE_ENDIAN_ORDER );
3190 
3191 	int radix, sign = 1;
3192 	// GCC workaround
3193 	// std::char_traits<wchar_t>::length() not defined in GCC 3.2 and STLport 4.5.3
3194 	unsigned int length;
3195 	for (length = 0; str[length] != 0; length++) {}
3196 
3197 	Integer v;
3198 
3199 	if (length == 0)
3200 		return Integer::Zero();
3201 
3202 	// 'str' is of length 1 or more
3203 	switch (str[length-1])
3204 	{
3205 	case 'h':
3206 	case 'H':
3207 		radix=16;
3208 		break;
3209 	case 'o':
3210 	case 'O':
3211 		radix=8;
3212 		break;
3213 	case 'b':
3214 	case 'B':
3215 		radix=2;
3216 		break;
3217 	default:
3218 		radix=10;
3219 	}
3220 
3221 	// 'str' is of length 1 or more
3222 	if (str[0] == '-')
3223 	{
3224 		sign = -1;
3225 		str += 1, length -= 1;
3226 	}
3227 
3228 	// Recognize common prefixes for hexadecimal, octal and decimal.
3229 	// Microsoft's MASM also recognizes 0t for octal, but not here.
3230 	if (length > 2 && str[0] == '0')
3231 	{
3232 		if (str[1] == 'x' || str[1] == 'X')
3233 		{
3234 			radix = 16;
3235 			str += 2, length -= 2;
3236 		}
3237 		else if (str[1] == 'n' || str[1] == 'N')
3238 		{
3239 			radix = 10;
3240 			str += 2, length -= 2;
3241 		}
3242 		else if (str[1] == 'o' || str[1] == 'O')
3243 		{
3244 			radix = 8;
3245 			str += 2, length -= 2;
3246 		}
3247 	}
3248 
3249 	if (order == BIG_ENDIAN_ORDER)
3250 	{
3251 		for (unsigned int i=0; i<length; i++)
3252 		{
3253 			int digit, ch = static_cast<int>(str[i]);
3254 
3255 			// Profiling guided the flow below.
3256 			if (ch >= '0' && ch <= '9')
3257 				digit = ch - '0';
3258 			else if (ch >= 'a' && ch <= 'f')
3259 				digit = ch - 'a' + 10;
3260 			else if (ch >= 'A' && ch <= 'F')
3261 				digit = ch - 'A' + 10;
3262 			else
3263 				digit = radix;
3264 
3265 			if (digit < radix)
3266 			{
3267 				v *= radix;
3268 				v += digit;
3269 			}
3270 		}
3271 	}
3272 	else if (radix == 16 && order == LITTLE_ENDIAN_ORDER)
3273 	{
3274 		// Nibble high, low and count
3275 		unsigned int nh = 0, nl = 0, nc = 0;
3276 		Integer position(Integer::One());
3277 
3278 		for (unsigned int i=0; i<length; i++)
3279 		{
3280 			int digit, ch = static_cast<int>(str[i]);
3281 
3282 			if (ch >= '0' && ch <= '9')
3283 				digit = ch - '0';
3284 			else if (ch >= 'a' && ch <= 'f')
3285 				digit = ch - 'a' + 10;
3286 			else if (ch >= 'A' && ch <= 'F')
3287 				digit = ch - 'A' + 10;
3288 			else
3289 				digit = radix;
3290 
3291 			if (digit < radix)
3292 			{
3293 				if (nc++ == 0)
3294 					nh = digit;
3295 				else
3296 					nl = digit;
3297 
3298 				if (nc == 2)
3299 				{
3300 					v += position * (nh << 4 | nl);
3301 					nc = 0, position <<= 8;
3302 				}
3303 			}
3304 		}
3305 
3306 		if (nc == 1)
3307 			v += nh * position;
3308 	}
3309 	else // LITTLE_ENDIAN_ORDER && radix != 16
3310 	{
3311 		for (int i=static_cast<int>(length)-1; i>=0; i--)
3312 		{
3313 			int digit, ch = static_cast<int>(str[i]);
3314 
3315 			if (ch >= '0' && ch <= '9')
3316 				digit = ch - '0';
3317 			else if (ch >= 'a' && ch <= 'f')
3318 				digit = ch - 'a' + 10;
3319 			else if (ch >= 'A' && ch <= 'F')
3320 				digit = ch - 'A' + 10;
3321 			else
3322 				digit = radix;
3323 
3324 			if (digit < radix)
3325 			{
3326 				v *= radix;
3327 				v += digit;
3328 			}
3329 		}
3330 	}
3331 
3332 	if (sign == -1)
3333 		v.Negate();
3334 
3335 	return v;
3336 }
3337 
Integer(const char * str,ByteOrder order)3338 Integer::Integer(const char *str, ByteOrder order)
3339 	: reg(2), sign(POSITIVE)
3340 {
3341 	*this = StringToInteger(str,order);
3342 }
3343 
Integer(const wchar_t * str,ByteOrder order)3344 Integer::Integer(const wchar_t *str, ByteOrder order)
3345 	: reg(2), sign(POSITIVE)
3346 {
3347 	*this = StringToInteger(str,order);
3348 }
3349 
WordCount() const3350 unsigned int Integer::WordCount() const
3351 {
3352 	return (unsigned int)CountWords(reg, reg.size());
3353 }
3354 
ByteCount() const3355 unsigned int Integer::ByteCount() const
3356 {
3357 	unsigned wordCount = WordCount();
3358 	if (wordCount)
3359 		return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]);
3360 	else
3361 		return 0;
3362 }
3363 
BitCount() const3364 unsigned int Integer::BitCount() const
3365 {
3366 	unsigned wordCount = WordCount();
3367 	if (wordCount)
3368 		return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]);
3369 	else
3370 		return 0;
3371 }
3372 
Decode(const byte * input,size_t inputLen,Signedness s)3373 void Integer::Decode(const byte *input, size_t inputLen, Signedness s)
3374 {
3375 	CRYPTOPP_ASSERT(input && inputLen); // NULL buffer
3376 	StringStore store(input, inputLen);
3377 	Decode(store, inputLen, s);
3378 }
3379 
Decode(BufferedTransformation & bt,size_t inputLen,Signedness s)3380 void Integer::Decode(BufferedTransformation &bt, size_t inputLen, Signedness s)
3381 {
3382 	CRYPTOPP_ASSERT(bt.MaxRetrievable() >= inputLen);
3383 	if (bt.MaxRetrievable() < inputLen)
3384 		throw InvalidArgument("Integer: input length is too small");
3385 
3386 	byte b;
3387 	bt.Peek(b);
3388 	sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
3389 
3390 	while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff))
3391 	{
3392 		bt.Skip(1);
3393 		inputLen--;
3394 		bt.Peek(b);
3395 	}
3396 
3397 	reg.CleanNew(RoundupSize(BytesToWords(inputLen)));
3398 	for (size_t i=inputLen; i > 0; i--)
3399 	{
3400 		(void)bt.Get(b);
3401 		reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8;
3402 	}
3403 
3404 	if (sign == NEGATIVE)
3405 	{
3406 		for (size_t i=inputLen; i<reg.size()*WORD_SIZE; i++)
3407 			reg[i/WORD_SIZE] |= word(0xff) << (i%WORD_SIZE)*8;
3408 		TwosComplement(reg, reg.size());
3409 	}
3410 }
3411 
MinEncodedSize(Signedness signedness) const3412 size_t Integer::MinEncodedSize(Signedness signedness) const
3413 {
3414 	// Profiling guided the flow below.
3415 	unsigned int outputLen = STDMAX(1U, ByteCount());
3416 	const bool pre = (signedness == UNSIGNED);
3417 	if (!pre && NotNegative() && (GetByte(outputLen-1) & 0x80))
3418 		outputLen++;
3419 	if (pre)
3420 		return outputLen;
3421 	if (IsNegative() && *this < -Power2(outputLen*8-1))
3422 		outputLen++;
3423 	return outputLen;
3424 }
3425 
3426 // PKCS12_PBKDF and other classes use undersized buffers
Encode(byte * output,size_t outputLen,Signedness signedness) const3427 void Integer::Encode(byte *output, size_t outputLen, Signedness signedness) const
3428 {
3429 	CRYPTOPP_ASSERT(output && outputLen);            // NULL buffer
3430 	ArraySink sink(output, outputLen);
3431 	Encode(sink, outputLen, signedness);
3432 }
3433 
Encode(BufferedTransformation & bt,size_t outputLen,Signedness signedness) const3434 void Integer::Encode(BufferedTransformation &bt, size_t outputLen, Signedness signedness) const
3435 {
3436 	if (signedness == UNSIGNED || NotNegative())
3437 	{
3438 		for (size_t i=outputLen; i > 0; i--)
3439 			bt.Put(GetByte(i-1));
3440 	}
3441 	else
3442 	{
3443 		// take two's complement of *this
3444 		Integer temp = Integer::Power2(8*STDMAX((size_t)ByteCount(), outputLen)) + *this;
3445 		temp.Encode(bt, outputLen, UNSIGNED);
3446 	}
3447 }
3448 
DEREncode(BufferedTransformation & bt) const3449 void Integer::DEREncode(BufferedTransformation &bt) const
3450 {
3451 	DERGeneralEncoder enc(bt, INTEGER);
3452 	Encode(enc, MinEncodedSize(SIGNED), SIGNED);
3453 	enc.MessageEnd();
3454 }
3455 
BERDecode(const byte * input,size_t len)3456 void Integer::BERDecode(const byte *input, size_t len)
3457 {
3458 	CRYPTOPP_ASSERT(input && len); // NULL buffer
3459 	StringStore store(input, len);
3460 	BERDecode(store);
3461 }
3462 
BERDecode(BufferedTransformation & bt)3463 void Integer::BERDecode(BufferedTransformation &bt)
3464 {
3465 	BERGeneralDecoder dec(bt, INTEGER);
3466 	if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength())
3467 		BERDecodeError();
3468 	Decode(dec, (size_t)dec.RemainingLength(), SIGNED);
3469 	dec.MessageEnd();
3470 }
3471 
DEREncodeAsOctetString(BufferedTransformation & bt,size_t length) const3472 void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, size_t length) const
3473 {
3474 	DERGeneralEncoder enc(bt, OCTET_STRING);
3475 	Encode(enc, length);
3476 	enc.MessageEnd();
3477 }
3478 
BERDecodeAsOctetString(BufferedTransformation & bt,size_t length)3479 void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, size_t length)
3480 {
3481 	BERGeneralDecoder dec(bt, OCTET_STRING);
3482 	if (!dec.IsDefiniteLength() || dec.RemainingLength() != length)
3483 		BERDecodeError();
3484 	Decode(dec, length);
3485 	dec.MessageEnd();
3486 }
3487 
OpenPGPEncode(byte * output,size_t bufferSize) const3488 size_t Integer::OpenPGPEncode(byte *output, size_t bufferSize) const
3489 {
3490 	CRYPTOPP_ASSERT(output && bufferSize);         // NULL buffer
3491 	CRYPTOPP_ASSERT(bufferSize >= 2+ByteCount());  // Undersized buffer
3492 	ArraySink sink(output, bufferSize);
3493 	return OpenPGPEncode(sink);
3494 }
3495 
OpenPGPEncode(BufferedTransformation & bt) const3496 size_t Integer::OpenPGPEncode(BufferedTransformation &bt) const
3497 {
3498 	word16 bitCount = word16(BitCount());
3499 	bt.PutWord16(bitCount);
3500 	size_t byteCount = BitsToBytes(bitCount);
3501 	Encode(bt, byteCount);
3502 	return 2 + byteCount;
3503 }
3504 
OpenPGPDecode(const byte * input,size_t len)3505 void Integer::OpenPGPDecode(const byte *input, size_t len)
3506 {
3507 	CRYPTOPP_ASSERT(input && len);  // NULL buffer
3508 	StringStore store(input, len);
3509 	OpenPGPDecode(store);
3510 }
3511 
OpenPGPDecode(BufferedTransformation & bt)3512 void Integer::OpenPGPDecode(BufferedTransformation &bt)
3513 {
3514 	word16 bitCount;
3515 	if (bt.GetWord16(bitCount) != 2 || bt.MaxRetrievable() < BitsToBytes(bitCount))
3516 		throw OpenPGPDecodeErr();
3517 	Decode(bt, BitsToBytes(bitCount));
3518 }
3519 
Randomize(RandomNumberGenerator & rng,size_t nbits)3520 void Integer::Randomize(RandomNumberGenerator &rng, size_t nbits)
3521 {
3522 	const size_t nbytes = nbits/8 + 1;
3523 	SecByteBlock buf(nbytes);
3524 	rng.GenerateBlock(buf, nbytes);
3525 	if (nbytes)
3526 		buf[0] = (byte)Crop(buf[0], nbits % 8);
3527 	Decode(buf, nbytes, UNSIGNED);
3528 }
3529 
Randomize(RandomNumberGenerator & rng,const Integer & min,const Integer & max)3530 void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max)
3531 {
3532 	if (min > max)
3533 		throw InvalidArgument("Integer: Min must be no greater than Max");
3534 
3535 	Integer range = max - min;
3536 	const unsigned int nbits = range.BitCount();
3537 
3538 	do
3539 	{
3540 		Randomize(rng, nbits);
3541 	}
3542 	while (*this > range);
3543 
3544 	*this += min;
3545 }
3546 
Randomize(RandomNumberGenerator & rng,const Integer & min,const Integer & max,RandomNumberType rnType,const Integer & equiv,const Integer & mod)3547 bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
3548 {
3549 	return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)
3550 		("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod));
3551 }
3552 
3553 class KDF2_RNG : public RandomNumberGenerator
3554 {
3555 public:
KDF2_RNG(const byte * seed,size_t seedSize)3556 	KDF2_RNG(const byte *seed, size_t seedSize)
3557 		: m_counter(0), m_counterAndSeed(ClampSize(seedSize) + 4)
3558 	{
3559 		memcpy(m_counterAndSeed + 4, seed, ClampSize(seedSize));
3560 	}
3561 
GenerateBlock(byte * output,size_t size)3562 	void GenerateBlock(byte *output, size_t size)
3563 	{
3564 		CRYPTOPP_ASSERT(output && size); // NULL buffer
3565 		PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
3566 		++m_counter;
3567 		P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULLPTR, 0);
3568 	}
3569 
3570 	// UBsan finding, -Wstringop-overflow
ClampSize(size_t req) const3571 	inline size_t ClampSize(size_t req) const
3572 	{
3573 		// Clamp at 16 MB
3574 		if (req > 16U*1024*1024)
3575 			return 16U*1024*1024;
3576 		return req;
3577 	}
3578 
3579 private:
3580 	word32 m_counter;
3581 	SecByteBlock m_counterAndSeed;
3582 };
3583 
GenerateRandomNoThrow(RandomNumberGenerator & i_rng,const NameValuePairs & params)3584 bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs &params)
3585 {
3586 	Integer min = params.GetValueWithDefault("Min", Integer::Zero());
3587 	Integer max;
3588 	if (!params.GetValue("Max", max))
3589 	{
3590 		int bitLength;
3591 		if (params.GetIntValue("BitLength", bitLength))
3592 			max = Integer::Power2(bitLength);
3593 		else
3594 			throw InvalidArgument("Integer: missing Max argument");
3595 	}
3596 	if (min > max)
3597 		throw InvalidArgument("Integer: Min must be no greater than Max");
3598 
3599 	Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero());
3600 	Integer mod = params.GetValueWithDefault("Mod", Integer::One());
3601 
3602 	if (equiv.IsNegative() || equiv >= mod)
3603 		throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument");
3604 
3605 	Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY);
3606 
3607 	member_ptr<KDF2_RNG> kdf2Rng;
3608 	ConstByteArrayParameter seed;
3609 	if (params.GetValue(Name::Seed(), seed))
3610 	{
3611 		ByteQueue bq;
3612 		DERSequenceEncoder seq(bq);
3613 		min.DEREncode(seq);
3614 		max.DEREncode(seq);
3615 		equiv.DEREncode(seq);
3616 		mod.DEREncode(seq);
3617 		DEREncodeUnsigned(seq, rnType);
3618 		DEREncodeOctetString(seq, seed.begin(), seed.size());
3619 		seq.MessageEnd();
3620 
3621 		SecByteBlock finalSeed((size_t)bq.MaxRetrievable());
3622 		bq.Get(finalSeed, finalSeed.size());
3623 		kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));
3624 	}
3625 	RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng;
3626 
3627 	switch (rnType)
3628 	{
3629 		case ANY:
3630 			if (mod == One())
3631 				Randomize(rng, min, max);
3632 			else
3633 			{
3634 				Integer min1 = min + (equiv-min)%mod;
3635 				if (max < min1)
3636 					return false;
3637 				Randomize(rng, Zero(), (max - min1) / mod);
3638 				*this *= mod;
3639 				*this += min1;
3640 			}
3641 			return true;
3642 
3643 		case PRIME:
3644 		{
3645 			const PrimeSelector *pSelector = params.GetValueWithDefault(Name::PointerToPrimeSelector(), (const PrimeSelector *)NULLPTR);
3646 
3647 			int i;
3648 			i = 0;
3649 			while (1)
3650 			{
3651 				if (++i==16)
3652 				{
3653 					// check if there are any suitable primes in [min, max]
3654 					Integer first = min;
3655 					if (FirstPrime(first, max, equiv, mod, pSelector))
3656 					{
3657 						// if there is only one suitable prime, we're done
3658 						*this = first;
3659 						if (!FirstPrime(first, max, equiv, mod, pSelector))
3660 							return true;
3661 					}
3662 					else
3663 						return false;
3664 				}
3665 
3666 				Randomize(rng, min, max);
3667 				if (FirstPrime(*this, STDMIN(*this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector))
3668 					return true;
3669 			}
3670 		}
3671 
3672 		default:
3673 			throw InvalidArgument("Integer: invalid RandomNumberType argument");
3674 	}
3675 }
3676 
operator >>(std::istream & in,Integer & a)3677 std::istream& operator>>(std::istream& in, Integer &a)
3678 {
3679 	char c;
3680 	unsigned int length = 0;
3681 	SecBlock<char> str(length + 16);
3682 
3683 	std::ws(in);
3684 
3685 	do
3686 	{
3687 		in.read(&c, 1);
3688 		str[length++] = c;
3689 		if (length >= str.size())
3690 			str.Grow(length + 16);
3691 	}
3692 	while (in && (c=='-' || c=='x' || (c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') || c=='h' || c=='H' || c=='o' || c=='O' || c==',' || c=='.'));
3693 
3694 	if (in.gcount())
3695 		in.putback(c);
3696 	str[length-1] = '\0';
3697 	a = Integer(str);
3698 
3699 	return in;
3700 }
3701 
3702 // Ensure base 10 is default
FlagToBase(long f)3703 inline int FlagToBase(long f) {
3704 	return f == std::ios::hex ? 16 : (f == std::ios::oct ? 8 : 10);
3705 }
3706 
FlagToSuffix(long f)3707 inline char FlagToSuffix(long f) {
3708 	return f == std::ios::hex ? 'h' : (f == std::ios::oct ? 'o' : '.');
3709 }
3710 
3711 // Ensure base 10 is default
operator <<(std::ostream & out,const Integer & a)3712 std::ostream& operator<<(std::ostream& out, const Integer &a)
3713 {
3714 	// Get relevant conversion specifications from ostream.
3715 	const long f = out.flags() & std::ios::basefield;
3716 	const int base = FlagToBase(f);
3717 	const char suffix = FlagToSuffix(f);
3718 
3719 	Integer temp1=a, temp2;
3720 	if (a.IsNegative())
3721 	{
3722 		out << '-';
3723 		temp1.Negate();
3724 	}
3725 
3726 	if (!a)
3727 		out << '0';
3728 
3729 	static const char upper[]="0123456789ABCDEF";
3730 	static const char lower[]="0123456789abcdef";
3731 
3732 	const char* vec = (out.flags() & std::ios::uppercase) ? upper : lower;
3733 	unsigned int i=0;
3734 	SecBlock<char> s(a.BitCount() / (SaturatingSubtract1(BitPrecision(base),1U)) + 1);
3735 
3736 	while (!!temp1)
3737 	{
3738 		word digit;
3739 		Integer::Divide(digit, temp2, temp1, base);
3740 		s[i++]=vec[digit];
3741 		temp1.swap(temp2);
3742 	}
3743 
3744 	while (i--)
3745 	{
3746 		out << s[i];
3747 	}
3748 
3749 #ifdef CRYPTOPP_USE_STD_SHOWBASE
3750 	if (out.flags() & std::ios_base::showbase)
3751 		out << suffix;
3752 
3753 	return out;
3754 #else
3755  	return out << suffix;
3756 #endif
3757 }
3758 
operator ++()3759 Integer& Integer::operator++()
3760 {
3761 	if (NotNegative())
3762 	{
3763 		if (Increment(reg, reg.size()))
3764 		{
3765 			reg.CleanGrow(2*reg.size());
3766 			reg[reg.size()/2]=1;
3767 		}
3768 	}
3769 	else
3770 	{
3771 		word borrow = Decrement(reg, reg.size());
3772 		CRYPTOPP_ASSERT(!borrow); CRYPTOPP_UNUSED(borrow);
3773 
3774 		if (WordCount()==0)
3775 			*this = Zero();
3776 	}
3777 	return *this;
3778 }
3779 
operator --()3780 Integer& Integer::operator--()
3781 {
3782 	if (IsNegative())
3783 	{
3784 		if (Increment(reg, reg.size()))
3785 		{
3786 			reg.CleanGrow(2*reg.size());
3787 			reg[reg.size()/2]=1;
3788 		}
3789 	}
3790 	else
3791 	{
3792 		if (Decrement(reg, reg.size()))
3793 			*this = -One();
3794 	}
3795 	return *this;
3796 }
3797 
3798 // This is a bit operation. We set sign to POSITIVE, so there's no need to
3799 //  worry about negative zero. Also see http://stackoverflow.com/q/11644362.
And(const Integer & t) const3800 Integer Integer::And(const Integer& t) const
3801 {
3802 	if (this == &t)
3803 	{
3804 		return AbsoluteValue();
3805 	}
3806 	else if (reg.size() >= t.reg.size())
3807 	{
3808 		Integer result(t);
3809 		AndWords(result.reg, reg, t.reg.size());
3810 
3811 		result.sign = POSITIVE;
3812 		return result;
3813 	}
3814 	else // reg.size() < t.reg.size()
3815 	{
3816 		Integer result(*this);
3817 		AndWords(result.reg, t.reg, reg.size());
3818 
3819 		result.sign = POSITIVE;
3820 		return result;
3821 	}
3822 }
3823 
3824 // This is a bit operation. We set sign to POSITIVE, so there's no need to
3825 //  worry about negative zero. Also see http://stackoverflow.com/q/11644362.
Or(const Integer & t) const3826 Integer Integer::Or(const Integer& t) const
3827 {
3828 	if (this == &t)
3829 	{
3830 		return AbsoluteValue();
3831 	}
3832 	else if (reg.size() >= t.reg.size())
3833 	{
3834 		Integer result(*this);
3835 		OrWords(result.reg, t.reg, t.reg.size());
3836 
3837 		result.sign = POSITIVE;
3838 		return result;
3839 	}
3840 	else // reg.size() < t.reg.size()
3841 	{
3842 		Integer result(t);
3843 		OrWords(result.reg, reg, reg.size());
3844 
3845 		result.sign = POSITIVE;
3846 		return result;
3847 	}
3848 }
3849 
3850 // This is a bit operation. We set sign to POSITIVE, so there's no need to
3851 //  worry about negative zero. Also see http://stackoverflow.com/q/11644362.
Xor(const Integer & t) const3852 Integer Integer::Xor(const Integer& t) const
3853 {
3854 	if (this == &t)
3855 	{
3856 		return Integer::Zero();
3857 	}
3858 	else if (reg.size() >= t.reg.size())
3859 	{
3860 		Integer result(*this);
3861 		XorWords(result.reg, t.reg, t.reg.size());
3862 
3863 		result.sign = POSITIVE;
3864 		return result;
3865 	}
3866 	else // reg.size() < t.reg.size()
3867 	{
3868 		Integer result(t);
3869 		XorWords(result.reg, reg, reg.size());
3870 
3871 		result.sign = POSITIVE;
3872 		return result;
3873 	}
3874 }
3875 
PositiveAdd(Integer & sum,const Integer & a,const Integer & b)3876 void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)
3877 {
3878 	// Profiling guided the flow below.
3879 	int carry; const bool pre = (a.reg.size() == b.reg.size());
3880 	if (!pre && a.reg.size() > b.reg.size())
3881 	{
3882 		carry = Add(sum.reg, a.reg, b.reg, b.reg.size());
3883 		CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size());
3884 		carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry);
3885 	}
3886 	else if (pre)
3887 	{
3888 		carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3889 	}
3890 	else
3891 	{
3892 		carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3893 		CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size());
3894 		carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry);
3895 	}
3896 
3897 	if (carry)
3898 	{
3899 		sum.reg.CleanGrow(2*sum.reg.size());
3900 		sum.reg[sum.reg.size()/2] = 1;
3901 	}
3902 	sum.sign = Integer::POSITIVE;
3903 }
3904 
PositiveSubtract(Integer & diff,const Integer & a,const Integer & b)3905 void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
3906 {
3907 	unsigned aSize = a.WordCount();
3908 	aSize += aSize%2;
3909 	unsigned bSize = b.WordCount();
3910 	bSize += bSize%2;
3911 
3912 	// Profiling guided the flow below.
3913 	if (aSize > bSize)
3914 	{
3915 		word borrow = Subtract(diff.reg, a.reg, b.reg, bSize);
3916 		CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize);
3917 		borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow);
3918 		CRYPTOPP_ASSERT(!borrow); CRYPTOPP_UNUSED(borrow);
3919 		diff.sign = Integer::POSITIVE;
3920 	}
3921 	else if (aSize == bSize)
3922 	{
3923 		if (Compare(a.reg, b.reg, aSize) >= 0)
3924 		{
3925 			Subtract(diff.reg, a.reg, b.reg, aSize);
3926 			diff.sign = Integer::POSITIVE;
3927 		}
3928 		else
3929 		{
3930 			Subtract(diff.reg, b.reg, a.reg, aSize);
3931 			diff.sign = Integer::NEGATIVE;
3932 		}
3933 	}
3934 	else
3935 	{
3936 		word borrow = Subtract(diff.reg, b.reg, a.reg, aSize);
3937 		CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize);
3938 		borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow);
3939 		CRYPTOPP_ASSERT(!borrow); CRYPTOPP_UNUSED(borrow);
3940 		diff.sign = Integer::NEGATIVE;
3941 	}
3942 }
3943 
3944 // MSVC .NET 2003 workaround
STDMAX2(const T & a,const T & b)3945 template <class T> inline const T& STDMAX2(const T& a, const T& b)
3946 {
3947 	return a < b ? b : a;
3948 }
3949 
Plus(const Integer & b) const3950 Integer Integer::Plus(const Integer& b) const
3951 {
3952 	Integer sum((word)0, STDMAX2(reg.size(), b.reg.size()));
3953 	if (NotNegative())
3954 	{
3955 		if (b.NotNegative())
3956 			PositiveAdd(sum, *this, b);
3957 		else
3958 			PositiveSubtract(sum, *this, b);
3959 	}
3960 	else
3961 	{
3962 		if (b.NotNegative())
3963 			PositiveSubtract(sum, b, *this);
3964 		else
3965 		{
3966 			PositiveAdd(sum, *this, b);
3967 			sum.sign = Integer::NEGATIVE;
3968 		}
3969 	}
3970 	return sum;
3971 }
3972 
operator +=(const Integer & t)3973 Integer& Integer::operator+=(const Integer& t)
3974 {
3975 	reg.CleanGrow(t.reg.size());
3976 	if (NotNegative())
3977 	{
3978 		if (t.NotNegative())
3979 			PositiveAdd(*this, *this, t);
3980 		else
3981 			PositiveSubtract(*this, *this, t);
3982 	}
3983 	else
3984 	{
3985 		if (t.NotNegative())
3986 			PositiveSubtract(*this, t, *this);
3987 		else
3988 		{
3989 			PositiveAdd(*this, *this, t);
3990 			sign = Integer::NEGATIVE;
3991 		}
3992 	}
3993 	return *this;
3994 }
3995 
Minus(const Integer & b) const3996 Integer Integer::Minus(const Integer& b) const
3997 {
3998 	Integer diff((word)0, STDMAX2(reg.size(), b.reg.size()));
3999 	if (NotNegative())
4000 	{
4001 		if (b.NotNegative())
4002 			PositiveSubtract(diff, *this, b);
4003 		else
4004 			PositiveAdd(diff, *this, b);
4005 	}
4006 	else
4007 	{
4008 		if (b.NotNegative())
4009 		{
4010 			PositiveAdd(diff, *this, b);
4011 			diff.sign = Integer::NEGATIVE;
4012 		}
4013 		else
4014 			PositiveSubtract(diff, b, *this);
4015 	}
4016 	return diff;
4017 }
4018 
operator -=(const Integer & t)4019 Integer& Integer::operator-=(const Integer& t)
4020 {
4021 	reg.CleanGrow(t.reg.size());
4022 	if (NotNegative())
4023 	{
4024 		if (t.NotNegative())
4025 			PositiveSubtract(*this, *this, t);
4026 		else
4027 			PositiveAdd(*this, *this, t);
4028 	}
4029 	else
4030 	{
4031 		if (t.NotNegative())
4032 		{
4033 			PositiveAdd(*this, *this, t);
4034 			sign = Integer::NEGATIVE;
4035 		}
4036 		else
4037 			PositiveSubtract(*this, t, *this);
4038 	}
4039 	return *this;
4040 }
4041 
operator <<=(size_t n)4042 Integer& Integer::operator<<=(size_t n)
4043 {
4044 	const size_t wordCount = WordCount();
4045 	const size_t shiftWords = n / WORD_BITS;
4046 	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
4047 
4048 	reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
4049 	ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);
4050 	ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits);
4051 	return *this;
4052 }
4053 
operator >>=(size_t n)4054 Integer& Integer::operator>>=(size_t n)
4055 {
4056 	const size_t wordCount = WordCount();
4057 	const size_t shiftWords = n / WORD_BITS;
4058 	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
4059 
4060 	ShiftWordsRightByWords(reg, wordCount, shiftWords);
4061 	if (wordCount > shiftWords)
4062 		ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits);
4063 	if (IsNegative() && WordCount()==0)   // avoid -0
4064 		*this = Zero();
4065 	return *this;
4066 }
4067 
operator &=(const Integer & t)4068 Integer& Integer::operator&=(const Integer& t)
4069 {
4070 	if (this != &t)
4071 	{
4072 		const size_t size = STDMIN(reg.size(), t.reg.size());
4073 		reg.resize(size);
4074 		AndWords(reg, t.reg, size);
4075 	}
4076 	sign = POSITIVE;
4077 	return *this;
4078 }
4079 
operator |=(const Integer & t)4080 Integer& Integer::operator|=(const Integer& t)
4081 {
4082 	if (this != &t)
4083 	{
4084 		if (reg.size() >= t.reg.size())
4085 		{
4086 			OrWords(reg, t.reg, t.reg.size());
4087 		}
4088 		else  // reg.size() < t.reg.size()
4089 		{
4090 			const size_t head = reg.size();
4091 			const size_t tail = t.reg.size() - reg.size();
4092 			reg.resize(head+tail);
4093 			OrWords(reg, t.reg, head);
4094 			CopyWords(reg+head,t.reg+head,tail);
4095 		}
4096 	}
4097 	sign = POSITIVE;
4098 	return *this;
4099 }
4100 
operator ^=(const Integer & t)4101 Integer& Integer::operator^=(const Integer& t)
4102 {
4103 	if (this == &t)
4104 	{
4105 		*this = Zero();
4106 	}
4107 	else
4108 	{
4109 		if (reg.size() >= t.reg.size())
4110 		{
4111 			XorWords(reg, t.reg, t.reg.size());
4112 		}
4113 		else  // reg.size() < t.reg.size()
4114 		{
4115 			const size_t head = reg.size();
4116 			const size_t tail = t.reg.size() - reg.size();
4117 			reg.resize(head+tail);
4118 			XorWords(reg, t.reg, head);
4119 			CopyWords(reg+head,t.reg+head,tail);
4120 		}
4121 	}
4122 	sign = POSITIVE;
4123 	return *this;
4124 }
4125 
PositiveMultiply(Integer & product,const Integer & a,const Integer & b)4126 void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
4127 {
4128 	size_t aSize = RoundupSize(a.WordCount());
4129 	size_t bSize = RoundupSize(b.WordCount());
4130 
4131 	product.reg.CleanNew(RoundupSize(aSize+bSize));
4132 	product.sign = Integer::POSITIVE;
4133 
4134 	IntegerSecBlock workspace(aSize + bSize);
4135 	AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
4136 }
4137 
Multiply(Integer & product,const Integer & a,const Integer & b)4138 void Multiply(Integer &product, const Integer &a, const Integer &b)
4139 {
4140 	PositiveMultiply(product, a, b);
4141 
4142 	if (a.NotNegative() != b.NotNegative())
4143 		product.Negate();
4144 }
4145 
Times(const Integer & b) const4146 Integer Integer::Times(const Integer &b) const
4147 {
4148 	Integer product;
4149 	Multiply(product, *this, b);
4150 	return product;
4151 }
4152 
4153 /*
4154 void PositiveDivide(Integer &remainder, Integer &quotient,
4155 				   const Integer &dividend, const Integer &divisor)
4156 {
4157 	remainder.reg.CleanNew(divisor.reg.size());
4158 	remainder.sign = Integer::POSITIVE;
4159 	quotient.reg.New(0);
4160 	quotient.sign = Integer::POSITIVE;
4161 	unsigned i=dividend.BitCount();
4162 	while (i--)
4163 	{
4164 		word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1);
4165 		remainder.reg[0] |= dividend[i];
4166 		if (overflow || remainder >= divisor)
4167 		{
4168 			Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size());
4169 			quotient.SetBit(i);
4170 		}
4171 	}
4172 }
4173 */
4174 
PositiveDivide(Integer & remainder,Integer & quotient,const Integer & a,const Integer & b)4175 void PositiveDivide(Integer &remainder, Integer &quotient,
4176 				   const Integer &a, const Integer &b)
4177 {
4178 	unsigned aSize = a.WordCount();
4179 	unsigned bSize = b.WordCount();
4180 
4181 	if (!bSize)
4182 		throw Integer::DivideByZero();
4183 
4184 	if (aSize < bSize)
4185 	{
4186 		remainder = a;
4187 		remainder.sign = Integer::POSITIVE;
4188 		quotient = Integer::Zero();
4189 		return;
4190 	}
4191 
4192 	aSize += aSize%2;	// round up to next even number
4193 	bSize += bSize%2;
4194 
4195 	remainder.reg.CleanNew(RoundupSize(bSize));
4196 	remainder.sign = Integer::POSITIVE;
4197 	quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
4198 	quotient.sign = Integer::POSITIVE;
4199 
4200 	IntegerSecBlock T(aSize+3*(bSize+2));
4201 	Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
4202 }
4203 
Divide(Integer & remainder,Integer & quotient,const Integer & dividend,const Integer & divisor)4204 void Integer::Divide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor)
4205 {
4206 	PositiveDivide(remainder, quotient, dividend, divisor);
4207 
4208 	if (dividend.IsNegative())
4209 	{
4210 		quotient.Negate();
4211 		if (remainder.NotZero())
4212 		{
4213 			--quotient;
4214 			remainder = divisor.AbsoluteValue() - remainder;
4215 		}
4216 	}
4217 
4218 	if (divisor.IsNegative())
4219 		quotient.Negate();
4220 }
4221 
DivideByPowerOf2(Integer & r,Integer & q,const Integer & a,unsigned int n)4222 void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n)
4223 {
4224 	q = a;
4225 	q >>= n;
4226 
4227 	const size_t wordCount = BitsToWords(n);
4228 	if (wordCount <= a.WordCount())
4229 	{
4230 		r.reg.resize(RoundupSize(wordCount));
4231 		CopyWords(r.reg, a.reg, wordCount);
4232 		SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);
4233 		if (n % WORD_BITS != 0)
4234 			r.reg[wordCount-1] %= (word(1) << (n % WORD_BITS));
4235 	}
4236 	else
4237 	{
4238 		r.reg.resize(RoundupSize(a.WordCount()));
4239 		CopyWords(r.reg, a.reg, r.reg.size());
4240 	}
4241 	r.sign = POSITIVE;
4242 
4243 	if (a.IsNegative() && r.NotZero())
4244 	{
4245 		--q;
4246 		r = Power2(n) - r;
4247 	}
4248 }
4249 
DividedBy(const Integer & b) const4250 Integer Integer::DividedBy(const Integer &b) const
4251 {
4252 	Integer remainder, quotient;
4253 	Integer::Divide(remainder, quotient, *this, b);
4254 	return quotient;
4255 }
4256 
Modulo(const Integer & b) const4257 Integer Integer::Modulo(const Integer &b) const
4258 {
4259 	Integer remainder, quotient;
4260 	Integer::Divide(remainder, quotient, *this, b);
4261 	return remainder;
4262 }
4263 
Divide(word & remainder,Integer & quotient,const Integer & dividend,word divisor)4264 void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend, word divisor)
4265 {
4266 	if (!divisor)
4267 		throw Integer::DivideByZero();
4268 
4269 	// IsPowerOf2 uses BMI on x86 if available. There is a small
4270 	// but measurable improvement during decryption and signing.
4271 	if (IsPowerOf2(divisor))
4272 	{
4273 		quotient = dividend >> (BitPrecision(divisor)-1);
4274 		remainder = dividend.reg[0] & (divisor-1);
4275 		return;
4276 	}
4277 
4278 	unsigned int i = dividend.WordCount();
4279 	quotient.reg.CleanNew(RoundupSize(i));
4280 	remainder = 0;
4281 	while (i--)
4282 	{
4283 		quotient.reg[i] = DWord(dividend.reg[i], remainder) / divisor;
4284 		remainder = DWord(dividend.reg[i], remainder) % divisor;
4285 	}
4286 
4287 	if (dividend.NotNegative())
4288 		quotient.sign = POSITIVE;
4289 	else
4290 	{
4291 		quotient.sign = NEGATIVE;
4292 		if (remainder)
4293 		{
4294 			--quotient;
4295 			remainder = divisor - remainder;
4296 		}
4297 	}
4298 }
4299 
DividedBy(word b) const4300 Integer Integer::DividedBy(word b) const
4301 {
4302 	word remainder;
4303 	Integer quotient;
4304 	Integer::Divide(remainder, quotient, *this, b);
4305 	return quotient;
4306 }
4307 
Modulo(word divisor) const4308 word Integer::Modulo(word divisor) const
4309 {
4310 	if (!divisor)
4311 		throw Integer::DivideByZero();
4312 
4313 	word remainder;
4314 
4315 	// Profiling guided the flow below.
4316 	if ((divisor & (divisor-1)) != 0)	// divisor is not a power of 2
4317 	{
4318 		// Profiling guided the flow below.
4319 		unsigned int i = WordCount();
4320 		if (divisor > 5)
4321 		{
4322 			remainder = 0;
4323 			while (i--)
4324 				remainder = DWord(reg[i], remainder) % divisor;
4325 		}
4326 		else
4327 		{
4328 			DWord sum(0, 0);
4329 			while (i--)
4330 				sum += reg[i];
4331 			remainder = sum % divisor;
4332 		}
4333 	}
4334 	else  // divisor is a power of 2
4335 	{
4336 		remainder = reg[0] & (divisor-1);
4337 	}
4338 
4339 	if (IsNegative() && remainder)
4340 		remainder = divisor - remainder;
4341 
4342 	return remainder;
4343 }
4344 
Negate()4345 void Integer::Negate()
4346 {
4347 	if (!!(*this))	// don't flip sign if *this==0
4348 		sign = Sign(1-sign);
4349 }
4350 
PositiveCompare(const Integer & t) const4351 int Integer::PositiveCompare(const Integer& t) const
4352 {
4353 	// Profiling guided the flow below.
4354 	const unsigned size = WordCount(), tSize = t.WordCount();
4355 	if (size != tSize)
4356 		return size > tSize ? 1 : -1;
4357 	else
4358 		return CryptoPP::Compare(reg, t.reg, size);
4359 }
4360 
Compare(const Integer & t) const4361 int Integer::Compare(const Integer& t) const
4362 {
4363 	if (NotNegative())
4364 	{
4365 		if (t.NotNegative())
4366 			return PositiveCompare(t);
4367 		else
4368 			return 1;
4369 	}
4370 	else
4371 	{
4372 		if (t.NotNegative())
4373 			return -1;
4374 		else
4375 			return -PositiveCompare(t);
4376 	}
4377 }
4378 
SquareRoot() const4379 Integer Integer::SquareRoot() const
4380 {
4381 	if (!IsPositive())
4382 		return Zero();
4383 
4384 	// overestimate square root
4385 	Integer x, y = Power2((BitCount()+1)/2);
4386 	CRYPTOPP_ASSERT(y*y >= *this);
4387 
4388 	do
4389 	{
4390 		x = y;
4391 		y = (x + *this/x) >> 1;
4392 	} while (y<x);
4393 
4394 	return x;
4395 }
4396 
IsSquare() const4397 bool Integer::IsSquare() const
4398 {
4399 	Integer r = SquareRoot();
4400 	return *this == r.Squared();
4401 }
4402 
IsUnit() const4403 bool Integer::IsUnit() const
4404 {
4405 	return (WordCount() == 1) && (reg[0] == 1);
4406 }
4407 
MultiplicativeInverse() const4408 Integer Integer::MultiplicativeInverse() const
4409 {
4410 	return IsUnit() ? *this : Zero();
4411 }
4412 
a_times_b_mod_c(const Integer & x,const Integer & y,const Integer & m)4413 Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
4414 {
4415 	CRYPTOPP_ASSERT(m.NotZero());
4416 	if (m.IsZero())
4417 		throw Integer::DivideByZero();
4418 
4419 	return x*y%m;
4420 }
4421 
a_exp_b_mod_c(const Integer & x,const Integer & e,const Integer & m)4422 Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
4423 {
4424 	CRYPTOPP_ASSERT(m.NotZero());
4425 	if (m.IsZero())
4426 		throw Integer::DivideByZero();
4427 
4428 	ModularArithmetic mr(m);
4429 	return mr.Exponentiate(x, e);
4430 }
4431 
Gcd(const Integer & a,const Integer & b)4432 Integer Integer::Gcd(const Integer &a, const Integer &b)
4433 {
4434 	return EuclideanDomainOf<Integer>().Gcd(a, b);
4435 }
4436 
InverseMod(const Integer & m) const4437 Integer Integer::InverseMod(const Integer &m) const
4438 {
4439 	CRYPTOPP_ASSERT(m.NotNegative());
4440 	CRYPTOPP_ASSERT(m.NotZero());
4441 
4442 	if (IsNegative())
4443 		return Modulo(m).InverseModNext(m);
4444 
4445 	// http://github.com/weidai11/cryptopp/issues/602
4446 	if (*this >= m)
4447 		return Modulo(m).InverseModNext(m);
4448 
4449 	return InverseModNext(m);
4450 }
4451 
InverseModNext(const Integer & m) const4452 Integer Integer::InverseModNext(const Integer &m) const
4453 {
4454 	CRYPTOPP_ASSERT(m.NotNegative());
4455 	CRYPTOPP_ASSERT(m.NotZero());
4456 
4457 	if (m.IsEven())
4458 	{
4459 		if (!m || IsEven())
4460 			return Zero();	// no inverse
4461 		if (*this == One())
4462 			return One();
4463 
4464 		Integer u = m.Modulo(*this).InverseModNext(*this);
4465 		return !u ? Zero() : (m*(*this-u)+1)/(*this);
4466 	}
4467 
4468 	// AlmostInverse requires a 4x workspace
4469 	IntegerSecBlock T(m.reg.size() * 4);
4470 	Integer r((word)0, m.reg.size());
4471 	unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size());
4472 	DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size());
4473 	return r;
4474 }
4475 
InverseMod(word mod) const4476 word Integer::InverseMod(word mod) const
4477 {
4478 	CRYPTOPP_ASSERT(mod != 0);
4479 
4480 	word g0 = mod, g1 = *this % mod;
4481 	word v0 = 0, v1 = 1;
4482 	word y;
4483 
4484 	while (g1)
4485 	{
4486 		if (g1 == 1)
4487 			return v1;
4488 		y = g0 / g1;
4489 		g0 = g0 % g1;
4490 		v0 += y * v1;
4491 
4492 		if (!g0)
4493 			break;
4494 		if (g0 == 1)
4495 			return mod-v0;
4496 		y = g1 / g0;
4497 		g1 = g1 % g0;
4498 		v1 += y * v0;
4499 	}
4500 	return 0;
4501 }
4502 
4503 // ********************************************************
4504 
ModularArithmetic(BufferedTransformation & bt)4505 ModularArithmetic::ModularArithmetic(BufferedTransformation &bt)
4506 {
4507 	BERSequenceDecoder seq(bt);
4508 	OID oid(seq);
4509 	if (oid != ASN1::prime_field())
4510 		BERDecodeError();
4511 	m_modulus.BERDecode(seq);
4512 	seq.MessageEnd();
4513 	m_result.reg.resize(m_modulus.reg.size());
4514 }
4515 
DEREncode(BufferedTransformation & bt) const4516 void ModularArithmetic::DEREncode(BufferedTransformation &bt) const
4517 {
4518 	DERSequenceEncoder seq(bt);
4519 	ASN1::prime_field().DEREncode(seq);
4520 	m_modulus.DEREncode(seq);
4521 	seq.MessageEnd();
4522 }
4523 
DEREncodeElement(BufferedTransformation & out,const Element & a) const4524 void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const
4525 {
4526 	a.DEREncodeAsOctetString(out, MaxElementByteLength());
4527 }
4528 
BERDecodeElement(BufferedTransformation & in,Element & a) const4529 void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const
4530 {
4531 	a.BERDecodeAsOctetString(in, MaxElementByteLength());
4532 }
4533 
Half(const Integer & a) const4534 const Integer& ModularArithmetic::Half(const Integer &a) const
4535 {
4536 	if (a.reg.size()==m_modulus.reg.size())
4537 	{
4538 		CryptoPP::DivideByPower2Mod(m_result.reg.begin(), a.reg, 1, m_modulus.reg, a.reg.size());
4539 		return m_result;
4540 	}
4541 	else
4542 		return m_result1 = (a.IsEven() ? (a >> 1) : ((a+m_modulus) >> 1));
4543 }
4544 
Add(const Integer & a,const Integer & b) const4545 const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
4546 {
4547 	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4548 	{
4549 		if (CryptoPP::Add(m_result.reg.begin(), a.reg, b.reg, a.reg.size())
4550 			|| Compare(m_result.reg, m_modulus.reg, a.reg.size()) >= 0)
4551 		{
4552 			CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
4553 		}
4554 		return m_result;
4555 	}
4556 	else
4557 	{
4558 		m_result1 = a+b;
4559 		if (m_result1 >= m_modulus)
4560 			m_result1 -= m_modulus;
4561 		return m_result1;
4562 	}
4563 }
4564 
Accumulate(Integer & a,const Integer & b) const4565 Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
4566 {
4567 	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4568 	{
4569 		if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())
4570 			|| Compare(a.reg, m_modulus.reg, a.reg.size()) >= 0)
4571 		{
4572 			CryptoPP::Subtract(a.reg, a.reg, m_modulus.reg, a.reg.size());
4573 		}
4574 	}
4575 	else
4576 	{
4577 		a+=b;
4578 		if (a>=m_modulus)
4579 			a-=m_modulus;
4580 	}
4581 
4582 	return a;
4583 }
4584 
Subtract(const Integer & a,const Integer & b) const4585 const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const
4586 {
4587 	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4588 	{
4589 		if (CryptoPP::Subtract(m_result.reg.begin(), a.reg, b.reg, a.reg.size()))
4590 			CryptoPP::Add(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
4591 		return m_result;
4592 	}
4593 	else
4594 	{
4595 		m_result1 = a-b;
4596 		if (m_result1.IsNegative())
4597 			m_result1 += m_modulus;
4598 		return m_result1;
4599 	}
4600 }
4601 
Reduce(Integer & a,const Integer & b) const4602 Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
4603 {
4604 	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4605 	{
4606 		if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))
4607 			CryptoPP::Add(a.reg, a.reg, m_modulus.reg, a.reg.size());
4608 	}
4609 	else
4610 	{
4611 		a-=b;
4612 		if (a.IsNegative())
4613 			a+=m_modulus;
4614 	}
4615 
4616 	return a;
4617 }
4618 
Inverse(const Integer & a) const4619 const Integer& ModularArithmetic::Inverse(const Integer &a) const
4620 {
4621 	if (!a)
4622 		return a;
4623 
4624 	CopyWords(m_result.reg.begin(), m_modulus.reg, m_modulus.reg.size());
4625 	if (CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, a.reg, a.reg.size()))
4626 		Decrement(m_result.reg.begin()+a.reg.size(), m_modulus.reg.size()-a.reg.size());
4627 
4628 	return m_result;
4629 }
4630 
CascadeExponentiate(const Integer & x,const Integer & e1,const Integer & y,const Integer & e2) const4631 Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const
4632 {
4633 	if (m_modulus.IsOdd())
4634 	{
4635 		MontgomeryRepresentation dr(m_modulus);
4636 		return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));
4637 	}
4638 	else
4639 		return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2);
4640 }
4641 
SimultaneousExponentiate(Integer * results,const Integer & base,const Integer * exponents,unsigned int exponentsCount) const4642 void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const
4643 {
4644 	if (m_modulus.IsOdd())
4645 	{
4646 		MontgomeryRepresentation dr(m_modulus);
4647 		dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);
4648 		for (unsigned int i=0; i<exponentsCount; i++)
4649 			results[i] = dr.ConvertOut(results[i]);
4650 	}
4651 	else
4652 		AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount);
4653 }
4654 
MontgomeryRepresentation(const Integer & m)4655 MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)	// modulus must be odd
4656 	: ModularArithmetic(m),
4657 	  m_u((word)0, m_modulus.reg.size()),
4658 	  m_workspace(5*m_modulus.reg.size())
4659 {
4660 	if (!m_modulus.IsOdd())
4661 		throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");
4662 
4663 	RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size());
4664 }
4665 
Multiply(const Integer & a,const Integer & b) const4666 const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
4667 {
4668 	word *const T = m_workspace.begin();
4669 	word *const R = m_result.reg.begin();
4670 	const size_t N = m_modulus.reg.size();
4671 	CRYPTOPP_ASSERT(a.reg.size()<=N && b.reg.size()<=N);
4672 
4673 	AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
4674 	SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
4675 	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4676 	return m_result;
4677 }
4678 
Square(const Integer & a) const4679 const Integer& MontgomeryRepresentation::Square(const Integer &a) const
4680 {
4681 	word *const T = m_workspace.begin();
4682 	word *const R = m_result.reg.begin();
4683 	const size_t N = m_modulus.reg.size();
4684 	CRYPTOPP_ASSERT(a.reg.size()<=N);
4685 
4686 	CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
4687 	SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());
4688 	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4689 	return m_result;
4690 }
4691 
ConvertOut(const Integer & a) const4692 Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
4693 {
4694 	word *const T = m_workspace.begin();
4695 	word *const R = m_result.reg.begin();
4696 	const size_t N = m_modulus.reg.size();
4697 	CRYPTOPP_ASSERT(a.reg.size()<=N);
4698 
4699 	CopyWords(T, a.reg, a.reg.size());
4700 	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
4701 	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4702 	return m_result;
4703 }
4704 
MultiplicativeInverse(const Integer & a) const4705 const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
4706 {
4707 //	  return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;
4708 	word *const T = m_workspace.begin();
4709 	word *const R = m_result.reg.begin();
4710 	const size_t N = m_modulus.reg.size();
4711 	CRYPTOPP_ASSERT(a.reg.size()<=N);
4712 
4713 	CopyWords(T, a.reg, a.reg.size());
4714 	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
4715 	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4716 	unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N);
4717 
4718 //	cout << "k=" << k << " N*32=" << 32*N << endl;
4719 
4720 	if (k>N*WORD_BITS)
4721 		DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N);
4722 	else
4723 		MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N);
4724 
4725 	return m_result;
4726 }
4727 
4728 // Specialization declared in misc.h to allow us to print integers
4729 //  with additional control options, like arbirary bases and uppercase.
4730 template <> CRYPTOPP_DLL
IntToString(Integer value,unsigned int base)4731 std::string IntToString<Integer>(Integer value, unsigned int base)
4732 {
4733 	// Hack... set the high bit for uppercase. Set the next bit fo a suffix.
4734 	static const unsigned int BIT_32 = (1U << 31);
4735 	const bool UPPER = !!(base & BIT_32);
4736 	static const unsigned int BIT_31 = (1U << 30);
4737 	const bool BASE = !!(base & BIT_31);
4738 
4739 	const char CH = UPPER ? 'A' : 'a';
4740 	base &= ~(BIT_32|BIT_31);
4741 	CRYPTOPP_ASSERT(base >= 2 && base <= 32);
4742 
4743 	if (value == 0)
4744 		return "0";
4745 
4746 	bool negative = false, zero = false;
4747 	if (value.IsNegative())
4748 	{
4749 		negative = true;
4750 		value.Negate();
4751 	}
4752 
4753 	if (!value)
4754 		zero = true;
4755 
4756 	SecBlock<char> s(value.BitCount() / (SaturatingSubtract1(BitPrecision(base),1U)) + 1);
4757 	Integer temp;
4758 
4759 	unsigned int i=0;
4760 	while (!!value)
4761 	{
4762 		word digit;
4763 		Integer::Divide(digit, temp, value, word(base));
4764 		s[i++]=char((digit < 10 ? '0' : (CH - 10)) + digit);
4765 		value.swap(temp);
4766 	}
4767 
4768 	std::string result;
4769 	result.reserve(i+2);
4770 
4771 	if (negative)
4772 		result += '-';
4773 
4774 	if (zero)
4775 		result += '0';
4776 
4777 	while (i--)
4778 		result += s[i];
4779 
4780 	if (BASE)
4781 	{
4782 		if (base == 10)
4783 			result += '.';
4784 		else if (base == 16)
4785 			result += 'h';
4786 		else if (base == 8)
4787 			result += 'o';
4788 		else if (base == 2)
4789 			result += 'b';
4790 	}
4791 
4792 	return result;
4793 }
4794 
4795 // Specialization declared in misc.h to avoid Coverity findings.
4796 template <> CRYPTOPP_DLL
IntToString(word64 value,unsigned int base)4797 std::string IntToString<word64>(word64 value, unsigned int base)
4798 {
4799 	// Hack... set the high bit for uppercase.
4800 	static const unsigned int HIGH_BIT = (1U << 31);
4801 	const char CH = !!(base & HIGH_BIT) ? 'A' : 'a';
4802 	base &= ~HIGH_BIT;
4803 
4804 	CRYPTOPP_ASSERT(base >= 2);
4805 	if (value == 0)
4806 		return "0";
4807 
4808 	std::string result;
4809 	while (value > 0)
4810 	{
4811 		word64 digit = value % base;
4812 		result = char((digit < 10 ? '0' : (CH - 10)) + digit) + result;
4813 		value /= base;
4814 	}
4815 	return result;
4816 }
4817 
4818 #ifndef CRYPTOPP_NO_ASSIGN_TO_INTEGER
4819 // Allow the linker to discard Integer code if not needed.
4820 // Also see http://github.com/weidai11/cryptopp/issues/389.
AssignIntToInteger(const std::type_info & valueType,void * pInteger,const void * pInt)4821 bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
4822 {
4823 	if (valueType != typeid(Integer))
4824 		return false;
4825 	*reinterpret_cast<Integer *>(pInteger) = *reinterpret_cast<const int *>(pInt);
4826 	return true;
4827 }
4828 #endif  // CRYPTOPP_NO_ASSIGN_TO_INTEGER
4829 
4830 // *************************** C++ Static Initialization ***************************
4831 
4832 class InitInteger
4833 {
4834 public:
InitInteger()4835 	InitInteger()
4836 	{
4837 		SetFunctionPointers();
4838 	}
4839 };
4840 
4841 // This is not really needed because each Integer can dynamically initialize
4842 // itself, but we take a peephole optimization and initialize the class once
4843 // if init priorities are available. Dynamic initialization will be used if
4844 // init priorities are not available.
4845 
4846 #if defined(HAVE_GCC_INIT_PRIORITY)
4847 	const InitInteger s_init __attribute__ ((init_priority (CRYPTOPP_INIT_PRIORITY + 10))) = InitInteger();
4848 	const Integer g_zero __attribute__ ((init_priority (CRYPTOPP_INIT_PRIORITY + 11))) = Integer(0L);
4849 	const Integer g_one __attribute__ ((init_priority (CRYPTOPP_INIT_PRIORITY + 12))) = Integer(1L);
4850 	const Integer g_two __attribute__ ((init_priority (CRYPTOPP_INIT_PRIORITY + 13))) = Integer(2L);
4851 #elif defined(HAVE_MSC_INIT_PRIORITY)
4852 	#pragma warning(disable: 4075)
4853 	#pragma init_seg(".CRT$XCU")
4854 	const InitInteger s_init;
4855 	const Integer g_zero(0L);
4856 	const Integer g_one(1L);
4857 	const Integer g_two(2L);
4858 	#pragma warning(default: 4075)
4859 #elif HAVE_XLC_INIT_PRIORITY
4860 	// XLC needs constant, not a define
4861 	#pragma priority(280)
4862 	const InitInteger s_init;
4863 	const Integer g_zero(0L);
4864 	const Integer g_one(1L);
4865 	const Integer g_two(2L);
4866 #else
4867 	const InitInteger s_init;
4868 #endif
4869 
4870 // ***************** Library code ********************
4871 
Zero()4872 const Integer &Integer::Zero()
4873 {
4874 #if defined(HAVE_GCC_INIT_PRIORITY) || defined(HAVE_MSC_INIT_PRIORITY) || defined(HAVE_XLC_INIT_PRIORITY)
4875 	return g_zero;
4876 #elif defined(CRYPTOPP_CXX11_STATIC_INIT)
4877 	static const Integer s_zero(0L);
4878 	return s_zero;
4879 #else  // Potential memory leak. Avoid if possible.
4880 	return Singleton<Integer, NewInteger<0L> >().Ref();
4881 #endif
4882 }
4883 
One()4884 const Integer &Integer::One()
4885 {
4886 #if defined(HAVE_GCC_INIT_PRIORITY) || defined(HAVE_MSC_INIT_PRIORITY) || defined(HAVE_XLC_INIT_PRIORITY)
4887 	return g_one;
4888 #elif defined(CRYPTOPP_CXX11_STATIC_INIT)
4889 	static const Integer s_one(1L);
4890 	return s_one;
4891 #else  // Potential memory leak. Avoid if possible.
4892 	return Singleton<Integer, NewInteger<1L> >().Ref();
4893 #endif
4894 }
4895 
Two()4896 const Integer &Integer::Two()
4897 {
4898 #if defined(HAVE_GCC_INIT_PRIORITY) || defined(HAVE_MSC_INIT_PRIORITY) || defined(HAVE_XLC_INIT_PRIORITY)
4899 	return g_two;
4900 #elif defined(CRYPTOPP_CXX11_STATIC_INIT)
4901 	static const Integer s_two(2L);
4902 	return s_two;
4903 #else  // Potential memory leak. Avoid if possible.
4904 	return Singleton<Integer, NewInteger<2L> >().Ref();
4905 #endif
4906 }
4907 
4908 NAMESPACE_END
4909 
4910 #endif  // CRYPTOPP_IMPORTS
4911