1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2018: Added support for ARMv7 AES instructions via Cryptogams ASM.
9            See the head notes in aes_armv4.S for copyright and license.
10 */
11 
12 /*
13 September 2017: Added support for Power8 AES instructions via compiler intrinsics.
14 */
15 
16 /*
17 July 2017: Added support for ARMv8 AES instructions via compiler intrinsics.
18 */
19 
20 /*
21 July 2010: Added support for AES-NI instructions via compiler intrinsics.
22 */
23 
24 /*
25 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
26 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
27 and Peter Schwabe in their paper "New AES software speed records". The round
28 function was also modified to include a trick similar to one in Brian Gladman's
29 x86 assembly code, doing an 8-bit register move to minimize the number of
30 register spills. Also switched to compressed tables and copying round keys to
31 the stack.
32 
33 The C++ implementation uses compressed tables if
34 CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined.
35 It is defined on x86 platforms by default but no others.
36 */
37 
38 /*
39 July 2006: Defense against timing attacks was added in by Wei Dai.
40 
41 The code now uses smaller tables in the first and last rounds,
42 and preloads them into L1 cache before usage (by loading at least
43 one element in each cache line).
44 
45 We try to delay subsequent accesses to each table (used in the first
46 and last rounds) until all of the table has been preloaded. Hopefully
47 the compiler isn't smart enough to optimize that code away.
48 
49 After preloading the table, we also try not to access any memory location
50 other than the table and the stack, in order to prevent table entries from
51 being unloaded from L1 cache, until that round is finished.
52 (Some popular CPUs have 2-way associative caches.)
53 */
54 
55 // This is the original introductory comment:
56 
57 /**
58  * version 3.0 (December 2000)
59  *
60  * Optimised ANSI C code for the Rijndael cipher (now AES)
61  *
62  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
63  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
64  * author Paulo Barreto <paulo.barreto@terra.com.br>
65  *
66  * This code is hereby placed in the public domain.
67  *
68  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
69  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
70  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
71  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
72  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
73  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
74  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
75  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
76  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
77  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
78  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79  */
80 
81 #include "pch.h"
82 #include "config.h"
83 
84 #ifndef CRYPTOPP_IMPORTS
85 #ifndef CRYPTOPP_GENERATE_X64_MASM
86 
87 #include "rijndael.h"
88 #include "misc.h"
89 #include "cpu.h"
90 
91 // VS2017 and global optimization bug. TODO, figure out when
92 // we can re-enable full optimizations for VS2017. Also see
93 // https://github.com/weidai11/cryptopp/issues/649
94 #if (_MSC_VER >= 1910)
95 # ifndef CRYPTOPP_DEBUG
96 #  pragma optimize("", off)
97 #  pragma optimize("ts", on)
98 # endif
99 #endif
100 
101 NAMESPACE_BEGIN(CryptoPP)
102 
103 // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
104 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE))
105 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
106 #endif
107 
108 // Clang intrinsic casts
109 #define M128I_CAST(x) ((__m128i *)(void *)(x))
110 #define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
111 
112 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
113 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
114 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
115 using namespace rdtable;
116 # else
117 static word64 Te[256];
118 # endif
119 static word64 Td[256];
120 #else // Not CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
121 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
122 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
123 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
124 # endif
125 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
126 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
127 #endif // CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
128 
129 static volatile bool s_TeFilled = false, s_TdFilled = false;
130 
131 ANONYMOUS_NAMESPACE_BEGIN
132 
133 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
134 
135 // Determine whether the range between begin and end overlaps
136 //   with the same 4k block offsets as the Te table. Logically,
137 //   the code is trying to create the condition:
138 //
139 // Two sepearate memory pages:
140 //
141 //  +-----+   +-----+
142 //  |XXXXX|   |YYYYY|
143 //  |XXXXX|   |YYYYY|
144 //  |     |   |     |
145 //  |     |   |     |
146 //  +-----+   +-----+
147 //  Te Table   Locals
148 //
149 // Have a logical cache view of (X and Y may be inverted):
150 //
151 // +-----+
152 // |XXXXX|
153 // |XXXXX|
154 // |YYYYY|
155 // |YYYYY|
156 // +-----+
157 //
AliasedWithTable(const byte * begin,const byte * end)158 static inline bool AliasedWithTable(const byte *begin, const byte *end)
159 {
160 	ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
161 	ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
162 	if (t1 > t0)
163 		return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
164 	else
165 		return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
166 }
167 
168 struct Locals
169 {
170 	word32 subkeys[4*12], workspace[8];
171 	const byte *inBlocks, *inXorBlocks, *outXorBlocks;
172 	byte *outBlocks;
173 	size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
174 	size_t regSpill, lengthAndCounterFlag, keysBegin;
175 };
176 
177 const size_t s_aliasPageSize = 4096;
178 const size_t s_aliasBlockSize = 256;
179 const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
180 
181 #endif  // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
182 
183 ANONYMOUS_NAMESPACE_END
184 
185 // ************************* Portable Code ************************************
186 
187 #define QUARTER_ROUND(L, T, t, a, b, c, d)	\
188 	a ^= L(T, 3, byte(t)); t >>= 8;\
189 	b ^= L(T, 2, byte(t)); t >>= 8;\
190 	c ^= L(T, 1, byte(t)); t >>= 8;\
191 	d ^= L(T, 0, t);
192 
193 #define QUARTER_ROUND_LE(t, a, b, c, d)	\
194 	tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
195 	tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
196 	tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
197 	tempBlock[d] = ((byte *)(Te+t))[1];
198 
199 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
200 	#define QUARTER_ROUND_LD(t, a, b, c, d)	\
201 		tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
202 		tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
203 		tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
204 		tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
205 #else
206 	#define QUARTER_ROUND_LD(t, a, b, c, d)	\
207 		tempBlock[a] = Sd[byte(t)]; t >>= 8;\
208 		tempBlock[b] = Sd[byte(t)]; t >>= 8;\
209 		tempBlock[c] = Sd[byte(t)]; t >>= 8;\
210 		tempBlock[d] = Sd[t];
211 #endif
212 
213 #define QUARTER_ROUND_E(t, a, b, c, d)		QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
214 #define QUARTER_ROUND_D(t, a, b, c, d)		QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
215 
216 #if (CRYPTOPP_LITTLE_ENDIAN)
217 	#define QUARTER_ROUND_FE(t, a, b, c, d)		QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
218 	#define QUARTER_ROUND_FD(t, a, b, c, d)		QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
219 	#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
220 		#define TL_F(T, i, x)	(*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
221 		#define TL_M(T, i, x)	(*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
222 	#else
223 		#define TL_F(T, i, x)	rotrFixed(T[x], (3-i)*8)
224 		#define TL_M(T, i, x)	T[i*256 + x]
225 	#endif
226 #else
227 	#define QUARTER_ROUND_FE(t, a, b, c, d)		QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
228 	#define QUARTER_ROUND_FD(t, a, b, c, d)		QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
229 	#if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
230 		#define TL_F(T, i, x)	(*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
231 		#define TL_M			TL_F
232 	#else
233 		#define TL_F(T, i, x)	rotrFixed(T[x], i*8)
234 		#define TL_M(T, i, x)	T[i*256 + x]
235 	#endif
236 #endif
237 
238 
239 #define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
240 #define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
241 #define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
242 
243 #define f3(x)   (f2(x) ^ x)
244 #define f9(x)   (f8(x) ^ x)
245 #define fb(x)   (f8(x) ^ f2(x) ^ x)
246 #define fd(x)   (f8(x) ^ f4(x) ^ x)
247 #define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
248 
OptimalDataAlignment() const249 unsigned int Rijndael::Base::OptimalDataAlignment() const
250 {
251 #if (CRYPTOPP_AESNI_AVAILABLE)
252 	if (HasAESNI())
253 		return 16;  // load __m128i
254 #endif
255 #if (CRYPTOPP_ARM_AES_AVAILABLE)
256 	if (HasAES())
257 		return 4;  // load uint32x4_t
258 #endif
259 #if (CRYPTOGAMS_ARM_AES)
260 	// Must use 1 here for Cryptogams AES. Also see
261 	// https://github.com/weidai11/cryptopp/issues/683
262 	if (HasARMv7())
263 		return 1;
264 #endif
265 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
266 	if (HasAES())
267 		return 16;  // load uint32x4_p
268 #endif
269 	return BlockTransformation::OptimalDataAlignment();
270 }
271 
FillEncTable()272 void Rijndael::Base::FillEncTable()
273 {
274 	for (int i=0; i<256; i++)
275 	{
276 		byte x = Se[i];
277 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
278 		word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
279 		Te[i] = word64(y | f3(x))<<32 | y;
280 #else
281 		word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
282 		for (int j=0; j<4; j++)
283 		{
284 			Te[i+j*256] = y;
285 			y = rotrConstant<8>(y);
286 		}
287 #endif
288 	}
289 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
290 	Te[256] = Te[257] = 0;
291 #endif
292 	s_TeFilled = true;
293 }
294 
FillDecTable()295 void Rijndael::Base::FillDecTable()
296 {
297 	for (int i=0; i<256; i++)
298 	{
299 		byte x = Sd[i];
300 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
301 		word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
302 		Td[i] = word64(y | fb(x))<<32 | y | x;
303 #else
304 		word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
305 		for (int j=0; j<4; j++)
306 		{
307 			Td[i+j*256] = y;
308 			y = rotrConstant<8>(y);
309 		}
310 #endif
311 	}
312 	s_TdFilled = true;
313 }
314 
315 #if (CRYPTOPP_AESNI_AVAILABLE)
316 extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
317 extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
318 
319 extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
320         const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
321 extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
322         const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
323 #endif
324 
325 #if (CRYPTOPP_ARM_AES_AVAILABLE)
326 extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
327         const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
328 extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
329         const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
330 #endif
331 
332 #if (CRYPTOGAMS_ARM_AES)
333 extern "C" int cryptogams_AES_set_encrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
334 extern "C" int cryptogams_AES_set_decrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
335 extern "C" void cryptogams_AES_encrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
336 extern "C" void cryptogams_AES_decrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
337 #endif
338 
339 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
340 extern void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen,
341         word32* rk, const byte* Se);
342 
343 extern size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
344         const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
345 extern size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
346         const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
347 #endif
348 
349 #if (CRYPTOGAMS_ARM_AES)
CRYPTOGAMS_set_encrypt_key(const byte * userKey,const int bitLen,word32 * rkey)350 int CRYPTOGAMS_set_encrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
351 {
352 	return cryptogams_AES_set_encrypt_key(userKey, bitLen, rkey);
353 }
CRYPTOGAMS_set_decrypt_key(const byte * userKey,const int bitLen,word32 * rkey)354 int CRYPTOGAMS_set_decrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
355 {
356 	return cryptogams_AES_set_decrypt_key(userKey, bitLen, rkey);
357 }
CRYPTOGAMS_encrypt(const byte * inBlock,const byte * xorBlock,byte * outBlock,const word32 * rkey)358 void CRYPTOGAMS_encrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
359 {
360 	cryptogams_AES_encrypt_block(inBlock, outBlock, rkey);
361 	if (xorBlock)
362 		xorbuf (outBlock, xorBlock, 16);
363 }
CRYPTOGAMS_decrypt(const byte * inBlock,const byte * xorBlock,byte * outBlock,const word32 * rkey)364 void CRYPTOGAMS_decrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
365 {
366 	cryptogams_AES_decrypt_block(inBlock, outBlock, rkey);
367 	if (xorBlock)
368 		xorbuf (outBlock, xorBlock, 16);
369 }
370 #endif
371 
AlgorithmProvider() const372 std::string Rijndael::Base::AlgorithmProvider() const
373 {
374 #if (CRYPTOPP_AESNI_AVAILABLE)
375 	if (HasAESNI())
376 		return "AESNI";
377 #endif
378 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
379 	if (HasSSE2())
380 		return "SSE2";
381 #endif
382 #if (CRYPTOPP_ARM_AES_AVAILABLE)
383 	if (HasAES())
384 		return "ARMv8";
385 #endif
386 #if (CRYPTOGAMS_ARM_AES)
387 	if (HasARMv7())
388 		return "ARMv7";
389 #endif
390 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
391 	if (HasAES())
392 		return "Power8";
393 #endif
394 	return "C++";
395 }
396 
UncheckedSetKey(const byte * userKey,unsigned int keyLen,const NameValuePairs &)397 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
398 {
399 	AssertValidKeyLength(keyLen);
400 
401 #if (CRYPTOGAMS_ARM_AES)
402 	if (HasARMv7())
403 	{
404 		m_rounds = keyLen/4 + 6;
405 		m_key.New(4*(14+1)+4);
406 
407 		if (IsForwardTransformation())
408 			CRYPTOGAMS_set_encrypt_key(userKey, keyLen*8, m_key.begin());
409 		else
410 			CRYPTOGAMS_set_decrypt_key(userKey, keyLen*8, m_key.begin());
411 		return;
412 	}
413 #endif
414 
415 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
416 	m_aliasBlock.New(s_sizeToAllocate);
417 	// The alias block is only used on IA-32 when unaligned data access is in effect.
418 	// Setting the low water mark to 0 avoids zeroization when m_aliasBlock is unused.
419 	m_aliasBlock.SetMark(0);
420 #endif
421 
422 	m_rounds = keyLen/4 + 6;
423 	m_key.New(4*(m_rounds+1));
424 	word32 *rk = m_key;
425 
426 #if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
427 	// MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
428 	if (HasAESNI() && HasSSE41())
429 	{
430 		// TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
431 		//  Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
432 		Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
433 		if (!IsForwardTransformation())
434 			Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
435 
436 		return;
437 	}
438 #endif
439 
440 #if CRYPTOPP_POWER8_AES_AVAILABLE
441 	if (HasAES())
442 	{
443 		// We still need rcon and Se to fallback to C/C++ for AES-192 and AES-256.
444 		// The IBM docs on AES sucks. Intel's docs on AESNI puts IBM to shame.
445 		Rijndael_UncheckedSetKey_POWER8(userKey, keyLen, rk, Se);
446 		return;
447 	}
448 #endif
449 
450 	GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
451 	const word32 *rc = rcon;
452 	word32 temp;
453 
454 	while (true)
455 	{
456 		temp  = rk[keyLen/4-1];
457 		word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
458 					(word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
459 		rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
460 		rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
461 		rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
462 		rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
463 
464 		if (rk + keyLen/4 + 4 == m_key.end())
465 			break;
466 
467 		if (keyLen == 24)
468 		{
469 			rk[10] = rk[ 4] ^ rk[ 9];
470 			rk[11] = rk[ 5] ^ rk[10];
471 		}
472 		else if (keyLen == 32)
473 		{
474     		temp = rk[11];
475     		rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
476     		rk[13] = rk[ 5] ^ rk[12];
477     		rk[14] = rk[ 6] ^ rk[13];
478     		rk[15] = rk[ 7] ^ rk[14];
479 		}
480 		rk += keyLen/4;
481 	}
482 
483 	rk = m_key;
484 
485 	if (IsForwardTransformation())
486 	{
487 		if (!s_TeFilled)
488 			FillEncTable();
489 
490 		ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
491 		ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
492 	}
493 	else
494 	{
495 		if (!s_TdFilled)
496 			FillDecTable();
497 
498 		#define InverseMixColumn(x) \
499 			TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
500 			TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
501 
502 		unsigned int i, j;
503 		for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
504 		{
505 			temp = InverseMixColumn(rk[i    ]); rk[i    ] = InverseMixColumn(rk[j    ]); rk[j    ] = temp;
506 			temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
507 			temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
508 			temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
509 		}
510 
511 		rk[i+0] = InverseMixColumn(rk[i+0]);
512 		rk[i+1] = InverseMixColumn(rk[i+1]);
513 		rk[i+2] = InverseMixColumn(rk[i+2]);
514 		rk[i+3] = InverseMixColumn(rk[i+3]);
515 
516 		temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
517 		temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
518 		temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
519 		temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
520 	}
521 
522 #if CRYPTOPP_AESNI_AVAILABLE
523 	if (HasAESNI())
524 		ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
525 #endif
526 #if CRYPTOPP_ARM_AES_AVAILABLE
527 	if (HasAES())
528 		ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
529 #endif
530 }
531 
ProcessAndXorBlock(const byte * inBlock,const byte * xorBlock,byte * outBlock) const532 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
533 {
534 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
535 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
536 	if (HasSSE2())
537 # else
538 	if (HasAESNI())
539 # endif
540 	{
541 		(void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
542 		return;
543 	}
544 #endif
545 
546 #if (CRYPTOPP_ARM_AES_AVAILABLE)
547 	if (HasAES())
548 	{
549 		(void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
550 		return;
551 	}
552 #endif
553 
554 #if (CRYPTOGAMS_ARM_AES)
555 	if (HasARMv7())
556 	{
557 		CRYPTOGAMS_encrypt(inBlock, xorBlock, outBlock, m_key.begin());
558 		return;
559 	}
560 #endif
561 
562 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
563 	if (HasAES())
564 	{
565 		(void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
566 		return;
567 	}
568 #endif
569 
570 	typedef BlockGetAndPut<word32, NativeByteOrder> Block;
571 
572 	word32 s0, s1, s2, s3, t0, t1, t2, t3;
573 	Block::Get(inBlock)(s0)(s1)(s2)(s3);
574 
575 	const word32 *rk = m_key;
576 	s0 ^= rk[0];
577 	s1 ^= rk[1];
578 	s2 ^= rk[2];
579 	s3 ^= rk[3];
580 	t0 = rk[4];
581 	t1 = rk[5];
582 	t2 = rk[6];
583 	t3 = rk[7];
584 	rk += 8;
585 
586 	// timing attack countermeasure. see comments at top for more details.
587 	// also see http://github.com/weidai11/cryptopp/issues/146
588 	const int cacheLineSize = GetCacheLineSize();
589 	unsigned int i;
590 	volatile word32 _u = 0;
591 	word32 u = _u;
592 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
593 	for (i=0; i<2048; i+=cacheLineSize)
594 #else
595 	for (i=0; i<1024; i+=cacheLineSize)
596 #endif
597 		u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
598 	u &= Te[255];
599 	s0 |= u; s1 |= u; s2 |= u; s3 |= u;
600 
601 	QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
602 	QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
603 	QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
604 	QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
605 
606 	// Nr - 2 full rounds:
607 	unsigned int r = m_rounds/2 - 1;
608 	do
609 	{
610 		s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
611 
612 		QUARTER_ROUND_E(t3, s0, s1, s2, s3)
613 		QUARTER_ROUND_E(t2, s3, s0, s1, s2)
614 		QUARTER_ROUND_E(t1, s2, s3, s0, s1)
615 		QUARTER_ROUND_E(t0, s1, s2, s3, s0)
616 
617 		t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
618 
619 		QUARTER_ROUND_E(s3, t0, t1, t2, t3)
620 		QUARTER_ROUND_E(s2, t3, t0, t1, t2)
621 		QUARTER_ROUND_E(s1, t2, t3, t0, t1)
622 		QUARTER_ROUND_E(s0, t1, t2, t3, t0)
623 
624 		rk += 8;
625 	} while (--r);
626 
627 	word32 tbw[4];
628 	byte *const tempBlock = (byte *)tbw;
629 
630 	QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
631 	QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
632 	QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
633 	QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
634 
635 	Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
636 }
637 
ProcessAndXorBlock(const byte * inBlock,const byte * xorBlock,byte * outBlock) const638 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
639 {
640 #if CRYPTOPP_AESNI_AVAILABLE
641 	if (HasAESNI())
642 	{
643 		(void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
644 		return;
645 	}
646 #endif
647 
648 #if (CRYPTOPP_ARM_AES_AVAILABLE)
649 	if (HasAES())
650 	{
651 		(void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
652 		return;
653 	}
654 #endif
655 
656 #if (CRYPTOGAMS_ARM_AES)
657 	if (HasARMv7())
658 	{
659 		CRYPTOGAMS_decrypt(inBlock, xorBlock, outBlock, m_key.begin());
660 		return;
661 	}
662 #endif
663 
664 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
665 	if (HasAES())
666 	{
667 		(void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
668 		return;
669 	}
670 #endif
671 
672 	typedef BlockGetAndPut<word32, NativeByteOrder> Block;
673 
674 	word32 s0, s1, s2, s3, t0, t1, t2, t3;
675 	Block::Get(inBlock)(s0)(s1)(s2)(s3);
676 
677 	const word32 *rk = m_key;
678 	s0 ^= rk[0];
679 	s1 ^= rk[1];
680 	s2 ^= rk[2];
681 	s3 ^= rk[3];
682 	t0 = rk[4];
683 	t1 = rk[5];
684 	t2 = rk[6];
685 	t3 = rk[7];
686 	rk += 8;
687 
688 	// timing attack countermeasure. see comments at top for more details.
689 	// also see http://github.com/weidai11/cryptopp/issues/146
690 	const int cacheLineSize = GetCacheLineSize();
691 	unsigned int i;
692 	volatile word32 _u = 0;
693 	word32 u = _u;
694 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
695 	for (i=0; i<2048; i+=cacheLineSize)
696 #else
697 	for (i=0; i<1024; i+=cacheLineSize)
698 #endif
699 		u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
700 	u &= Td[255];
701 	s0 |= u; s1 |= u; s2 |= u; s3 |= u;
702 
703 	QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
704 	QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
705 	QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
706 	QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
707 
708 	// Nr - 2 full rounds:
709 	unsigned int r = m_rounds/2 - 1;
710 	do
711 	{
712 		s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
713 
714 		QUARTER_ROUND_D(t3, s2, s1, s0, s3)
715 		QUARTER_ROUND_D(t2, s1, s0, s3, s2)
716 		QUARTER_ROUND_D(t1, s0, s3, s2, s1)
717 		QUARTER_ROUND_D(t0, s3, s2, s1, s0)
718 
719 		t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
720 
721 		QUARTER_ROUND_D(s3, t2, t1, t0, t3)
722 		QUARTER_ROUND_D(s2, t1, t0, t3, t2)
723 		QUARTER_ROUND_D(s1, t0, t3, t2, t1)
724 		QUARTER_ROUND_D(s0, t3, t2, t1, t0)
725 
726 		rk += 8;
727 	} while (--r);
728 
729 #if !(defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
730 	// timing attack countermeasure. see comments at top for more details
731 	// If CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined,
732 	// QUARTER_ROUND_LD will use Td, which is already preloaded.
733 	u = _u;
734 	for (i=0; i<256; i+=cacheLineSize)
735 		u &= *(const word32 *)(const void *)(Sd+i);
736 	u &= *(const word32 *)(const void *)(Sd+252);
737 	t0 |= u; t1 |= u; t2 |= u; t3 |= u;
738 #endif
739 
740 	word32 tbw[4];
741 	byte *const tempBlock = (byte *)tbw;
742 
743 	QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
744 	QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
745 	QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
746 	QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
747 
748 	Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
749 }
750 
751 // ************************* Assembly Code ************************************
752 
753 #if CRYPTOPP_MSC_VERSION
754 # pragma warning(disable: 4731)	// frame pointer register 'ebp' modified by inline assembly code
755 #endif
756 
757 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
758 
759 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
760 
Rijndael_Enc_AdvancedProcessBlocks_SSE2(void * locals,const word32 * k)761 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k)
762 {
763 	CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
764 
765 #if CRYPTOPP_BOOL_X86
766 
767 #define L_REG			esp
768 #define L_INDEX(i)		(L_REG+768+i)
769 #define L_INXORBLOCKS	L_INBLOCKS+4
770 #define L_OUTXORBLOCKS	L_INBLOCKS+8
771 #define L_OUTBLOCKS		L_INBLOCKS+12
772 #define L_INCREMENTS	L_INDEX(16*15)
773 #define L_SP			L_INDEX(16*16)
774 #define L_LENGTH		L_INDEX(16*16+4)
775 #define L_KEYS_BEGIN	L_INDEX(16*16+8)
776 
777 #define MOVD			movd
778 #define MM(i)			mm##i
779 
780 #define MXOR(a,b,c)	\
781 	AS2(	movzx	esi, b)\
782 	AS2(	movd	mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
783 	AS2(	pxor	MM(a), mm7)\
784 
785 #define MMOV(a,b,c)	\
786 	AS2(	movzx	esi, b)\
787 	AS2(	movd	MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
788 
789 #else
790 
791 #define L_REG			r8
792 #define L_INDEX(i)		(L_REG+i)
793 #define L_INXORBLOCKS	L_INBLOCKS+8
794 #define L_OUTXORBLOCKS	L_INBLOCKS+16
795 #define L_OUTBLOCKS		L_INBLOCKS+24
796 #define L_INCREMENTS	L_INDEX(16*16)
797 #define L_LENGTH		L_INDEX(16*18+8)
798 #define L_KEYS_BEGIN	L_INDEX(16*19)
799 
800 #define MOVD			mov
801 #define MM_0			r9d
802 #define MM_1			r12d
803 #ifdef __GNUC__
804 #define MM_2			r11d
805 #else
806 #define MM_2			r10d
807 #endif
808 #define MM(i)			MM_##i
809 
810 #define MXOR(a,b,c)	\
811 	AS2(	movzx	esi, b)\
812 	AS2(	xor		MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
813 
814 #define MMOV(a,b,c)	\
815 	AS2(	movzx	esi, b)\
816 	AS2(	mov		MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
817 
818 #endif
819 
820 #define L_SUBKEYS		L_INDEX(0)
821 #define L_SAVED_X		L_SUBKEYS
822 #define L_KEY12			L_INDEX(16*12)
823 #define L_LASTROUND		L_INDEX(16*13)
824 #define L_INBLOCKS		L_INDEX(16*14)
825 #define MAP0TO4(i)		(ASM_MOD(i+3,4)+1)
826 
827 #define XOR(a,b,c)	\
828 	AS2(	movzx	esi, b)\
829 	AS2(	xor		a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
830 
831 #define MOV(a,b,c)	\
832 	AS2(	movzx	esi, b)\
833 	AS2(	mov		a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
834 
835 #ifdef CRYPTOPP_GENERATE_X64_MASM
836 		ALIGN   8
837 	Rijndael_Enc_AdvancedProcessBlocks	PROC FRAME
838 		rex_push_reg rsi
839 		push_reg rdi
840 		push_reg rbx
841 		push_reg r12
842 		.endprolog
843 		mov L_REG, rcx
844 		mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
845 		mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
846 #elif defined(__GNUC__)
847 	__asm__ __volatile__
848 	(
849 	INTEL_NOPREFIX
850 	#if CRYPTOPP_BOOL_X64
851 	AS2(	mov		L_REG, rcx)
852 	#endif
853 	AS_PUSH_IF86(bx)
854 	AS_PUSH_IF86(bp)
855 	AS2(	mov		AS_REG_7, WORD_REG(si))
856 #else
857 	AS_PUSH_IF86(si)
858 	AS_PUSH_IF86(di)
859 	AS_PUSH_IF86(bx)
860 	AS_PUSH_IF86(bp)
861 	AS2(	lea		AS_REG_7, [Te])
862 	AS2(	mov		edi, [g_cacheLineSize])
863 #endif
864 
865 #if CRYPTOPP_BOOL_X86
866 	AS2(	mov		[ecx+16*12+16*4], esp)	// save esp to L_SP
867 	AS2(	lea		esp, [ecx-768])
868 #endif
869 
870 	// copy subkeys to stack
871 	AS2(	mov		WORD_REG(si), [L_KEYS_BEGIN])
872 	AS2(	mov		WORD_REG(ax), 16)
873 	AS2(	and		WORD_REG(ax), WORD_REG(si))
874 	AS2(	movdqa	xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])	// subkey 1 (non-counter) or 2 (counter)
875 	AS2(	movdqa	[L_KEY12], xmm3)
876 	AS2(	lea		WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
877 	AS2(	sub		WORD_REG(ax), WORD_REG(si))
878 	ASL(0)
879 	AS2(	movdqa	xmm0, [WORD_REG(ax)+WORD_REG(si)])
880 	AS2(	movdqa	XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
881 	AS2(	add		WORD_REG(si), 16)
882 	AS2(	cmp		WORD_REG(si), 16*12)
883 	ATT_NOPREFIX
884 	ASJ(	jl,		0, b)
885 	INTEL_NOPREFIX
886 
887 	// read subkeys 0, 1 and last
888 	AS2(	movdqa	xmm4, [WORD_REG(ax)+WORD_REG(si)])	// last subkey
889 	AS2(	movdqa	xmm1, [WORD_REG(dx)])			// subkey 0
890 	AS2(	MOVD	MM(1), [WORD_REG(dx)+4*4])		// 0,1,2,3
891 	AS2(	mov		ebx, [WORD_REG(dx)+5*4])		// 4,5,6,7
892 	AS2(	mov		ecx, [WORD_REG(dx)+6*4])		// 8,9,10,11
893 	AS2(	mov		edx, [WORD_REG(dx)+7*4])		// 12,13,14,15
894 
895 	// load table into cache
896 	AS2(	xor		WORD_REG(ax), WORD_REG(ax))
897 	ASL(9)
898 	AS2(	mov		esi, [AS_REG_7+WORD_REG(ax)])
899 	AS2(	add		WORD_REG(ax), WORD_REG(di))
900 	AS2(	mov		esi, [AS_REG_7+WORD_REG(ax)])
901 	AS2(	add		WORD_REG(ax), WORD_REG(di))
902 	AS2(	mov		esi, [AS_REG_7+WORD_REG(ax)])
903 	AS2(	add		WORD_REG(ax), WORD_REG(di))
904 	AS2(	mov		esi, [AS_REG_7+WORD_REG(ax)])
905 	AS2(	add		WORD_REG(ax), WORD_REG(di))
906 	AS2(	cmp		WORD_REG(ax), 2048)
907 	ATT_NOPREFIX
908 	ASJ(	jl,		9, b)
909 	INTEL_NOPREFIX
910 	AS1(	lfence)
911 
912 	AS2(	test	DWORD PTR [L_LENGTH], 1)
913 	ATT_NOPREFIX
914 	ASJ(	jz,		8, f)
915 	INTEL_NOPREFIX
916 
917 	// counter mode one-time setup
918 	AS2(	mov		WORD_REG(si), [L_INBLOCKS])
919 	AS2(	movdqu	xmm2, [WORD_REG(si)])	// counter
920 	AS2(	pxor	xmm2, xmm1)
921 	AS2(	psrldq	xmm1, 14)
922 	AS2(	movd	eax, xmm1)
923 	AS2(	mov		al, BYTE PTR [WORD_REG(si)+15])
924 	AS2(	MOVD	MM(2), eax)
925 #if CRYPTOPP_BOOL_X86
926 	AS2(	mov		eax, 1)
927 	AS2(	movd	mm3, eax)
928 #endif
929 
930 	// partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
931 	AS2(	movd	eax, xmm2)
932 	AS2(	psrldq	xmm2, 4)
933 	AS2(	movd	edi, xmm2)
934 	AS2(	psrldq	xmm2, 4)
935 		MXOR(		1, al, 0)		// 0
936 		XOR(		edx, ah, 1)		// 1
937 	AS2(	shr		eax, 16)
938 		XOR(		ecx, al, 2)		// 2
939 		XOR(		ebx, ah, 3)		// 3
940 	AS2(	mov		eax, edi)
941 	AS2(	movd	edi, xmm2)
942 	AS2(	psrldq	xmm2, 4)
943 		XOR(		ebx, al, 0)		// 4
944 		MXOR(		1, ah, 1)		// 5
945 	AS2(	shr		eax, 16)
946 		XOR(		edx, al, 2)		// 6
947 		XOR(		ecx, ah, 3)		// 7
948 	AS2(	mov		eax, edi)
949 	AS2(	movd	edi, xmm2)
950 		XOR(		ecx, al, 0)		// 8
951 		XOR(		ebx, ah, 1)		// 9
952 	AS2(	shr		eax, 16)
953 		MXOR(		1, al, 2)		// 10
954 		XOR(		edx, ah, 3)		// 11
955 	AS2(	mov		eax, edi)
956 		XOR(		edx, al, 0)		// 12
957 		XOR(		ecx, ah, 1)		// 13
958 	AS2(	shr		eax, 16)
959 		XOR(		ebx, al, 2)		// 14
960 	AS2(	psrldq	xmm2, 3)
961 
962 	// partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
963 	AS2(	mov		eax, [L_KEY12+0*4])
964 	AS2(	mov		edi, [L_KEY12+2*4])
965 	AS2(	MOVD	MM(0), [L_KEY12+3*4])
966 		MXOR(	0, cl, 3)	/* 11 */
967 		XOR(	edi, bl, 3)	/* 7 */
968 		MXOR(	0, bh, 2)	/* 6 */
969 	AS2(	shr ebx, 16)	/* 4,5 */
970 		XOR(	eax, bl, 1)	/* 5 */
971 		MOV(	ebx, bh, 0)	/* 4 */
972 	AS2(	xor		ebx, [L_KEY12+1*4])
973 		XOR(	eax, ch, 2)	/* 10 */
974 	AS2(	shr ecx, 16)	/* 8,9 */
975 		XOR(	eax, dl, 3)	/* 15 */
976 		XOR(	ebx, dh, 2)	/* 14 */
977 	AS2(	shr edx, 16)	/* 12,13 */
978 		XOR(	edi, ch, 0)	/* 8 */
979 		XOR(	ebx, cl, 1)	/* 9 */
980 		XOR(	edi, dl, 1)	/* 13 */
981 		MXOR(	0, dh, 0)	/* 12 */
982 
983 	AS2(	movd	ecx, xmm2)
984 	AS2(	MOVD	edx, MM(1))
985 	AS2(	MOVD	[L_SAVED_X+3*4], MM(0))
986 	AS2(	mov		[L_SAVED_X+0*4], eax)
987 	AS2(	mov		[L_SAVED_X+1*4], ebx)
988 	AS2(	mov		[L_SAVED_X+2*4], edi)
989 	ATT_NOPREFIX
990 	ASJ(	jmp,	5, f)
991 	INTEL_NOPREFIX
992 	ASL(3)
993 	// non-counter mode per-block setup
994 	AS2(	MOVD	MM(1), [L_KEY12+0*4])	// 0,1,2,3
995 	AS2(	mov		ebx, [L_KEY12+1*4])		// 4,5,6,7
996 	AS2(	mov		ecx, [L_KEY12+2*4])		// 8,9,10,11
997 	AS2(	mov		edx, [L_KEY12+3*4])		// 12,13,14,15
998 	ASL(8)
999 	AS2(	mov		WORD_REG(ax), [L_INBLOCKS])
1000 	AS2(	movdqu	xmm2, [WORD_REG(ax)])
1001 	AS2(	mov		WORD_REG(si), [L_INXORBLOCKS])
1002 	AS2(	movdqu	xmm5, [WORD_REG(si)])
1003 	AS2(	pxor	xmm2, xmm1)
1004 	AS2(	pxor	xmm2, xmm5)
1005 
1006 	// first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
1007 	AS2(	movd	eax, xmm2)
1008 	AS2(	psrldq	xmm2, 4)
1009 	AS2(	movd	edi, xmm2)
1010 	AS2(	psrldq	xmm2, 4)
1011 		MXOR(		1, al, 0)		// 0
1012 		XOR(		edx, ah, 1)		// 1
1013 	AS2(	shr		eax, 16)
1014 		XOR(		ecx, al, 2)		// 2
1015 		XOR(		ebx, ah, 3)		// 3
1016 	AS2(	mov		eax, edi)
1017 	AS2(	movd	edi, xmm2)
1018 	AS2(	psrldq	xmm2, 4)
1019 		XOR(		ebx, al, 0)		// 4
1020 		MXOR(		1, ah, 1)		// 5
1021 	AS2(	shr		eax, 16)
1022 		XOR(		edx, al, 2)		// 6
1023 		XOR(		ecx, ah, 3)		// 7
1024 	AS2(	mov		eax, edi)
1025 	AS2(	movd	edi, xmm2)
1026 		XOR(		ecx, al, 0)		// 8
1027 		XOR(		ebx, ah, 1)		// 9
1028 	AS2(	shr		eax, 16)
1029 		MXOR(		1, al, 2)		// 10
1030 		XOR(		edx, ah, 3)		// 11
1031 	AS2(	mov		eax, edi)
1032 		XOR(		edx, al, 0)		// 12
1033 		XOR(		ecx, ah, 1)		// 13
1034 	AS2(	shr		eax, 16)
1035 		XOR(		ebx, al, 2)		// 14
1036 		MXOR(		1, ah, 3)		// 15
1037 	AS2(	MOVD	eax, MM(1))
1038 
1039 	AS2(	add		L_REG, [L_KEYS_BEGIN])
1040 	AS2(	add		L_REG, 4*16)
1041 	ATT_NOPREFIX
1042 	ASJ(	jmp,	2, f)
1043 	INTEL_NOPREFIX
1044 	ASL(1)
1045 	// counter-mode per-block setup
1046 	AS2(	MOVD	ecx, MM(2))
1047 	AS2(	MOVD	edx, MM(1))
1048 	AS2(	mov		eax, [L_SAVED_X+0*4])
1049 	AS2(	mov		ebx, [L_SAVED_X+1*4])
1050 	AS2(	xor		cl, ch)
1051 	AS2(	and		WORD_REG(cx), 255)
1052 	ASL(5)
1053 #if CRYPTOPP_BOOL_X86
1054 	AS2(	paddb	MM(2), mm3)
1055 #else
1056 	AS2(	add		MM(2), 1)
1057 #endif
1058 	// remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
1059 	AS2(	xor		edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
1060 		XOR(		ebx, dl, 3)
1061 		MOV(		ecx, dh, 2)
1062 	AS2(	shr		edx, 16)
1063 	AS2(	xor		ecx, [L_SAVED_X+2*4])
1064 		XOR(		eax, dh, 0)
1065 		MOV(		edx, dl, 1)
1066 	AS2(	xor		edx, [L_SAVED_X+3*4])
1067 
1068 	AS2(	add		L_REG, [L_KEYS_BEGIN])
1069 	AS2(	add		L_REG, 3*16)
1070 	ATT_NOPREFIX
1071 	ASJ(	jmp,	4, f)
1072 	INTEL_NOPREFIX
1073 
1074 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
1075 // out: eax, ebx, edi, mm0
1076 #define ROUND()		\
1077 		MXOR(	0, cl, 3)	/* 11 */\
1078 	AS2(	mov	cl, al)		/* 8,9,10,3 */\
1079 		XOR(	edi, ah, 2)	/* 2 */\
1080 	AS2(	shr eax, 16)	/* 0,1 */\
1081 		XOR(	edi, bl, 3)	/* 7 */\
1082 		MXOR(	0, bh, 2)	/* 6 */\
1083 	AS2(	shr ebx, 16)	/* 4,5 */\
1084 		MXOR(	0, al, 1)	/* 1 */\
1085 		MOV(	eax, ah, 0)	/* 0 */\
1086 		XOR(	eax, bl, 1)	/* 5 */\
1087 		MOV(	ebx, bh, 0)	/* 4 */\
1088 		XOR(	eax, ch, 2)	/* 10 */\
1089 		XOR(	ebx, cl, 3)	/* 3 */\
1090 	AS2(	shr ecx, 16)	/* 8,9 */\
1091 		XOR(	eax, dl, 3)	/* 15 */\
1092 		XOR(	ebx, dh, 2)	/* 14 */\
1093 	AS2(	shr edx, 16)	/* 12,13 */\
1094 		XOR(	edi, ch, 0)	/* 8 */\
1095 		XOR(	ebx, cl, 1)	/* 9 */\
1096 		XOR(	edi, dl, 1)	/* 13 */\
1097 		MXOR(	0, dh, 0)	/* 12 */\
1098 
1099 	ASL(2)	// 2-round loop
1100 	AS2(	MOVD	MM(0), [L_SUBKEYS-4*16+3*4])
1101 	AS2(	mov		edi, [L_SUBKEYS-4*16+2*4])
1102 	ROUND()
1103 	AS2(	mov		ecx, edi)
1104 	AS2(	xor		eax, [L_SUBKEYS-4*16+0*4])
1105 	AS2(	xor		ebx, [L_SUBKEYS-4*16+1*4])
1106 	AS2(	MOVD	edx, MM(0))
1107 
1108 	ASL(4)
1109 	AS2(	MOVD	MM(0), [L_SUBKEYS-4*16+7*4])
1110 	AS2(	mov		edi, [L_SUBKEYS-4*16+6*4])
1111 	ROUND()
1112 	AS2(	mov		ecx, edi)
1113 	AS2(	xor		eax, [L_SUBKEYS-4*16+4*4])
1114 	AS2(	xor		ebx, [L_SUBKEYS-4*16+5*4])
1115 	AS2(	MOVD	edx, MM(0))
1116 
1117 	AS2(	add		L_REG, 32)
1118 	AS2(	test	L_REG, 255)
1119 	ATT_NOPREFIX
1120 	ASJ(	jnz,	2, b)
1121 	INTEL_NOPREFIX
1122 	AS2(	sub		L_REG, 16*16)
1123 
1124 #define LAST(a, b, c)												\
1125 	AS2(	movzx	esi, a											)\
1126 	AS2(	movzx	edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1]	)\
1127 	AS2(	movzx	esi, b											)\
1128 	AS2(	xor		edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0]	)\
1129 	AS2(	mov		WORD PTR [L_LASTROUND+c], di					)\
1130 
1131 	// last round
1132 	LAST(ch, dl, 2)
1133 	LAST(dh, al, 6)
1134 	AS2(	shr		edx, 16)
1135 	LAST(ah, bl, 10)
1136 	AS2(	shr		eax, 16)
1137 	LAST(bh, cl, 14)
1138 	AS2(	shr		ebx, 16)
1139 	LAST(dh, al, 12)
1140 	AS2(	shr		ecx, 16)
1141 	LAST(ah, bl, 0)
1142 	LAST(bh, cl, 4)
1143 	LAST(ch, dl, 8)
1144 
1145 	AS2(	mov		WORD_REG(ax), [L_OUTXORBLOCKS])
1146 	AS2(	mov		WORD_REG(bx), [L_OUTBLOCKS])
1147 
1148 	AS2(	mov		WORD_REG(cx), [L_LENGTH])
1149 	AS2(	sub		WORD_REG(cx), 16)
1150 
1151 	AS2(	movdqu	xmm2, [WORD_REG(ax)])
1152 	AS2(	pxor	xmm2, xmm4)
1153 
1154 #if CRYPTOPP_BOOL_X86
1155 	AS2(	movdqa	xmm0, [L_INCREMENTS])
1156 	AS2(	paddd	xmm0, [L_INBLOCKS])
1157 	AS2(	movdqa	[L_INBLOCKS], xmm0)
1158 #else
1159 	AS2(	movdqa	xmm0, [L_INCREMENTS+16])
1160 	AS2(	paddq	xmm0, [L_INBLOCKS+16])
1161 	AS2(	movdqa	[L_INBLOCKS+16], xmm0)
1162 #endif
1163 
1164 	AS2(	pxor	xmm2, [L_LASTROUND])
1165 	AS2(	movdqu	[WORD_REG(bx)], xmm2)
1166 
1167 	ATT_NOPREFIX
1168 	ASJ(	jle,	7, f)
1169 	INTEL_NOPREFIX
1170 	AS2(	mov		[L_LENGTH], WORD_REG(cx))
1171 	AS2(	test	WORD_REG(cx), 1)
1172 	ATT_NOPREFIX
1173 	ASJ(	jnz,	1, b)
1174 	INTEL_NOPREFIX
1175 #if CRYPTOPP_BOOL_X64
1176 	AS2(	movdqa	xmm0, [L_INCREMENTS])
1177 	AS2(	paddq	xmm0, [L_INBLOCKS])
1178 	AS2(	movdqa	[L_INBLOCKS], xmm0)
1179 #endif
1180 	ATT_NOPREFIX
1181 	ASJ(	jmp,	3, b)
1182 	INTEL_NOPREFIX
1183 
1184 	ASL(7)
1185 	// erase keys on stack
1186 	AS2(	xorps	xmm0, xmm0)
1187 	AS2(	lea		WORD_REG(ax), [L_SUBKEYS+7*16])
1188 	AS2(	movaps	[WORD_REG(ax)-7*16], xmm0)
1189 	AS2(	movaps	[WORD_REG(ax)-6*16], xmm0)
1190 	AS2(	movaps	[WORD_REG(ax)-5*16], xmm0)
1191 	AS2(	movaps	[WORD_REG(ax)-4*16], xmm0)
1192 	AS2(	movaps	[WORD_REG(ax)-3*16], xmm0)
1193 	AS2(	movaps	[WORD_REG(ax)-2*16], xmm0)
1194 	AS2(	movaps	[WORD_REG(ax)-1*16], xmm0)
1195 	AS2(	movaps	[WORD_REG(ax)+0*16], xmm0)
1196 	AS2(	movaps	[WORD_REG(ax)+1*16], xmm0)
1197 	AS2(	movaps	[WORD_REG(ax)+2*16], xmm0)
1198 	AS2(	movaps	[WORD_REG(ax)+3*16], xmm0)
1199 	AS2(	movaps	[WORD_REG(ax)+4*16], xmm0)
1200 	AS2(	movaps	[WORD_REG(ax)+5*16], xmm0)
1201 	AS2(	movaps	[WORD_REG(ax)+6*16], xmm0)
1202 #if CRYPTOPP_BOOL_X86
1203 	AS2(	mov		esp, [L_SP])
1204 	AS1(	emms)
1205 #endif
1206 	AS_POP_IF86(bp)
1207 	AS_POP_IF86(bx)
1208 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
1209 	AS_POP_IF86(di)
1210 	AS_POP_IF86(si)
1211 	AS1(ret)
1212 #endif
1213 #ifdef CRYPTOPP_GENERATE_X64_MASM
1214 	pop r12
1215 	pop rbx
1216 	pop rdi
1217 	pop rsi
1218 	ret
1219 	Rijndael_Enc_AdvancedProcessBlocks ENDP
1220 #endif
1221 #ifdef __GNUC__
1222 	ATT_PREFIX
1223 	:
1224 	: "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1225 	: "memory", "cc", "%eax"
1226 	#if CRYPTOPP_BOOL_X64
1227 		, "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1228 	#endif
1229 	);
1230 #endif
1231 }
1232 
1233 #endif
1234 
1235 #ifndef CRYPTOPP_GENERATE_X64_MASM
1236 
1237 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1238 extern "C" {
1239 void Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k);
1240 }
1241 #endif
1242 
1243 #if CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
AdvancedProcessBlocks(const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags) const1244 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1245 {
1246 #if CRYPTOPP_AESNI_AVAILABLE
1247 	if (HasAESNI())
1248 		return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1249 #endif
1250 #if CRYPTOPP_ARM_AES_AVAILABLE
1251 	if (HasAES())
1252 		return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1253 #endif
1254 #if CRYPTOPP_POWER8_AES_AVAILABLE
1255 	if (HasAES())
1256 		return Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1257 #endif
1258 
1259 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1260 	if (HasSSE2())
1261 	{
1262 		if (length < BLOCKSIZE)
1263 			return length;
1264 
1265 		static const byte *zeros = (const byte*)(Te+256);
1266 		m_aliasBlock.SetMark(m_aliasBlock.size());
1267 		byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1268 
1269 		// round up to nearest 256 byte boundary
1270 		space = originalSpace +	(s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1271 		while (AliasedWithTable(space, space + sizeof(Locals)))
1272 		{
1273 			space += 256;
1274 			CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1275 		}
1276 
1277 		size_t increment = BLOCKSIZE;
1278 		if (flags & BT_ReverseDirection)
1279 		{
1280 			CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1281 			inBlocks += length - BLOCKSIZE;
1282 			xorBlocks += length - BLOCKSIZE;
1283 			outBlocks += length - BLOCKSIZE;
1284 			increment = 0-increment;
1285 		}
1286 
1287 		Locals &locals = *(Locals *)(void *)space;
1288 
1289 		locals.inBlocks = inBlocks;
1290 		locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1291 		locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1292 		locals.outBlocks = outBlocks;
1293 
1294 		locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1295 		locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1296 		locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1297 		locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1298 
1299 		locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1300 		int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1301 		locals.keysBegin = (12-keysToCopy)*16;
1302 
1303 		Rijndael_Enc_AdvancedProcessBlocks_SSE2(&locals, m_key);
1304 
1305 		return length % BLOCKSIZE;
1306 	}
1307 #endif
1308 
1309 	return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1310 }
1311 
AdvancedProcessBlocks(const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags) const1312 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1313 {
1314 #if CRYPTOPP_AESNI_AVAILABLE
1315 	if (HasAESNI())
1316 		return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1317 #endif
1318 #if CRYPTOPP_ARM_AES_AVAILABLE
1319 	if (HasAES())
1320 		return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1321 #endif
1322 #if CRYPTOPP_POWER8_AES_AVAILABLE
1323 	if (HasAES())
1324 		return Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1325 #endif
1326 
1327 	return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1328 }
1329 #endif	// CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1330 
1331 NAMESPACE_END
1332 
1333 #endif
1334 #endif
1335