1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6
7 /*
8 July 2018: Added support for ARMv7 AES instructions via Cryptogams ASM.
9 See the head notes in aes_armv4.S for copyright and license.
10 */
11
12 /*
13 September 2017: Added support for Power8 AES instructions via compiler intrinsics.
14 */
15
16 /*
17 July 2017: Added support for ARMv8 AES instructions via compiler intrinsics.
18 */
19
20 /*
21 July 2010: Added support for AES-NI instructions via compiler intrinsics.
22 */
23
24 /*
25 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
26 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
27 and Peter Schwabe in their paper "New AES software speed records". The round
28 function was also modified to include a trick similar to one in Brian Gladman's
29 x86 assembly code, doing an 8-bit register move to minimize the number of
30 register spills. Also switched to compressed tables and copying round keys to
31 the stack.
32
33 The C++ implementation uses compressed tables if
34 CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined.
35 It is defined on x86 platforms by default but no others.
36 */
37
38 /*
39 July 2006: Defense against timing attacks was added in by Wei Dai.
40
41 The code now uses smaller tables in the first and last rounds,
42 and preloads them into L1 cache before usage (by loading at least
43 one element in each cache line).
44
45 We try to delay subsequent accesses to each table (used in the first
46 and last rounds) until all of the table has been preloaded. Hopefully
47 the compiler isn't smart enough to optimize that code away.
48
49 After preloading the table, we also try not to access any memory location
50 other than the table and the stack, in order to prevent table entries from
51 being unloaded from L1 cache, until that round is finished.
52 (Some popular CPUs have 2-way associative caches.)
53 */
54
55 // This is the original introductory comment:
56
57 /**
58 * version 3.0 (December 2000)
59 *
60 * Optimised ANSI C code for the Rijndael cipher (now AES)
61 *
62 * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
63 * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
64 * author Paulo Barreto <paulo.barreto@terra.com.br>
65 *
66 * This code is hereby placed in the public domain.
67 *
68 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
69 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
70 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
71 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
72 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
73 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
74 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
75 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
76 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
77 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
78 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79 */
80
81 #include "pch.h"
82 #include "config.h"
83
84 #ifndef CRYPTOPP_IMPORTS
85 #ifndef CRYPTOPP_GENERATE_X64_MASM
86
87 #include "rijndael.h"
88 #include "misc.h"
89 #include "cpu.h"
90
91 // VS2017 and global optimization bug. TODO, figure out when
92 // we can re-enable full optimizations for VS2017. Also see
93 // https://github.com/weidai11/cryptopp/issues/649
94 #if (_MSC_VER >= 1910)
95 # ifndef CRYPTOPP_DEBUG
96 # pragma optimize("", off)
97 # pragma optimize("ts", on)
98 # endif
99 #endif
100
101 NAMESPACE_BEGIN(CryptoPP)
102
103 // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
104 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE))
105 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
106 #endif
107
108 // Clang intrinsic casts
109 #define M128I_CAST(x) ((__m128i *)(void *)(x))
110 #define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
111
112 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
113 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
114 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
115 using namespace rdtable;
116 # else
117 static word64 Te[256];
118 # endif
119 static word64 Td[256];
120 #else // Not CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
121 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
122 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
123 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
124 # endif
125 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
126 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
127 #endif // CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
128
129 static volatile bool s_TeFilled = false, s_TdFilled = false;
130
131 ANONYMOUS_NAMESPACE_BEGIN
132
133 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
134
135 // Determine whether the range between begin and end overlaps
136 // with the same 4k block offsets as the Te table. Logically,
137 // the code is trying to create the condition:
138 //
139 // Two sepearate memory pages:
140 //
141 // +-----+ +-----+
142 // |XXXXX| |YYYYY|
143 // |XXXXX| |YYYYY|
144 // | | | |
145 // | | | |
146 // +-----+ +-----+
147 // Te Table Locals
148 //
149 // Have a logical cache view of (X and Y may be inverted):
150 //
151 // +-----+
152 // |XXXXX|
153 // |XXXXX|
154 // |YYYYY|
155 // |YYYYY|
156 // +-----+
157 //
AliasedWithTable(const byte * begin,const byte * end)158 static inline bool AliasedWithTable(const byte *begin, const byte *end)
159 {
160 ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
161 ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
162 if (t1 > t0)
163 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
164 else
165 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
166 }
167
168 struct Locals
169 {
170 word32 subkeys[4*12], workspace[8];
171 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
172 byte *outBlocks;
173 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
174 size_t regSpill, lengthAndCounterFlag, keysBegin;
175 };
176
177 const size_t s_aliasPageSize = 4096;
178 const size_t s_aliasBlockSize = 256;
179 const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
180
181 #endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
182
183 ANONYMOUS_NAMESPACE_END
184
185 // ************************* Portable Code ************************************
186
187 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
188 a ^= L(T, 3, byte(t)); t >>= 8;\
189 b ^= L(T, 2, byte(t)); t >>= 8;\
190 c ^= L(T, 1, byte(t)); t >>= 8;\
191 d ^= L(T, 0, t);
192
193 #define QUARTER_ROUND_LE(t, a, b, c, d) \
194 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
195 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
196 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
197 tempBlock[d] = ((byte *)(Te+t))[1];
198
199 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
200 #define QUARTER_ROUND_LD(t, a, b, c, d) \
201 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
202 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
203 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
204 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
205 #else
206 #define QUARTER_ROUND_LD(t, a, b, c, d) \
207 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
208 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
209 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
210 tempBlock[d] = Sd[t];
211 #endif
212
213 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
214 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
215
216 #if (CRYPTOPP_LITTLE_ENDIAN)
217 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
218 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
219 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
220 #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
221 #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
222 #else
223 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
224 #define TL_M(T, i, x) T[i*256 + x]
225 #endif
226 #else
227 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
228 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
229 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
230 #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
231 #define TL_M TL_F
232 #else
233 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
234 #define TL_M(T, i, x) T[i*256 + x]
235 #endif
236 #endif
237
238
239 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
240 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
241 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
242
243 #define f3(x) (f2(x) ^ x)
244 #define f9(x) (f8(x) ^ x)
245 #define fb(x) (f8(x) ^ f2(x) ^ x)
246 #define fd(x) (f8(x) ^ f4(x) ^ x)
247 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
248
OptimalDataAlignment() const249 unsigned int Rijndael::Base::OptimalDataAlignment() const
250 {
251 #if (CRYPTOPP_AESNI_AVAILABLE)
252 if (HasAESNI())
253 return 16; // load __m128i
254 #endif
255 #if (CRYPTOPP_ARM_AES_AVAILABLE)
256 if (HasAES())
257 return 4; // load uint32x4_t
258 #endif
259 #if (CRYPTOGAMS_ARM_AES)
260 // Must use 1 here for Cryptogams AES. Also see
261 // https://github.com/weidai11/cryptopp/issues/683
262 if (HasARMv7())
263 return 1;
264 #endif
265 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
266 if (HasAES())
267 return 16; // load uint32x4_p
268 #endif
269 return BlockTransformation::OptimalDataAlignment();
270 }
271
FillEncTable()272 void Rijndael::Base::FillEncTable()
273 {
274 for (int i=0; i<256; i++)
275 {
276 byte x = Se[i];
277 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
278 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
279 Te[i] = word64(y | f3(x))<<32 | y;
280 #else
281 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
282 for (int j=0; j<4; j++)
283 {
284 Te[i+j*256] = y;
285 y = rotrConstant<8>(y);
286 }
287 #endif
288 }
289 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
290 Te[256] = Te[257] = 0;
291 #endif
292 s_TeFilled = true;
293 }
294
FillDecTable()295 void Rijndael::Base::FillDecTable()
296 {
297 for (int i=0; i<256; i++)
298 {
299 byte x = Sd[i];
300 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
301 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
302 Td[i] = word64(y | fb(x))<<32 | y | x;
303 #else
304 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
305 for (int j=0; j<4; j++)
306 {
307 Td[i+j*256] = y;
308 y = rotrConstant<8>(y);
309 }
310 #endif
311 }
312 s_TdFilled = true;
313 }
314
315 #if (CRYPTOPP_AESNI_AVAILABLE)
316 extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
317 extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
318
319 extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
320 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
321 extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
322 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
323 #endif
324
325 #if (CRYPTOPP_ARM_AES_AVAILABLE)
326 extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
327 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
328 extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
329 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
330 #endif
331
332 #if (CRYPTOGAMS_ARM_AES)
333 extern "C" int cryptogams_AES_set_encrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
334 extern "C" int cryptogams_AES_set_decrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
335 extern "C" void cryptogams_AES_encrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
336 extern "C" void cryptogams_AES_decrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
337 #endif
338
339 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
340 extern void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen,
341 word32* rk, const byte* Se);
342
343 extern size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
344 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
345 extern size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
346 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
347 #endif
348
349 #if (CRYPTOGAMS_ARM_AES)
CRYPTOGAMS_set_encrypt_key(const byte * userKey,const int bitLen,word32 * rkey)350 int CRYPTOGAMS_set_encrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
351 {
352 return cryptogams_AES_set_encrypt_key(userKey, bitLen, rkey);
353 }
CRYPTOGAMS_set_decrypt_key(const byte * userKey,const int bitLen,word32 * rkey)354 int CRYPTOGAMS_set_decrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
355 {
356 return cryptogams_AES_set_decrypt_key(userKey, bitLen, rkey);
357 }
CRYPTOGAMS_encrypt(const byte * inBlock,const byte * xorBlock,byte * outBlock,const word32 * rkey)358 void CRYPTOGAMS_encrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
359 {
360 cryptogams_AES_encrypt_block(inBlock, outBlock, rkey);
361 if (xorBlock)
362 xorbuf (outBlock, xorBlock, 16);
363 }
CRYPTOGAMS_decrypt(const byte * inBlock,const byte * xorBlock,byte * outBlock,const word32 * rkey)364 void CRYPTOGAMS_decrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
365 {
366 cryptogams_AES_decrypt_block(inBlock, outBlock, rkey);
367 if (xorBlock)
368 xorbuf (outBlock, xorBlock, 16);
369 }
370 #endif
371
AlgorithmProvider() const372 std::string Rijndael::Base::AlgorithmProvider() const
373 {
374 #if (CRYPTOPP_AESNI_AVAILABLE)
375 if (HasAESNI())
376 return "AESNI";
377 #endif
378 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
379 if (HasSSE2())
380 return "SSE2";
381 #endif
382 #if (CRYPTOPP_ARM_AES_AVAILABLE)
383 if (HasAES())
384 return "ARMv8";
385 #endif
386 #if (CRYPTOGAMS_ARM_AES)
387 if (HasARMv7())
388 return "ARMv7";
389 #endif
390 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
391 if (HasAES())
392 return "Power8";
393 #endif
394 return "C++";
395 }
396
UncheckedSetKey(const byte * userKey,unsigned int keyLen,const NameValuePairs &)397 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
398 {
399 AssertValidKeyLength(keyLen);
400
401 #if (CRYPTOGAMS_ARM_AES)
402 if (HasARMv7())
403 {
404 m_rounds = keyLen/4 + 6;
405 m_key.New(4*(14+1)+4);
406
407 if (IsForwardTransformation())
408 CRYPTOGAMS_set_encrypt_key(userKey, keyLen*8, m_key.begin());
409 else
410 CRYPTOGAMS_set_decrypt_key(userKey, keyLen*8, m_key.begin());
411 return;
412 }
413 #endif
414
415 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
416 m_aliasBlock.New(s_sizeToAllocate);
417 // The alias block is only used on IA-32 when unaligned data access is in effect.
418 // Setting the low water mark to 0 avoids zeroization when m_aliasBlock is unused.
419 m_aliasBlock.SetMark(0);
420 #endif
421
422 m_rounds = keyLen/4 + 6;
423 m_key.New(4*(m_rounds+1));
424 word32 *rk = m_key;
425
426 #if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
427 // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
428 if (HasAESNI() && HasSSE41())
429 {
430 // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
431 // Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
432 Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
433 if (!IsForwardTransformation())
434 Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
435
436 return;
437 }
438 #endif
439
440 #if CRYPTOPP_POWER8_AES_AVAILABLE
441 if (HasAES())
442 {
443 // We still need rcon and Se to fallback to C/C++ for AES-192 and AES-256.
444 // The IBM docs on AES sucks. Intel's docs on AESNI puts IBM to shame.
445 Rijndael_UncheckedSetKey_POWER8(userKey, keyLen, rk, Se);
446 return;
447 }
448 #endif
449
450 GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
451 const word32 *rc = rcon;
452 word32 temp;
453
454 while (true)
455 {
456 temp = rk[keyLen/4-1];
457 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
458 (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
459 rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
460 rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
461 rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
462 rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
463
464 if (rk + keyLen/4 + 4 == m_key.end())
465 break;
466
467 if (keyLen == 24)
468 {
469 rk[10] = rk[ 4] ^ rk[ 9];
470 rk[11] = rk[ 5] ^ rk[10];
471 }
472 else if (keyLen == 32)
473 {
474 temp = rk[11];
475 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
476 rk[13] = rk[ 5] ^ rk[12];
477 rk[14] = rk[ 6] ^ rk[13];
478 rk[15] = rk[ 7] ^ rk[14];
479 }
480 rk += keyLen/4;
481 }
482
483 rk = m_key;
484
485 if (IsForwardTransformation())
486 {
487 if (!s_TeFilled)
488 FillEncTable();
489
490 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
491 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
492 }
493 else
494 {
495 if (!s_TdFilled)
496 FillDecTable();
497
498 #define InverseMixColumn(x) \
499 TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
500 TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
501
502 unsigned int i, j;
503 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
504 {
505 temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
506 temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
507 temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
508 temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
509 }
510
511 rk[i+0] = InverseMixColumn(rk[i+0]);
512 rk[i+1] = InverseMixColumn(rk[i+1]);
513 rk[i+2] = InverseMixColumn(rk[i+2]);
514 rk[i+3] = InverseMixColumn(rk[i+3]);
515
516 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
517 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
518 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
519 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
520 }
521
522 #if CRYPTOPP_AESNI_AVAILABLE
523 if (HasAESNI())
524 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
525 #endif
526 #if CRYPTOPP_ARM_AES_AVAILABLE
527 if (HasAES())
528 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
529 #endif
530 }
531
ProcessAndXorBlock(const byte * inBlock,const byte * xorBlock,byte * outBlock) const532 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
533 {
534 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
535 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
536 if (HasSSE2())
537 # else
538 if (HasAESNI())
539 # endif
540 {
541 (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
542 return;
543 }
544 #endif
545
546 #if (CRYPTOPP_ARM_AES_AVAILABLE)
547 if (HasAES())
548 {
549 (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
550 return;
551 }
552 #endif
553
554 #if (CRYPTOGAMS_ARM_AES)
555 if (HasARMv7())
556 {
557 CRYPTOGAMS_encrypt(inBlock, xorBlock, outBlock, m_key.begin());
558 return;
559 }
560 #endif
561
562 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
563 if (HasAES())
564 {
565 (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
566 return;
567 }
568 #endif
569
570 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
571
572 word32 s0, s1, s2, s3, t0, t1, t2, t3;
573 Block::Get(inBlock)(s0)(s1)(s2)(s3);
574
575 const word32 *rk = m_key;
576 s0 ^= rk[0];
577 s1 ^= rk[1];
578 s2 ^= rk[2];
579 s3 ^= rk[3];
580 t0 = rk[4];
581 t1 = rk[5];
582 t2 = rk[6];
583 t3 = rk[7];
584 rk += 8;
585
586 // timing attack countermeasure. see comments at top for more details.
587 // also see http://github.com/weidai11/cryptopp/issues/146
588 const int cacheLineSize = GetCacheLineSize();
589 unsigned int i;
590 volatile word32 _u = 0;
591 word32 u = _u;
592 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
593 for (i=0; i<2048; i+=cacheLineSize)
594 #else
595 for (i=0; i<1024; i+=cacheLineSize)
596 #endif
597 u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
598 u &= Te[255];
599 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
600
601 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
602 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
603 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
604 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
605
606 // Nr - 2 full rounds:
607 unsigned int r = m_rounds/2 - 1;
608 do
609 {
610 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
611
612 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
613 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
614 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
615 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
616
617 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
618
619 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
620 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
621 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
622 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
623
624 rk += 8;
625 } while (--r);
626
627 word32 tbw[4];
628 byte *const tempBlock = (byte *)tbw;
629
630 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
631 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
632 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
633 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
634
635 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
636 }
637
ProcessAndXorBlock(const byte * inBlock,const byte * xorBlock,byte * outBlock) const638 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
639 {
640 #if CRYPTOPP_AESNI_AVAILABLE
641 if (HasAESNI())
642 {
643 (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
644 return;
645 }
646 #endif
647
648 #if (CRYPTOPP_ARM_AES_AVAILABLE)
649 if (HasAES())
650 {
651 (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
652 return;
653 }
654 #endif
655
656 #if (CRYPTOGAMS_ARM_AES)
657 if (HasARMv7())
658 {
659 CRYPTOGAMS_decrypt(inBlock, xorBlock, outBlock, m_key.begin());
660 return;
661 }
662 #endif
663
664 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
665 if (HasAES())
666 {
667 (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
668 return;
669 }
670 #endif
671
672 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
673
674 word32 s0, s1, s2, s3, t0, t1, t2, t3;
675 Block::Get(inBlock)(s0)(s1)(s2)(s3);
676
677 const word32 *rk = m_key;
678 s0 ^= rk[0];
679 s1 ^= rk[1];
680 s2 ^= rk[2];
681 s3 ^= rk[3];
682 t0 = rk[4];
683 t1 = rk[5];
684 t2 = rk[6];
685 t3 = rk[7];
686 rk += 8;
687
688 // timing attack countermeasure. see comments at top for more details.
689 // also see http://github.com/weidai11/cryptopp/issues/146
690 const int cacheLineSize = GetCacheLineSize();
691 unsigned int i;
692 volatile word32 _u = 0;
693 word32 u = _u;
694 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
695 for (i=0; i<2048; i+=cacheLineSize)
696 #else
697 for (i=0; i<1024; i+=cacheLineSize)
698 #endif
699 u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
700 u &= Td[255];
701 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
702
703 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
704 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
705 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
706 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
707
708 // Nr - 2 full rounds:
709 unsigned int r = m_rounds/2 - 1;
710 do
711 {
712 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
713
714 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
715 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
716 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
717 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
718
719 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
720
721 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
722 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
723 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
724 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
725
726 rk += 8;
727 } while (--r);
728
729 #if !(defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
730 // timing attack countermeasure. see comments at top for more details
731 // If CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined,
732 // QUARTER_ROUND_LD will use Td, which is already preloaded.
733 u = _u;
734 for (i=0; i<256; i+=cacheLineSize)
735 u &= *(const word32 *)(const void *)(Sd+i);
736 u &= *(const word32 *)(const void *)(Sd+252);
737 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
738 #endif
739
740 word32 tbw[4];
741 byte *const tempBlock = (byte *)tbw;
742
743 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
744 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
745 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
746 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
747
748 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
749 }
750
751 // ************************* Assembly Code ************************************
752
753 #if CRYPTOPP_MSC_VERSION
754 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
755 #endif
756
757 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
758
759 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
760
Rijndael_Enc_AdvancedProcessBlocks_SSE2(void * locals,const word32 * k)761 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k)
762 {
763 CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
764
765 #if CRYPTOPP_BOOL_X86
766
767 #define L_REG esp
768 #define L_INDEX(i) (L_REG+768+i)
769 #define L_INXORBLOCKS L_INBLOCKS+4
770 #define L_OUTXORBLOCKS L_INBLOCKS+8
771 #define L_OUTBLOCKS L_INBLOCKS+12
772 #define L_INCREMENTS L_INDEX(16*15)
773 #define L_SP L_INDEX(16*16)
774 #define L_LENGTH L_INDEX(16*16+4)
775 #define L_KEYS_BEGIN L_INDEX(16*16+8)
776
777 #define MOVD movd
778 #define MM(i) mm##i
779
780 #define MXOR(a,b,c) \
781 AS2( movzx esi, b)\
782 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
783 AS2( pxor MM(a), mm7)\
784
785 #define MMOV(a,b,c) \
786 AS2( movzx esi, b)\
787 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
788
789 #else
790
791 #define L_REG r8
792 #define L_INDEX(i) (L_REG+i)
793 #define L_INXORBLOCKS L_INBLOCKS+8
794 #define L_OUTXORBLOCKS L_INBLOCKS+16
795 #define L_OUTBLOCKS L_INBLOCKS+24
796 #define L_INCREMENTS L_INDEX(16*16)
797 #define L_LENGTH L_INDEX(16*18+8)
798 #define L_KEYS_BEGIN L_INDEX(16*19)
799
800 #define MOVD mov
801 #define MM_0 r9d
802 #define MM_1 r12d
803 #ifdef __GNUC__
804 #define MM_2 r11d
805 #else
806 #define MM_2 r10d
807 #endif
808 #define MM(i) MM_##i
809
810 #define MXOR(a,b,c) \
811 AS2( movzx esi, b)\
812 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
813
814 #define MMOV(a,b,c) \
815 AS2( movzx esi, b)\
816 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
817
818 #endif
819
820 #define L_SUBKEYS L_INDEX(0)
821 #define L_SAVED_X L_SUBKEYS
822 #define L_KEY12 L_INDEX(16*12)
823 #define L_LASTROUND L_INDEX(16*13)
824 #define L_INBLOCKS L_INDEX(16*14)
825 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
826
827 #define XOR(a,b,c) \
828 AS2( movzx esi, b)\
829 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
830
831 #define MOV(a,b,c) \
832 AS2( movzx esi, b)\
833 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
834
835 #ifdef CRYPTOPP_GENERATE_X64_MASM
836 ALIGN 8
837 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
838 rex_push_reg rsi
839 push_reg rdi
840 push_reg rbx
841 push_reg r12
842 .endprolog
843 mov L_REG, rcx
844 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
845 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
846 #elif defined(__GNUC__)
847 __asm__ __volatile__
848 (
849 INTEL_NOPREFIX
850 #if CRYPTOPP_BOOL_X64
851 AS2( mov L_REG, rcx)
852 #endif
853 AS_PUSH_IF86(bx)
854 AS_PUSH_IF86(bp)
855 AS2( mov AS_REG_7, WORD_REG(si))
856 #else
857 AS_PUSH_IF86(si)
858 AS_PUSH_IF86(di)
859 AS_PUSH_IF86(bx)
860 AS_PUSH_IF86(bp)
861 AS2( lea AS_REG_7, [Te])
862 AS2( mov edi, [g_cacheLineSize])
863 #endif
864
865 #if CRYPTOPP_BOOL_X86
866 AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
867 AS2( lea esp, [ecx-768])
868 #endif
869
870 // copy subkeys to stack
871 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
872 AS2( mov WORD_REG(ax), 16)
873 AS2( and WORD_REG(ax), WORD_REG(si))
874 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
875 AS2( movdqa [L_KEY12], xmm3)
876 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
877 AS2( sub WORD_REG(ax), WORD_REG(si))
878 ASL(0)
879 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
880 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
881 AS2( add WORD_REG(si), 16)
882 AS2( cmp WORD_REG(si), 16*12)
883 ATT_NOPREFIX
884 ASJ( jl, 0, b)
885 INTEL_NOPREFIX
886
887 // read subkeys 0, 1 and last
888 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
889 AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
890 AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
891 AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
892 AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
893 AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
894
895 // load table into cache
896 AS2( xor WORD_REG(ax), WORD_REG(ax))
897 ASL(9)
898 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
899 AS2( add WORD_REG(ax), WORD_REG(di))
900 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
901 AS2( add WORD_REG(ax), WORD_REG(di))
902 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
903 AS2( add WORD_REG(ax), WORD_REG(di))
904 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
905 AS2( add WORD_REG(ax), WORD_REG(di))
906 AS2( cmp WORD_REG(ax), 2048)
907 ATT_NOPREFIX
908 ASJ( jl, 9, b)
909 INTEL_NOPREFIX
910 AS1( lfence)
911
912 AS2( test DWORD PTR [L_LENGTH], 1)
913 ATT_NOPREFIX
914 ASJ( jz, 8, f)
915 INTEL_NOPREFIX
916
917 // counter mode one-time setup
918 AS2( mov WORD_REG(si), [L_INBLOCKS])
919 AS2( movdqu xmm2, [WORD_REG(si)]) // counter
920 AS2( pxor xmm2, xmm1)
921 AS2( psrldq xmm1, 14)
922 AS2( movd eax, xmm1)
923 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
924 AS2( MOVD MM(2), eax)
925 #if CRYPTOPP_BOOL_X86
926 AS2( mov eax, 1)
927 AS2( movd mm3, eax)
928 #endif
929
930 // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
931 AS2( movd eax, xmm2)
932 AS2( psrldq xmm2, 4)
933 AS2( movd edi, xmm2)
934 AS2( psrldq xmm2, 4)
935 MXOR( 1, al, 0) // 0
936 XOR( edx, ah, 1) // 1
937 AS2( shr eax, 16)
938 XOR( ecx, al, 2) // 2
939 XOR( ebx, ah, 3) // 3
940 AS2( mov eax, edi)
941 AS2( movd edi, xmm2)
942 AS2( psrldq xmm2, 4)
943 XOR( ebx, al, 0) // 4
944 MXOR( 1, ah, 1) // 5
945 AS2( shr eax, 16)
946 XOR( edx, al, 2) // 6
947 XOR( ecx, ah, 3) // 7
948 AS2( mov eax, edi)
949 AS2( movd edi, xmm2)
950 XOR( ecx, al, 0) // 8
951 XOR( ebx, ah, 1) // 9
952 AS2( shr eax, 16)
953 MXOR( 1, al, 2) // 10
954 XOR( edx, ah, 3) // 11
955 AS2( mov eax, edi)
956 XOR( edx, al, 0) // 12
957 XOR( ecx, ah, 1) // 13
958 AS2( shr eax, 16)
959 XOR( ebx, al, 2) // 14
960 AS2( psrldq xmm2, 3)
961
962 // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
963 AS2( mov eax, [L_KEY12+0*4])
964 AS2( mov edi, [L_KEY12+2*4])
965 AS2( MOVD MM(0), [L_KEY12+3*4])
966 MXOR( 0, cl, 3) /* 11 */
967 XOR( edi, bl, 3) /* 7 */
968 MXOR( 0, bh, 2) /* 6 */
969 AS2( shr ebx, 16) /* 4,5 */
970 XOR( eax, bl, 1) /* 5 */
971 MOV( ebx, bh, 0) /* 4 */
972 AS2( xor ebx, [L_KEY12+1*4])
973 XOR( eax, ch, 2) /* 10 */
974 AS2( shr ecx, 16) /* 8,9 */
975 XOR( eax, dl, 3) /* 15 */
976 XOR( ebx, dh, 2) /* 14 */
977 AS2( shr edx, 16) /* 12,13 */
978 XOR( edi, ch, 0) /* 8 */
979 XOR( ebx, cl, 1) /* 9 */
980 XOR( edi, dl, 1) /* 13 */
981 MXOR( 0, dh, 0) /* 12 */
982
983 AS2( movd ecx, xmm2)
984 AS2( MOVD edx, MM(1))
985 AS2( MOVD [L_SAVED_X+3*4], MM(0))
986 AS2( mov [L_SAVED_X+0*4], eax)
987 AS2( mov [L_SAVED_X+1*4], ebx)
988 AS2( mov [L_SAVED_X+2*4], edi)
989 ATT_NOPREFIX
990 ASJ( jmp, 5, f)
991 INTEL_NOPREFIX
992 ASL(3)
993 // non-counter mode per-block setup
994 AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
995 AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
996 AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
997 AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
998 ASL(8)
999 AS2( mov WORD_REG(ax), [L_INBLOCKS])
1000 AS2( movdqu xmm2, [WORD_REG(ax)])
1001 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
1002 AS2( movdqu xmm5, [WORD_REG(si)])
1003 AS2( pxor xmm2, xmm1)
1004 AS2( pxor xmm2, xmm5)
1005
1006 // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
1007 AS2( movd eax, xmm2)
1008 AS2( psrldq xmm2, 4)
1009 AS2( movd edi, xmm2)
1010 AS2( psrldq xmm2, 4)
1011 MXOR( 1, al, 0) // 0
1012 XOR( edx, ah, 1) // 1
1013 AS2( shr eax, 16)
1014 XOR( ecx, al, 2) // 2
1015 XOR( ebx, ah, 3) // 3
1016 AS2( mov eax, edi)
1017 AS2( movd edi, xmm2)
1018 AS2( psrldq xmm2, 4)
1019 XOR( ebx, al, 0) // 4
1020 MXOR( 1, ah, 1) // 5
1021 AS2( shr eax, 16)
1022 XOR( edx, al, 2) // 6
1023 XOR( ecx, ah, 3) // 7
1024 AS2( mov eax, edi)
1025 AS2( movd edi, xmm2)
1026 XOR( ecx, al, 0) // 8
1027 XOR( ebx, ah, 1) // 9
1028 AS2( shr eax, 16)
1029 MXOR( 1, al, 2) // 10
1030 XOR( edx, ah, 3) // 11
1031 AS2( mov eax, edi)
1032 XOR( edx, al, 0) // 12
1033 XOR( ecx, ah, 1) // 13
1034 AS2( shr eax, 16)
1035 XOR( ebx, al, 2) // 14
1036 MXOR( 1, ah, 3) // 15
1037 AS2( MOVD eax, MM(1))
1038
1039 AS2( add L_REG, [L_KEYS_BEGIN])
1040 AS2( add L_REG, 4*16)
1041 ATT_NOPREFIX
1042 ASJ( jmp, 2, f)
1043 INTEL_NOPREFIX
1044 ASL(1)
1045 // counter-mode per-block setup
1046 AS2( MOVD ecx, MM(2))
1047 AS2( MOVD edx, MM(1))
1048 AS2( mov eax, [L_SAVED_X+0*4])
1049 AS2( mov ebx, [L_SAVED_X+1*4])
1050 AS2( xor cl, ch)
1051 AS2( and WORD_REG(cx), 255)
1052 ASL(5)
1053 #if CRYPTOPP_BOOL_X86
1054 AS2( paddb MM(2), mm3)
1055 #else
1056 AS2( add MM(2), 1)
1057 #endif
1058 // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
1059 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
1060 XOR( ebx, dl, 3)
1061 MOV( ecx, dh, 2)
1062 AS2( shr edx, 16)
1063 AS2( xor ecx, [L_SAVED_X+2*4])
1064 XOR( eax, dh, 0)
1065 MOV( edx, dl, 1)
1066 AS2( xor edx, [L_SAVED_X+3*4])
1067
1068 AS2( add L_REG, [L_KEYS_BEGIN])
1069 AS2( add L_REG, 3*16)
1070 ATT_NOPREFIX
1071 ASJ( jmp, 4, f)
1072 INTEL_NOPREFIX
1073
1074 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
1075 // out: eax, ebx, edi, mm0
1076 #define ROUND() \
1077 MXOR( 0, cl, 3) /* 11 */\
1078 AS2( mov cl, al) /* 8,9,10,3 */\
1079 XOR( edi, ah, 2) /* 2 */\
1080 AS2( shr eax, 16) /* 0,1 */\
1081 XOR( edi, bl, 3) /* 7 */\
1082 MXOR( 0, bh, 2) /* 6 */\
1083 AS2( shr ebx, 16) /* 4,5 */\
1084 MXOR( 0, al, 1) /* 1 */\
1085 MOV( eax, ah, 0) /* 0 */\
1086 XOR( eax, bl, 1) /* 5 */\
1087 MOV( ebx, bh, 0) /* 4 */\
1088 XOR( eax, ch, 2) /* 10 */\
1089 XOR( ebx, cl, 3) /* 3 */\
1090 AS2( shr ecx, 16) /* 8,9 */\
1091 XOR( eax, dl, 3) /* 15 */\
1092 XOR( ebx, dh, 2) /* 14 */\
1093 AS2( shr edx, 16) /* 12,13 */\
1094 XOR( edi, ch, 0) /* 8 */\
1095 XOR( ebx, cl, 1) /* 9 */\
1096 XOR( edi, dl, 1) /* 13 */\
1097 MXOR( 0, dh, 0) /* 12 */\
1098
1099 ASL(2) // 2-round loop
1100 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
1101 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
1102 ROUND()
1103 AS2( mov ecx, edi)
1104 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
1105 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
1106 AS2( MOVD edx, MM(0))
1107
1108 ASL(4)
1109 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
1110 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
1111 ROUND()
1112 AS2( mov ecx, edi)
1113 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
1114 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
1115 AS2( MOVD edx, MM(0))
1116
1117 AS2( add L_REG, 32)
1118 AS2( test L_REG, 255)
1119 ATT_NOPREFIX
1120 ASJ( jnz, 2, b)
1121 INTEL_NOPREFIX
1122 AS2( sub L_REG, 16*16)
1123
1124 #define LAST(a, b, c) \
1125 AS2( movzx esi, a )\
1126 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
1127 AS2( movzx esi, b )\
1128 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
1129 AS2( mov WORD PTR [L_LASTROUND+c], di )\
1130
1131 // last round
1132 LAST(ch, dl, 2)
1133 LAST(dh, al, 6)
1134 AS2( shr edx, 16)
1135 LAST(ah, bl, 10)
1136 AS2( shr eax, 16)
1137 LAST(bh, cl, 14)
1138 AS2( shr ebx, 16)
1139 LAST(dh, al, 12)
1140 AS2( shr ecx, 16)
1141 LAST(ah, bl, 0)
1142 LAST(bh, cl, 4)
1143 LAST(ch, dl, 8)
1144
1145 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
1146 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
1147
1148 AS2( mov WORD_REG(cx), [L_LENGTH])
1149 AS2( sub WORD_REG(cx), 16)
1150
1151 AS2( movdqu xmm2, [WORD_REG(ax)])
1152 AS2( pxor xmm2, xmm4)
1153
1154 #if CRYPTOPP_BOOL_X86
1155 AS2( movdqa xmm0, [L_INCREMENTS])
1156 AS2( paddd xmm0, [L_INBLOCKS])
1157 AS2( movdqa [L_INBLOCKS], xmm0)
1158 #else
1159 AS2( movdqa xmm0, [L_INCREMENTS+16])
1160 AS2( paddq xmm0, [L_INBLOCKS+16])
1161 AS2( movdqa [L_INBLOCKS+16], xmm0)
1162 #endif
1163
1164 AS2( pxor xmm2, [L_LASTROUND])
1165 AS2( movdqu [WORD_REG(bx)], xmm2)
1166
1167 ATT_NOPREFIX
1168 ASJ( jle, 7, f)
1169 INTEL_NOPREFIX
1170 AS2( mov [L_LENGTH], WORD_REG(cx))
1171 AS2( test WORD_REG(cx), 1)
1172 ATT_NOPREFIX
1173 ASJ( jnz, 1, b)
1174 INTEL_NOPREFIX
1175 #if CRYPTOPP_BOOL_X64
1176 AS2( movdqa xmm0, [L_INCREMENTS])
1177 AS2( paddq xmm0, [L_INBLOCKS])
1178 AS2( movdqa [L_INBLOCKS], xmm0)
1179 #endif
1180 ATT_NOPREFIX
1181 ASJ( jmp, 3, b)
1182 INTEL_NOPREFIX
1183
1184 ASL(7)
1185 // erase keys on stack
1186 AS2( xorps xmm0, xmm0)
1187 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
1188 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
1189 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
1190 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
1191 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
1192 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
1193 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
1194 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
1195 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
1196 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
1197 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
1198 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
1199 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
1200 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
1201 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
1202 #if CRYPTOPP_BOOL_X86
1203 AS2( mov esp, [L_SP])
1204 AS1( emms)
1205 #endif
1206 AS_POP_IF86(bp)
1207 AS_POP_IF86(bx)
1208 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
1209 AS_POP_IF86(di)
1210 AS_POP_IF86(si)
1211 AS1(ret)
1212 #endif
1213 #ifdef CRYPTOPP_GENERATE_X64_MASM
1214 pop r12
1215 pop rbx
1216 pop rdi
1217 pop rsi
1218 ret
1219 Rijndael_Enc_AdvancedProcessBlocks ENDP
1220 #endif
1221 #ifdef __GNUC__
1222 ATT_PREFIX
1223 :
1224 : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1225 : "memory", "cc", "%eax"
1226 #if CRYPTOPP_BOOL_X64
1227 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1228 #endif
1229 );
1230 #endif
1231 }
1232
1233 #endif
1234
1235 #ifndef CRYPTOPP_GENERATE_X64_MASM
1236
1237 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1238 extern "C" {
1239 void Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k);
1240 }
1241 #endif
1242
1243 #if CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
AdvancedProcessBlocks(const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags) const1244 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1245 {
1246 #if CRYPTOPP_AESNI_AVAILABLE
1247 if (HasAESNI())
1248 return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1249 #endif
1250 #if CRYPTOPP_ARM_AES_AVAILABLE
1251 if (HasAES())
1252 return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1253 #endif
1254 #if CRYPTOPP_POWER8_AES_AVAILABLE
1255 if (HasAES())
1256 return Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1257 #endif
1258
1259 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1260 if (HasSSE2())
1261 {
1262 if (length < BLOCKSIZE)
1263 return length;
1264
1265 static const byte *zeros = (const byte*)(Te+256);
1266 m_aliasBlock.SetMark(m_aliasBlock.size());
1267 byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1268
1269 // round up to nearest 256 byte boundary
1270 space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1271 while (AliasedWithTable(space, space + sizeof(Locals)))
1272 {
1273 space += 256;
1274 CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1275 }
1276
1277 size_t increment = BLOCKSIZE;
1278 if (flags & BT_ReverseDirection)
1279 {
1280 CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1281 inBlocks += length - BLOCKSIZE;
1282 xorBlocks += length - BLOCKSIZE;
1283 outBlocks += length - BLOCKSIZE;
1284 increment = 0-increment;
1285 }
1286
1287 Locals &locals = *(Locals *)(void *)space;
1288
1289 locals.inBlocks = inBlocks;
1290 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1291 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1292 locals.outBlocks = outBlocks;
1293
1294 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1295 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1296 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1297 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1298
1299 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1300 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1301 locals.keysBegin = (12-keysToCopy)*16;
1302
1303 Rijndael_Enc_AdvancedProcessBlocks_SSE2(&locals, m_key);
1304
1305 return length % BLOCKSIZE;
1306 }
1307 #endif
1308
1309 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1310 }
1311
AdvancedProcessBlocks(const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags) const1312 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1313 {
1314 #if CRYPTOPP_AESNI_AVAILABLE
1315 if (HasAESNI())
1316 return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1317 #endif
1318 #if CRYPTOPP_ARM_AES_AVAILABLE
1319 if (HasAES())
1320 return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1321 #endif
1322 #if CRYPTOPP_POWER8_AES_AVAILABLE
1323 if (HasAES())
1324 return Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1325 #endif
1326
1327 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1328 }
1329 #endif // CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1330
1331 NAMESPACE_END
1332
1333 #endif
1334 #endif
1335