1 /*
2 * AES using SSSE3
3 * (C) 2010 Jack Lloyd
4 *
5 * This is more or less a direct translation of public domain x86-64
6 * assembly written by Mike Hamburg, described in "Accelerating AES
7 * with Vector Permute Instructions" (CHES 2009). His original code is
8 * available at http://crypto.stanford.edu/vpaes/
9 *
10 * Distributed under the terms of the Botan license
11 */
12 
13 #include <botan/aes_ssse3.h>
14 #include <tmmintrin.h>
15 
16 namespace Botan {
17 
18 namespace {
19 
20 const __m128i low_nibs = _mm_set1_epi8(0x0F);
21 
22 const __m128i k_ipt1 = _mm_set_epi32(
23    0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);
24 const __m128i k_ipt2 = _mm_set_epi32(
25    0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);
26 
27 const __m128i k_inv1 = _mm_set_epi32(
28    0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);
29 const __m128i k_inv2 = _mm_set_epi32(
30    0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);
31 
32 const __m128i sb1u = _mm_set_epi32(
33    0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);
34 const __m128i sb1t = _mm_set_epi32(
35    0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);
36 
37 const __m128i mc_forward[4] = {
38    _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201),
39    _mm_set_epi32(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605),
40    _mm_set_epi32(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09),
41    _mm_set_epi32(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D)
42 };
43 
44 const __m128i sr[4] = {
45    _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100),
46    _mm_set_epi32(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500),
47    _mm_set_epi32(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900),
48    _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00),
49 };
50 
51 #define mm_xor3(x, y, z) _mm_xor_si128(x, _mm_xor_si128(y, z))
52 
aes_schedule_transform(__m128i input,__m128i table_1,__m128i table_2)53 __m128i aes_schedule_transform(__m128i input,
54                                __m128i table_1,
55                                __m128i table_2)
56    {
57    __m128i i_1 = _mm_and_si128(low_nibs, input);
58    __m128i i_2 = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input), 4);
59 
60    input = _mm_and_si128(low_nibs, input);
61 
62    return _mm_xor_si128(
63       _mm_shuffle_epi8(table_1, i_1),
64       _mm_shuffle_epi8(table_2, i_2));
65    }
66 
aes_schedule_mangle(__m128i k,byte round_no)67 __m128i aes_schedule_mangle(__m128i k, byte round_no)
68    {
69    __m128i t = _mm_shuffle_epi8(_mm_xor_si128(k, _mm_set1_epi8(0x5B)),
70                                 mc_forward[0]);
71 
72    __m128i t2 = t;
73 
74    t = _mm_shuffle_epi8(t, mc_forward[0]);
75 
76    t2 = mm_xor3(t2, t, _mm_shuffle_epi8(t, mc_forward[0]));
77 
78    return _mm_shuffle_epi8(t2, sr[round_no % 4]);
79    }
80 
aes_schedule_192_smear(__m128i x,__m128i y)81 __m128i aes_schedule_192_smear(__m128i x, __m128i y)
82    {
83    return mm_xor3(y,
84                   _mm_shuffle_epi32(x, 0xFE),
85                   _mm_shuffle_epi32(y, 0x80));
86    }
87 
aes_schedule_mangle_dec(__m128i k,byte round_no)88 __m128i aes_schedule_mangle_dec(__m128i k, byte round_no)
89    {
90    const __m128i dsk[8] = {
91       _mm_set_epi32(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700),
92       _mm_set_epi32(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300),
93       _mm_set_epi32(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400),
94       _mm_set_epi32(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00),
95       _mm_set_epi32(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700),
96       _mm_set_epi32(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700),
97       _mm_set_epi32(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000),
98       _mm_set_epi32(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8)
99    };
100 
101    __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]);
102    __m128i output = _mm_shuffle_epi8(t, mc_forward[0]);
103 
104    t = aes_schedule_transform(t, dsk[2], dsk[3]);
105    output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
106 
107    t = aes_schedule_transform(t, dsk[4], dsk[5]);
108    output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
109 
110    t = aes_schedule_transform(t, dsk[6], dsk[7]);
111    output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
112 
113    return _mm_shuffle_epi8(output, sr[round_no % 4]);
114    }
115 
aes_schedule_mangle_last(__m128i k,byte round_no)116 __m128i aes_schedule_mangle_last(__m128i k, byte round_no)
117    {
118    const __m128i out_tr1 = _mm_set_epi32(
119       0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);
120    const __m128i out_tr2 = _mm_set_epi32(
121       0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);
122 
123    k = _mm_shuffle_epi8(k, sr[round_no % 4]);
124    k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
125    return aes_schedule_transform(k, out_tr1, out_tr2);
126    }
127 
aes_schedule_mangle_last_dec(__m128i k)128 __m128i aes_schedule_mangle_last_dec(__m128i k)
129    {
130    const __m128i deskew1 = _mm_set_epi32(
131       0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300);
132    const __m128i deskew2 = _mm_set_epi32(
133       0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900);
134 
135    k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
136    return aes_schedule_transform(k, deskew1, deskew2);
137    }
138 
aes_schedule_round(__m128i * rcon,__m128i input1,__m128i input2)139 __m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2)
140    {
141    if(rcon)
142       {
143       input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15),
144                              input2);
145 
146       *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon
147 
148       input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate
149       input1 = _mm_alignr_epi8(input1, input1, 1);
150       }
151 
152    __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4));
153    smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B));
154 
155    __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4);
156 
157    input1 = _mm_and_si128(low_nibs, input1);
158 
159    __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);
160 
161    input1 = _mm_xor_si128(input1, t);
162 
163    __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
164    __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));
165 
166    __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
167    __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
168 
169    return mm_xor3(_mm_shuffle_epi8(sb1u, t5),
170                   _mm_shuffle_epi8(sb1t, t6),
171                   smeared);
172    }
173 
aes_ssse3_encrypt(__m128i B,const __m128i * keys,size_t rounds)174 __m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, size_t rounds)
175    {
176    const __m128i sb2u = _mm_set_epi32(
177       0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);
178    const __m128i sb2t = _mm_set_epi32(
179       0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);
180 
181    const __m128i sbou = _mm_set_epi32(
182       0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700);
183    const __m128i sbot = _mm_set_epi32(
184       0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00);
185 
186    const __m128i mc_backward[4] = {
187       _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003),
188       _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F),
189       _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B),
190       _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407),
191    };
192 
193    B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)),
194                _mm_shuffle_epi8(k_ipt2,
195                                 _mm_srli_epi32(
196                                    _mm_andnot_si128(low_nibs, B),
197                                    4)),
198                _mm_loadu_si128(keys));
199 
200    for(size_t r = 1; ; ++r)
201       {
202       const __m128i K = _mm_loadu_si128(keys + r);
203 
204       __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
205 
206       B = _mm_and_si128(low_nibs, B);
207 
208       __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
209 
210       B = _mm_xor_si128(B, t);
211 
212       __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
213       __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
214 
215       __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
216       __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
217 
218       if(r == rounds)
219          {
220          B = _mm_shuffle_epi8(
221             mm_xor3(_mm_shuffle_epi8(sbou, t5),
222                     _mm_shuffle_epi8(sbot, t6),
223                     K),
224             sr[r % 4]);
225 
226          return B;
227          }
228 
229       __m128i t7 = mm_xor3(_mm_shuffle_epi8(sb1t, t6),
230                            _mm_shuffle_epi8(sb1u, t5),
231                            K);
232 
233       __m128i t8 = mm_xor3(_mm_shuffle_epi8(sb2t, t6),
234                            _mm_shuffle_epi8(sb2u, t5),
235                            _mm_shuffle_epi8(t7, mc_forward[r % 4]));
236 
237       B = mm_xor3(_mm_shuffle_epi8(t8, mc_forward[r % 4]),
238                   _mm_shuffle_epi8(t7, mc_backward[r % 4]),
239                   t8);
240       }
241    }
242 
aes_ssse3_decrypt(__m128i B,const __m128i * keys,size_t rounds)243 __m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, size_t rounds)
244    {
245    const __m128i k_dipt1 = _mm_set_epi32(
246       0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00);
247    const __m128i k_dipt2 = _mm_set_epi32(
248       0x12771772, 0xF491F194, 0x86E383E6, 0x60056500);
249 
250    const __m128i sb9u = _mm_set_epi32(
251       0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600);
252    const __m128i sb9t = _mm_set_epi32(
253       0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900);
254 
255    const __m128i sbeu = _mm_set_epi32(
256       0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000);
257    const __m128i sbet = _mm_set_epi32(
258       0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100);
259 
260    const __m128i sbdu = _mm_set_epi32(
261       0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200);
262    const __m128i sbdt = _mm_set_epi32(
263       0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00);
264 
265    const __m128i sbbu = _mm_set_epi32(
266       0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200);
267    const __m128i sbbt = _mm_set_epi32(
268       0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700);
269 
270    __m128i mc = mc_forward[3];
271 
272    __m128i t =
273       _mm_shuffle_epi8(k_dipt2,
274                        _mm_srli_epi32(
275                           _mm_andnot_si128(low_nibs, B),
276                           4));
277 
278    B = mm_xor3(t, _mm_loadu_si128(keys),
279                _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs)));
280 
281    for(size_t r = 1; ; ++r)
282       {
283       const __m128i K = _mm_loadu_si128(keys + r);
284 
285       t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
286 
287       B = _mm_and_si128(low_nibs, B);
288 
289       __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
290 
291       B = _mm_xor_si128(B, t);
292 
293       __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
294       __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
295       __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
296       __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
297 
298       if(r == rounds)
299          {
300          const __m128i sbou = _mm_set_epi32(
301             0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000);
302          const __m128i sbot = _mm_set_epi32(
303             0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00);
304 
305          __m128i x = _mm_shuffle_epi8(sbou, t5);
306          __m128i y = _mm_shuffle_epi8(sbot, t6);
307          x = _mm_xor_si128(x, K);
308          x = _mm_xor_si128(x, y);
309 
310          const u32bit which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16;
311          return _mm_shuffle_epi8(x, sr[which_sr]);
312          }
313 
314       __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6),
315                                  _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K));
316 
317       __m128i t9 = mm_xor3(_mm_shuffle_epi8(t8, mc),
318                            _mm_shuffle_epi8(sbdu, t5),
319                            _mm_shuffle_epi8(sbdt, t6));
320 
321       __m128i t12 = _mm_xor_si128(
322          _mm_xor_si128(
323             _mm_shuffle_epi8(t9, mc),
324             _mm_shuffle_epi8(sbbu, t5)),
325          _mm_shuffle_epi8(sbbt, t6));
326 
327       B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc),
328                                       _mm_shuffle_epi8(sbeu, t5)),
329                         _mm_shuffle_epi8(sbet, t6));
330 
331       mc = _mm_alignr_epi8(mc, mc, 12);
332       }
333    }
334 
335 }
336 
337 /*
338 * AES-128 Encryption
339 */
encrypt_n(const byte in[],byte out[],size_t blocks) const340 void AES_128_SSSE3::encrypt_n(const byte in[], byte out[], size_t blocks) const
341    {
342    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
343    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
344 
345    const __m128i* keys = reinterpret_cast<const __m128i*>(&EK[0]);
346 
347    for(size_t i = 0; i != blocks; ++i)
348       {
349       __m128i B = _mm_loadu_si128(in_mm + i);
350       _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10));
351       }
352    }
353 
354 /*
355 * AES-128 Decryption
356 */
decrypt_n(const byte in[],byte out[],size_t blocks) const357 void AES_128_SSSE3::decrypt_n(const byte in[], byte out[], size_t blocks) const
358    {
359    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
360    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
361 
362    const __m128i* keys = reinterpret_cast<const __m128i*>(&DK[0]);
363 
364    for(size_t i = 0; i != blocks; ++i)
365       {
366       __m128i B = _mm_loadu_si128(in_mm + i);
367       _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10));
368       }
369    }
370 
371 /*
372 * AES-128 Key Schedule
373 */
key_schedule(const byte keyb[],size_t)374 void AES_128_SSSE3::key_schedule(const byte keyb[], size_t)
375    {
376    __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
377                                 0x1F8391B9, 0xAF9DEEB6);
378 
379    __m128i key = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
380 
381    __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]);
382    __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]);
383 
384    _mm_storeu_si128(DK_mm + 10, _mm_shuffle_epi8(key, sr[2]));
385 
386    key = aes_schedule_transform(key, k_ipt1, k_ipt2);
387 
388    _mm_storeu_si128(EK_mm, key);
389 
390    for(size_t i = 1; i != 10; ++i)
391       {
392       key = aes_schedule_round(&rcon, key, key);
393 
394       _mm_storeu_si128(EK_mm + i,
395                        aes_schedule_mangle(key, (12-i) % 4));
396 
397       _mm_storeu_si128(DK_mm + (10-i),
398                        aes_schedule_mangle_dec(key, (10-i) % 4));
399       }
400 
401    key = aes_schedule_round(&rcon, key, key);
402    _mm_storeu_si128(EK_mm + 10, aes_schedule_mangle_last(key, 2));
403    _mm_storeu_si128(DK_mm, aes_schedule_mangle_last_dec(key));
404    }
405 
406 /*
407 * AES-192 Encryption
408 */
encrypt_n(const byte in[],byte out[],size_t blocks) const409 void AES_192_SSSE3::encrypt_n(const byte in[], byte out[], size_t blocks) const
410    {
411    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
412    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
413 
414    const __m128i* keys = reinterpret_cast<const __m128i*>(&EK[0]);
415 
416    for(size_t i = 0; i != blocks; ++i)
417       {
418       __m128i B = _mm_loadu_si128(in_mm + i);
419       _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 12));
420       }
421    }
422 
423 /*
424 * AES-192 Decryption
425 */
decrypt_n(const byte in[],byte out[],size_t blocks) const426 void AES_192_SSSE3::decrypt_n(const byte in[], byte out[], size_t blocks) const
427    {
428    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
429    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
430 
431    const __m128i* keys = reinterpret_cast<const __m128i*>(&DK[0]);
432 
433    for(size_t i = 0; i != blocks; ++i)
434       {
435       __m128i B = _mm_loadu_si128(in_mm + i);
436       _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 12));
437       }
438    }
439 
440 /*
441 * AES-192 Key Schedule
442 */
key_schedule(const byte keyb[],size_t)443 void AES_192_SSSE3::key_schedule(const byte keyb[], size_t)
444    {
445    __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
446                                 0x1F8391B9, 0xAF9DEEB6);
447 
448    __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]);
449    __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]);
450 
451    __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
452    __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 8)));
453 
454    _mm_storeu_si128(DK_mm + 12, _mm_shuffle_epi8(key1, sr[0]));
455 
456    key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
457    key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
458 
459    _mm_storeu_si128(EK_mm + 0, key1);
460 
461    // key2 with 8 high bytes masked off
462    __m128i t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
463 
464    for(size_t i = 0; i != 4; ++i)
465       {
466       key2 = aes_schedule_round(&rcon, key2, key1);
467 
468       _mm_storeu_si128(EK_mm + 3*i+1,
469                        aes_schedule_mangle(_mm_alignr_epi8(key2, t, 8), (i+3)%4));
470       _mm_storeu_si128(DK_mm + 11-3*i,
471                        aes_schedule_mangle_dec(_mm_alignr_epi8(key2, t, 8), (i+3)%4));
472 
473       t = aes_schedule_192_smear(key2, t);
474 
475       _mm_storeu_si128(EK_mm + 3*i+2,
476                        aes_schedule_mangle(t, (i+2)%4));
477       _mm_storeu_si128(DK_mm + 10-3*i,
478                        aes_schedule_mangle_dec(t, (i+2)%4));
479 
480       key2 = aes_schedule_round(&rcon, t, key2);
481 
482       if(i == 3)
483          {
484          _mm_storeu_si128(EK_mm + 3*i+3,
485                           aes_schedule_mangle_last(key2, (i+1)%4));
486          _mm_storeu_si128(DK_mm + 9-3*i,
487                           aes_schedule_mangle_last_dec(key2));
488          }
489       else
490          {
491          _mm_storeu_si128(EK_mm + 3*i+3,
492                           aes_schedule_mangle(key2, (i+1)%4));
493          _mm_storeu_si128(DK_mm + 9-3*i,
494                           aes_schedule_mangle_dec(key2, (i+1)%4));
495          }
496 
497       key1 = key2;
498       key2 = aes_schedule_192_smear(key2,
499                                     _mm_slli_si128(_mm_srli_si128(t, 8), 8));
500       t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
501       }
502    }
503 
504 
505 /*
506 * AES-256 Encryption
507 */
encrypt_n(const byte in[],byte out[],size_t blocks) const508 void AES_256_SSSE3::encrypt_n(const byte in[], byte out[], size_t blocks) const
509    {
510    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
511    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
512 
513    const __m128i* keys = reinterpret_cast<const __m128i*>(&EK[0]);
514 
515    for(size_t i = 0; i != blocks; ++i)
516       {
517       __m128i B = _mm_loadu_si128(in_mm + i);
518       _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 14));
519       }
520    }
521 
522 /*
523 * AES-256 Decryption
524 */
decrypt_n(const byte in[],byte out[],size_t blocks) const525 void AES_256_SSSE3::decrypt_n(const byte in[], byte out[], size_t blocks) const
526    {
527    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
528    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
529 
530    const __m128i* keys = reinterpret_cast<const __m128i*>(&DK[0]);
531 
532    for(size_t i = 0; i != blocks; ++i)
533       {
534       __m128i B = _mm_loadu_si128(in_mm + i);
535       _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 14));
536       }
537    }
538 
539 /*
540 * AES-256 Key Schedule
541 */
key_schedule(const byte keyb[],size_t)542 void AES_256_SSSE3::key_schedule(const byte keyb[], size_t)
543    {
544    __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
545                                 0x1F8391B9, 0xAF9DEEB6);
546 
547    __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]);
548    __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]);
549 
550    __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
551    __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 16)));
552 
553    _mm_storeu_si128(DK_mm + 14, _mm_shuffle_epi8(key1, sr[2]));
554 
555    key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
556    key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
557 
558    _mm_storeu_si128(EK_mm + 0, key1);
559    _mm_storeu_si128(EK_mm + 1, aes_schedule_mangle(key2, 3));
560 
561    _mm_storeu_si128(DK_mm + 13, aes_schedule_mangle_dec(key2, 1));
562 
563    for(size_t i = 2; i != 14; i += 2)
564       {
565       __m128i k_t = key2;
566       key1 = key2 = aes_schedule_round(&rcon, key2, key1);
567 
568       _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key2, i % 4));
569       _mm_storeu_si128(DK_mm + (14-i), aes_schedule_mangle_dec(key2, (i+2) % 4));
570 
571       key2 = aes_schedule_round(NULL, _mm_shuffle_epi32(key2, 0xFF), k_t);
572       _mm_storeu_si128(EK_mm + i + 1, aes_schedule_mangle(key2, (i - 1) % 4));
573       _mm_storeu_si128(DK_mm + (13-i), aes_schedule_mangle_dec(key2, (i+1) % 4));
574       }
575 
576    key2 = aes_schedule_round(&rcon, key2, key1);
577 
578    _mm_storeu_si128(EK_mm + 14, aes_schedule_mangle_last(key2, 2));
579    _mm_storeu_si128(DK_mm + 0, aes_schedule_mangle_last_dec(key2));
580    }
581 
582 }
583