1 /*
2 Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
3
4 The redistribution and use of this software (with or without changes)
5 is allowed without the payment of fees or royalties provided that:
6
7 source code distributions include the above copyright notice, this
8 list of conditions and the following disclaimer;
9
10 binary distributions include the above copyright notice, this list
11 of conditions and the following disclaimer in their documentation.
12
13 This software is provided 'as is' with no explicit or implied warranties
14 in respect of its operation, including, but not limited to, correctness
15 and fitness for purpose.
16 ---------------------------------------------------------------------------
17 Issue Date: 13/11/2013
18 */
19
20 #include <intrin.h>
21 #include "aes_ni.h"
22
23 #if defined( USE_INTEL_AES_IF_PRESENT )
24
25 #pragma intrinsic(__cpuid)
26
has_aes_ni()27 __inline int has_aes_ni()
28 {
29 static int test = -1;
30 int cpu_info[4];
31 if(test < 0)
32 {
33 __cpuid(cpu_info, 1);
34 test = cpu_info[2] & 0x02000000;
35 }
36 return test;
37 }
38
aes_128_assist(__m128i t1,__m128i t2)39 __inline __m128i aes_128_assist(__m128i t1, __m128i t2)
40 {
41 __m128i t3;
42 t2 = _mm_shuffle_epi32(t2, 0xff);
43 t3 = _mm_slli_si128(t1, 0x4);
44 t1 = _mm_xor_si128(t1, t3);
45 t3 = _mm_slli_si128(t3, 0x4);
46 t1 = _mm_xor_si128(t1, t3);
47 t3 = _mm_slli_si128(t3, 0x4);
48 t1 = _mm_xor_si128(t1, t3);
49 t1 = _mm_xor_si128(t1, t2);
50 return t1;
51 }
52
aes_ni(encrypt_key128)53 AES_RETURN aes_ni(encrypt_key128)(const unsigned char *key, aes_encrypt_ctx cx[1])
54 {
55 __m128i t1, t2;
56 __m128i *ks = (__m128i*)cx->ks;
57
58 if(!has_aes_ni())
59 {
60 aes_xi(encrypt_key128)(key, cx);
61 return EXIT_SUCCESS;
62 }
63
64 t1 = _mm_loadu_si128((__m128i*)key);
65
66 ks[0] = t1;
67
68 t2 = _mm_aeskeygenassist_si128(t1, 0x1);
69 t1 = aes_128_assist(t1, t2);
70 ks[1] = t1;
71
72 t2 = _mm_aeskeygenassist_si128(t1, 0x2);
73 t1 = aes_128_assist(t1, t2);
74 ks[2] = t1;
75
76 t2 = _mm_aeskeygenassist_si128(t1, 0x4);
77 t1 = aes_128_assist(t1, t2);
78 ks[3] = t1;
79
80 t2 = _mm_aeskeygenassist_si128(t1, 0x8);
81 t1 = aes_128_assist(t1, t2);
82 ks[4] = t1;
83
84 t2 = _mm_aeskeygenassist_si128(t1, 0x10);
85 t1 = aes_128_assist(t1, t2);
86 ks[5] = t1;
87
88 t2 = _mm_aeskeygenassist_si128(t1, 0x20);
89 t1 = aes_128_assist(t1, t2);
90 ks[6] = t1;
91
92 t2 = _mm_aeskeygenassist_si128(t1, 0x40);
93 t1 = aes_128_assist(t1, t2);
94 ks[7] = t1;
95
96 t2 = _mm_aeskeygenassist_si128(t1, 0x80);
97 t1 = aes_128_assist(t1, t2);
98 ks[8] = t1;
99
100 t2 = _mm_aeskeygenassist_si128(t1, 0x1b);
101 t1 = aes_128_assist(t1, t2);
102 ks[9] = t1;
103
104 t2 = _mm_aeskeygenassist_si128(t1, 0x36);
105 t1 = aes_128_assist(t1, t2);
106 ks[10] = t1;
107
108 cx->inf.l = 0;
109 cx->inf.b[0] = 10 * 16;
110 return EXIT_SUCCESS;
111 }
112
aes_192_assist(__m128i * t1,__m128i * t2,__m128i * t3)113 __inline void aes_192_assist(__m128i* t1, __m128i * t2, __m128i * t3)
114 {
115 __m128i t4;
116 *t2 = _mm_shuffle_epi32(*t2, 0x55);
117 t4 = _mm_slli_si128(*t1, 0x4);
118 *t1 = _mm_xor_si128(*t1, t4);
119 t4 = _mm_slli_si128(t4, 0x4);
120 *t1 = _mm_xor_si128(*t1, t4);
121 t4 = _mm_slli_si128(t4, 0x4);
122 *t1 = _mm_xor_si128(*t1, t4);
123 *t1 = _mm_xor_si128(*t1, *t2);
124 *t2 = _mm_shuffle_epi32(*t1, 0xff);
125 t4 = _mm_slli_si128(*t3, 0x4);
126 *t3 = _mm_xor_si128(*t3, t4);
127 *t3 = _mm_xor_si128(*t3, *t2);
128 }
129
aes_ni(encrypt_key192)130 AES_RETURN aes_ni(encrypt_key192)(const unsigned char *key, aes_encrypt_ctx cx[1])
131 {
132 __m128i t1, t2, t3;
133 __m128i *ks = (__m128i*)cx->ks;
134
135 if(!has_aes_ni())
136 {
137 aes_xi(encrypt_key192)(key, cx);
138 return EXIT_SUCCESS;
139 }
140
141 t1 = _mm_loadu_si128((__m128i*)key);
142 t3 = _mm_loadu_si128((__m128i*)(key + 16));
143
144 ks[0] = t1;
145 ks[1] = t3;
146
147 t2 = _mm_aeskeygenassist_si128(t3, 0x1);
148 aes_192_assist(&t1, &t2, &t3);
149
150 ks[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[1]), _mm_castsi128_pd(t1), 0));
151 ks[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
152
153 t2 = _mm_aeskeygenassist_si128(t3, 0x2);
154 aes_192_assist(&t1, &t2, &t3);
155 ks[3] = t1;
156 ks[4] = t3;
157
158 t2 = _mm_aeskeygenassist_si128(t3, 0x4);
159 aes_192_assist(&t1, &t2, &t3);
160 ks[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[4]), _mm_castsi128_pd(t1), 0));
161 ks[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
162
163 t2 = _mm_aeskeygenassist_si128(t3, 0x8);
164 aes_192_assist(&t1, &t2, &t3);
165 ks[6] = t1;
166 ks[7] = t3;
167
168 t2 = _mm_aeskeygenassist_si128(t3, 0x10);
169 aes_192_assist(&t1, &t2, &t3);
170 ks[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[7]), _mm_castsi128_pd(t1), 0));
171 ks[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
172
173 t2 = _mm_aeskeygenassist_si128(t3, 0x20);
174 aes_192_assist(&t1, &t2, &t3);
175 ks[9] = t1;
176 ks[10] = t3;
177
178 t2 = _mm_aeskeygenassist_si128(t3, 0x40);
179 aes_192_assist(&t1, &t2, &t3);
180 ks[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[10]), _mm_castsi128_pd(t1), 0));
181 ks[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
182
183 t2 = _mm_aeskeygenassist_si128(t3, 0x80);
184 aes_192_assist(&t1, &t2, &t3);
185 ks[12] = t1;
186
187 cx->inf.l = 0;
188 cx->inf.b[0] = 12 * 16;
189 return EXIT_SUCCESS;
190 }
191
aes_256_assist1(__m128i * t1,__m128i * t2)192 __inline void aes_256_assist1(__m128i* t1, __m128i * t2)
193 {
194 __m128i t4;
195 *t2 = _mm_shuffle_epi32(*t2, 0xff);
196 t4 = _mm_slli_si128(*t1, 0x4);
197 *t1 = _mm_xor_si128(*t1, t4);
198 t4 = _mm_slli_si128(t4, 0x4);
199 *t1 = _mm_xor_si128(*t1, t4);
200 t4 = _mm_slli_si128(t4, 0x4);
201 *t1 = _mm_xor_si128(*t1, t4);
202 *t1 = _mm_xor_si128(*t1, *t2);
203 }
204
aes_256_assist2(__m128i * t1,__m128i * t3)205 __inline void aes_256_assist2(__m128i* t1, __m128i * t3)
206 {
207 __m128i t2, t4;
208 t4 = _mm_aeskeygenassist_si128(*t1, 0x0);
209 t2 = _mm_shuffle_epi32(t4, 0xaa);
210 t4 = _mm_slli_si128(*t3, 0x4);
211 *t3 = _mm_xor_si128(*t3, t4);
212 t4 = _mm_slli_si128(t4, 0x4);
213 *t3 = _mm_xor_si128(*t3, t4);
214 t4 = _mm_slli_si128(t4, 0x4);
215 *t3 = _mm_xor_si128(*t3, t4);
216 *t3 = _mm_xor_si128(*t3, t2);
217 }
218
aes_ni(encrypt_key256)219 AES_RETURN aes_ni(encrypt_key256)(const unsigned char *key, aes_encrypt_ctx cx[1])
220 {
221 __m128i t1, t2, t3;
222 __m128i *ks = (__m128i*)cx->ks;
223
224 if(!has_aes_ni())
225 {
226 aes_xi(encrypt_key256)(key, cx);
227 return EXIT_SUCCESS;
228 }
229
230 t1 = _mm_loadu_si128((__m128i*)key);
231 t3 = _mm_loadu_si128((__m128i*)(key + 16));
232
233 ks[0] = t1;
234 ks[1] = t3;
235
236 t2 = _mm_aeskeygenassist_si128(t3, 0x01);
237 aes_256_assist1(&t1, &t2);
238 ks[2] = t1;
239 aes_256_assist2(&t1, &t3);
240 ks[3] = t3;
241
242 t2 = _mm_aeskeygenassist_si128(t3, 0x02);
243 aes_256_assist1(&t1, &t2);
244 ks[4] = t1;
245 aes_256_assist2(&t1, &t3);
246 ks[5] = t3;
247
248 t2 = _mm_aeskeygenassist_si128(t3, 0x04);
249 aes_256_assist1(&t1, &t2);
250 ks[6] = t1;
251 aes_256_assist2(&t1, &t3);
252 ks[7] = t3;
253
254 t2 = _mm_aeskeygenassist_si128(t3, 0x08);
255 aes_256_assist1(&t1, &t2);
256 ks[8] = t1;
257 aes_256_assist2(&t1, &t3);
258 ks[9] = t3;
259
260 t2 = _mm_aeskeygenassist_si128(t3, 0x10);
261 aes_256_assist1(&t1, &t2);
262 ks[10] = t1;
263 aes_256_assist2(&t1, &t3);
264 ks[11] = t3;
265
266 t2 = _mm_aeskeygenassist_si128(t3, 0x20);
267 aes_256_assist1(&t1, &t2);
268 ks[12] = t1;
269 aes_256_assist2(&t1, &t3);
270 ks[13] = t3;
271
272 t2 = _mm_aeskeygenassist_si128(t3, 0x40);
273 aes_256_assist1(&t1, &t2);
274 ks[14] = t1;
275
276 cx->inf.l = 0;
277 cx->inf.b[0] = 14 * 16;
278 return EXIT_SUCCESS;
279 }
280
enc_to_dec(aes_decrypt_ctx cx[1])281 __inline void enc_to_dec(aes_decrypt_ctx cx[1])
282 {
283 __m128i *ks = (__m128i*)cx->ks;
284 int j;
285
286 for( j = 1 ; j < (cx->inf.b[0] >> 4) ; ++j )
287 ks[j] = _mm_aesimc_si128(ks[j]);
288 }
289
aes_ni(decrypt_key128)290 AES_RETURN aes_ni(decrypt_key128)(const unsigned char *key, aes_decrypt_ctx cx[1])
291 {
292 if(!has_aes_ni())
293 {
294 aes_xi(decrypt_key128)(key, cx);
295 return EXIT_SUCCESS;
296 }
297
298 if(aes_ni(encrypt_key128)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
299 {
300 enc_to_dec(cx);
301 return EXIT_SUCCESS;
302 }
303 else
304 return EXIT_FAILURE;
305
306 }
307
aes_ni(decrypt_key192)308 AES_RETURN aes_ni(decrypt_key192)(const unsigned char *key, aes_decrypt_ctx cx[1])
309 {
310 if(!has_aes_ni())
311 {
312 aes_xi(decrypt_key192)(key, cx);
313 return EXIT_SUCCESS;
314 }
315
316 if(aes_ni(encrypt_key192)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
317 {
318 enc_to_dec(cx);
319 return EXIT_SUCCESS;
320 }
321 else
322 return EXIT_FAILURE;
323 }
324
aes_ni(decrypt_key256)325 AES_RETURN aes_ni(decrypt_key256)(const unsigned char *key, aes_decrypt_ctx cx[1])
326 {
327 if(!has_aes_ni())
328 {
329 aes_xi(decrypt_key256)(key, cx);
330 return EXIT_SUCCESS;
331 }
332
333 if(aes_ni(encrypt_key256)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
334 {
335 enc_to_dec(cx);
336 return EXIT_SUCCESS;
337 }
338 else
339 return EXIT_FAILURE;
340 }
341
aes_ni(encrypt)342 AES_RETURN aes_ni(encrypt)(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1])
343 {
344 __m128i *key = (__m128i*)cx->ks, t;
345
346 if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
347 return EXIT_FAILURE;
348
349 if(!has_aes_ni())
350 {
351 aes_xi(encrypt)(in, out, cx);
352 return EXIT_SUCCESS;
353 }
354
355 t = _mm_xor_si128(_mm_loadu_si128((__m128i*)in), *(__m128i*)key);
356
357 switch(cx->inf.b[0])
358 {
359 case 14 * 16:
360 t = _mm_aesenc_si128(t, *(__m128i*)++key);
361 t = _mm_aesenc_si128(t, *(__m128i*)++key);
362 case 12 * 16:
363 t = _mm_aesenc_si128(t, *(__m128i*)++key);
364 t = _mm_aesenc_si128(t, *(__m128i*)++key);
365 case 10 * 16:
366 t = _mm_aesenc_si128(t, *(__m128i*)++key);
367 t = _mm_aesenc_si128(t, *(__m128i*)++key);
368 t = _mm_aesenc_si128(t, *(__m128i*)++key);
369 t = _mm_aesenc_si128(t, *(__m128i*)++key);
370 t = _mm_aesenc_si128(t, *(__m128i*)++key);
371 t = _mm_aesenc_si128(t, *(__m128i*)++key);
372 t = _mm_aesenc_si128(t, *(__m128i*)++key);
373 t = _mm_aesenc_si128(t, *(__m128i*)++key);
374 t = _mm_aesenc_si128(t, *(__m128i*)++key);
375 t = _mm_aesenclast_si128(t, *(__m128i*)++key);
376 }
377
378 _mm_storeu_si128(&((__m128i*)out)[0], t);
379 return EXIT_SUCCESS;
380 }
381
aes_ni(decrypt)382 AES_RETURN aes_ni(decrypt)(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1])
383 {
384 __m128i *key = (__m128i*)cx->ks + (cx->inf.b[0] >> 4), t;
385
386 if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
387 return EXIT_FAILURE;
388
389 if(!has_aes_ni())
390 {
391 aes_xi(decrypt)(in, out, cx);
392 return EXIT_SUCCESS;
393 }
394
395 t = _mm_xor_si128(_mm_loadu_si128((__m128i*)in), *(__m128i*)key);
396
397 switch(cx->inf.b[0])
398 {
399 case 14 * 16:
400 t = _mm_aesdec_si128(t, *(__m128i*)--key);
401 t = _mm_aesdec_si128(t, *(__m128i*)--key);
402 case 12 * 16:
403 t = _mm_aesdec_si128(t, *(__m128i*)--key);
404 t = _mm_aesdec_si128(t, *(__m128i*)--key);
405 case 10 * 16:
406 t = _mm_aesdec_si128(t, *(__m128i*)--key);
407 t = _mm_aesdec_si128(t, *(__m128i*)--key);
408 t = _mm_aesdec_si128(t, *(__m128i*)--key);
409 t = _mm_aesdec_si128(t, *(__m128i*)--key);
410 t = _mm_aesdec_si128(t, *(__m128i*)--key);
411 t = _mm_aesdec_si128(t, *(__m128i*)--key);
412 t = _mm_aesdec_si128(t, *(__m128i*)--key);
413 t = _mm_aesdec_si128(t, *(__m128i*)--key);
414 t = _mm_aesdec_si128(t, *(__m128i*)--key);
415 t = _mm_aesdeclast_si128(t, *(__m128i*)--key);
416 }
417
418 _mm_storeu_si128((__m128i*)out, t);
419 return EXIT_SUCCESS;
420 }
421
aes_CBC_encrypt(const unsigned char * in,unsigned char * out,unsigned char ivec[16],unsigned long length,unsigned char * key,int number_of_rounds)422 void aes_CBC_encrypt(const unsigned char *in,
423 unsigned char *out,
424 unsigned char ivec[16],
425 unsigned long length,
426 unsigned char *key,
427 int number_of_rounds)
428 {
429 __m128i feedback, data;
430 int i, j;
431 if(length % 16)
432 length = length / 16 + 1;
433 else length /= 16;
434 feedback = _mm_loadu_si128((__m128i*)ivec);
435 for(i = 0; i < length; i++)
436 {
437 data = _mm_loadu_si128(&((__m128i*)in)[i]);
438 feedback = _mm_xor_si128(data, feedback);
439 feedback = _mm_xor_si128(feedback, ((__m128i*)key)[0]);
440 for(j = 1; j <number_of_rounds; j++)
441 feedback = _mm_aesenc_si128(feedback, ((__m128i*)key)[j]);
442 feedback = _mm_aesenclast_si128(feedback, ((__m128i*)key)[j]);
443 _mm_storeu_si128(&((__m128i*)out)[i], feedback);
444 }
445 }
446
aes_CBC_decrypt(const unsigned char * in,unsigned char * out,unsigned char ivec[16],unsigned long length,unsigned char * key,int number_of_rounds)447 void aes_CBC_decrypt(const unsigned char *in,
448 unsigned char *out,
449 unsigned char ivec[16],
450 unsigned long length,
451 unsigned char *key,
452 int number_of_rounds)
453 {
454 __m128i data, feedback, last_in;
455 int i, j;
456 if(length % 16)
457 length = length / 16 + 1;
458 else length /= 16;
459 feedback = _mm_loadu_si128((__m128i*)ivec);
460 for(i = 0; i < length; i++)
461 {
462 last_in = _mm_loadu_si128(&((__m128i*)in)[i]);
463 data = _mm_xor_si128(last_in, ((__m128i*)key)[0]);
464 for(j = 1; j <number_of_rounds; j++)
465 {
466 data = _mm_aesdec_si128(data, ((__m128i*)key)[j]);
467 }
468 data = _mm_aesdeclast_si128(data, ((__m128i*)key)[j]);
469 data = _mm_xor_si128(data, feedback);
470 _mm_storeu_si128(&((__m128i*)out)[i], data);
471 feedback = last_in;
472 }
473 }
474
AES_CTR_encrypt(const unsigned char * in,unsigned char * out,const unsigned char ivec[8],const unsigned char nonce[4],unsigned long length,const unsigned char * key,int number_of_rounds)475 void AES_CTR_encrypt(const unsigned char *in,
476 unsigned char *out,
477 const unsigned char ivec[8],
478 const unsigned char nonce[4],
479 unsigned long length,
480 const unsigned char *key,
481 int number_of_rounds)
482 {
483 __m128i ctr_block = { 0 }, tmp, ONE, BSWAP_EPI64;
484 int i, j;
485 if(length % 16)
486 length = length / 16 + 1;
487 else length /= 16;
488 ONE = _mm_set_epi32(0, 1, 0, 0);
489 BSWAP_EPI64 = _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
490 ctr_block = _mm_insert_epi64(ctr_block, *(long long*)ivec, 1);
491 ctr_block = _mm_insert_epi32(ctr_block, *(long*)nonce, 1);
492 ctr_block = _mm_srli_si128(ctr_block, 4);
493 ctr_block = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
494 ctr_block = _mm_add_epi64(ctr_block, ONE);
495 for(i = 0; i < length; i++)
496 {
497 tmp = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
498 ctr_block = _mm_add_epi64(ctr_block, ONE);
499 tmp = _mm_xor_si128(tmp, ((__m128i*)key)[0]);
500 for(j = 1; j <number_of_rounds; j++)
501 {
502 tmp = _mm_aesenc_si128(tmp, ((__m128i*)key)[j]);
503 };
504 tmp = _mm_aesenclast_si128(tmp, ((__m128i*)key)[j]);
505 tmp = _mm_xor_si128(tmp, _mm_loadu_si128(&((__m128i*)in)[i]));
506 _mm_storeu_si128(&((__m128i*)out)[i], tmp);
507 }
508 }
509
510 #endif
511