1 /*
2 Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
3 
4 The redistribution and use of this software (with or without changes)
5 is allowed without the payment of fees or royalties provided that:
6 
7   source code distributions include the above copyright notice, this
8   list of conditions and the following disclaimer;
9 
10   binary distributions include the above copyright notice, this list
11   of conditions and the following disclaimer in their documentation.
12 
13 This software is provided 'as is' with no explicit or implied warranties
14 in respect of its operation, including, but not limited to, correctness
15 and fitness for purpose.
16 ---------------------------------------------------------------------------
17 Issue Date: 13/11/2013
18 */
19 
20 #include <intrin.h>
21 #include "aes_ni.h"
22 
23 #if defined( USE_INTEL_AES_IF_PRESENT )
24 
25 #pragma intrinsic(__cpuid)
26 
has_aes_ni()27 __inline int has_aes_ni()
28 {
29 	static int test = -1;
30 	int cpu_info[4];
31 	if(test < 0)
32 	{
33 		__cpuid(cpu_info, 1);
34 		test = cpu_info[2] & 0x02000000;
35 	}
36 	return test;
37 }
38 
aes_128_assist(__m128i t1,__m128i t2)39 __inline __m128i aes_128_assist(__m128i t1, __m128i t2)
40 {
41 	__m128i t3;
42 	t2 = _mm_shuffle_epi32(t2, 0xff);
43 	t3 = _mm_slli_si128(t1, 0x4);
44 	t1 = _mm_xor_si128(t1, t3);
45 	t3 = _mm_slli_si128(t3, 0x4);
46 	t1 = _mm_xor_si128(t1, t3);
47 	t3 = _mm_slli_si128(t3, 0x4);
48 	t1 = _mm_xor_si128(t1, t3);
49 	t1 = _mm_xor_si128(t1, t2);
50 	return t1;
51 }
52 
aes_ni(encrypt_key128)53 AES_RETURN aes_ni(encrypt_key128)(const unsigned char *key, aes_encrypt_ctx cx[1])
54 {
55 	__m128i t1, t2;
56 	__m128i *ks = (__m128i*)cx->ks;
57 
58 	if(!has_aes_ni())
59 	{
60 		aes_xi(encrypt_key128)(key, cx);
61 		return EXIT_SUCCESS;
62 	}
63 
64 	t1 = _mm_loadu_si128((__m128i*)key);
65 
66 	ks[0] = t1;
67 
68 	t2 = _mm_aeskeygenassist_si128(t1, 0x1);
69 	t1 = aes_128_assist(t1, t2);
70 	ks[1] = t1;
71 
72 	t2 = _mm_aeskeygenassist_si128(t1, 0x2);
73 	t1 = aes_128_assist(t1, t2);
74 	ks[2] = t1;
75 
76 	t2 = _mm_aeskeygenassist_si128(t1, 0x4);
77 	t1 = aes_128_assist(t1, t2);
78 	ks[3] = t1;
79 
80 	t2 = _mm_aeskeygenassist_si128(t1, 0x8);
81 	t1 = aes_128_assist(t1, t2);
82 	ks[4] = t1;
83 
84 	t2 = _mm_aeskeygenassist_si128(t1, 0x10);
85 	t1 = aes_128_assist(t1, t2);
86 	ks[5] = t1;
87 
88 	t2 = _mm_aeskeygenassist_si128(t1, 0x20);
89 	t1 = aes_128_assist(t1, t2);
90 	ks[6] = t1;
91 
92 	t2 = _mm_aeskeygenassist_si128(t1, 0x40);
93 	t1 = aes_128_assist(t1, t2);
94 	ks[7] = t1;
95 
96 	t2 = _mm_aeskeygenassist_si128(t1, 0x80);
97 	t1 = aes_128_assist(t1, t2);
98 	ks[8] = t1;
99 
100 	t2 = _mm_aeskeygenassist_si128(t1, 0x1b);
101 	t1 = aes_128_assist(t1, t2);
102 	ks[9] = t1;
103 
104 	t2 = _mm_aeskeygenassist_si128(t1, 0x36);
105 	t1 = aes_128_assist(t1, t2);
106 	ks[10] = t1;
107 
108 	cx->inf.l = 0;
109 	cx->inf.b[0] = 10 * 16;
110 	return EXIT_SUCCESS;
111 }
112 
aes_192_assist(__m128i * t1,__m128i * t2,__m128i * t3)113 __inline void aes_192_assist(__m128i* t1, __m128i * t2, __m128i * t3)
114 {
115 	__m128i t4;
116 	*t2 = _mm_shuffle_epi32(*t2, 0x55);
117 	t4 = _mm_slli_si128(*t1, 0x4);
118 	*t1 = _mm_xor_si128(*t1, t4);
119 	t4 = _mm_slli_si128(t4, 0x4);
120 	*t1 = _mm_xor_si128(*t1, t4);
121 	t4 = _mm_slli_si128(t4, 0x4);
122 	*t1 = _mm_xor_si128(*t1, t4);
123 	*t1 = _mm_xor_si128(*t1, *t2);
124 	*t2 = _mm_shuffle_epi32(*t1, 0xff);
125 	t4 = _mm_slli_si128(*t3, 0x4);
126 	*t3 = _mm_xor_si128(*t3, t4);
127 	*t3 = _mm_xor_si128(*t3, *t2);
128 }
129 
aes_ni(encrypt_key192)130 AES_RETURN aes_ni(encrypt_key192)(const unsigned char *key, aes_encrypt_ctx cx[1])
131 {
132 	__m128i t1, t2, t3;
133 	__m128i *ks = (__m128i*)cx->ks;
134 
135 	if(!has_aes_ni())
136 	{
137 		aes_xi(encrypt_key192)(key, cx);
138 		return EXIT_SUCCESS;
139 	}
140 
141 	t1 = _mm_loadu_si128((__m128i*)key);
142 	t3 = _mm_loadu_si128((__m128i*)(key + 16));
143 
144 	ks[0] = t1;
145 	ks[1] = t3;
146 
147 	t2 = _mm_aeskeygenassist_si128(t3, 0x1);
148 	aes_192_assist(&t1, &t2, &t3);
149 
150 	ks[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[1]), _mm_castsi128_pd(t1), 0));
151 	ks[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
152 
153 	t2 = _mm_aeskeygenassist_si128(t3, 0x2);
154 	aes_192_assist(&t1, &t2, &t3);
155 	ks[3] = t1;
156 	ks[4] = t3;
157 
158 	t2 = _mm_aeskeygenassist_si128(t3, 0x4);
159 	aes_192_assist(&t1, &t2, &t3);
160 	ks[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[4]), _mm_castsi128_pd(t1), 0));
161 	ks[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
162 
163 	t2 = _mm_aeskeygenassist_si128(t3, 0x8);
164 	aes_192_assist(&t1, &t2, &t3);
165 	ks[6] = t1;
166 	ks[7] = t3;
167 
168 	t2 = _mm_aeskeygenassist_si128(t3, 0x10);
169 	aes_192_assist(&t1, &t2, &t3);
170 	ks[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[7]), _mm_castsi128_pd(t1), 0));
171 	ks[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
172 
173 	t2 = _mm_aeskeygenassist_si128(t3, 0x20);
174 	aes_192_assist(&t1, &t2, &t3);
175 	ks[9] = t1;
176 	ks[10] = t3;
177 
178 	t2 = _mm_aeskeygenassist_si128(t3, 0x40);
179 	aes_192_assist(&t1, &t2, &t3);
180 	ks[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[10]), _mm_castsi128_pd(t1), 0));
181 	ks[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
182 
183 	t2 = _mm_aeskeygenassist_si128(t3, 0x80);
184 	aes_192_assist(&t1, &t2, &t3);
185 	ks[12] = t1;
186 
187 	cx->inf.l = 0;
188 	cx->inf.b[0] = 12 * 16;
189 	return EXIT_SUCCESS;
190 }
191 
aes_256_assist1(__m128i * t1,__m128i * t2)192 __inline void aes_256_assist1(__m128i* t1, __m128i * t2)
193 {
194 	__m128i t4;
195 	*t2 = _mm_shuffle_epi32(*t2, 0xff);
196 	t4 = _mm_slli_si128(*t1, 0x4);
197 	*t1 = _mm_xor_si128(*t1, t4);
198 	t4 = _mm_slli_si128(t4, 0x4);
199 	*t1 = _mm_xor_si128(*t1, t4);
200 	t4 = _mm_slli_si128(t4, 0x4);
201 	*t1 = _mm_xor_si128(*t1, t4);
202 	*t1 = _mm_xor_si128(*t1, *t2);
203 }
204 
aes_256_assist2(__m128i * t1,__m128i * t3)205 __inline void aes_256_assist2(__m128i* t1, __m128i * t3)
206 {
207 	__m128i t2, t4;
208 	t4 = _mm_aeskeygenassist_si128(*t1, 0x0);
209 	t2 = _mm_shuffle_epi32(t4, 0xaa);
210 	t4 = _mm_slli_si128(*t3, 0x4);
211 	*t3 = _mm_xor_si128(*t3, t4);
212 	t4 = _mm_slli_si128(t4, 0x4);
213 	*t3 = _mm_xor_si128(*t3, t4);
214 	t4 = _mm_slli_si128(t4, 0x4);
215 	*t3 = _mm_xor_si128(*t3, t4);
216 	*t3 = _mm_xor_si128(*t3, t2);
217 }
218 
aes_ni(encrypt_key256)219 AES_RETURN aes_ni(encrypt_key256)(const unsigned char *key, aes_encrypt_ctx cx[1])
220 {
221 	__m128i t1, t2, t3;
222 	__m128i *ks = (__m128i*)cx->ks;
223 
224 	if(!has_aes_ni())
225 	{
226 		aes_xi(encrypt_key256)(key, cx);
227 		return EXIT_SUCCESS;
228 	}
229 
230 	t1 = _mm_loadu_si128((__m128i*)key);
231 	t3 = _mm_loadu_si128((__m128i*)(key + 16));
232 
233 	ks[0] = t1;
234 	ks[1] = t3;
235 
236 	t2 = _mm_aeskeygenassist_si128(t3, 0x01);
237 	aes_256_assist1(&t1, &t2);
238 	ks[2] = t1;
239 	aes_256_assist2(&t1, &t3);
240 	ks[3] = t3;
241 
242 	t2 = _mm_aeskeygenassist_si128(t3, 0x02);
243 	aes_256_assist1(&t1, &t2);
244 	ks[4] = t1;
245 	aes_256_assist2(&t1, &t3);
246 	ks[5] = t3;
247 
248 	t2 = _mm_aeskeygenassist_si128(t3, 0x04);
249 	aes_256_assist1(&t1, &t2);
250 	ks[6] = t1;
251 	aes_256_assist2(&t1, &t3);
252 	ks[7] = t3;
253 
254 	t2 = _mm_aeskeygenassist_si128(t3, 0x08);
255 	aes_256_assist1(&t1, &t2);
256 	ks[8] = t1;
257 	aes_256_assist2(&t1, &t3);
258 	ks[9] = t3;
259 
260 	t2 = _mm_aeskeygenassist_si128(t3, 0x10);
261 	aes_256_assist1(&t1, &t2);
262 	ks[10] = t1;
263 	aes_256_assist2(&t1, &t3);
264 	ks[11] = t3;
265 
266 	t2 = _mm_aeskeygenassist_si128(t3, 0x20);
267 	aes_256_assist1(&t1, &t2);
268 	ks[12] = t1;
269 	aes_256_assist2(&t1, &t3);
270 	ks[13] = t3;
271 
272 	t2 = _mm_aeskeygenassist_si128(t3, 0x40);
273 	aes_256_assist1(&t1, &t2);
274 	ks[14] = t1;
275 
276 	cx->inf.l = 0;
277 	cx->inf.b[0] = 14 * 16;
278 	return EXIT_SUCCESS;
279 }
280 
enc_to_dec(aes_decrypt_ctx cx[1])281 __inline void enc_to_dec(aes_decrypt_ctx cx[1])
282 {
283 	__m128i *ks = (__m128i*)cx->ks;
284 	int j;
285 
286 	for( j = 1 ; j < (cx->inf.b[0] >> 4) ; ++j )
287 		ks[j] = _mm_aesimc_si128(ks[j]);
288 }
289 
aes_ni(decrypt_key128)290 AES_RETURN aes_ni(decrypt_key128)(const unsigned char *key, aes_decrypt_ctx cx[1])
291 {
292 	if(!has_aes_ni())
293 	{
294 		aes_xi(decrypt_key128)(key, cx);
295 		return EXIT_SUCCESS;
296 	}
297 
298 	if(aes_ni(encrypt_key128)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
299 	{
300 		enc_to_dec(cx);
301 		return EXIT_SUCCESS;
302 	}
303 	else
304 		return EXIT_FAILURE;
305 
306 }
307 
aes_ni(decrypt_key192)308 AES_RETURN aes_ni(decrypt_key192)(const unsigned char *key, aes_decrypt_ctx cx[1])
309 {
310 	if(!has_aes_ni())
311 	{
312 		aes_xi(decrypt_key192)(key, cx);
313 		return EXIT_SUCCESS;
314 	}
315 
316 	if(aes_ni(encrypt_key192)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
317 	{
318 		enc_to_dec(cx);
319 		return EXIT_SUCCESS;
320 	}
321 	else
322 		return EXIT_FAILURE;
323 }
324 
aes_ni(decrypt_key256)325 AES_RETURN aes_ni(decrypt_key256)(const unsigned char *key, aes_decrypt_ctx cx[1])
326 {
327 	if(!has_aes_ni())
328 	{
329 		aes_xi(decrypt_key256)(key, cx);
330 		return EXIT_SUCCESS;
331 	}
332 
333 	if(aes_ni(encrypt_key256)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
334 	{
335 		enc_to_dec(cx);
336 		return EXIT_SUCCESS;
337 	}
338 	else
339 		return EXIT_FAILURE;
340 }
341 
aes_ni(encrypt)342 AES_RETURN aes_ni(encrypt)(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1])
343 {
344 	__m128i *key = (__m128i*)cx->ks, t;
345 
346 	if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
347 		return EXIT_FAILURE;
348 
349 	if(!has_aes_ni())
350 	{
351 		aes_xi(encrypt)(in, out, cx);
352 		return EXIT_SUCCESS;
353 	}
354 
355 	t = _mm_xor_si128(_mm_loadu_si128((__m128i*)in), *(__m128i*)key);
356 
357 	switch(cx->inf.b[0])
358 	{
359 	case 14 * 16:
360 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
361 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
362 	case 12 * 16:
363 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
364 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
365 	case 10 * 16:
366 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
367 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
368 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
369 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
370 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
371 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
372 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
373 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
374 		t = _mm_aesenc_si128(t, *(__m128i*)++key);
375 		t = _mm_aesenclast_si128(t, *(__m128i*)++key);
376 	}
377 
378 	_mm_storeu_si128(&((__m128i*)out)[0], t);
379 	return EXIT_SUCCESS;
380 }
381 
aes_ni(decrypt)382 AES_RETURN aes_ni(decrypt)(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1])
383 {
384 	__m128i *key = (__m128i*)cx->ks + (cx->inf.b[0] >> 4), t;
385 
386 	if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
387 		return EXIT_FAILURE;
388 
389 	if(!has_aes_ni())
390 	{
391 		aes_xi(decrypt)(in, out, cx);
392 		return EXIT_SUCCESS;
393 	}
394 
395 	t = _mm_xor_si128(_mm_loadu_si128((__m128i*)in), *(__m128i*)key);
396 
397 	switch(cx->inf.b[0])
398 	{
399 	case 14 * 16:
400 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
401 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
402 	case 12 * 16:
403 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
404 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
405 	case 10 * 16:
406 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
407 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
408 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
409 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
410 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
411 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
412 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
413 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
414 		t = _mm_aesdec_si128(t, *(__m128i*)--key);
415 		t = _mm_aesdeclast_si128(t, *(__m128i*)--key);
416 	}
417 
418 	_mm_storeu_si128((__m128i*)out, t);
419 	return EXIT_SUCCESS;
420 }
421 
aes_CBC_encrypt(const unsigned char * in,unsigned char * out,unsigned char ivec[16],unsigned long length,unsigned char * key,int number_of_rounds)422 void aes_CBC_encrypt(const unsigned char *in,
423 	unsigned char *out,
424 	unsigned char ivec[16],
425 	unsigned long length,
426 	unsigned char *key,
427 	int number_of_rounds)
428 {
429 	__m128i feedback, data;
430 	int i, j;
431 	if(length % 16)
432 		length = length / 16 + 1;
433 	else length /= 16;
434 	feedback = _mm_loadu_si128((__m128i*)ivec);
435 	for(i = 0; i < length; i++)
436 	{
437 		data = _mm_loadu_si128(&((__m128i*)in)[i]);
438 		feedback = _mm_xor_si128(data, feedback);
439 		feedback = _mm_xor_si128(feedback, ((__m128i*)key)[0]);
440 		for(j = 1; j <number_of_rounds; j++)
441 			feedback = _mm_aesenc_si128(feedback, ((__m128i*)key)[j]);
442 		feedback = _mm_aesenclast_si128(feedback, ((__m128i*)key)[j]);
443 		_mm_storeu_si128(&((__m128i*)out)[i], feedback);
444 	}
445 }
446 
aes_CBC_decrypt(const unsigned char * in,unsigned char * out,unsigned char ivec[16],unsigned long length,unsigned char * key,int number_of_rounds)447 void aes_CBC_decrypt(const unsigned char *in,
448 	unsigned char *out,
449 	unsigned char ivec[16],
450 	unsigned long length,
451 	unsigned char *key,
452 	int number_of_rounds)
453 {
454 	__m128i data, feedback, last_in;
455 	int i, j;
456 	if(length % 16)
457 		length = length / 16 + 1;
458 	else length /= 16;
459 	feedback = _mm_loadu_si128((__m128i*)ivec);
460 	for(i = 0; i < length; i++)
461 	{
462 		last_in = _mm_loadu_si128(&((__m128i*)in)[i]);
463 		data = _mm_xor_si128(last_in, ((__m128i*)key)[0]);
464 		for(j = 1; j <number_of_rounds; j++)
465 		{
466 			data = _mm_aesdec_si128(data, ((__m128i*)key)[j]);
467 		}
468 		data = _mm_aesdeclast_si128(data, ((__m128i*)key)[j]);
469 		data = _mm_xor_si128(data, feedback);
470 		_mm_storeu_si128(&((__m128i*)out)[i], data);
471 		feedback = last_in;
472 	}
473 }
474 
AES_CTR_encrypt(const unsigned char * in,unsigned char * out,const unsigned char ivec[8],const unsigned char nonce[4],unsigned long length,const unsigned char * key,int number_of_rounds)475 void AES_CTR_encrypt(const unsigned char *in,
476 	unsigned char *out,
477 	const unsigned char ivec[8],
478 	const unsigned char nonce[4],
479 	unsigned long length,
480 	const unsigned char *key,
481 	int number_of_rounds)
482 {
483 	__m128i ctr_block = { 0 }, tmp, ONE, BSWAP_EPI64;
484 	int i, j;
485 	if(length % 16)
486 		length = length / 16 + 1;
487 	else length /= 16;
488 	ONE = _mm_set_epi32(0, 1, 0, 0);
489 	BSWAP_EPI64 = _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
490 	ctr_block = _mm_insert_epi64(ctr_block, *(long long*)ivec, 1);
491 	ctr_block = _mm_insert_epi32(ctr_block, *(long*)nonce, 1);
492 	ctr_block = _mm_srli_si128(ctr_block, 4);
493 	ctr_block = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
494 	ctr_block = _mm_add_epi64(ctr_block, ONE);
495 	for(i = 0; i < length; i++)
496 	{
497 		tmp = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
498 		ctr_block = _mm_add_epi64(ctr_block, ONE);
499 		tmp = _mm_xor_si128(tmp, ((__m128i*)key)[0]);
500 		for(j = 1; j <number_of_rounds; j++)
501 		{
502 			tmp = _mm_aesenc_si128(tmp, ((__m128i*)key)[j]);
503 		};
504 		tmp = _mm_aesenclast_si128(tmp, ((__m128i*)key)[j]);
505 		tmp = _mm_xor_si128(tmp, _mm_loadu_si128(&((__m128i*)in)[i]));
506 		_mm_storeu_si128(&((__m128i*)out)[i], tmp);
507 	}
508 }
509 
510 #endif
511