xref: /freebsd/sys/crypto/aesni/aesni_ghash.c (revision 315ee00f)
1 /*-
2  * Copyright (c) 2014 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by John-Mark Gurney under
6  * the sponsorship of the FreeBSD Foundation and
7  * Rubicon Communications, LLC (Netgate).
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1.  Redistributions of source code must retain the above copyright
12  *     notice, this list of conditions and the following disclaimer.
13  * 2.  Redistributions in binary form must reproduce the above copyright
14  *     notice, this list of conditions and the following disclaimer in the
15  *     documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *
30  */
31 
32 /*
33  * Figure 5, 8 and 12 are copied from the Intel white paper:
34  * Intel® Carry-Less Multiplication Instruction and its Usage for
35  * Computing the GCM Mode
36  *
37  * and as such are:
38  * Copyright © 2010 Intel Corporation.
39  * All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  *   * Redistributions of source code must retain the above copyright
45  *     notice, this list of conditions and the following disclaimer.
46  *   * Redistributions in binary form must reproduce the above copyright
47  *     notice, this list of conditions and the following disclaimer in the
48  *     documentation and/or other materials provided with the distribution.
49  *   * Neither the name of Intel Corporation nor the
50  *     names of its contributors may be used to endorse or promote products
51  *     derived from this software without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
54  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
55  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
56  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
57  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
58  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
59  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
60  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
61  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
62  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
63  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  */
65 
66 #ifdef _KERNEL
67 #include <crypto/aesni/aesni.h>
68 #include <crypto/aesni/aesni_os.h>
69 #else
70 #include <stdint.h>
71 #endif
72 
73 #include <wmmintrin.h>
74 #include <emmintrin.h>
75 #include <smmintrin.h>
76 
77 static inline int
78 m128icmp(__m128i a, __m128i b)
79 {
80 	__m128i cmp;
81 
82 	cmp = _mm_cmpeq_epi32(a, b);
83 
84 	return _mm_movemask_epi8(cmp) == 0xffff;
85 }
86 
87 #ifdef __i386__
88 static inline __m128i
89 _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
90 {
91 
92 	if (!ndx) {
93 		a = _mm_insert_epi32(a, b, 0);
94 		a = _mm_insert_epi32(a, b >> 32, 1);
95 	} else {
96 		a = _mm_insert_epi32(a, b, 2);
97 		a = _mm_insert_epi32(a, b >> 32, 3);
98 	}
99 
100 	return a;
101 }
102 #endif
103 
104 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
105 
106 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
107 static void
108 gfmul(__m128i a, __m128i b, __m128i *res)
109 {
110 	__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
111 
112 	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
113 	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
114 	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
115 	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
116 
117 	tmp4 = _mm_xor_si128(tmp4, tmp5);
118 	tmp5 = _mm_slli_si128(tmp4, 8);
119 	tmp4 = _mm_srli_si128(tmp4, 8);
120 	tmp3 = _mm_xor_si128(tmp3, tmp5);
121 	tmp6 = _mm_xor_si128(tmp6, tmp4);
122 
123 	tmp7 = _mm_srli_epi32(tmp3, 31);
124 	tmp8 = _mm_srli_epi32(tmp6, 31);
125 	tmp3 = _mm_slli_epi32(tmp3, 1);
126 	tmp6 = _mm_slli_epi32(tmp6, 1);
127 
128 	tmp9 = _mm_srli_si128(tmp7, 12);
129 	tmp8 = _mm_slli_si128(tmp8, 4);
130 	tmp7 = _mm_slli_si128(tmp7, 4);
131 	tmp3 = _mm_or_si128(tmp3, tmp7);
132 	tmp6 = _mm_or_si128(tmp6, tmp8);
133 	tmp6 = _mm_or_si128(tmp6, tmp9);
134 
135 	tmp7 = _mm_slli_epi32(tmp3, 31);
136 	tmp8 = _mm_slli_epi32(tmp3, 30);
137 	tmp9 = _mm_slli_epi32(tmp3, 25);
138 
139 	tmp7 = _mm_xor_si128(tmp7, tmp8);
140 	tmp7 = _mm_xor_si128(tmp7, tmp9);
141 	tmp8 = _mm_srli_si128(tmp7, 4);
142 	tmp7 = _mm_slli_si128(tmp7, 12);
143 	tmp3 = _mm_xor_si128(tmp3, tmp7);
144 
145 	tmp2 = _mm_srli_epi32(tmp3, 1);
146 	tmp4 = _mm_srli_epi32(tmp3, 2);
147 	tmp5 = _mm_srli_epi32(tmp3, 7);
148 	tmp2 = _mm_xor_si128(tmp2, tmp4);
149 	tmp2 = _mm_xor_si128(tmp2, tmp5);
150 	tmp2 = _mm_xor_si128(tmp2, tmp8);
151 	tmp3 = _mm_xor_si128(tmp3, tmp2);
152 	tmp6 = _mm_xor_si128(tmp6, tmp3);
153 
154 	*res = tmp6;
155 }
156 
157 /*
158  * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
159  * Method */
160 static void
161 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
162     __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
163 {
164 	/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
165 	__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
166 	    H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
167 	__m128i tmp0, tmp1, tmp2, tmp3;
168 	__m128i tmp4, tmp5, tmp6, tmp7;
169 	__m128i tmp8, tmp9;
170 
171 	H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
172 	H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
173 	H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
174 	H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
175 
176 	lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
177 	lo = _mm_xor_si128(lo, H3_X3_lo);
178 	lo = _mm_xor_si128(lo, H4_X4_lo);
179 
180 	H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
181 	H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
182 	H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
183 	H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
184 
185 	hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
186 	hi = _mm_xor_si128(hi, H3_X3_hi);
187 	hi = _mm_xor_si128(hi, H4_X4_hi);
188 
189 	tmp0 = _mm_shuffle_epi32(H1, 78);
190 	tmp4 = _mm_shuffle_epi32(X1, 78);
191 	tmp0 = _mm_xor_si128(tmp0, H1);
192 	tmp4 = _mm_xor_si128(tmp4, X1);
193 	tmp1 = _mm_shuffle_epi32(H2, 78);
194 	tmp5 = _mm_shuffle_epi32(X2, 78);
195 	tmp1 = _mm_xor_si128(tmp1, H2);
196 	tmp5 = _mm_xor_si128(tmp5, X2);
197 	tmp2 = _mm_shuffle_epi32(H3, 78);
198 	tmp6 = _mm_shuffle_epi32(X3, 78);
199 	tmp2 = _mm_xor_si128(tmp2, H3);
200 	tmp6 = _mm_xor_si128(tmp6, X3);
201 	tmp3 = _mm_shuffle_epi32(H4, 78);
202 	tmp7 = _mm_shuffle_epi32(X4, 78);
203 	tmp3 = _mm_xor_si128(tmp3, H4);
204 	tmp7 = _mm_xor_si128(tmp7, X4);
205 
206 	tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
207 	tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
208 	tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
209 	tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
210 
211 	tmp0 = _mm_xor_si128(tmp0, lo);
212 	tmp0 = _mm_xor_si128(tmp0, hi);
213 	tmp0 = _mm_xor_si128(tmp1, tmp0);
214 	tmp0 = _mm_xor_si128(tmp2, tmp0);
215 	tmp0 = _mm_xor_si128(tmp3, tmp0);
216 
217 	tmp4 = _mm_slli_si128(tmp0, 8);
218 	tmp0 = _mm_srli_si128(tmp0, 8);
219 
220 	lo = _mm_xor_si128(tmp4, lo);
221 	hi = _mm_xor_si128(tmp0, hi);
222 
223 	tmp3 = lo;
224 	tmp6 = hi;
225 
226 	tmp7 = _mm_srli_epi32(tmp3, 31);
227 	tmp8 = _mm_srli_epi32(tmp6, 31);
228 	tmp3 = _mm_slli_epi32(tmp3, 1);
229 	tmp6 = _mm_slli_epi32(tmp6, 1);
230 
231 	tmp9 = _mm_srli_si128(tmp7, 12);
232 	tmp8 = _mm_slli_si128(tmp8, 4);
233 	tmp7 = _mm_slli_si128(tmp7, 4);
234 	tmp3 = _mm_or_si128(tmp3, tmp7);
235 	tmp6 = _mm_or_si128(tmp6, tmp8);
236 	tmp6 = _mm_or_si128(tmp6, tmp9);
237 
238 	tmp7 = _mm_slli_epi32(tmp3, 31);
239 	tmp8 = _mm_slli_epi32(tmp3, 30);
240 	tmp9 = _mm_slli_epi32(tmp3, 25);
241 
242 	tmp7 = _mm_xor_si128(tmp7, tmp8);
243 	tmp7 = _mm_xor_si128(tmp7, tmp9);
244 	tmp8 = _mm_srli_si128(tmp7, 4);
245 	tmp7 = _mm_slli_si128(tmp7, 12);
246 	tmp3 = _mm_xor_si128(tmp3, tmp7);
247 
248 	tmp2 = _mm_srli_epi32(tmp3, 1);
249 	tmp4 = _mm_srli_epi32(tmp3, 2);
250 	tmp5 = _mm_srli_epi32(tmp3, 7);
251 	tmp2 = _mm_xor_si128(tmp2, tmp4);
252 	tmp2 = _mm_xor_si128(tmp2, tmp5);
253 	tmp2 = _mm_xor_si128(tmp2, tmp8);
254 	tmp3 = _mm_xor_si128(tmp3, tmp2);
255 	tmp6 = _mm_xor_si128(tmp6, tmp3);
256 
257 	*res = tmp6;
258 }
259 
260 /*
261  * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
262  * Every Four Blocks
263  */
264 /*
265  * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
266  * 2^32-256*8*16 bytes.
267  */
268 void
269 AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
270 	const unsigned char *addt, const unsigned char *ivec,
271 	unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
272 	const unsigned char *key, int nr)
273 {
274 	int i, j ,k;
275 	__m128i tmp1, tmp2, tmp3, tmp4;
276 	__m128i tmp5, tmp6, tmp7, tmp8;
277 	__m128i H, H2, H3, H4, Y, T;
278 	const __m128i *KEY = (const __m128i *)key;
279 	__m128i ctr1, ctr2, ctr3, ctr4;
280 	__m128i ctr5, ctr6, ctr7, ctr8;
281 	__m128i last_block = _mm_setzero_si128();
282 	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
283 	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
284 	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
285 	    7);
286 	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
287 	    15);
288 	__m128i X = _mm_setzero_si128();
289 
290 	if (ibytes == 96/8) {
291 		Y = _mm_loadu_si128((const __m128i *)ivec);
292 		Y = _mm_insert_epi32(Y, 0x1000000, 3);
293 		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
294 		tmp1 = _mm_xor_si128(X, KEY[0]);
295 		tmp2 = _mm_xor_si128(Y, KEY[0]);
296 		for (j=1; j < nr-1; j+=2) {
297 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
298 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
299 
300 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
301 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
302 		}
303 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
304 		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
305 
306 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
307 		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
308 
309 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
310 	} else {
311 		tmp1 = _mm_xor_si128(X, KEY[0]);
312 		for (j=1; j <nr; j++)
313 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
314 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
315 
316 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
317 		Y = _mm_setzero_si128();
318 
319 		for (i=0; i < ibytes/16; i++) {
320 			tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
321 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
322 			Y = _mm_xor_si128(Y, tmp1);
323 			gfmul(Y, H, &Y);
324 		}
325 		if (ibytes%16) {
326 			for (j=0; j < ibytes%16; j++)
327 				((unsigned char*)&last_block)[j] = ivec[i*16+j];
328 			tmp1 = last_block;
329 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
330 			Y = _mm_xor_si128(Y, tmp1);
331 			gfmul(Y, H, &Y);
332 		}
333 		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
334 		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
335 
336 		Y = _mm_xor_si128(Y, tmp1);
337 		gfmul(Y, H, &Y);
338 		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
339 		tmp1 = _mm_xor_si128(Y, KEY[0]);
340 		for (j=1; j < nr; j++)
341 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
342 		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
343 	}
344 
345 	gfmul(H,H,&H2);
346 	gfmul(H,H2,&H3);
347 	gfmul(H,H3,&H4);
348 
349 	for (i=0; i<abytes/16/4; i++) {
350 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
351 		tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
352 		tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
353 		tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
354 
355 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
356 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
357 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
358 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
359 		tmp1 = _mm_xor_si128(X, tmp1);
360 
361 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
362 	}
363 	for (i=i*4; i<abytes/16; i++) {
364 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
365 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
366 		X = _mm_xor_si128(X,tmp1);
367 		gfmul(X, H, &X);
368 	}
369 	if (abytes%16) {
370 		last_block = _mm_setzero_si128();
371 		for (j=0; j<abytes%16; j++)
372 			((unsigned char*)&last_block)[j] = addt[i*16+j];
373 		tmp1 = last_block;
374 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
375 		X =_mm_xor_si128(X,tmp1);
376 		gfmul(X,H,&X);
377 	}
378 
379 	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
380 	ctr1 = _mm_add_epi64(ctr1, ONE);
381 	ctr2 = _mm_add_epi64(ctr1, ONE);
382 	ctr3 = _mm_add_epi64(ctr2, ONE);
383 	ctr4 = _mm_add_epi64(ctr3, ONE);
384 	ctr5 = _mm_add_epi64(ctr4, ONE);
385 	ctr6 = _mm_add_epi64(ctr5, ONE);
386 	ctr7 = _mm_add_epi64(ctr6, ONE);
387 	ctr8 = _mm_add_epi64(ctr7, ONE);
388 
389 	for (i=0; i<nbytes/16/8; i++) {
390 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
391 		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
392 		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
393 		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
394 		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
395 		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
396 		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
397 		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
398 
399 		ctr1 = _mm_add_epi64(ctr1, EIGHT);
400 		ctr2 = _mm_add_epi64(ctr2, EIGHT);
401 		ctr3 = _mm_add_epi64(ctr3, EIGHT);
402 		ctr4 = _mm_add_epi64(ctr4, EIGHT);
403 		ctr5 = _mm_add_epi64(ctr5, EIGHT);
404 		ctr6 = _mm_add_epi64(ctr6, EIGHT);
405 		ctr7 = _mm_add_epi64(ctr7, EIGHT);
406 		ctr8 = _mm_add_epi64(ctr8, EIGHT);
407 
408 		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
409 		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
410 		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
411 		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
412 		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
413 		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
414 		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
415 		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
416 
417 		for (j=1; j<nr; j++) {
418 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
419 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
420 			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
421 			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
422 			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
423 			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
424 			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
425 			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
426 		}
427 		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
428 		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
429 		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
430 		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
431 		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
432 		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
433 		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
434 		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
435 
436 		tmp1 = _mm_xor_si128(tmp1,
437 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
438 		tmp2 = _mm_xor_si128(tmp2,
439 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
440 		tmp3 = _mm_xor_si128(tmp3,
441 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
442 		tmp4 = _mm_xor_si128(tmp4,
443 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
444 		tmp5 = _mm_xor_si128(tmp5,
445 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
446 		tmp6 = _mm_xor_si128(tmp6,
447 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
448 		tmp7 = _mm_xor_si128(tmp7,
449 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
450 		tmp8 = _mm_xor_si128(tmp8,
451 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
452 
453 		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
454 		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
455 		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
456 		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
457 		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
458 		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
459 		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
460 		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
461 
462 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
463 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
464 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
465 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
466 		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
467 		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
468 		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
469 		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
470 
471 		tmp1 = _mm_xor_si128(X, tmp1);
472 
473 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
474 
475 		tmp5 = _mm_xor_si128(X, tmp5);
476 		reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
477 	}
478 	for (k=i*8; k<nbytes/16; k++) {
479 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
480 		ctr1 = _mm_add_epi64(ctr1, ONE);
481 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
482 		for (j=1; j<nr-1; j+=2) {
483 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
484 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
485 		}
486 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
487 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
488 		tmp1 = _mm_xor_si128(tmp1,
489 		    _mm_loadu_si128(&((const __m128i *)in)[k]));
490 		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
491 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
492 		X = _mm_xor_si128(X, tmp1);
493 		gfmul(X,H,&X);
494 	}
495 	//If remains one incomplete block
496 	if (nbytes%16) {
497 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
498 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
499 		for (j=1; j<nr-1; j+=2) {
500 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
501 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
502 		}
503 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
504 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
505 		last_block = _mm_setzero_si128();
506 		memcpy(&last_block, &((const __m128i *)in)[k],
507 		    nbytes % 16);
508 		last_block = _mm_xor_si128(last_block, tmp1);
509 		for (j=0; j<nbytes%16; j++)
510 			out[k*16+j] = ((unsigned char*)&last_block)[j];
511 		for ((void)j; j<16; j++)
512 			((unsigned char*)&last_block)[j] = 0;
513 		tmp1 = last_block;
514 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
515 		X = _mm_xor_si128(X, tmp1);
516 		gfmul(X, H, &X);
517 	}
518 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
519 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
520 
521 	X = _mm_xor_si128(X, tmp1);
522 	gfmul(X,H,&X);
523 	X = _mm_shuffle_epi8(X, BSWAP_MASK);
524 	T = _mm_xor_si128(X, T);
525 	_mm_storeu_si128((__m128i*)tag, T);
526 }
527 
528 /* My modification of _encrypt to be _decrypt */
529 int
530 AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
531 	const unsigned char *addt, const unsigned char *ivec,
532 	const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
533 	const unsigned char *key, int nr)
534 {
535 	int i, j ,k;
536 	__m128i tmp1, tmp2, tmp3, tmp4;
537 	__m128i tmp5, tmp6, tmp7, tmp8;
538 	__m128i H, H2, H3, H4, Y, T;
539 	const __m128i *KEY = (const __m128i *)key;
540 	__m128i ctr1, ctr2, ctr3, ctr4;
541 	__m128i ctr5, ctr6, ctr7, ctr8;
542 	__m128i last_block = _mm_setzero_si128();
543 	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
544 	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
545 	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
546 	    7);
547 	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
548 	    15);
549 	__m128i X = _mm_setzero_si128();
550 
551 	if (ibytes == 96/8) {
552 		Y = _mm_loadu_si128((const __m128i *)ivec);
553 		Y = _mm_insert_epi32(Y, 0x1000000, 3);
554 		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
555 		tmp1 = _mm_xor_si128(X, KEY[0]);
556 		tmp2 = _mm_xor_si128(Y, KEY[0]);
557 		for (j=1; j < nr-1; j+=2) {
558 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
559 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
560 
561 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
562 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
563 		}
564 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
565 		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
566 
567 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
568 		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
569 
570 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
571 	} else {
572 		tmp1 = _mm_xor_si128(X, KEY[0]);
573 		for (j=1; j <nr; j++)
574 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
575 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
576 
577 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
578 		Y = _mm_setzero_si128();
579 
580 		for (i=0; i < ibytes/16; i++) {
581 			tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
582 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
583 			Y = _mm_xor_si128(Y, tmp1);
584 			gfmul(Y, H, &Y);
585 		}
586 		if (ibytes%16) {
587 			for (j=0; j < ibytes%16; j++)
588 				((unsigned char*)&last_block)[j] = ivec[i*16+j];
589 			tmp1 = last_block;
590 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
591 			Y = _mm_xor_si128(Y, tmp1);
592 			gfmul(Y, H, &Y);
593 		}
594 		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
595 		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
596 
597 		Y = _mm_xor_si128(Y, tmp1);
598 		gfmul(Y, H, &Y);
599 		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
600 		tmp1 = _mm_xor_si128(Y, KEY[0]);
601 		for (j=1; j < nr; j++)
602 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
603 		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
604 	}
605 
606 	gfmul(H,H,&H2);
607 	gfmul(H,H2,&H3);
608 	gfmul(H,H3,&H4);
609 
610 	for (i=0; i<abytes/16/4; i++) {
611 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
612 		tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
613 		tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
614 		tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
615 
616 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
617 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
618 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
619 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
620 
621 		tmp1 = _mm_xor_si128(X, tmp1);
622 
623 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
624 	}
625 	for (i=i*4; i<abytes/16; i++) {
626 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
627 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
628 		X = _mm_xor_si128(X,tmp1);
629 		gfmul(X, H, &X);
630 	}
631 	if (abytes%16) {
632 		last_block = _mm_setzero_si128();
633 		for (j=0; j<abytes%16; j++)
634 			((unsigned char*)&last_block)[j] = addt[i*16+j];
635 		tmp1 = last_block;
636 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
637 		X =_mm_xor_si128(X,tmp1);
638 		gfmul(X,H,&X);
639 	}
640 
641 	/* This is where we validate the cipher text before decrypt */
642 	for (i = 0; i<nbytes/16/4; i++) {
643 		tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
644 		tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
645 		tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
646 		tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
647 
648 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
649 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
650 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
651 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
652 
653 		tmp1 = _mm_xor_si128(X, tmp1);
654 
655 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
656 	}
657 	for (i = i*4; i<nbytes/16; i++) {
658 		tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
659 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
660 		X = _mm_xor_si128(X, tmp1);
661 		gfmul(X,H,&X);
662 	}
663 	if (nbytes%16) {
664 		last_block = _mm_setzero_si128();
665 		for (j=0; j<nbytes%16; j++)
666 			((unsigned char*)&last_block)[j] = in[i*16+j];
667 		tmp1 = last_block;
668 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
669 		X = _mm_xor_si128(X, tmp1);
670 		gfmul(X, H, &X);
671 	}
672 
673 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
674 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
675 
676 	X = _mm_xor_si128(X, tmp1);
677 	gfmul(X,H,&X);
678 	X = _mm_shuffle_epi8(X, BSWAP_MASK);
679 	T = _mm_xor_si128(X, T);
680 
681 	if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
682 		return 0; //in case the authentication failed
683 
684 	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
685 	ctr1 = _mm_add_epi64(ctr1, ONE);
686 	ctr2 = _mm_add_epi64(ctr1, ONE);
687 	ctr3 = _mm_add_epi64(ctr2, ONE);
688 	ctr4 = _mm_add_epi64(ctr3, ONE);
689 	ctr5 = _mm_add_epi64(ctr4, ONE);
690 	ctr6 = _mm_add_epi64(ctr5, ONE);
691 	ctr7 = _mm_add_epi64(ctr6, ONE);
692 	ctr8 = _mm_add_epi64(ctr7, ONE);
693 
694 	for (i=0; i<nbytes/16/8; i++) {
695 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
696 		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
697 		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
698 		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
699 		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
700 		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
701 		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
702 		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
703 
704 		ctr1 = _mm_add_epi64(ctr1, EIGHT);
705 		ctr2 = _mm_add_epi64(ctr2, EIGHT);
706 		ctr3 = _mm_add_epi64(ctr3, EIGHT);
707 		ctr4 = _mm_add_epi64(ctr4, EIGHT);
708 		ctr5 = _mm_add_epi64(ctr5, EIGHT);
709 		ctr6 = _mm_add_epi64(ctr6, EIGHT);
710 		ctr7 = _mm_add_epi64(ctr7, EIGHT);
711 		ctr8 = _mm_add_epi64(ctr8, EIGHT);
712 
713 		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
714 		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
715 		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
716 		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
717 		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
718 		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
719 		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
720 		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
721 
722 		for (j=1; j<nr; j++) {
723 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
724 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
725 			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
726 			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
727 			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
728 			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
729 			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
730 			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
731 		}
732 		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
733 		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
734 		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
735 		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
736 		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
737 		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
738 		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
739 		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
740 
741 		tmp1 = _mm_xor_si128(tmp1,
742 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
743 		tmp2 = _mm_xor_si128(tmp2,
744 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
745 		tmp3 = _mm_xor_si128(tmp3,
746 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
747 		tmp4 = _mm_xor_si128(tmp4,
748 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
749 		tmp5 = _mm_xor_si128(tmp5,
750 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
751 		tmp6 = _mm_xor_si128(tmp6,
752 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
753 		tmp7 = _mm_xor_si128(tmp7,
754 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
755 		tmp8 = _mm_xor_si128(tmp8,
756 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
757 
758 		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
759 		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
760 		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
761 		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
762 		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
763 		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
764 		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
765 		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
766 
767 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
768 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
769 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
770 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
771 		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
772 		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
773 		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
774 		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
775 	}
776 	for (k=i*8; k<nbytes/16; k++) {
777 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
778 		ctr1 = _mm_add_epi64(ctr1, ONE);
779 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
780 		for (j=1; j<nr-1; j+=2) {
781 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
782 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
783 		}
784 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
785 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
786 		tmp1 = _mm_xor_si128(tmp1,
787 		    _mm_loadu_si128(&((const __m128i *)in)[k]));
788 		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
789 	}
790 	//If remains one incomplete block
791 	if (nbytes%16) {
792 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
793 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
794 		for (j=1; j<nr-1; j+=2) {
795 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
796 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
797 		}
798 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
799 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
800 		last_block = _mm_setzero_si128();
801 		memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16);
802 		tmp1 = _mm_xor_si128(tmp1, last_block);
803 		last_block = tmp1;
804 		for (j=0; j<nbytes%16; j++)
805 			out[k*16+j] = ((unsigned char*)&last_block)[j];
806 	}
807 	return 1; //when sucessfull returns 1
808 }
809