1 /* AES-NI accelerated AES for Libgcrypt
2 * Copyright (C) 2000, 2001, 2002, 2003, 2007,
3 * 2008, 2011, 2012 Free Software Foundation, Inc.
4 *
5 * This file is part of Libgcrypt.
6 *
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
11 *
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <config.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h> /* for memcmp() */
25
26 #include "types.h" /* for byte and u32 typedefs */
27 #include "g10lib.h"
28 #include "cipher.h"
29 #include "bufhelp.h"
30 #include "cipher-selftest.h"
31 #include "rijndael-internal.h"
32 #include "./cipher-internal.h"
33
34
35 #ifdef USE_AESNI
36
37
38 #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
39 /* Prevent compiler from issuing SSE instructions between asm blocks. */
40 # pragma GCC target("no-sse")
41 #endif
42 #if __clang__
43 # pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
44 #endif
45
46
47 #define ALWAYS_INLINE inline __attribute__((always_inline))
48 #define NO_INLINE __attribute__((noinline))
49 #define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
50
51 #define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
52 #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
53 #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
54
55
56 typedef struct u128_s
57 {
58 u32 a, b, c, d;
59 } __attribute__((packed, aligned(1), may_alias)) u128_t;
60
61
62 /* Copy of ocb_get_l needed here as GCC is unable to inline ocb_get_l
63 because of 'pragma target'. */
64 static ASM_FUNC_ATTR_INLINE const unsigned char *
aes_ocb_get_l(gcry_cipher_hd_t c,u64 n)65 aes_ocb_get_l (gcry_cipher_hd_t c, u64 n)
66 {
67 unsigned long ntz;
68
69 /* Assumes that N != 0. */
70 asm ("rep;bsfl %k[low], %k[ntz]\n\t"
71 : [ntz] "=r" (ntz)
72 : [low] "r" ((unsigned long)n)
73 : "cc");
74
75 return c->u_mode.ocb.L[ntz];
76 }
77
78
79 /* Two macros to be called prior and after the use of AESNI
80 instructions. There should be no external function calls between
81 the use of these macros. There purpose is to make sure that the
82 SSE regsiters are cleared and won't reveal any information about
83 the key or the data. */
84 #ifdef __WIN64__
85 /* XMM6-XMM15 are callee-saved registers on WIN64. */
86 # define aesni_prepare_2_7_variable char win64tmp[16 * 2]
87 # define aesni_prepare_8_15_variable char win64tmp8_15[16 * 8]
88 # define aesni_prepare() do { } while (0)
89 # define aesni_prepare_2_7() \
90 do { asm volatile ("movdqu %%xmm6, %0\n\t" \
91 "movdqu %%xmm7, %1\n\t" \
92 : "=m" (*win64tmp), "=m" (*(win64tmp+16)) \
93 : \
94 : "memory"); \
95 } while (0)
96 # define aesni_prepare_8_15() \
97 do { asm volatile ("movdqu %%xmm8, 0*16(%0)\n\t" \
98 "movdqu %%xmm9, 1*16(%0)\n\t" \
99 "movdqu %%xmm10, 2*16(%0)\n\t" \
100 "movdqu %%xmm11, 3*16(%0)\n\t" \
101 "movdqu %%xmm12, 4*16(%0)\n\t" \
102 "movdqu %%xmm13, 5*16(%0)\n\t" \
103 "movdqu %%xmm14, 6*16(%0)\n\t" \
104 "movdqu %%xmm15, 7*16(%0)\n\t" \
105 : \
106 : "r" (win64tmp8_15) \
107 : "memory"); \
108 } while (0)
109 # define aesni_cleanup() \
110 do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \
111 "pxor %%xmm1, %%xmm1\n" :: ); \
112 } while (0)
113 # define aesni_cleanup_2_7() \
114 do { asm volatile ("movdqu %0, %%xmm6\n\t" \
115 "movdqu %1, %%xmm7\n\t" \
116 "pxor %%xmm2, %%xmm2\n" \
117 "pxor %%xmm3, %%xmm3\n" \
118 "pxor %%xmm4, %%xmm4\n" \
119 "pxor %%xmm5, %%xmm5\n" \
120 : \
121 : "m" (*win64tmp), "m" (*(win64tmp+16)) \
122 : "memory"); \
123 } while (0)
124 # define aesni_cleanup_8_15() \
125 do { asm volatile ("movdqu 0*16(%0), %%xmm8\n\t" \
126 "movdqu 1*16(%0), %%xmm9\n\t" \
127 "movdqu 2*16(%0), %%xmm10\n\t" \
128 "movdqu 3*16(%0), %%xmm11\n\t" \
129 "movdqu 4*16(%0), %%xmm12\n\t" \
130 "movdqu 5*16(%0), %%xmm13\n\t" \
131 "movdqu 6*16(%0), %%xmm14\n\t" \
132 "movdqu 7*16(%0), %%xmm15\n\t" \
133 : \
134 : "r" (win64tmp8_15) \
135 : "memory"); \
136 } while (0)
137 #else
138 # define aesni_prepare_2_7_variable
139 # define aesni_prepare() do { } while (0)
140 # define aesni_prepare_2_7() do { } while (0)
141 # define aesni_cleanup() \
142 do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \
143 "pxor %%xmm1, %%xmm1\n" :: ); \
144 } while (0)
145 # define aesni_cleanup_2_7() \
146 do { asm volatile ("pxor %%xmm7, %%xmm7\n\t" \
147 "pxor %%xmm2, %%xmm2\n\t" \
148 "pxor %%xmm3, %%xmm3\n" \
149 "pxor %%xmm4, %%xmm4\n" \
150 "pxor %%xmm5, %%xmm5\n" \
151 "pxor %%xmm6, %%xmm6\n":: ); \
152 } while (0)
153 # ifdef __x86_64__
154 # define aesni_prepare_8_15_variable
155 # define aesni_prepare_8_15() do { } while (0)
156 # define aesni_cleanup_8_15() \
157 do { asm volatile ("pxor %%xmm8, %%xmm8\n" \
158 "pxor %%xmm9, %%xmm9\n" \
159 "pxor %%xmm10, %%xmm10\n" \
160 "pxor %%xmm11, %%xmm11\n" \
161 "pxor %%xmm12, %%xmm12\n" \
162 "pxor %%xmm13, %%xmm13\n" \
163 "pxor %%xmm14, %%xmm14\n" \
164 "pxor %%xmm15, %%xmm15\n":: ); \
165 } while (0)
166 # endif
167 #endif
168
169 void ASM_FUNC_ATTR
_gcry_aes_aesni_do_setkey(RIJNDAEL_context * ctx,const byte * key)170 _gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
171 {
172 aesni_prepare_2_7_variable;
173
174 aesni_prepare();
175 aesni_prepare_2_7();
176
177 if (ctx->rounds < 12)
178 {
179 /* 128-bit key */
180 #define AESKEYGENASSIST_xmm1_xmm2(imm8) \
181 ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
182 #define AESKEY_EXPAND128 \
183 "pshufd $0xff, %%xmm2, %%xmm2\n\t" \
184 "movdqa %%xmm1, %%xmm3\n\t" \
185 "pslldq $4, %%xmm3\n\t" \
186 "pxor %%xmm3, %%xmm1\n\t" \
187 "pslldq $4, %%xmm3\n\t" \
188 "pxor %%xmm3, %%xmm1\n\t" \
189 "pslldq $4, %%xmm3\n\t" \
190 "pxor %%xmm3, %%xmm2\n\t" \
191 "pxor %%xmm2, %%xmm1\n\t"
192
193 asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key */
194 "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */
195 AESKEYGENASSIST_xmm1_xmm2(0x01)
196 AESKEY_EXPAND128
197 "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1 */
198 AESKEYGENASSIST_xmm1_xmm2(0x02)
199 AESKEY_EXPAND128
200 "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */
201 AESKEYGENASSIST_xmm1_xmm2(0x04)
202 AESKEY_EXPAND128
203 "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */
204 AESKEYGENASSIST_xmm1_xmm2(0x08)
205 AESKEY_EXPAND128
206 "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */
207 AESKEYGENASSIST_xmm1_xmm2(0x10)
208 AESKEY_EXPAND128
209 "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1 */
210 AESKEYGENASSIST_xmm1_xmm2(0x20)
211 AESKEY_EXPAND128
212 "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */
213 AESKEYGENASSIST_xmm1_xmm2(0x40)
214 AESKEY_EXPAND128
215 "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1 */
216 AESKEYGENASSIST_xmm1_xmm2(0x80)
217 AESKEY_EXPAND128
218 "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */
219 AESKEYGENASSIST_xmm1_xmm2(0x1b)
220 AESKEY_EXPAND128
221 "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */
222 AESKEYGENASSIST_xmm1_xmm2(0x36)
223 AESKEY_EXPAND128
224 "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */
225 :
226 : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
227 : "cc", "memory" );
228 #undef AESKEYGENASSIST_xmm1_xmm2
229 #undef AESKEY_EXPAND128
230 }
231 else if (ctx->rounds == 12)
232 {
233 /* 192-bit key */
234 #define AESKEYGENASSIST_xmm3_xmm2(imm8) \
235 ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
236 #define AESKEY_EXPAND192 \
237 "pshufd $0x55, %%xmm2, %%xmm2\n\t" \
238 "movdqu %%xmm1, %%xmm4\n\t" \
239 "pslldq $4, %%xmm4\n\t" \
240 "pxor %%xmm4, %%xmm1\n\t" \
241 "pslldq $4, %%xmm4\n\t" \
242 "pxor %%xmm4, %%xmm1\n\t" \
243 "pslldq $4, %%xmm4\n\t" \
244 "pxor %%xmm4, %%xmm1\n\t" \
245 "pxor %%xmm2, %%xmm1\n\t" \
246 "pshufd $0xff, %%xmm1, %%xmm2\n\t" \
247 "movdqu %%xmm3, %%xmm4\n\t" \
248 "pslldq $4, %%xmm4\n\t" \
249 "pxor %%xmm4, %%xmm3\n\t" \
250 "pxor %%xmm2, %%xmm3\n\t"
251
252 asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */
253 "movq 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..23] */
254 "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */
255 "movdqa %%xmm3, %%xmm5\n\t"
256
257 AESKEYGENASSIST_xmm3_xmm2(0x01)
258 AESKEY_EXPAND192
259 "shufpd $0, %%xmm1, %%xmm5\n\t"
260 "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5 */
261 "movdqa %%xmm1, %%xmm6\n\t"
262 "shufpd $1, %%xmm3, %%xmm6\n\t"
263 "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6 */
264 AESKEYGENASSIST_xmm3_xmm2(0x02)
265 AESKEY_EXPAND192
266 "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1 */
267 "movdqa %%xmm3, %%xmm5\n\t"
268
269 AESKEYGENASSIST_xmm3_xmm2(0x04)
270 AESKEY_EXPAND192
271 "shufpd $0, %%xmm1, %%xmm5\n\t"
272 "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5 */
273 "movdqa %%xmm1, %%xmm6\n\t"
274 "shufpd $1, %%xmm3, %%xmm6\n\t"
275 "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6 */
276 AESKEYGENASSIST_xmm3_xmm2(0x08)
277 AESKEY_EXPAND192
278 "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */
279 "movdqa %%xmm3, %%xmm5\n\t"
280
281 AESKEYGENASSIST_xmm3_xmm2(0x10)
282 AESKEY_EXPAND192
283 "shufpd $0, %%xmm1, %%xmm5\n\t"
284 "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5 */
285 "movdqa %%xmm1, %%xmm6\n\t"
286 "shufpd $1, %%xmm3, %%xmm6\n\t"
287 "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6 */
288 AESKEYGENASSIST_xmm3_xmm2(0x20)
289 AESKEY_EXPAND192
290 "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1 */
291 "movdqa %%xmm3, %%xmm5\n\t"
292
293 AESKEYGENASSIST_xmm3_xmm2(0x40)
294 AESKEY_EXPAND192
295 "shufpd $0, %%xmm1, %%xmm5\n\t"
296 "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5 */
297 "movdqa %%xmm1, %%xmm6\n\t"
298 "shufpd $1, %%xmm3, %%xmm6\n\t"
299 "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6 */
300 AESKEYGENASSIST_xmm3_xmm2(0x80)
301 AESKEY_EXPAND192
302 "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */
303 :
304 : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
305 : "cc", "memory" );
306 #undef AESKEYGENASSIST_xmm3_xmm2
307 #undef AESKEY_EXPAND192
308 }
309 else if (ctx->rounds > 12)
310 {
311 /* 256-bit key */
312 #define AESKEYGENASSIST_xmm1_xmm2(imm8) \
313 ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
314 #define AESKEYGENASSIST_xmm3_xmm2(imm8) \
315 ".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
316 #define AESKEY_EXPAND256_A \
317 "pshufd $0xff, %%xmm2, %%xmm2\n\t" \
318 "movdqa %%xmm1, %%xmm4\n\t" \
319 "pslldq $4, %%xmm4\n\t" \
320 "pxor %%xmm4, %%xmm1\n\t" \
321 "pslldq $4, %%xmm4\n\t" \
322 "pxor %%xmm4, %%xmm1\n\t" \
323 "pslldq $4, %%xmm4\n\t" \
324 "pxor %%xmm4, %%xmm1\n\t" \
325 "pxor %%xmm2, %%xmm1\n\t"
326 #define AESKEY_EXPAND256_B \
327 "pshufd $0xaa, %%xmm2, %%xmm2\n\t" \
328 "movdqa %%xmm3, %%xmm4\n\t" \
329 "pslldq $4, %%xmm4\n\t" \
330 "pxor %%xmm4, %%xmm3\n\t" \
331 "pslldq $4, %%xmm4\n\t" \
332 "pxor %%xmm4, %%xmm3\n\t" \
333 "pslldq $4, %%xmm4\n\t" \
334 "pxor %%xmm4, %%xmm3\n\t" \
335 "pxor %%xmm2, %%xmm3\n\t"
336
337 asm volatile ("movdqu (%[key]), %%xmm1\n\t" /* xmm1 := key[0..15] */
338 "movdqu 16(%[key]), %%xmm3\n\t" /* xmm3 := key[16..31] */
339 "movdqa %%xmm1, (%[ksch])\n\t" /* ksch[0] := xmm1 */
340 "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3 */
341
342 AESKEYGENASSIST_xmm3_xmm2(0x01)
343 AESKEY_EXPAND256_A
344 "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1 */
345 AESKEYGENASSIST_xmm1_xmm2(0x00)
346 AESKEY_EXPAND256_B
347 "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3 */
348
349 AESKEYGENASSIST_xmm3_xmm2(0x02)
350 AESKEY_EXPAND256_A
351 "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1 */
352 AESKEYGENASSIST_xmm1_xmm2(0x00)
353 AESKEY_EXPAND256_B
354 "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3 */
355
356 AESKEYGENASSIST_xmm3_xmm2(0x04)
357 AESKEY_EXPAND256_A
358 "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1 */
359 AESKEYGENASSIST_xmm1_xmm2(0x00)
360 AESKEY_EXPAND256_B
361 "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3 */
362
363 AESKEYGENASSIST_xmm3_xmm2(0x08)
364 AESKEY_EXPAND256_A
365 "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1 */
366 AESKEYGENASSIST_xmm1_xmm2(0x00)
367 AESKEY_EXPAND256_B
368 "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3 */
369
370 AESKEYGENASSIST_xmm3_xmm2(0x10)
371 AESKEY_EXPAND256_A
372 "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1 */
373 AESKEYGENASSIST_xmm1_xmm2(0x00)
374 AESKEY_EXPAND256_B
375 "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3 */
376
377 AESKEYGENASSIST_xmm3_xmm2(0x20)
378 AESKEY_EXPAND256_A
379 "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1 */
380 AESKEYGENASSIST_xmm1_xmm2(0x00)
381 AESKEY_EXPAND256_B
382 "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3 */
383
384 AESKEYGENASSIST_xmm3_xmm2(0x40)
385 AESKEY_EXPAND256_A
386 "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1 */
387
388 :
389 : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
390 : "cc", "memory" );
391 #undef AESKEYGENASSIST_xmm1_xmm2
392 #undef AESKEYGENASSIST_xmm3_xmm2
393 #undef AESKEY_EXPAND256_A
394 #undef AESKEY_EXPAND256_B
395 }
396
397 aesni_cleanup();
398 aesni_cleanup_2_7();
399 }
400
401
402 /* Make a decryption key from an encryption key. */
403 static ASM_FUNC_ATTR_INLINE void
do_aesni_prepare_decryption(RIJNDAEL_context * ctx)404 do_aesni_prepare_decryption (RIJNDAEL_context *ctx)
405 {
406 /* The AES-NI decrypt instructions use the Equivalent Inverse
407 Cipher, thus we can't use the the standard decrypt key
408 preparation. */
409 u128_t *ekey = (u128_t *)ctx->keyschenc;
410 u128_t *dkey = (u128_t *)ctx->keyschdec;
411 int rr;
412 int r;
413
414 #define DO_AESNI_AESIMC() \
415 asm volatile ("movdqa %[ekey], %%xmm1\n\t" \
416 /*"aesimc %%xmm1, %%xmm1\n\t"*/ \
417 ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \
418 "movdqa %%xmm1, %[dkey]" \
419 : [dkey] "=m" (dkey[r]) \
420 : [ekey] "m" (ekey[rr]) \
421 : "memory")
422
423 dkey[0] = ekey[ctx->rounds];
424 r=1;
425 rr=ctx->rounds-1;
426 DO_AESNI_AESIMC(); r++; rr--; /* round 1 */
427 DO_AESNI_AESIMC(); r++; rr--; /* round 2 */
428 DO_AESNI_AESIMC(); r++; rr--; /* round 3 */
429 DO_AESNI_AESIMC(); r++; rr--; /* round 4 */
430 DO_AESNI_AESIMC(); r++; rr--; /* round 5 */
431 DO_AESNI_AESIMC(); r++; rr--; /* round 6 */
432 DO_AESNI_AESIMC(); r++; rr--; /* round 7 */
433 DO_AESNI_AESIMC(); r++; rr--; /* round 8 */
434 DO_AESNI_AESIMC(); r++; rr--; /* round 9 */
435 if (ctx->rounds > 10)
436 {
437 DO_AESNI_AESIMC(); r++; rr--; /* round 10 */
438 DO_AESNI_AESIMC(); r++; rr--; /* round 11 */
439 if (ctx->rounds > 12)
440 {
441 DO_AESNI_AESIMC(); r++; rr--; /* round 12 */
442 DO_AESNI_AESIMC(); r++; rr--; /* round 13 */
443 }
444 }
445
446 dkey[r] = ekey[0];
447
448 #undef DO_AESNI_AESIMC
449 }
450
451 void ASM_FUNC_ATTR
_gcry_aes_aesni_prepare_decryption(RIJNDAEL_context * ctx)452 _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx)
453 {
454 aesni_prepare();
455 do_aesni_prepare_decryption (ctx);
456 aesni_cleanup();
457 }
458
459
460 /* Encrypt one block using the Intel AES-NI instructions. Block is input
461 * and output through SSE register xmm0. */
462 static ASM_FUNC_ATTR_INLINE void
do_aesni_enc(const RIJNDAEL_context * ctx)463 do_aesni_enc (const RIJNDAEL_context *ctx)
464 {
465 #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
466 #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
467 asm volatile ("movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
468 "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
469 "movdqa 0x10(%[key]), %%xmm1\n\t"
470 aesenc_xmm1_xmm0
471 "movdqa 0x20(%[key]), %%xmm1\n\t"
472 aesenc_xmm1_xmm0
473 "movdqa 0x30(%[key]), %%xmm1\n\t"
474 aesenc_xmm1_xmm0
475 "movdqa 0x40(%[key]), %%xmm1\n\t"
476 aesenc_xmm1_xmm0
477 "movdqa 0x50(%[key]), %%xmm1\n\t"
478 aesenc_xmm1_xmm0
479 "movdqa 0x60(%[key]), %%xmm1\n\t"
480 aesenc_xmm1_xmm0
481 "movdqa 0x70(%[key]), %%xmm1\n\t"
482 aesenc_xmm1_xmm0
483 "movdqa 0x80(%[key]), %%xmm1\n\t"
484 aesenc_xmm1_xmm0
485 "movdqa 0x90(%[key]), %%xmm1\n\t"
486 aesenc_xmm1_xmm0
487 "movdqa 0xa0(%[key]), %%xmm1\n\t"
488 "cmpl $10, %[rounds]\n\t"
489 "jz .Lenclast%=\n\t"
490 aesenc_xmm1_xmm0
491 "movdqa 0xb0(%[key]), %%xmm1\n\t"
492 aesenc_xmm1_xmm0
493 "movdqa 0xc0(%[key]), %%xmm1\n\t"
494 "cmpl $12, %[rounds]\n\t"
495 "jz .Lenclast%=\n\t"
496 aesenc_xmm1_xmm0
497 "movdqa 0xd0(%[key]), %%xmm1\n\t"
498 aesenc_xmm1_xmm0
499 "movdqa 0xe0(%[key]), %%xmm1\n"
500
501 ".Lenclast%=:\n\t"
502 aesenclast_xmm1_xmm0
503 "\n"
504 :
505 : [key] "r" (ctx->keyschenc),
506 [rounds] "r" (ctx->rounds)
507 : "cc", "memory");
508 #undef aesenc_xmm1_xmm0
509 #undef aesenclast_xmm1_xmm0
510 }
511
512
513 /* Decrypt one block using the Intel AES-NI instructions. Block is input
514 * and output through SSE register xmm0. */
515 static ASM_FUNC_ATTR_INLINE void
do_aesni_dec(const RIJNDAEL_context * ctx)516 do_aesni_dec (const RIJNDAEL_context *ctx)
517 {
518 #define aesdec_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
519 #define aesdeclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t"
520 asm volatile ("movdqa (%[key]), %%xmm1\n\t"
521 "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
522 "movdqa 0x10(%[key]), %%xmm1\n\t"
523 aesdec_xmm1_xmm0
524 "movdqa 0x20(%[key]), %%xmm1\n\t"
525 aesdec_xmm1_xmm0
526 "movdqa 0x30(%[key]), %%xmm1\n\t"
527 aesdec_xmm1_xmm0
528 "movdqa 0x40(%[key]), %%xmm1\n\t"
529 aesdec_xmm1_xmm0
530 "movdqa 0x50(%[key]), %%xmm1\n\t"
531 aesdec_xmm1_xmm0
532 "movdqa 0x60(%[key]), %%xmm1\n\t"
533 aesdec_xmm1_xmm0
534 "movdqa 0x70(%[key]), %%xmm1\n\t"
535 aesdec_xmm1_xmm0
536 "movdqa 0x80(%[key]), %%xmm1\n\t"
537 aesdec_xmm1_xmm0
538 "movdqa 0x90(%[key]), %%xmm1\n\t"
539 aesdec_xmm1_xmm0
540 "movdqa 0xa0(%[key]), %%xmm1\n\t"
541 "cmpl $10, %[rounds]\n\t"
542 "jz .Ldeclast%=\n\t"
543 aesdec_xmm1_xmm0
544 "movdqa 0xb0(%[key]), %%xmm1\n\t"
545 aesdec_xmm1_xmm0
546 "movdqa 0xc0(%[key]), %%xmm1\n\t"
547 "cmpl $12, %[rounds]\n\t"
548 "jz .Ldeclast%=\n\t"
549 aesdec_xmm1_xmm0
550 "movdqa 0xd0(%[key]), %%xmm1\n\t"
551 aesdec_xmm1_xmm0
552 "movdqa 0xe0(%[key]), %%xmm1\n"
553
554 ".Ldeclast%=:\n\t"
555 aesdeclast_xmm1_xmm0
556 "\n"
557 :
558 : [key] "r" (ctx->keyschdec),
559 [rounds] "r" (ctx->rounds)
560 : "cc", "memory");
561 #undef aesdec_xmm1_xmm0
562 #undef aesdeclast_xmm1_xmm0
563 }
564
565
566 /* Encrypt four blocks using the Intel AES-NI instructions. Blocks are input
567 * and output through SSE registers xmm1 to xmm4. */
568 static ASM_FUNC_ATTR_INLINE void
do_aesni_enc_vec4(const RIJNDAEL_context * ctx)569 do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
570 {
571 #define aesenc_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
572 #define aesenc_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
573 #define aesenc_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
574 #define aesenc_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
575 #define aesenclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
576 #define aesenclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
577 #define aesenclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
578 #define aesenclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
579 asm volatile ("movdqa (%[key]), %%xmm0\n\t"
580 "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
581 "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
582 "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
583 "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
584 "movdqa 0x10(%[key]), %%xmm0\n\t"
585 aesenc_xmm0_xmm1
586 aesenc_xmm0_xmm2
587 aesenc_xmm0_xmm3
588 aesenc_xmm0_xmm4
589 "movdqa 0x20(%[key]), %%xmm0\n\t"
590 aesenc_xmm0_xmm1
591 aesenc_xmm0_xmm2
592 aesenc_xmm0_xmm3
593 aesenc_xmm0_xmm4
594 "movdqa 0x30(%[key]), %%xmm0\n\t"
595 aesenc_xmm0_xmm1
596 aesenc_xmm0_xmm2
597 aesenc_xmm0_xmm3
598 aesenc_xmm0_xmm4
599 "movdqa 0x40(%[key]), %%xmm0\n\t"
600 aesenc_xmm0_xmm1
601 aesenc_xmm0_xmm2
602 aesenc_xmm0_xmm3
603 aesenc_xmm0_xmm4
604 "movdqa 0x50(%[key]), %%xmm0\n\t"
605 aesenc_xmm0_xmm1
606 aesenc_xmm0_xmm2
607 aesenc_xmm0_xmm3
608 aesenc_xmm0_xmm4
609 "movdqa 0x60(%[key]), %%xmm0\n\t"
610 aesenc_xmm0_xmm1
611 aesenc_xmm0_xmm2
612 aesenc_xmm0_xmm3
613 aesenc_xmm0_xmm4
614 "movdqa 0x70(%[key]), %%xmm0\n\t"
615 aesenc_xmm0_xmm1
616 aesenc_xmm0_xmm2
617 aesenc_xmm0_xmm3
618 aesenc_xmm0_xmm4
619 "movdqa 0x80(%[key]), %%xmm0\n\t"
620 aesenc_xmm0_xmm1
621 aesenc_xmm0_xmm2
622 aesenc_xmm0_xmm3
623 aesenc_xmm0_xmm4
624 "movdqa 0x90(%[key]), %%xmm0\n\t"
625 aesenc_xmm0_xmm1
626 aesenc_xmm0_xmm2
627 aesenc_xmm0_xmm3
628 aesenc_xmm0_xmm4
629 "movdqa 0xa0(%[key]), %%xmm0\n\t"
630 "cmpl $10, %[rounds]\n\t"
631 "jz .Ldeclast%=\n\t"
632 aesenc_xmm0_xmm1
633 aesenc_xmm0_xmm2
634 aesenc_xmm0_xmm3
635 aesenc_xmm0_xmm4
636 "movdqa 0xb0(%[key]), %%xmm0\n\t"
637 aesenc_xmm0_xmm1
638 aesenc_xmm0_xmm2
639 aesenc_xmm0_xmm3
640 aesenc_xmm0_xmm4
641 "movdqa 0xc0(%[key]), %%xmm0\n\t"
642 "cmpl $12, %[rounds]\n\t"
643 "jz .Ldeclast%=\n\t"
644 aesenc_xmm0_xmm1
645 aesenc_xmm0_xmm2
646 aesenc_xmm0_xmm3
647 aesenc_xmm0_xmm4
648 "movdqa 0xd0(%[key]), %%xmm0\n\t"
649 aesenc_xmm0_xmm1
650 aesenc_xmm0_xmm2
651 aesenc_xmm0_xmm3
652 aesenc_xmm0_xmm4
653 "movdqa 0xe0(%[key]), %%xmm0\n"
654
655 ".Ldeclast%=:\n\t"
656 aesenclast_xmm0_xmm1
657 aesenclast_xmm0_xmm2
658 aesenclast_xmm0_xmm3
659 aesenclast_xmm0_xmm4
660 : /* no output */
661 : [key] "r" (ctx->keyschenc),
662 [rounds] "r" (ctx->rounds)
663 : "cc", "memory");
664 #undef aesenc_xmm0_xmm1
665 #undef aesenc_xmm0_xmm2
666 #undef aesenc_xmm0_xmm3
667 #undef aesenc_xmm0_xmm4
668 #undef aesenclast_xmm0_xmm1
669 #undef aesenclast_xmm0_xmm2
670 #undef aesenclast_xmm0_xmm3
671 #undef aesenclast_xmm0_xmm4
672 }
673
674
675 /* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input
676 * and output through SSE registers xmm1 to xmm4. */
677 static ASM_FUNC_ATTR_INLINE void
do_aesni_dec_vec4(const RIJNDAEL_context * ctx)678 do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
679 {
680 #define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
681 #define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t"
682 #define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t"
683 #define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t"
684 #define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t"
685 #define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t"
686 #define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t"
687 #define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t"
688 asm volatile ("movdqa (%[key]), %%xmm0\n\t"
689 "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
690 "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
691 "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
692 "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
693 "movdqa 0x10(%[key]), %%xmm0\n\t"
694 aesdec_xmm0_xmm1
695 aesdec_xmm0_xmm2
696 aesdec_xmm0_xmm3
697 aesdec_xmm0_xmm4
698 "movdqa 0x20(%[key]), %%xmm0\n\t"
699 aesdec_xmm0_xmm1
700 aesdec_xmm0_xmm2
701 aesdec_xmm0_xmm3
702 aesdec_xmm0_xmm4
703 "movdqa 0x30(%[key]), %%xmm0\n\t"
704 aesdec_xmm0_xmm1
705 aesdec_xmm0_xmm2
706 aesdec_xmm0_xmm3
707 aesdec_xmm0_xmm4
708 "movdqa 0x40(%[key]), %%xmm0\n\t"
709 aesdec_xmm0_xmm1
710 aesdec_xmm0_xmm2
711 aesdec_xmm0_xmm3
712 aesdec_xmm0_xmm4
713 "movdqa 0x50(%[key]), %%xmm0\n\t"
714 aesdec_xmm0_xmm1
715 aesdec_xmm0_xmm2
716 aesdec_xmm0_xmm3
717 aesdec_xmm0_xmm4
718 "movdqa 0x60(%[key]), %%xmm0\n\t"
719 aesdec_xmm0_xmm1
720 aesdec_xmm0_xmm2
721 aesdec_xmm0_xmm3
722 aesdec_xmm0_xmm4
723 "movdqa 0x70(%[key]), %%xmm0\n\t"
724 aesdec_xmm0_xmm1
725 aesdec_xmm0_xmm2
726 aesdec_xmm0_xmm3
727 aesdec_xmm0_xmm4
728 "movdqa 0x80(%[key]), %%xmm0\n\t"
729 aesdec_xmm0_xmm1
730 aesdec_xmm0_xmm2
731 aesdec_xmm0_xmm3
732 aesdec_xmm0_xmm4
733 "movdqa 0x90(%[key]), %%xmm0\n\t"
734 aesdec_xmm0_xmm1
735 aesdec_xmm0_xmm2
736 aesdec_xmm0_xmm3
737 aesdec_xmm0_xmm4
738 "movdqa 0xa0(%[key]), %%xmm0\n\t"
739 "cmpl $10, %[rounds]\n\t"
740 "jz .Ldeclast%=\n\t"
741 aesdec_xmm0_xmm1
742 aesdec_xmm0_xmm2
743 aesdec_xmm0_xmm3
744 aesdec_xmm0_xmm4
745 "movdqa 0xb0(%[key]), %%xmm0\n\t"
746 aesdec_xmm0_xmm1
747 aesdec_xmm0_xmm2
748 aesdec_xmm0_xmm3
749 aesdec_xmm0_xmm4
750 "movdqa 0xc0(%[key]), %%xmm0\n\t"
751 "cmpl $12, %[rounds]\n\t"
752 "jz .Ldeclast%=\n\t"
753 aesdec_xmm0_xmm1
754 aesdec_xmm0_xmm2
755 aesdec_xmm0_xmm3
756 aesdec_xmm0_xmm4
757 "movdqa 0xd0(%[key]), %%xmm0\n\t"
758 aesdec_xmm0_xmm1
759 aesdec_xmm0_xmm2
760 aesdec_xmm0_xmm3
761 aesdec_xmm0_xmm4
762 "movdqa 0xe0(%[key]), %%xmm0\n"
763
764 ".Ldeclast%=:\n\t"
765 aesdeclast_xmm0_xmm1
766 aesdeclast_xmm0_xmm2
767 aesdeclast_xmm0_xmm3
768 aesdeclast_xmm0_xmm4
769 : /* no output */
770 : [key] "r" (ctx->keyschdec),
771 [rounds] "r" (ctx->rounds)
772 : "cc", "memory");
773 #undef aesdec_xmm0_xmm1
774 #undef aesdec_xmm0_xmm2
775 #undef aesdec_xmm0_xmm3
776 #undef aesdec_xmm0_xmm4
777 #undef aesdeclast_xmm0_xmm1
778 #undef aesdeclast_xmm0_xmm2
779 #undef aesdeclast_xmm0_xmm3
780 #undef aesdeclast_xmm0_xmm4
781 }
782
783
784 #ifdef __x86_64__
785
786 /* Encrypt eight blocks using the Intel AES-NI instructions. Blocks are input
787 * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */
788 static ASM_FUNC_ATTR_INLINE void
do_aesni_enc_vec8(const RIJNDAEL_context * ctx)789 do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
790 {
791 asm volatile ("movdqa 0x10(%[key]), %%xmm0\n\t"
792 "aesenc %%xmm0, %%xmm1\n\t"
793 "aesenc %%xmm0, %%xmm2\n\t"
794 "aesenc %%xmm0, %%xmm3\n\t"
795 "aesenc %%xmm0, %%xmm4\n\t"
796 "aesenc %%xmm0, %%xmm8\n\t"
797 "aesenc %%xmm0, %%xmm9\n\t"
798 "aesenc %%xmm0, %%xmm10\n\t"
799 "aesenc %%xmm0, %%xmm11\n\t"
800 "movdqa 0x20(%[key]), %%xmm0\n\t"
801 "cmpl $12, %[rounds]\n\t"
802 "aesenc %%xmm0, %%xmm1\n\t"
803 "aesenc %%xmm0, %%xmm2\n\t"
804 "aesenc %%xmm0, %%xmm3\n\t"
805 "aesenc %%xmm0, %%xmm4\n\t"
806 "aesenc %%xmm0, %%xmm8\n\t"
807 "aesenc %%xmm0, %%xmm9\n\t"
808 "aesenc %%xmm0, %%xmm10\n\t"
809 "aesenc %%xmm0, %%xmm11\n\t"
810 "movdqa 0x30(%[key]), %%xmm0\n\t"
811 "aesenc %%xmm0, %%xmm1\n\t"
812 "aesenc %%xmm0, %%xmm2\n\t"
813 "aesenc %%xmm0, %%xmm3\n\t"
814 "aesenc %%xmm0, %%xmm4\n\t"
815 "aesenc %%xmm0, %%xmm8\n\t"
816 "aesenc %%xmm0, %%xmm9\n\t"
817 "aesenc %%xmm0, %%xmm10\n\t"
818 "aesenc %%xmm0, %%xmm11\n\t"
819 "movdqa 0x40(%[key]), %%xmm0\n\t"
820 "aesenc %%xmm0, %%xmm1\n\t"
821 "aesenc %%xmm0, %%xmm2\n\t"
822 "aesenc %%xmm0, %%xmm3\n\t"
823 "aesenc %%xmm0, %%xmm4\n\t"
824 "aesenc %%xmm0, %%xmm8\n\t"
825 "aesenc %%xmm0, %%xmm9\n\t"
826 "aesenc %%xmm0, %%xmm10\n\t"
827 "aesenc %%xmm0, %%xmm11\n\t"
828 "movdqa 0x50(%[key]), %%xmm0\n\t"
829 "aesenc %%xmm0, %%xmm1\n\t"
830 "aesenc %%xmm0, %%xmm2\n\t"
831 "aesenc %%xmm0, %%xmm3\n\t"
832 "aesenc %%xmm0, %%xmm4\n\t"
833 "aesenc %%xmm0, %%xmm8\n\t"
834 "aesenc %%xmm0, %%xmm9\n\t"
835 "aesenc %%xmm0, %%xmm10\n\t"
836 "aesenc %%xmm0, %%xmm11\n\t"
837 "movdqa 0x60(%[key]), %%xmm0\n\t"
838 "aesenc %%xmm0, %%xmm1\n\t"
839 "aesenc %%xmm0, %%xmm2\n\t"
840 "aesenc %%xmm0, %%xmm3\n\t"
841 "aesenc %%xmm0, %%xmm4\n\t"
842 "aesenc %%xmm0, %%xmm8\n\t"
843 "aesenc %%xmm0, %%xmm9\n\t"
844 "aesenc %%xmm0, %%xmm10\n\t"
845 "aesenc %%xmm0, %%xmm11\n\t"
846 "movdqa 0x70(%[key]), %%xmm0\n\t"
847 "aesenc %%xmm0, %%xmm1\n\t"
848 "aesenc %%xmm0, %%xmm2\n\t"
849 "aesenc %%xmm0, %%xmm3\n\t"
850 "aesenc %%xmm0, %%xmm4\n\t"
851 "aesenc %%xmm0, %%xmm8\n\t"
852 "aesenc %%xmm0, %%xmm9\n\t"
853 "aesenc %%xmm0, %%xmm10\n\t"
854 "aesenc %%xmm0, %%xmm11\n\t"
855 "movdqa 0x80(%[key]), %%xmm0\n\t"
856 "aesenc %%xmm0, %%xmm1\n\t"
857 "aesenc %%xmm0, %%xmm2\n\t"
858 "aesenc %%xmm0, %%xmm3\n\t"
859 "aesenc %%xmm0, %%xmm4\n\t"
860 "aesenc %%xmm0, %%xmm8\n\t"
861 "aesenc %%xmm0, %%xmm9\n\t"
862 "aesenc %%xmm0, %%xmm10\n\t"
863 "aesenc %%xmm0, %%xmm11\n\t"
864 "movdqa 0x90(%[key]), %%xmm0\n\t"
865 "aesenc %%xmm0, %%xmm1\n\t"
866 "aesenc %%xmm0, %%xmm2\n\t"
867 "aesenc %%xmm0, %%xmm3\n\t"
868 "aesenc %%xmm0, %%xmm4\n\t"
869 "aesenc %%xmm0, %%xmm8\n\t"
870 "aesenc %%xmm0, %%xmm9\n\t"
871 "aesenc %%xmm0, %%xmm10\n\t"
872 "aesenc %%xmm0, %%xmm11\n\t"
873 "movdqa 0xa0(%[key]), %%xmm0\n\t"
874 "jb .Ldeclast%=\n\t"
875 "aesenc %%xmm0, %%xmm1\n\t"
876 "aesenc %%xmm0, %%xmm2\n\t"
877 "aesenc %%xmm0, %%xmm3\n\t"
878 "aesenc %%xmm0, %%xmm4\n\t"
879 "aesenc %%xmm0, %%xmm8\n\t"
880 "aesenc %%xmm0, %%xmm9\n\t"
881 "aesenc %%xmm0, %%xmm10\n\t"
882 "aesenc %%xmm0, %%xmm11\n\t"
883 "movdqa 0xb0(%[key]), %%xmm0\n\t"
884 "aesenc %%xmm0, %%xmm1\n\t"
885 "aesenc %%xmm0, %%xmm2\n\t"
886 "aesenc %%xmm0, %%xmm3\n\t"
887 "aesenc %%xmm0, %%xmm4\n\t"
888 "aesenc %%xmm0, %%xmm8\n\t"
889 "aesenc %%xmm0, %%xmm9\n\t"
890 "aesenc %%xmm0, %%xmm10\n\t"
891 "aesenc %%xmm0, %%xmm11\n\t"
892 "movdqa 0xc0(%[key]), %%xmm0\n\t"
893 "je .Ldeclast%=\n\t"
894 "aesenc %%xmm0, %%xmm1\n\t"
895 "aesenc %%xmm0, %%xmm2\n\t"
896 "aesenc %%xmm0, %%xmm3\n\t"
897 "aesenc %%xmm0, %%xmm4\n\t"
898 "aesenc %%xmm0, %%xmm8\n\t"
899 "aesenc %%xmm0, %%xmm9\n\t"
900 "aesenc %%xmm0, %%xmm10\n\t"
901 "aesenc %%xmm0, %%xmm11\n\t"
902 "movdqa 0xd0(%[key]), %%xmm0\n\t"
903 "aesenc %%xmm0, %%xmm1\n\t"
904 "aesenc %%xmm0, %%xmm2\n\t"
905 "aesenc %%xmm0, %%xmm3\n\t"
906 "aesenc %%xmm0, %%xmm4\n\t"
907 "aesenc %%xmm0, %%xmm8\n\t"
908 "aesenc %%xmm0, %%xmm9\n\t"
909 "aesenc %%xmm0, %%xmm10\n\t"
910 "aesenc %%xmm0, %%xmm11\n\t"
911 "movdqa 0xe0(%[key]), %%xmm0\n"
912
913 ".Ldeclast%=:\n\t"
914 : /* no output */
915 : [key] "r" (ctx->keyschenc),
916 [rounds] "r" (ctx->rounds)
917 : "cc", "memory");
918 }
919
920
921 /* Decrypt eight blocks using the Intel AES-NI instructions. Blocks are input
922 * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11. */
923 static ASM_FUNC_ATTR_INLINE void
do_aesni_dec_vec8(const RIJNDAEL_context * ctx)924 do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
925 {
926 asm volatile ("movdqa 0x10(%[key]), %%xmm0\n\t"
927 "cmpl $12, %[rounds]\n\t"
928 "aesdec %%xmm0, %%xmm1\n\t"
929 "aesdec %%xmm0, %%xmm2\n\t"
930 "aesdec %%xmm0, %%xmm3\n\t"
931 "aesdec %%xmm0, %%xmm4\n\t"
932 "aesdec %%xmm0, %%xmm8\n\t"
933 "aesdec %%xmm0, %%xmm9\n\t"
934 "aesdec %%xmm0, %%xmm10\n\t"
935 "aesdec %%xmm0, %%xmm11\n\t"
936 "movdqa 0x20(%[key]), %%xmm0\n\t"
937 "aesdec %%xmm0, %%xmm1\n\t"
938 "aesdec %%xmm0, %%xmm2\n\t"
939 "aesdec %%xmm0, %%xmm3\n\t"
940 "aesdec %%xmm0, %%xmm4\n\t"
941 "aesdec %%xmm0, %%xmm8\n\t"
942 "aesdec %%xmm0, %%xmm9\n\t"
943 "aesdec %%xmm0, %%xmm10\n\t"
944 "aesdec %%xmm0, %%xmm11\n\t"
945 "movdqa 0x30(%[key]), %%xmm0\n\t"
946 "aesdec %%xmm0, %%xmm1\n\t"
947 "aesdec %%xmm0, %%xmm2\n\t"
948 "aesdec %%xmm0, %%xmm3\n\t"
949 "aesdec %%xmm0, %%xmm4\n\t"
950 "aesdec %%xmm0, %%xmm8\n\t"
951 "aesdec %%xmm0, %%xmm9\n\t"
952 "aesdec %%xmm0, %%xmm10\n\t"
953 "aesdec %%xmm0, %%xmm11\n\t"
954 "movdqa 0x40(%[key]), %%xmm0\n\t"
955 "aesdec %%xmm0, %%xmm1\n\t"
956 "aesdec %%xmm0, %%xmm2\n\t"
957 "aesdec %%xmm0, %%xmm3\n\t"
958 "aesdec %%xmm0, %%xmm4\n\t"
959 "aesdec %%xmm0, %%xmm8\n\t"
960 "aesdec %%xmm0, %%xmm9\n\t"
961 "aesdec %%xmm0, %%xmm10\n\t"
962 "aesdec %%xmm0, %%xmm11\n\t"
963 "movdqa 0x50(%[key]), %%xmm0\n\t"
964 "aesdec %%xmm0, %%xmm1\n\t"
965 "aesdec %%xmm0, %%xmm2\n\t"
966 "aesdec %%xmm0, %%xmm3\n\t"
967 "aesdec %%xmm0, %%xmm4\n\t"
968 "aesdec %%xmm0, %%xmm8\n\t"
969 "aesdec %%xmm0, %%xmm9\n\t"
970 "aesdec %%xmm0, %%xmm10\n\t"
971 "aesdec %%xmm0, %%xmm11\n\t"
972 "movdqa 0x60(%[key]), %%xmm0\n\t"
973 "aesdec %%xmm0, %%xmm1\n\t"
974 "aesdec %%xmm0, %%xmm2\n\t"
975 "aesdec %%xmm0, %%xmm3\n\t"
976 "aesdec %%xmm0, %%xmm4\n\t"
977 "aesdec %%xmm0, %%xmm8\n\t"
978 "aesdec %%xmm0, %%xmm9\n\t"
979 "aesdec %%xmm0, %%xmm10\n\t"
980 "aesdec %%xmm0, %%xmm11\n\t"
981 "movdqa 0x70(%[key]), %%xmm0\n\t"
982 "aesdec %%xmm0, %%xmm1\n\t"
983 "aesdec %%xmm0, %%xmm2\n\t"
984 "aesdec %%xmm0, %%xmm3\n\t"
985 "aesdec %%xmm0, %%xmm4\n\t"
986 "aesdec %%xmm0, %%xmm8\n\t"
987 "aesdec %%xmm0, %%xmm9\n\t"
988 "aesdec %%xmm0, %%xmm10\n\t"
989 "aesdec %%xmm0, %%xmm11\n\t"
990 "movdqa 0x80(%[key]), %%xmm0\n\t"
991 "aesdec %%xmm0, %%xmm1\n\t"
992 "aesdec %%xmm0, %%xmm2\n\t"
993 "aesdec %%xmm0, %%xmm3\n\t"
994 "aesdec %%xmm0, %%xmm4\n\t"
995 "aesdec %%xmm0, %%xmm8\n\t"
996 "aesdec %%xmm0, %%xmm9\n\t"
997 "aesdec %%xmm0, %%xmm10\n\t"
998 "aesdec %%xmm0, %%xmm11\n\t"
999 "movdqa 0x90(%[key]), %%xmm0\n\t"
1000 "aesdec %%xmm0, %%xmm1\n\t"
1001 "aesdec %%xmm0, %%xmm2\n\t"
1002 "aesdec %%xmm0, %%xmm3\n\t"
1003 "aesdec %%xmm0, %%xmm4\n\t"
1004 "aesdec %%xmm0, %%xmm8\n\t"
1005 "aesdec %%xmm0, %%xmm9\n\t"
1006 "aesdec %%xmm0, %%xmm10\n\t"
1007 "aesdec %%xmm0, %%xmm11\n\t"
1008 "movdqa 0xa0(%[key]), %%xmm0\n\t"
1009 "jb .Ldeclast%=\n\t"
1010 "aesdec %%xmm0, %%xmm1\n\t"
1011 "aesdec %%xmm0, %%xmm2\n\t"
1012 "aesdec %%xmm0, %%xmm3\n\t"
1013 "aesdec %%xmm0, %%xmm4\n\t"
1014 "aesdec %%xmm0, %%xmm8\n\t"
1015 "aesdec %%xmm0, %%xmm9\n\t"
1016 "aesdec %%xmm0, %%xmm10\n\t"
1017 "aesdec %%xmm0, %%xmm11\n\t"
1018 "movdqa 0xb0(%[key]), %%xmm0\n\t"
1019 "aesdec %%xmm0, %%xmm1\n\t"
1020 "aesdec %%xmm0, %%xmm2\n\t"
1021 "aesdec %%xmm0, %%xmm3\n\t"
1022 "aesdec %%xmm0, %%xmm4\n\t"
1023 "aesdec %%xmm0, %%xmm8\n\t"
1024 "aesdec %%xmm0, %%xmm9\n\t"
1025 "aesdec %%xmm0, %%xmm10\n\t"
1026 "aesdec %%xmm0, %%xmm11\n\t"
1027 "movdqa 0xc0(%[key]), %%xmm0\n\t"
1028 "je .Ldeclast%=\n\t"
1029 "aesdec %%xmm0, %%xmm1\n\t"
1030 "aesdec %%xmm0, %%xmm2\n\t"
1031 "aesdec %%xmm0, %%xmm3\n\t"
1032 "aesdec %%xmm0, %%xmm4\n\t"
1033 "aesdec %%xmm0, %%xmm8\n\t"
1034 "aesdec %%xmm0, %%xmm9\n\t"
1035 "aesdec %%xmm0, %%xmm10\n\t"
1036 "aesdec %%xmm0, %%xmm11\n\t"
1037 "movdqa 0xd0(%[key]), %%xmm0\n\t"
1038 "aesdec %%xmm0, %%xmm1\n\t"
1039 "aesdec %%xmm0, %%xmm2\n\t"
1040 "aesdec %%xmm0, %%xmm3\n\t"
1041 "aesdec %%xmm0, %%xmm4\n\t"
1042 "aesdec %%xmm0, %%xmm8\n\t"
1043 "aesdec %%xmm0, %%xmm9\n\t"
1044 "aesdec %%xmm0, %%xmm10\n\t"
1045 "aesdec %%xmm0, %%xmm11\n\t"
1046 "movdqa 0xe0(%[key]), %%xmm0\n"
1047
1048 ".Ldeclast%=:\n\t"
1049 : /* no output */
1050 : [key] "r" (ctx->keyschdec),
1051 [rounds] "r" (ctx->rounds)
1052 : "cc", "memory");
1053 }
1054
1055 #endif /* __x86_64__ */
1056
1057
1058 /* Perform a CTR encryption round using the counter CTR and the input
1059 block A. Write the result to the output block B and update CTR.
1060 CTR needs to be a 16 byte aligned little-endian value. */
1061 static ASM_FUNC_ATTR_INLINE void
do_aesni_ctr(const RIJNDAEL_context * ctx,unsigned char * ctr,unsigned char * b,const unsigned char * a)1062 do_aesni_ctr (const RIJNDAEL_context *ctx,
1063 unsigned char *ctr, unsigned char *b, const unsigned char *a)
1064 {
1065 #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
1066 #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
1067
1068 asm volatile ("movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */
1069 "pcmpeqd %%xmm1, %%xmm1\n\t"
1070 "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
1071
1072 "pshufb %%xmm6, %%xmm5\n\t"
1073 "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ (big endian) */
1074
1075 /* detect if 64-bit carry handling is needed */
1076 "cmpl $0xffffffff, 8(%[ctr])\n\t"
1077 "jne .Lno_carry%=\n\t"
1078 "cmpl $0xffffffff, 12(%[ctr])\n\t"
1079 "jne .Lno_carry%=\n\t"
1080
1081 "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
1082 "psubq %%xmm1, %%xmm5\n\t" /* add carry to upper 64bits */
1083
1084 ".Lno_carry%=:\n\t"
1085
1086 "pshufb %%xmm6, %%xmm5\n\t"
1087 "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */
1088
1089 "pxor (%[key]), %%xmm0\n\t" /* xmm1 ^= key[0] */
1090 "movdqa 0x10(%[key]), %%xmm1\n\t"
1091 aesenc_xmm1_xmm0
1092 "movdqa 0x20(%[key]), %%xmm1\n\t"
1093 aesenc_xmm1_xmm0
1094 "movdqa 0x30(%[key]), %%xmm1\n\t"
1095 aesenc_xmm1_xmm0
1096 "movdqa 0x40(%[key]), %%xmm1\n\t"
1097 aesenc_xmm1_xmm0
1098 "movdqa 0x50(%[key]), %%xmm1\n\t"
1099 aesenc_xmm1_xmm0
1100 "movdqa 0x60(%[key]), %%xmm1\n\t"
1101 aesenc_xmm1_xmm0
1102 "movdqa 0x70(%[key]), %%xmm1\n\t"
1103 aesenc_xmm1_xmm0
1104 "movdqa 0x80(%[key]), %%xmm1\n\t"
1105 aesenc_xmm1_xmm0
1106 "movdqa 0x90(%[key]), %%xmm1\n\t"
1107 aesenc_xmm1_xmm0
1108 "movdqa 0xa0(%[key]), %%xmm1\n\t"
1109 "cmpl $10, %[rounds]\n\t"
1110 "jz .Lenclast%=\n\t"
1111 aesenc_xmm1_xmm0
1112 "movdqa 0xb0(%[key]), %%xmm1\n\t"
1113 aesenc_xmm1_xmm0
1114 "movdqa 0xc0(%[key]), %%xmm1\n\t"
1115 "cmpl $12, %[rounds]\n\t"
1116 "jz .Lenclast%=\n\t"
1117 aesenc_xmm1_xmm0
1118 "movdqa 0xd0(%[key]), %%xmm1\n\t"
1119 aesenc_xmm1_xmm0
1120 "movdqa 0xe0(%[key]), %%xmm1\n"
1121
1122 ".Lenclast%=:\n\t"
1123 aesenclast_xmm1_xmm0
1124 "movdqu %[src], %%xmm1\n\t" /* xmm1 := input */
1125 "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */
1126 "movdqu %%xmm0, %[dst]" /* Store EncCTR. */
1127
1128 : [dst] "=m" (*b)
1129 : [src] "m" (*a),
1130 [ctr] "r" (ctr),
1131 [key] "r" (ctx->keyschenc),
1132 [rounds] "g" (ctx->rounds)
1133 : "cc", "memory");
1134 #undef aesenc_xmm1_xmm0
1135 #undef aesenclast_xmm1_xmm0
1136 }
1137
1138
1139 /* Four blocks at a time variant of do_aesni_ctr. */
1140 static ASM_FUNC_ATTR_INLINE void
do_aesni_ctr_4(const RIJNDAEL_context * ctx,unsigned char * ctr,unsigned char * b,const unsigned char * a)1141 do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
1142 unsigned char *ctr, unsigned char *b, const unsigned char *a)
1143 {
1144 static const byte bige_addb_const[4][16] __attribute__ ((aligned (16))) =
1145 {
1146 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
1147 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
1148 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
1149 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }
1150 };
1151 const void *bige_addb = bige_addb_const;
1152 #define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
1153 #define aesenc_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t"
1154 #define aesenc_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t"
1155 #define aesenc_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t"
1156 #define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
1157 #define aesenclast_xmm1_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t"
1158 #define aesenclast_xmm1_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t"
1159 #define aesenclast_xmm1_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
1160
1161 /* Register usage:
1162 [key] keyschedule
1163 xmm0 CTR-0
1164 xmm1 temp / round key
1165 xmm2 CTR-1
1166 xmm3 CTR-2
1167 xmm4 CTR-3
1168 xmm5 copy of *ctr
1169 xmm6 endian swapping mask
1170 */
1171
1172 asm volatile (/* detect if 8-bit carry handling is needed */
1173 "addb $4, 15(%[ctr])\n\t"
1174 "jc .Ladd32bit%=\n\t"
1175
1176 "movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */
1177 "movdqa 0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) */
1178 "movdqa 1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) */
1179 "movdqa 2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) */
1180 "movdqa 3*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(4) */
1181 "paddb %%xmm0, %%xmm2\n\t" /* xmm2 := be(1) + CTR (xmm0) */
1182 "paddb %%xmm0, %%xmm3\n\t" /* xmm3 := be(2) + CTR (xmm0) */
1183 "paddb %%xmm0, %%xmm4\n\t" /* xmm4 := be(3) + CTR (xmm0) */
1184 "paddb %%xmm0, %%xmm5\n\t" /* xmm5 := be(4) + CTR (xmm0) */
1185 "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
1186 "jmp .Ldone_ctr%=\n\t"
1187
1188 ".Ladd32bit%=:\n\t"
1189 "movdqa %%xmm5, (%[ctr])\n\t" /* Restore CTR. */
1190 "movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */
1191 "movdqa %%xmm0, %%xmm2\n\t"
1192 "pcmpeqd %%xmm1, %%xmm1\n\t"
1193 "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
1194
1195 "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */
1196 "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */
1197 "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */
1198 "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */
1199 "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */
1200 "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */
1201 "movdqa %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */
1202 "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */
1203
1204 /* detect if 64-bit carry handling is needed */
1205 "cmpl $0xffffffff, 8(%[ctr])\n\t"
1206 "jne .Lno_carry%=\n\t"
1207 "movl 12(%[ctr]), %%esi\n\t"
1208 "bswapl %%esi\n\t"
1209 "cmpl $0xfffffffc, %%esi\n\t"
1210 "jb .Lno_carry%=\n\t" /* no carry */
1211
1212 "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
1213 "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffffc */
1214 "cmpl $0xfffffffe, %%esi\n\t"
1215 "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */
1216 "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */
1217 /* esi == 0xffffffff */
1218
1219 "psubq %%xmm1, %%xmm2\n\t"
1220 ".Lcarry_xmm3%=:\n\t"
1221 "psubq %%xmm1, %%xmm3\n\t"
1222 ".Lcarry_xmm4%=:\n\t"
1223 "psubq %%xmm1, %%xmm4\n\t"
1224 ".Lcarry_xmm5%=:\n\t"
1225 "psubq %%xmm1, %%xmm5\n\t"
1226
1227 ".Lno_carry%=:\n\t"
1228 "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
1229
1230 "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */
1231 "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */
1232 "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */
1233 "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */
1234
1235 "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */
1236
1237 ".Ldone_ctr%=:\n\t"
1238 :
1239 : [ctr] "r" (ctr),
1240 [key] "r" (ctx->keyschenc),
1241 [addb] "r" (bige_addb)
1242 : "%esi", "cc", "memory");
1243
1244 asm volatile ("pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
1245 "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */
1246 "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */
1247 "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */
1248 "movdqa 0x10(%[key]), %%xmm1\n\t"
1249 aesenc_xmm1_xmm0
1250 aesenc_xmm1_xmm2
1251 aesenc_xmm1_xmm3
1252 aesenc_xmm1_xmm4
1253 "movdqa 0x20(%[key]), %%xmm1\n\t"
1254 aesenc_xmm1_xmm0
1255 aesenc_xmm1_xmm2
1256 aesenc_xmm1_xmm3
1257 aesenc_xmm1_xmm4
1258 "movdqa 0x30(%[key]), %%xmm1\n\t"
1259 aesenc_xmm1_xmm0
1260 aesenc_xmm1_xmm2
1261 aesenc_xmm1_xmm3
1262 aesenc_xmm1_xmm4
1263 "movdqa 0x40(%[key]), %%xmm1\n\t"
1264 aesenc_xmm1_xmm0
1265 aesenc_xmm1_xmm2
1266 aesenc_xmm1_xmm3
1267 aesenc_xmm1_xmm4
1268 "movdqa 0x50(%[key]), %%xmm1\n\t"
1269 aesenc_xmm1_xmm0
1270 aesenc_xmm1_xmm2
1271 aesenc_xmm1_xmm3
1272 aesenc_xmm1_xmm4
1273 "movdqa 0x60(%[key]), %%xmm1\n\t"
1274 aesenc_xmm1_xmm0
1275 aesenc_xmm1_xmm2
1276 aesenc_xmm1_xmm3
1277 aesenc_xmm1_xmm4
1278 "movdqa 0x70(%[key]), %%xmm1\n\t"
1279 aesenc_xmm1_xmm0
1280 aesenc_xmm1_xmm2
1281 aesenc_xmm1_xmm3
1282 aesenc_xmm1_xmm4
1283 "movdqa 0x80(%[key]), %%xmm1\n\t"
1284 aesenc_xmm1_xmm0
1285 aesenc_xmm1_xmm2
1286 aesenc_xmm1_xmm3
1287 aesenc_xmm1_xmm4
1288 "movdqa 0x90(%[key]), %%xmm1\n\t"
1289 aesenc_xmm1_xmm0
1290 aesenc_xmm1_xmm2
1291 aesenc_xmm1_xmm3
1292 aesenc_xmm1_xmm4
1293 "movdqa 0xa0(%[key]), %%xmm1\n\t"
1294 "cmpl $10, %[rounds]\n\t"
1295 "jz .Lenclast%=\n\t"
1296 aesenc_xmm1_xmm0
1297 aesenc_xmm1_xmm2
1298 aesenc_xmm1_xmm3
1299 aesenc_xmm1_xmm4
1300 "movdqa 0xb0(%[key]), %%xmm1\n\t"
1301 aesenc_xmm1_xmm0
1302 aesenc_xmm1_xmm2
1303 aesenc_xmm1_xmm3
1304 aesenc_xmm1_xmm4
1305 "movdqa 0xc0(%[key]), %%xmm1\n\t"
1306 "cmpl $12, %[rounds]\n\t"
1307 "jz .Lenclast%=\n\t"
1308 aesenc_xmm1_xmm0
1309 aesenc_xmm1_xmm2
1310 aesenc_xmm1_xmm3
1311 aesenc_xmm1_xmm4
1312 "movdqa 0xd0(%[key]), %%xmm1\n\t"
1313 aesenc_xmm1_xmm0
1314 aesenc_xmm1_xmm2
1315 aesenc_xmm1_xmm3
1316 aesenc_xmm1_xmm4
1317 "movdqa 0xe0(%[key]), %%xmm1\n"
1318
1319 ".Lenclast%=:\n\t"
1320 aesenclast_xmm1_xmm0
1321 aesenclast_xmm1_xmm2
1322 aesenclast_xmm1_xmm3
1323 aesenclast_xmm1_xmm4
1324 :
1325 : [key] "r" (ctx->keyschenc),
1326 [rounds] "r" (ctx->rounds)
1327 : "cc", "memory");
1328
1329 asm volatile ("movdqu (%[src]), %%xmm1\n\t" /* Get block 1. */
1330 "pxor %%xmm1, %%xmm0\n\t" /* EncCTR-1 ^= input */
1331 "movdqu %%xmm0, (%[dst])\n\t" /* Store block 1 */
1332
1333 "movdqu 16(%[src]), %%xmm1\n\t" /* Get block 2. */
1334 "pxor %%xmm1, %%xmm2\n\t" /* EncCTR-2 ^= input */
1335 "movdqu %%xmm2, 16(%[dst])\n\t" /* Store block 2. */
1336
1337 "movdqu 32(%[src]), %%xmm1\n\t" /* Get block 3. */
1338 "pxor %%xmm1, %%xmm3\n\t" /* EncCTR-3 ^= input */
1339 "movdqu %%xmm3, 32(%[dst])\n\t" /* Store block 3. */
1340
1341 "movdqu 48(%[src]), %%xmm1\n\t" /* Get block 4. */
1342 "pxor %%xmm1, %%xmm4\n\t" /* EncCTR-4 ^= input */
1343 "movdqu %%xmm4, 48(%[dst])" /* Store block 4. */
1344 :
1345 : [src] "r" (a),
1346 [dst] "r" (b)
1347 : "memory");
1348 #undef aesenc_xmm1_xmm0
1349 #undef aesenc_xmm1_xmm2
1350 #undef aesenc_xmm1_xmm3
1351 #undef aesenc_xmm1_xmm4
1352 #undef aesenclast_xmm1_xmm0
1353 #undef aesenclast_xmm1_xmm2
1354 #undef aesenclast_xmm1_xmm3
1355 #undef aesenclast_xmm1_xmm4
1356 }
1357
1358
1359 #ifdef __x86_64__
1360
1361 /* Eight blocks at a time variant of do_aesni_ctr. */
1362 static ASM_FUNC_ATTR_INLINE void
do_aesni_ctr_8(const RIJNDAEL_context * ctx,unsigned char * ctr,unsigned char * b,const unsigned char * a)1363 do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
1364 unsigned char *ctr, unsigned char *b, const unsigned char *a)
1365 {
1366 static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) =
1367 {
1368 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
1369 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
1370 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
1371 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 },
1372 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 },
1373 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 },
1374 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 },
1375 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }
1376 };
1377 const void *bige_addb = bige_addb_const;
1378
1379 /* Register usage:
1380 [key] keyschedule
1381 xmm0 CTR-0
1382 xmm1 temp / round key
1383 xmm2 CTR-1
1384 xmm3 CTR-2
1385 xmm4 CTR-3
1386 xmm5 copy of *ctr
1387 xmm6 endian swapping mask
1388 xmm8 CTR-4
1389 xmm9 CTR-5
1390 xmm10 CTR-6
1391 xmm11 CTR-7
1392 xmm12 temp
1393 xmm13 temp
1394 xmm14 temp
1395 xmm15 temp
1396 */
1397
1398 asm volatile (/* detect if 8-bit carry handling is needed */
1399 "addb $8, 15(%[ctr])\n\t"
1400 "jc .Ladd32bit%=\n\t"
1401
1402 "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
1403 "movdqa 16(%[key]), %%xmm7\n\t" /* xmm7 := key[1] */
1404
1405 "movdqa %%xmm5, %%xmm0\n\t" /* xmm0 := CTR (xmm5) */
1406 "movdqa %%xmm5, %%xmm2\n\t" /* xmm2 := CTR (xmm5) */
1407 "movdqa %%xmm5, %%xmm3\n\t" /* xmm3 := CTR (xmm5) */
1408 "movdqa %%xmm5, %%xmm4\n\t" /* xmm4 := CTR (xmm5) */
1409 "paddb 0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) + CTR */
1410 "paddb 1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) + CTR */
1411 "paddb 2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) + CTR */
1412 "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
1413 "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */
1414 "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */
1415 "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */
1416 "aesenc %%xmm7, %%xmm0\n\t"
1417 "aesenc %%xmm7, %%xmm2\n\t"
1418 "aesenc %%xmm7, %%xmm3\n\t"
1419 "aesenc %%xmm7, %%xmm4\n\t"
1420 "movdqa %%xmm5, %%xmm8\n\t" /* xmm8 := CTR (xmm5) */
1421 "movdqa %%xmm5, %%xmm9\n\t" /* xmm9 := CTR (xmm5) */
1422 "movdqa %%xmm5, %%xmm10\n\t" /* xmm10 := CTR (xmm5) */
1423 "movdqa %%xmm5, %%xmm11\n\t" /* xmm11 := CTR (xmm5) */
1424 "paddb 3*16(%[addb]), %%xmm8\n\t" /* xmm8 := be(4) + CTR */
1425 "paddb 4*16(%[addb]), %%xmm9\n\t" /* xmm9 := be(5) + CTR */
1426 "paddb 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) + CTR */
1427 "paddb 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) + CTR */
1428 "pxor %%xmm1, %%xmm8\n\t" /* xmm8 ^= key[0] */
1429 "pxor %%xmm1, %%xmm9\n\t" /* xmm9 ^= key[0] */
1430 "pxor %%xmm1, %%xmm10\n\t" /* xmm10 ^= key[0] */
1431 "pxor %%xmm1, %%xmm11\n\t" /* xmm11 ^= key[0] */
1432 "aesenc %%xmm7, %%xmm8\n\t"
1433 "aesenc %%xmm7, %%xmm9\n\t"
1434 "aesenc %%xmm7, %%xmm10\n\t"
1435 "aesenc %%xmm7, %%xmm11\n\t"
1436
1437 "paddb 7*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(8) + CTR */
1438
1439 "jmp .Ldone_ctr%=\n\t"
1440
1441 ".Ladd32bit%=:\n\t"
1442 "movdqa %%xmm5, (%[ctr])\n\t" /* Restore CTR. */
1443 "movdqa %%xmm5, %%xmm0\n\t" /* xmm0, xmm2 := CTR (xmm5) */
1444 "movdqa %%xmm0, %%xmm2\n\t"
1445 "pcmpeqd %%xmm1, %%xmm1\n\t"
1446 "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
1447
1448 "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := le(xmm2) */
1449 "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */
1450 "movdqa %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */
1451 "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */
1452 "movdqa %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */
1453 "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */
1454 "movdqa %%xmm4, %%xmm8\n\t" /* xmm8 := xmm4 */
1455 "psubq %%xmm1, %%xmm8\n\t" /* xmm8++ */
1456 "movdqa %%xmm8, %%xmm9\n\t" /* xmm9 := xmm8 */
1457 "psubq %%xmm1, %%xmm9\n\t" /* xmm9++ */
1458 "movdqa %%xmm9, %%xmm10\n\t" /* xmm10 := xmm9 */
1459 "psubq %%xmm1, %%xmm10\n\t" /* xmm10++ */
1460 "movdqa %%xmm10, %%xmm11\n\t" /* xmm11 := xmm10 */
1461 "psubq %%xmm1, %%xmm11\n\t" /* xmm11++ */
1462 "movdqa %%xmm11, %%xmm5\n\t" /* xmm5 := xmm11 */
1463 "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */
1464
1465 /* detect if 64-bit carry handling is needed */
1466 "cmpl $0xffffffff, 8(%[ctr])\n\t"
1467 "jne .Lno_carry%=\n\t"
1468 "movl 12(%[ctr]), %%esi\n\t"
1469 "bswapl %%esi\n\t"
1470 "cmpl $0xfffffff8, %%esi\n\t"
1471 "jb .Lno_carry%=\n\t" /* no carry */
1472
1473 "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
1474 "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffff8 */
1475 "cmpl $0xfffffffa, %%esi\n\t"
1476 "jb .Lcarry_xmm11%=\n\t" /* esi == 0xfffffff9 */
1477 "je .Lcarry_xmm10%=\n\t" /* esi == 0xfffffffa */
1478 "cmpl $0xfffffffc, %%esi\n\t"
1479 "jb .Lcarry_xmm9%=\n\t" /* esi == 0xfffffffb */
1480 "je .Lcarry_xmm8%=\n\t" /* esi == 0xfffffffc */
1481 "cmpl $0xfffffffe, %%esi\n\t"
1482 "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */
1483 "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */
1484 /* esi == 0xffffffff */
1485
1486 "psubq %%xmm1, %%xmm2\n\t"
1487 ".Lcarry_xmm3%=:\n\t"
1488 "psubq %%xmm1, %%xmm3\n\t"
1489 ".Lcarry_xmm4%=:\n\t"
1490 "psubq %%xmm1, %%xmm4\n\t"
1491 ".Lcarry_xmm8%=:\n\t"
1492 "psubq %%xmm1, %%xmm8\n\t"
1493 ".Lcarry_xmm9%=:\n\t"
1494 "psubq %%xmm1, %%xmm9\n\t"
1495 ".Lcarry_xmm10%=:\n\t"
1496 "psubq %%xmm1, %%xmm10\n\t"
1497 ".Lcarry_xmm11%=:\n\t"
1498 "psubq %%xmm1, %%xmm11\n\t"
1499 ".Lcarry_xmm5%=:\n\t"
1500 "psubq %%xmm1, %%xmm5\n\t"
1501
1502 ".Lno_carry%=:\n\t"
1503 "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
1504 "movdqa 16(%[key]), %%xmm7\n\t" /* xmm7 := key[1] */
1505
1506 "pshufb %%xmm6, %%xmm2\n\t" /* xmm2 := be(xmm2) */
1507 "pshufb %%xmm6, %%xmm3\n\t" /* xmm3 := be(xmm3) */
1508 "pshufb %%xmm6, %%xmm4\n\t" /* xmm4 := be(xmm4) */
1509 "pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
1510 "pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */
1511 "pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */
1512 "pxor %%xmm1, %%xmm4\n\t" /* xmm4 ^= key[0] */
1513 "aesenc %%xmm7, %%xmm0\n\t"
1514 "aesenc %%xmm7, %%xmm2\n\t"
1515 "aesenc %%xmm7, %%xmm3\n\t"
1516 "aesenc %%xmm7, %%xmm4\n\t"
1517 "pshufb %%xmm6, %%xmm8\n\t" /* xmm8 := be(xmm8) */
1518 "pshufb %%xmm6, %%xmm9\n\t" /* xmm9 := be(xmm9) */
1519 "pshufb %%xmm6, %%xmm10\n\t" /* xmm10 := be(xmm10) */
1520 "pshufb %%xmm6, %%xmm11\n\t" /* xmm11 := be(xmm11) */
1521 "pxor %%xmm1, %%xmm8\n\t" /* xmm8 ^= key[0] */
1522 "pxor %%xmm1, %%xmm9\n\t" /* xmm9 ^= key[0] */
1523 "pxor %%xmm1, %%xmm10\n\t" /* xmm10 ^= key[0] */
1524 "pxor %%xmm1, %%xmm11\n\t" /* xmm11 ^= key[0] */
1525 "aesenc %%xmm7, %%xmm8\n\t"
1526 "aesenc %%xmm7, %%xmm9\n\t"
1527 "aesenc %%xmm7, %%xmm10\n\t"
1528 "aesenc %%xmm7, %%xmm11\n\t"
1529
1530 "pshufb %%xmm6, %%xmm5\n\t" /* xmm5 := be(xmm5) */
1531 "movdqa %%xmm5, (%[ctr])\n\t" /* Update CTR (mem). */
1532
1533 ".align 16\n\t"
1534 ".Ldone_ctr%=:\n\t"
1535 :
1536 : [ctr] "r" (ctr),
1537 [key] "r" (ctx->keyschenc),
1538 [addb] "r" (bige_addb)
1539 : "%esi", "cc", "memory");
1540
1541 asm volatile ("movdqa 0x20(%[key]), %%xmm1\n\t"
1542 "movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1. */
1543 "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2. */
1544 "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3. */
1545 "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4. */
1546 "movdqu 4*16(%[src]), %%xmm7\n\t" /* Get block 5. */
1547 "aesenc %%xmm1, %%xmm0\n\t"
1548 "aesenc %%xmm1, %%xmm2\n\t"
1549 "aesenc %%xmm1, %%xmm3\n\t"
1550 "aesenc %%xmm1, %%xmm4\n\t"
1551 "aesenc %%xmm1, %%xmm8\n\t"
1552 "aesenc %%xmm1, %%xmm9\n\t"
1553 "aesenc %%xmm1, %%xmm10\n\t"
1554 "aesenc %%xmm1, %%xmm11\n\t"
1555 "cmpl $12, %[rounds]\n\t"
1556 "movdqa 0x30(%[key]), %%xmm1\n\t"
1557 "aesenc %%xmm1, %%xmm0\n\t"
1558 "aesenc %%xmm1, %%xmm2\n\t"
1559 "aesenc %%xmm1, %%xmm3\n\t"
1560 "aesenc %%xmm1, %%xmm4\n\t"
1561 "aesenc %%xmm1, %%xmm8\n\t"
1562 "aesenc %%xmm1, %%xmm9\n\t"
1563 "aesenc %%xmm1, %%xmm10\n\t"
1564 "aesenc %%xmm1, %%xmm11\n\t"
1565 "movdqa 0x40(%[key]), %%xmm1\n\t"
1566 "aesenc %%xmm1, %%xmm0\n\t"
1567 "aesenc %%xmm1, %%xmm2\n\t"
1568 "aesenc %%xmm1, %%xmm3\n\t"
1569 "aesenc %%xmm1, %%xmm4\n\t"
1570 "aesenc %%xmm1, %%xmm8\n\t"
1571 "aesenc %%xmm1, %%xmm9\n\t"
1572 "aesenc %%xmm1, %%xmm10\n\t"
1573 "aesenc %%xmm1, %%xmm11\n\t"
1574 "movdqa 0x50(%[key]), %%xmm1\n\t"
1575 "aesenc %%xmm1, %%xmm0\n\t"
1576 "aesenc %%xmm1, %%xmm2\n\t"
1577 "aesenc %%xmm1, %%xmm3\n\t"
1578 "aesenc %%xmm1, %%xmm4\n\t"
1579 "aesenc %%xmm1, %%xmm8\n\t"
1580 "aesenc %%xmm1, %%xmm9\n\t"
1581 "aesenc %%xmm1, %%xmm10\n\t"
1582 "aesenc %%xmm1, %%xmm11\n\t"
1583 "movdqa 0x60(%[key]), %%xmm1\n\t"
1584 "aesenc %%xmm1, %%xmm0\n\t"
1585 "aesenc %%xmm1, %%xmm2\n\t"
1586 "aesenc %%xmm1, %%xmm3\n\t"
1587 "aesenc %%xmm1, %%xmm4\n\t"
1588 "aesenc %%xmm1, %%xmm8\n\t"
1589 "aesenc %%xmm1, %%xmm9\n\t"
1590 "aesenc %%xmm1, %%xmm10\n\t"
1591 "aesenc %%xmm1, %%xmm11\n\t"
1592 "movdqa 0x70(%[key]), %%xmm1\n\t"
1593 "aesenc %%xmm1, %%xmm0\n\t"
1594 "aesenc %%xmm1, %%xmm2\n\t"
1595 "aesenc %%xmm1, %%xmm3\n\t"
1596 "aesenc %%xmm1, %%xmm4\n\t"
1597 "aesenc %%xmm1, %%xmm8\n\t"
1598 "aesenc %%xmm1, %%xmm9\n\t"
1599 "aesenc %%xmm1, %%xmm10\n\t"
1600 "aesenc %%xmm1, %%xmm11\n\t"
1601 "movdqa 0x80(%[key]), %%xmm1\n\t"
1602 "aesenc %%xmm1, %%xmm0\n\t"
1603 "aesenc %%xmm1, %%xmm2\n\t"
1604 "aesenc %%xmm1, %%xmm3\n\t"
1605 "aesenc %%xmm1, %%xmm4\n\t"
1606 "aesenc %%xmm1, %%xmm8\n\t"
1607 "aesenc %%xmm1, %%xmm9\n\t"
1608 "aesenc %%xmm1, %%xmm10\n\t"
1609 "aesenc %%xmm1, %%xmm11\n\t"
1610 "movdqa 0x90(%[key]), %%xmm1\n\t"
1611 "aesenc %%xmm1, %%xmm0\n\t"
1612 "aesenc %%xmm1, %%xmm2\n\t"
1613 "aesenc %%xmm1, %%xmm3\n\t"
1614 "aesenc %%xmm1, %%xmm4\n\t"
1615 "aesenc %%xmm1, %%xmm8\n\t"
1616 "aesenc %%xmm1, %%xmm9\n\t"
1617 "aesenc %%xmm1, %%xmm10\n\t"
1618 "aesenc %%xmm1, %%xmm11\n\t"
1619 "movdqa 0xa0(%[key]), %%xmm1\n\t"
1620 "jb .Lenclast%=\n\t"
1621 "aesenc %%xmm1, %%xmm0\n\t"
1622 "aesenc %%xmm1, %%xmm2\n\t"
1623 "aesenc %%xmm1, %%xmm3\n\t"
1624 "aesenc %%xmm1, %%xmm4\n\t"
1625 "aesenc %%xmm1, %%xmm8\n\t"
1626 "aesenc %%xmm1, %%xmm9\n\t"
1627 "aesenc %%xmm1, %%xmm10\n\t"
1628 "aesenc %%xmm1, %%xmm11\n\t"
1629 "movdqa 0xb0(%[key]), %%xmm1\n\t"
1630 "aesenc %%xmm1, %%xmm0\n\t"
1631 "aesenc %%xmm1, %%xmm2\n\t"
1632 "aesenc %%xmm1, %%xmm3\n\t"
1633 "aesenc %%xmm1, %%xmm4\n\t"
1634 "aesenc %%xmm1, %%xmm8\n\t"
1635 "aesenc %%xmm1, %%xmm9\n\t"
1636 "aesenc %%xmm1, %%xmm10\n\t"
1637 "aesenc %%xmm1, %%xmm11\n\t"
1638 "movdqa 0xc0(%[key]), %%xmm1\n\t"
1639 "je .Lenclast%=\n\t"
1640 "aesenc %%xmm1, %%xmm0\n\t"
1641 "aesenc %%xmm1, %%xmm2\n\t"
1642 "aesenc %%xmm1, %%xmm3\n\t"
1643 "aesenc %%xmm1, %%xmm4\n\t"
1644 "aesenc %%xmm1, %%xmm8\n\t"
1645 "aesenc %%xmm1, %%xmm9\n\t"
1646 "aesenc %%xmm1, %%xmm10\n\t"
1647 "aesenc %%xmm1, %%xmm11\n\t"
1648 "movdqa 0xd0(%[key]), %%xmm1\n\t"
1649 "aesenc %%xmm1, %%xmm0\n\t"
1650 "aesenc %%xmm1, %%xmm2\n\t"
1651 "aesenc %%xmm1, %%xmm3\n\t"
1652 "aesenc %%xmm1, %%xmm4\n\t"
1653 "aesenc %%xmm1, %%xmm8\n\t"
1654 "aesenc %%xmm1, %%xmm9\n\t"
1655 "aesenc %%xmm1, %%xmm10\n\t"
1656 "aesenc %%xmm1, %%xmm11\n\t"
1657 "movdqa 0xe0(%[key]), %%xmm1\n"
1658
1659 ".Lenclast%=:\n\t"
1660 :
1661 : [key] "r" (ctx->keyschenc),
1662 [rounds] "r" (ctx->rounds),
1663 [src] "r" (a)
1664 : "cc", "memory");
1665
1666 asm volatile ("pxor %%xmm1, %%xmm12\n\t" /* block1 ^= lastkey */
1667 "pxor %%xmm1, %%xmm13\n\t" /* block2 ^= lastkey */
1668 "pxor %%xmm1, %%xmm14\n\t" /* block3 ^= lastkey */
1669 "pxor %%xmm1, %%xmm15\n\t" /* block4 ^= lastkey */
1670 "aesenclast %%xmm12, %%xmm0\n\t"
1671 "aesenclast %%xmm13, %%xmm2\n\t"
1672 "aesenclast %%xmm14, %%xmm3\n\t"
1673 "aesenclast %%xmm15, %%xmm4\n\t"
1674 "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6. */
1675 "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7. */
1676 "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8. */
1677 "movdqu %%xmm0, 0*16(%[dst])\n\t" /* Store block 1. */
1678 "movdqu %%xmm2, 1*16(%[dst])\n\t" /* Store block 2. */
1679 "movdqu %%xmm3, 2*16(%[dst])\n\t" /* Store block 3. */
1680 "movdqu %%xmm4, 3*16(%[dst])\n\t" /* Store block 4. */
1681 "pxor %%xmm1, %%xmm7\n\t" /* block5 ^= lastkey */
1682 "pxor %%xmm1, %%xmm12\n\t" /* block6 ^= lastkey */
1683 "pxor %%xmm1, %%xmm13\n\t" /* block7 ^= lastkey */
1684 "pxor %%xmm1, %%xmm14\n\t" /* block8 ^= lastkey */
1685 "aesenclast %%xmm7, %%xmm8\n\t"
1686 "aesenclast %%xmm12, %%xmm9\n\t"
1687 "aesenclast %%xmm13, %%xmm10\n\t"
1688 "aesenclast %%xmm14, %%xmm11\n\t"
1689 "movdqu %%xmm8, 4*16(%[dst])\n\t" /* Store block 8. */
1690 "movdqu %%xmm9, 5*16(%[dst])\n\t" /* Store block 9. */
1691 "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10. */
1692 "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11. */
1693 :
1694 : [src] "r" (a),
1695 [dst] "r" (b)
1696 : "memory");
1697 }
1698
1699 #endif /* __x86_64__ */
1700
1701
1702 unsigned int ASM_FUNC_ATTR
_gcry_aes_aesni_encrypt(const RIJNDAEL_context * ctx,unsigned char * dst,const unsigned char * src)1703 _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
1704 const unsigned char *src)
1705 {
1706 aesni_prepare ();
1707 asm volatile ("movdqu %[src], %%xmm0\n\t"
1708 :
1709 : [src] "m" (*src)
1710 : "memory" );
1711 do_aesni_enc (ctx);
1712 asm volatile ("movdqu %%xmm0, %[dst]\n\t"
1713 : [dst] "=m" (*dst)
1714 :
1715 : "memory" );
1716 aesni_cleanup ();
1717 return 0;
1718 }
1719
1720
1721 void ASM_FUNC_ATTR
_gcry_aes_aesni_cfb_enc(RIJNDAEL_context * ctx,unsigned char * iv,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)1722 _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
1723 unsigned char *outbuf, const unsigned char *inbuf,
1724 size_t nblocks)
1725 {
1726 aesni_prepare ();
1727
1728 asm volatile ("movdqu %[iv], %%xmm0\n\t"
1729 : /* No output */
1730 : [iv] "m" (*iv)
1731 : "memory" );
1732
1733 for ( ;nblocks; nblocks-- )
1734 {
1735 do_aesni_enc (ctx);
1736
1737 asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
1738 "pxor %%xmm1, %%xmm0\n\t"
1739 "movdqu %%xmm0, %[outbuf]\n\t"
1740 : [outbuf] "=m" (*outbuf)
1741 : [inbuf] "m" (*inbuf)
1742 : "memory" );
1743
1744 outbuf += BLOCKSIZE;
1745 inbuf += BLOCKSIZE;
1746 }
1747
1748 asm volatile ("movdqu %%xmm0, %[iv]\n\t"
1749 : [iv] "=m" (*iv)
1750 :
1751 : "memory" );
1752
1753 aesni_cleanup ();
1754 }
1755
1756
1757 void ASM_FUNC_ATTR
_gcry_aes_aesni_cbc_enc(RIJNDAEL_context * ctx,unsigned char * iv,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks,int cbc_mac)1758 _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv,
1759 unsigned char *outbuf, const unsigned char *inbuf,
1760 size_t nblocks, int cbc_mac)
1761 {
1762 aesni_prepare_2_7_variable;
1763
1764 aesni_prepare ();
1765 aesni_prepare_2_7();
1766
1767 asm volatile ("movdqu %[iv], %%xmm5\n\t"
1768 : /* No output */
1769 : [iv] "m" (*iv)
1770 : "memory" );
1771
1772 for ( ;nblocks; nblocks-- )
1773 {
1774 asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
1775 "pxor %%xmm5, %%xmm0\n\t"
1776 : /* No output */
1777 : [inbuf] "m" (*inbuf)
1778 : "memory" );
1779
1780 do_aesni_enc (ctx);
1781
1782 asm volatile ("movdqa %%xmm0, %%xmm5\n\t"
1783 "movdqu %%xmm0, %[outbuf]\n\t"
1784 : [outbuf] "=m" (*outbuf)
1785 :
1786 : "memory" );
1787
1788 inbuf += BLOCKSIZE;
1789 if (!cbc_mac)
1790 outbuf += BLOCKSIZE;
1791 }
1792
1793 asm volatile ("movdqu %%xmm5, %[iv]\n\t"
1794 : [iv] "=m" (*iv)
1795 :
1796 : "memory" );
1797
1798 aesni_cleanup ();
1799 aesni_cleanup_2_7 ();
1800 }
1801
1802
1803 void ASM_FUNC_ATTR
_gcry_aes_aesni_ctr_enc(RIJNDAEL_context * ctx,unsigned char * ctr,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)1804 _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
1805 unsigned char *outbuf, const unsigned char *inbuf,
1806 size_t nblocks)
1807 {
1808 static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
1809 { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
1810 aesni_prepare_2_7_variable;
1811
1812 aesni_prepare ();
1813 aesni_prepare_2_7();
1814
1815 asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
1816 "movdqa %[ctr], %%xmm5\n\t" /* Preload CTR */
1817 : /* No output */
1818 : [mask] "m" (*be_mask),
1819 [ctr] "m" (*ctr)
1820 : "memory");
1821
1822 #ifdef __x86_64__
1823 if (nblocks >= 8)
1824 {
1825 aesni_prepare_8_15_variable;
1826
1827 aesni_prepare_8_15();
1828
1829 for ( ;nblocks >= 8 ; nblocks -= 8 )
1830 {
1831 do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf);
1832 outbuf += 8*BLOCKSIZE;
1833 inbuf += 8*BLOCKSIZE;
1834 }
1835
1836 aesni_cleanup_8_15();
1837 }
1838 #endif
1839
1840 for ( ;nblocks >= 4 ; nblocks -= 4 )
1841 {
1842 do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
1843 outbuf += 4*BLOCKSIZE;
1844 inbuf += 4*BLOCKSIZE;
1845 }
1846 for ( ;nblocks; nblocks-- )
1847 {
1848 do_aesni_ctr (ctx, ctr, outbuf, inbuf);
1849 outbuf += BLOCKSIZE;
1850 inbuf += BLOCKSIZE;
1851 }
1852 aesni_cleanup ();
1853 aesni_cleanup_2_7 ();
1854 }
1855
1856
1857 unsigned int ASM_FUNC_ATTR
_gcry_aes_aesni_decrypt(const RIJNDAEL_context * ctx,unsigned char * dst,const unsigned char * src)1858 _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
1859 const unsigned char *src)
1860 {
1861 aesni_prepare ();
1862 asm volatile ("movdqu %[src], %%xmm0\n\t"
1863 :
1864 : [src] "m" (*src)
1865 : "memory" );
1866 do_aesni_dec (ctx);
1867 asm volatile ("movdqu %%xmm0, %[dst]\n\t"
1868 : [dst] "=m" (*dst)
1869 :
1870 : "memory" );
1871 aesni_cleanup ();
1872 return 0;
1873 }
1874
1875
1876 void ASM_FUNC_ATTR
_gcry_aes_aesni_cfb_dec(RIJNDAEL_context * ctx,unsigned char * iv,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)1877 _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
1878 unsigned char *outbuf, const unsigned char *inbuf,
1879 size_t nblocks)
1880 {
1881 aesni_prepare_2_7_variable;
1882
1883 aesni_prepare ();
1884 aesni_prepare_2_7();
1885
1886 asm volatile ("movdqu %[iv], %%xmm6\n\t"
1887 : /* No output */
1888 : [iv] "m" (*iv)
1889 : "memory" );
1890
1891 /* CFB decryption can be parallelized */
1892
1893 #ifdef __x86_64__
1894 if (nblocks >= 8)
1895 {
1896 aesni_prepare_8_15_variable;
1897
1898 aesni_prepare_8_15();
1899
1900 for ( ;nblocks >= 8; nblocks -= 8)
1901 {
1902 asm volatile
1903 ("movdqa (%[key]), %%xmm0\n\t"
1904
1905 "movdqu %%xmm6, %%xmm1\n\t" /* load input blocks */
1906 "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
1907 "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
1908 "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
1909 "movdqu 3*16(%[inbuf]), %%xmm8\n\t"
1910 "movdqu 4*16(%[inbuf]), %%xmm9\n\t"
1911 "movdqu 5*16(%[inbuf]), %%xmm10\n\t"
1912 "movdqu 6*16(%[inbuf]), %%xmm11\n\t"
1913
1914 "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */
1915
1916 "movdqa %%xmm2, %%xmm12\n\t"
1917 "movdqa %%xmm3, %%xmm13\n\t"
1918 "movdqa %%xmm4, %%xmm14\n\t"
1919 "movdqa %%xmm8, %%xmm15\n\t"
1920
1921 "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
1922 "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
1923 "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
1924 "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
1925 "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */
1926 "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */
1927 "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */
1928 "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */
1929 : /* No output */
1930 : [inbuf] "r" (inbuf),
1931 [key] "r" (ctx->keyschenc)
1932 : "memory");
1933
1934 do_aesni_enc_vec8 (ctx);
1935
1936 asm volatile
1937 (
1938 "pxor %%xmm0, %%xmm12\n\t"
1939 "pxor %%xmm0, %%xmm13\n\t"
1940 "pxor %%xmm0, %%xmm14\n\t"
1941 "pxor %%xmm0, %%xmm15\n\t"
1942 "aesenclast %%xmm12, %%xmm1\n\t"
1943 "aesenclast %%xmm13, %%xmm2\n\t"
1944 "aesenclast %%xmm14, %%xmm3\n\t"
1945 "aesenclast %%xmm15, %%xmm4\n\t"
1946
1947 "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
1948 "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
1949 "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
1950 "movdqu 7*16(%[inbuf]), %%xmm15\n\t"
1951 "pxor %%xmm0, %%xmm12\n\t"
1952 "pxor %%xmm0, %%xmm13\n\t"
1953 "pxor %%xmm0, %%xmm14\n\t"
1954 "pxor %%xmm0, %%xmm15\n\t"
1955
1956 "aesenclast %%xmm12, %%xmm8\n\t"
1957 "aesenclast %%xmm13, %%xmm9\n\t"
1958 "aesenclast %%xmm14, %%xmm10\n\t"
1959 "aesenclast %%xmm15, %%xmm11\n\t"
1960
1961 "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
1962 "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
1963 "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
1964 "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
1965
1966 "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
1967 "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
1968 "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
1969 "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
1970
1971 : /* No output */
1972 : [inbuf] "r" (inbuf),
1973 [outbuf] "r" (outbuf)
1974 : "memory");
1975
1976 outbuf += 8*BLOCKSIZE;
1977 inbuf += 8*BLOCKSIZE;
1978 }
1979
1980 aesni_cleanup_8_15();
1981 }
1982 #endif
1983
1984 for ( ;nblocks >= 4; nblocks -= 4)
1985 {
1986 asm volatile
1987 ("movdqu %%xmm6, %%xmm1\n\t" /* load input blocks */
1988 "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
1989 "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
1990 "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
1991
1992 "movdqu 3*16(%[inbuf]), %%xmm6\n\t" /* update IV */
1993 : /* No output */
1994 : [inbuf] "r" (inbuf)
1995 : "memory");
1996
1997 do_aesni_enc_vec4 (ctx);
1998
1999 asm volatile
2000 ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
2001 "pxor %%xmm5, %%xmm1\n\t"
2002 "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
2003
2004 "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
2005 "pxor %%xmm5, %%xmm2\n\t"
2006 "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
2007
2008 "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
2009 "pxor %%xmm5, %%xmm3\n\t"
2010 "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
2011
2012 "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
2013 "pxor %%xmm5, %%xmm4\n\t"
2014 "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
2015
2016 : /* No output */
2017 : [inbuf] "r" (inbuf),
2018 [outbuf] "r" (outbuf)
2019 : "memory");
2020
2021 outbuf += 4*BLOCKSIZE;
2022 inbuf += 4*BLOCKSIZE;
2023 }
2024
2025 asm volatile ("movdqu %%xmm6, %%xmm0\n\t" ::: "cc");
2026
2027 for ( ;nblocks; nblocks-- )
2028 {
2029 do_aesni_enc (ctx);
2030
2031 asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
2032 "movdqu %[inbuf], %%xmm0\n\t"
2033 "pxor %%xmm0, %%xmm6\n\t"
2034 "movdqu %%xmm6, %[outbuf]\n\t"
2035 : [outbuf] "=m" (*outbuf)
2036 : [inbuf] "m" (*inbuf)
2037 : "memory" );
2038
2039 outbuf += BLOCKSIZE;
2040 inbuf += BLOCKSIZE;
2041 }
2042
2043 asm volatile ("movdqu %%xmm0, %[iv]\n\t"
2044 : [iv] "=m" (*iv)
2045 :
2046 : "memory" );
2047
2048 aesni_cleanup ();
2049 aesni_cleanup_2_7 ();
2050 }
2051
2052
2053 void ASM_FUNC_ATTR
_gcry_aes_aesni_cbc_dec(RIJNDAEL_context * ctx,unsigned char * iv,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)2054 _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
2055 unsigned char *outbuf, const unsigned char *inbuf,
2056 size_t nblocks)
2057 {
2058 aesni_prepare_2_7_variable;
2059
2060 aesni_prepare ();
2061 aesni_prepare_2_7();
2062
2063 if ( !ctx->decryption_prepared )
2064 {
2065 do_aesni_prepare_decryption ( ctx );
2066 ctx->decryption_prepared = 1;
2067 }
2068
2069 asm volatile
2070 ("movdqu %[iv], %%xmm5\n\t" /* use xmm5 as fast IV storage */
2071 : /* No output */
2072 : [iv] "m" (*iv)
2073 : "memory");
2074
2075 #ifdef __x86_64__
2076 if (nblocks >= 8)
2077 {
2078 aesni_prepare_8_15_variable;
2079
2080 aesni_prepare_8_15();
2081
2082 for ( ;nblocks >= 8 ; nblocks -= 8 )
2083 {
2084 asm volatile
2085 ("movdqa (%[key]), %%xmm0\n\t"
2086
2087 "movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */
2088 "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
2089 "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
2090 "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
2091 "movdqu 4*16(%[inbuf]), %%xmm8\n\t"
2092 "movdqu 5*16(%[inbuf]), %%xmm9\n\t"
2093 "movdqu 6*16(%[inbuf]), %%xmm10\n\t"
2094 "movdqu 7*16(%[inbuf]), %%xmm11\n\t"
2095
2096 "movdqa %%xmm1, %%xmm12\n\t"
2097 "movdqa %%xmm2, %%xmm13\n\t"
2098 "movdqa %%xmm3, %%xmm14\n\t"
2099 "movdqa %%xmm4, %%xmm15\n\t"
2100
2101 "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
2102 "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
2103 "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
2104 "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
2105 "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */
2106 "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */
2107 "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */
2108 "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */
2109
2110 : /* No output */
2111 : [inbuf] "r" (inbuf),
2112 [key] "r" (ctx->keyschdec)
2113 : "memory");
2114
2115 do_aesni_dec_vec8 (ctx);
2116
2117 asm volatile
2118 (
2119 "pxor %%xmm0, %%xmm5\n\t" /* xor IV with key */
2120 "pxor %%xmm0, %%xmm12\n\t" /* xor IV with key */
2121 "pxor %%xmm0, %%xmm13\n\t" /* xor IV with key */
2122 "pxor %%xmm0, %%xmm14\n\t" /* xor IV with key */
2123 "pxor %%xmm0, %%xmm15\n\t" /* xor IV with key */
2124
2125 "aesdeclast %%xmm5, %%xmm1\n\t"
2126 "aesdeclast %%xmm12, %%xmm2\n\t"
2127 "aesdeclast %%xmm13, %%xmm3\n\t"
2128 "aesdeclast %%xmm14, %%xmm4\n\t"
2129
2130 "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
2131 "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
2132 "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
2133 "movdqu 7*16(%[inbuf]), %%xmm5\n\t"
2134 "pxor %%xmm0, %%xmm12\n\t" /* xor IV with key */
2135 "pxor %%xmm0, %%xmm13\n\t" /* xor IV with key */
2136 "pxor %%xmm0, %%xmm14\n\t" /* xor IV with key */
2137
2138 "aesdeclast %%xmm15, %%xmm8\n\t"
2139 "aesdeclast %%xmm12, %%xmm9\n\t"
2140 "aesdeclast %%xmm13, %%xmm10\n\t"
2141 "aesdeclast %%xmm14, %%xmm11\n\t"
2142
2143 "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
2144 "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
2145 "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
2146 "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
2147 "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
2148 "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
2149 "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
2150 "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
2151
2152 : /* No output */
2153 : [inbuf] "r" (inbuf),
2154 [outbuf] "r" (outbuf)
2155 : "memory");
2156
2157 outbuf += 8*BLOCKSIZE;
2158 inbuf += 8*BLOCKSIZE;
2159 }
2160
2161 aesni_cleanup_8_15();
2162 }
2163 #endif
2164
2165 for ( ;nblocks >= 4 ; nblocks -= 4 )
2166 {
2167 asm volatile
2168 ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */
2169 "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
2170 "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
2171 "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
2172 : /* No output */
2173 : [inbuf] "r" (inbuf)
2174 : "memory");
2175
2176 do_aesni_dec_vec4 (ctx);
2177
2178 asm volatile
2179 ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */
2180 "movdqu 0*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
2181 "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
2182
2183 "pxor %%xmm5, %%xmm2\n\t" /* xor IV with output */
2184 "movdqu 1*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
2185 "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
2186
2187 "pxor %%xmm5, %%xmm3\n\t" /* xor IV with output */
2188 "movdqu 2*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
2189 "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
2190
2191 "pxor %%xmm5, %%xmm4\n\t" /* xor IV with output */
2192 "movdqu 3*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
2193 "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
2194
2195 : /* No output */
2196 : [inbuf] "r" (inbuf),
2197 [outbuf] "r" (outbuf)
2198 : "memory");
2199
2200 outbuf += 4*BLOCKSIZE;
2201 inbuf += 4*BLOCKSIZE;
2202 }
2203
2204 for ( ;nblocks; nblocks-- )
2205 {
2206 asm volatile
2207 ("movdqu %[inbuf], %%xmm0\n\t"
2208 "movdqa %%xmm0, %%xmm2\n\t" /* use xmm2 as savebuf */
2209 : /* No output */
2210 : [inbuf] "m" (*inbuf)
2211 : "memory");
2212
2213 /* uses only xmm0 and xmm1 */
2214 do_aesni_dec (ctx);
2215
2216 asm volatile
2217 ("pxor %%xmm5, %%xmm0\n\t" /* xor IV with output */
2218 "movdqu %%xmm0, %[outbuf]\n\t"
2219 "movdqu %%xmm2, %%xmm5\n\t" /* store savebuf as new IV */
2220 : [outbuf] "=m" (*outbuf)
2221 :
2222 : "memory");
2223
2224 outbuf += BLOCKSIZE;
2225 inbuf += BLOCKSIZE;
2226 }
2227
2228 asm volatile
2229 ("movdqu %%xmm5, %[iv]\n\t" /* store IV */
2230 : /* No output */
2231 : [iv] "m" (*iv)
2232 : "memory");
2233
2234 aesni_cleanup ();
2235 aesni_cleanup_2_7 ();
2236 }
2237
2238
2239 static ASM_FUNC_ATTR_INLINE void
aesni_ocb_checksum(gcry_cipher_hd_t c,const unsigned char * plaintext,size_t nblocks)2240 aesni_ocb_checksum (gcry_cipher_hd_t c, const unsigned char *plaintext,
2241 size_t nblocks)
2242 {
2243 RIJNDAEL_context *ctx = (void *)&c->context.c;
2244
2245 /* Calculate checksum */
2246 asm volatile ("movdqu %[checksum], %%xmm6\n\t"
2247 "pxor %%xmm1, %%xmm1\n\t"
2248 "pxor %%xmm2, %%xmm2\n\t"
2249 "pxor %%xmm3, %%xmm3\n\t"
2250 :
2251 :[checksum] "m" (*c->u_ctr.ctr)
2252 : "memory" );
2253
2254 if (0) {}
2255 #if defined(HAVE_GCC_INLINE_ASM_AVX2)
2256 else if (nblocks >= 16 && ctx->use_avx2)
2257 {
2258 /* Use wider 256-bit registers for fast xoring of plaintext. */
2259 asm volatile ("vzeroupper\n\t"
2260 "vpxor %%xmm0, %%xmm0, %%xmm0\n\t"
2261 "vpxor %%xmm4, %%xmm4, %%xmm4\n\t"
2262 "vpxor %%xmm5, %%xmm5, %%xmm5\n\t"
2263 "vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
2264 :
2265 :
2266 : "memory");
2267
2268 for (;nblocks >= 16; nblocks -= 16)
2269 {
2270 asm volatile ("vpxor %[ptr0], %%ymm6, %%ymm6\n\t"
2271 "vpxor %[ptr1], %%ymm1, %%ymm1\n\t"
2272 "vpxor %[ptr2], %%ymm2, %%ymm2\n\t"
2273 "vpxor %[ptr3], %%ymm3, %%ymm3\n\t"
2274 :
2275 : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
2276 [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
2277 [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
2278 [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
2279 : "memory" );
2280 asm volatile ("vpxor %[ptr4], %%ymm0, %%ymm0\n\t"
2281 "vpxor %[ptr5], %%ymm4, %%ymm4\n\t"
2282 "vpxor %[ptr6], %%ymm5, %%ymm5\n\t"
2283 "vpxor %[ptr7], %%ymm7, %%ymm7\n\t"
2284 :
2285 : [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
2286 [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
2287 [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
2288 [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
2289 : "memory" );
2290 plaintext += BLOCKSIZE * 16;
2291 }
2292
2293 asm volatile ("vpxor %%ymm0, %%ymm6, %%ymm6\n\t"
2294 "vpxor %%ymm4, %%ymm1, %%ymm1\n\t"
2295 "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
2296 "vpxor %%ymm7, %%ymm3, %%ymm3\n\t"
2297 "vextracti128 $1, %%ymm6, %%xmm0\n\t"
2298 "vextracti128 $1, %%ymm1, %%xmm4\n\t"
2299 "vextracti128 $1, %%ymm2, %%xmm5\n\t"
2300 "vextracti128 $1, %%ymm3, %%xmm7\n\t"
2301 "vpxor %%xmm0, %%xmm6, %%xmm6\n\t"
2302 "vpxor %%xmm4, %%xmm1, %%xmm1\n\t"
2303 "vpxor %%xmm5, %%xmm2, %%xmm2\n\t"
2304 "vpxor %%xmm7, %%xmm3, %%xmm3\n\t"
2305 "vzeroupper\n\t"
2306 :
2307 :
2308 : "memory" );
2309 }
2310 #endif
2311 #if defined(HAVE_GCC_INLINE_ASM_AVX)
2312 else if (nblocks >= 16 && ctx->use_avx)
2313 {
2314 /* Same as AVX2, except using 256-bit floating point instructions. */
2315 asm volatile ("vzeroupper\n\t"
2316 "vxorpd %%xmm0, %%xmm0, %%xmm0\n\t"
2317 "vxorpd %%xmm4, %%xmm4, %%xmm4\n\t"
2318 "vxorpd %%xmm5, %%xmm5, %%xmm5\n\t"
2319 "vxorpd %%xmm7, %%xmm7, %%xmm7\n\t"
2320 :
2321 :
2322 : "memory");
2323
2324 for (;nblocks >= 16; nblocks -= 16)
2325 {
2326 asm volatile ("vxorpd %[ptr0], %%ymm6, %%ymm6\n\t"
2327 "vxorpd %[ptr1], %%ymm1, %%ymm1\n\t"
2328 "vxorpd %[ptr2], %%ymm2, %%ymm2\n\t"
2329 "vxorpd %[ptr3], %%ymm3, %%ymm3\n\t"
2330 :
2331 : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
2332 [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
2333 [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
2334 [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
2335 : "memory" );
2336 asm volatile ("vxorpd %[ptr4], %%ymm0, %%ymm0\n\t"
2337 "vxorpd %[ptr5], %%ymm4, %%ymm4\n\t"
2338 "vxorpd %[ptr6], %%ymm5, %%ymm5\n\t"
2339 "vxorpd %[ptr7], %%ymm7, %%ymm7\n\t"
2340 :
2341 : [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
2342 [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
2343 [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
2344 [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
2345 : "memory" );
2346 plaintext += BLOCKSIZE * 16;
2347 }
2348
2349 asm volatile ("vxorpd %%ymm0, %%ymm6, %%ymm6\n\t"
2350 "vxorpd %%ymm4, %%ymm1, %%ymm1\n\t"
2351 "vxorpd %%ymm5, %%ymm2, %%ymm2\n\t"
2352 "vxorpd %%ymm7, %%ymm3, %%ymm3\n\t"
2353 "vextractf128 $1, %%ymm6, %%xmm0\n\t"
2354 "vextractf128 $1, %%ymm1, %%xmm4\n\t"
2355 "vextractf128 $1, %%ymm2, %%xmm5\n\t"
2356 "vextractf128 $1, %%ymm3, %%xmm7\n\t"
2357 "vxorpd %%xmm0, %%xmm6, %%xmm6\n\t"
2358 "vxorpd %%xmm4, %%xmm1, %%xmm1\n\t"
2359 "vxorpd %%xmm5, %%xmm2, %%xmm2\n\t"
2360 "vxorpd %%xmm7, %%xmm3, %%xmm3\n\t"
2361 "vzeroupper\n\t"
2362 :
2363 :
2364 : "memory" );
2365 }
2366 #endif
2367
2368 for (;nblocks >= 4; nblocks -= 4)
2369 {
2370 asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
2371 "movdqu %[ptr1], %%xmm4\n\t"
2372 "movdqu %[ptr2], %%xmm5\n\t"
2373 "movdqu %[ptr3], %%xmm7\n\t"
2374 "pxor %%xmm0, %%xmm6\n\t"
2375 "pxor %%xmm4, %%xmm1\n\t"
2376 "pxor %%xmm5, %%xmm2\n\t"
2377 "pxor %%xmm7, %%xmm3\n\t"
2378 :
2379 : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)),
2380 [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE)),
2381 [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE)),
2382 [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE))
2383 : "memory" );
2384 plaintext += BLOCKSIZE * 4;
2385 }
2386
2387 for (;nblocks >= 1; nblocks -= 1)
2388 {
2389 asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
2390 "pxor %%xmm0, %%xmm6\n\t"
2391 :
2392 : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE))
2393 : "memory" );
2394 plaintext += BLOCKSIZE;
2395 }
2396
2397 asm volatile ("pxor %%xmm1, %%xmm6\n\t"
2398 "pxor %%xmm2, %%xmm6\n\t"
2399 "pxor %%xmm3, %%xmm6\n\t"
2400 "movdqu %%xmm6, %[checksum]\n\t"
2401 : [checksum] "=m" (*c->u_ctr.ctr)
2402 :
2403 : "memory" );
2404 }
2405
2406
2407 static unsigned int ASM_FUNC_ATTR_NOINLINE
aesni_ocb_enc(gcry_cipher_hd_t c,void * outbuf_arg,const void * inbuf_arg,size_t nblocks)2408 aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
2409 const void *inbuf_arg, size_t nblocks)
2410 {
2411 RIJNDAEL_context *ctx = (void *)&c->context.c;
2412 unsigned char *outbuf = outbuf_arg;
2413 const unsigned char *inbuf = inbuf_arg;
2414 u64 n = c->u_mode.ocb.data_nblocks;
2415 const unsigned char *l;
2416 byte tmpbuf_store[3 * 16 + 15];
2417 byte *tmpbuf;
2418 aesni_prepare_2_7_variable;
2419
2420 asm volatile ("" : "=r" (tmpbuf) : "0" (tmpbuf_store) : "memory");
2421 tmpbuf = tmpbuf + (-(uintptr_t)tmpbuf & 15);
2422
2423 aesni_prepare ();
2424 aesni_prepare_2_7 ();
2425
2426 /* Preload Offset */
2427 asm volatile ("movdqu %[iv], %%xmm5\n\t"
2428 "movdqu %[ctr], %%xmm7\n\t"
2429 : /* No output */
2430 : [iv] "m" (*c->u_iv.iv),
2431 [ctr] "m" (*c->u_ctr.ctr)
2432 : "memory" );
2433
2434 for ( ;nblocks && n % 4; nblocks-- )
2435 {
2436 l = aes_ocb_get_l(c, ++n);
2437
2438 /* Checksum_i = Checksum_{i-1} xor P_i */
2439 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
2440 /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
2441 asm volatile ("movdqu %[l], %%xmm1\n\t"
2442 "movdqu %[inbuf], %%xmm0\n\t"
2443 "pxor %%xmm1, %%xmm5\n\t"
2444 "pxor %%xmm0, %%xmm7\n\t"
2445 "pxor %%xmm5, %%xmm0\n\t"
2446 :
2447 : [l] "m" (*l),
2448 [inbuf] "m" (*inbuf)
2449 : "memory" );
2450
2451 do_aesni_enc (ctx);
2452
2453 asm volatile ("pxor %%xmm5, %%xmm0\n\t"
2454 "movdqu %%xmm0, %[outbuf]\n\t"
2455 : [outbuf] "=m" (*outbuf)
2456 :
2457 : "memory" );
2458
2459 inbuf += BLOCKSIZE;
2460 outbuf += BLOCKSIZE;
2461 }
2462
2463 #ifdef __x86_64__
2464 if (nblocks >= 8)
2465 {
2466 unsigned char last_xor_first_key_store[16 + 15];
2467 unsigned char *lxf_key;
2468 aesni_prepare_8_15_variable;
2469
2470 asm volatile (""
2471 : "=r" (lxf_key)
2472 : "0" (last_xor_first_key_store)
2473 : "memory");
2474 lxf_key = lxf_key + (-(uintptr_t)lxf_key & 15);
2475
2476 aesni_prepare_8_15();
2477
2478 asm volatile ("movdqu %[l0], %%xmm6\n\t"
2479 "movdqa %[last_key], %%xmm0\n\t"
2480 "pxor %[first_key], %%xmm5\n\t"
2481 "pxor %[first_key], %%xmm0\n\t"
2482 "movdqa %%xmm0, %[lxfkey]\n\t"
2483 : [lxfkey] "=m" (*lxf_key)
2484 : [l0] "m" (*c->u_mode.ocb.L[0]),
2485 [last_key] "m" (ctx->keyschenc[ctx->rounds][0][0]),
2486 [first_key] "m" (ctx->keyschenc[0][0][0])
2487 : "memory" );
2488
2489 for ( ;nblocks >= 8 ; nblocks -= 8 )
2490 {
2491 n += 4;
2492 l = aes_ocb_get_l(c, n);
2493
2494 asm volatile ("movdqu %[l0l1], %%xmm10\n\t"
2495 "movdqu %[l1], %%xmm11\n\t"
2496 "movdqu %[l3], %%xmm15\n\t"
2497 :
2498 : [l0l1] "m" (*c->u_mode.ocb.L0L1),
2499 [l1] "m" (*c->u_mode.ocb.L[1]),
2500 [l3] "m" (*l)
2501 : "memory" );
2502
2503 n += 4;
2504 l = aes_ocb_get_l(c, n);
2505
2506 /* Checksum_i = Checksum_{i-1} xor P_i */
2507 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
2508 /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */
2509 asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
2510 "movdqu %[inbuf1], %%xmm2\n\t"
2511 "movdqu %[inbuf2], %%xmm3\n\t"
2512 :
2513 : [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)),
2514 [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
2515 [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
2516 : "memory" );
2517 asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
2518 "movdqu %[inbuf4], %%xmm8\n\t"
2519 "movdqu %[inbuf5], %%xmm9\n\t"
2520 :
2521 : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
2522 [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)),
2523 [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
2524 : "memory" );
2525 asm volatile ("movdqa %[lxfkey], %%xmm0\n\t"
2526 "movdqa %%xmm6, %%xmm12\n\t"
2527 "pxor %%xmm5, %%xmm12\n\t"
2528 "pxor %%xmm1, %%xmm7\n\t"
2529 "pxor %%xmm12, %%xmm1\n\t"
2530 "pxor %%xmm0, %%xmm12\n\t"
2531
2532 "movdqa %%xmm10, %%xmm13\n\t"
2533 "pxor %%xmm5, %%xmm13\n\t"
2534 "pxor %%xmm2, %%xmm7\n\t"
2535 "pxor %%xmm13, %%xmm2\n\t"
2536 "pxor %%xmm0, %%xmm13\n\t"
2537
2538 "movdqa %%xmm11, %%xmm14\n\t"
2539 "pxor %%xmm5, %%xmm14\n\t"
2540 "pxor %%xmm3, %%xmm7\n\t"
2541 "pxor %%xmm14, %%xmm3\n\t"
2542 "pxor %%xmm0, %%xmm14\n\t"
2543
2544 "pxor %%xmm11, %%xmm5\n\t"
2545 "pxor %%xmm15, %%xmm5\n\t"
2546 "pxor %%xmm4, %%xmm7\n\t"
2547 "pxor %%xmm5, %%xmm4\n\t"
2548 "movdqa %%xmm5, %%xmm15\n\t"
2549 "pxor %%xmm0, %%xmm15\n\t"
2550
2551 "movdqa %%xmm5, %%xmm0\n\t"
2552 "pxor %%xmm6, %%xmm0\n\t"
2553 "pxor %%xmm8, %%xmm7\n\t"
2554 "pxor %%xmm0, %%xmm8\n\t"
2555 "pxor %[lxfkey], %%xmm0\n\t"
2556 "movdqa %%xmm0, %[tmpbuf0]\n\t"
2557
2558 "movdqa %%xmm10, %%xmm0\n\t"
2559 "pxor %%xmm5, %%xmm0\n\t"
2560 "pxor %%xmm9, %%xmm7\n\t"
2561 "pxor %%xmm0, %%xmm9\n\t"
2562 "pxor %[lxfkey], %%xmm0\n"
2563 "movdqa %%xmm0, %[tmpbuf1]\n\t"
2564 : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
2565 [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
2566 : [lxfkey] "m" (*lxf_key)
2567 : "memory" );
2568 asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
2569 "movdqa %%xmm11, %%xmm0\n\t"
2570 "pxor %%xmm5, %%xmm0\n\t"
2571 "pxor %%xmm10, %%xmm7\n\t"
2572 "pxor %%xmm0, %%xmm10\n\t"
2573 "pxor %[lxfkey], %%xmm0\n\t"
2574 "movdqa %%xmm0, %[tmpbuf2]\n\t"
2575 : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
2576 : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)),
2577 [lxfkey] "m" (*lxf_key)
2578 : "memory" );
2579 asm volatile ("movdqu %[l7], %%xmm0\n\t"
2580 "pxor %%xmm11, %%xmm5\n\t"
2581 "pxor %%xmm0, %%xmm5\n\t"
2582 "movdqa 0x10(%[key]), %%xmm0\n\t"
2583 "movdqu %[inbuf7], %%xmm11\n\t"
2584 "pxor %%xmm11, %%xmm7\n\t"
2585 "pxor %%xmm5, %%xmm11\n\t"
2586 :
2587 : [l7] "m" (*l),
2588 [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)),
2589 [key] "r" (ctx->keyschenc)
2590 : "memory" );
2591
2592 asm volatile ("cmpl $12, %[rounds]\n\t"
2593 "aesenc %%xmm0, %%xmm1\n\t"
2594 "aesenc %%xmm0, %%xmm2\n\t"
2595 "aesenc %%xmm0, %%xmm3\n\t"
2596 "aesenc %%xmm0, %%xmm4\n\t"
2597 "aesenc %%xmm0, %%xmm8\n\t"
2598 "aesenc %%xmm0, %%xmm9\n\t"
2599 "aesenc %%xmm0, %%xmm10\n\t"
2600 "aesenc %%xmm0, %%xmm11\n\t"
2601 "movdqa 0x20(%[key]), %%xmm0\n\t"
2602 "aesenc %%xmm0, %%xmm1\n\t"
2603 "aesenc %%xmm0, %%xmm2\n\t"
2604 "aesenc %%xmm0, %%xmm3\n\t"
2605 "aesenc %%xmm0, %%xmm4\n\t"
2606 "aesenc %%xmm0, %%xmm8\n\t"
2607 "aesenc %%xmm0, %%xmm9\n\t"
2608 "aesenc %%xmm0, %%xmm10\n\t"
2609 "aesenc %%xmm0, %%xmm11\n\t"
2610 "movdqa 0x30(%[key]), %%xmm0\n\t"
2611 "aesenc %%xmm0, %%xmm1\n\t"
2612 "aesenc %%xmm0, %%xmm2\n\t"
2613 "aesenc %%xmm0, %%xmm3\n\t"
2614 "aesenc %%xmm0, %%xmm4\n\t"
2615 "aesenc %%xmm0, %%xmm8\n\t"
2616 "aesenc %%xmm0, %%xmm9\n\t"
2617 "aesenc %%xmm0, %%xmm10\n\t"
2618 "aesenc %%xmm0, %%xmm11\n\t"
2619 "movdqa 0x40(%[key]), %%xmm0\n\t"
2620 "aesenc %%xmm0, %%xmm1\n\t"
2621 "aesenc %%xmm0, %%xmm2\n\t"
2622 "aesenc %%xmm0, %%xmm3\n\t"
2623 "aesenc %%xmm0, %%xmm4\n\t"
2624 "aesenc %%xmm0, %%xmm8\n\t"
2625 "aesenc %%xmm0, %%xmm9\n\t"
2626 "aesenc %%xmm0, %%xmm10\n\t"
2627 "aesenc %%xmm0, %%xmm11\n\t"
2628 "movdqa 0x50(%[key]), %%xmm0\n\t"
2629 "aesenc %%xmm0, %%xmm1\n\t"
2630 "aesenc %%xmm0, %%xmm2\n\t"
2631 "aesenc %%xmm0, %%xmm3\n\t"
2632 "aesenc %%xmm0, %%xmm4\n\t"
2633 "aesenc %%xmm0, %%xmm8\n\t"
2634 "aesenc %%xmm0, %%xmm9\n\t"
2635 "aesenc %%xmm0, %%xmm10\n\t"
2636 "aesenc %%xmm0, %%xmm11\n\t"
2637 "movdqa 0x60(%[key]), %%xmm0\n\t"
2638 "aesenc %%xmm0, %%xmm1\n\t"
2639 "aesenc %%xmm0, %%xmm2\n\t"
2640 "aesenc %%xmm0, %%xmm3\n\t"
2641 "aesenc %%xmm0, %%xmm4\n\t"
2642 "aesenc %%xmm0, %%xmm8\n\t"
2643 "aesenc %%xmm0, %%xmm9\n\t"
2644 "aesenc %%xmm0, %%xmm10\n\t"
2645 "aesenc %%xmm0, %%xmm11\n\t"
2646 "movdqa 0x70(%[key]), %%xmm0\n\t"
2647 "aesenc %%xmm0, %%xmm1\n\t"
2648 "aesenc %%xmm0, %%xmm2\n\t"
2649 "aesenc %%xmm0, %%xmm3\n\t"
2650 "aesenc %%xmm0, %%xmm4\n\t"
2651 "aesenc %%xmm0, %%xmm8\n\t"
2652 "aesenc %%xmm0, %%xmm9\n\t"
2653 "aesenc %%xmm0, %%xmm10\n\t"
2654 "aesenc %%xmm0, %%xmm11\n\t"
2655 "movdqa 0x80(%[key]), %%xmm0\n\t"
2656 "aesenc %%xmm0, %%xmm1\n\t"
2657 "aesenc %%xmm0, %%xmm2\n\t"
2658 "aesenc %%xmm0, %%xmm3\n\t"
2659 "aesenc %%xmm0, %%xmm4\n\t"
2660 "aesenc %%xmm0, %%xmm8\n\t"
2661 "aesenc %%xmm0, %%xmm9\n\t"
2662 "aesenc %%xmm0, %%xmm10\n\t"
2663 "aesenc %%xmm0, %%xmm11\n\t"
2664 "movdqa 0x90(%[key]), %%xmm0\n\t"
2665 "aesenc %%xmm0, %%xmm1\n\t"
2666 "aesenc %%xmm0, %%xmm2\n\t"
2667 "aesenc %%xmm0, %%xmm3\n\t"
2668 "aesenc %%xmm0, %%xmm4\n\t"
2669 "aesenc %%xmm0, %%xmm8\n\t"
2670 "aesenc %%xmm0, %%xmm9\n\t"
2671 "aesenc %%xmm0, %%xmm10\n\t"
2672 "aesenc %%xmm0, %%xmm11\n\t"
2673 "jb .Ldeclast%=\n\t"
2674 "movdqa 0xa0(%[key]), %%xmm0\n\t"
2675 "aesenc %%xmm0, %%xmm1\n\t"
2676 "aesenc %%xmm0, %%xmm2\n\t"
2677 "aesenc %%xmm0, %%xmm3\n\t"
2678 "aesenc %%xmm0, %%xmm4\n\t"
2679 "aesenc %%xmm0, %%xmm8\n\t"
2680 "aesenc %%xmm0, %%xmm9\n\t"
2681 "aesenc %%xmm0, %%xmm10\n\t"
2682 "aesenc %%xmm0, %%xmm11\n\t"
2683 "movdqa 0xb0(%[key]), %%xmm0\n\t"
2684 "aesenc %%xmm0, %%xmm1\n\t"
2685 "aesenc %%xmm0, %%xmm2\n\t"
2686 "aesenc %%xmm0, %%xmm3\n\t"
2687 "aesenc %%xmm0, %%xmm4\n\t"
2688 "aesenc %%xmm0, %%xmm8\n\t"
2689 "aesenc %%xmm0, %%xmm9\n\t"
2690 "aesenc %%xmm0, %%xmm10\n\t"
2691 "aesenc %%xmm0, %%xmm11\n\t"
2692 "je .Ldeclast%=\n\t"
2693 "movdqa 0xc0(%[key]), %%xmm0\n\t"
2694 "aesenc %%xmm0, %%xmm1\n\t"
2695 "aesenc %%xmm0, %%xmm2\n\t"
2696 "aesenc %%xmm0, %%xmm3\n\t"
2697 "aesenc %%xmm0, %%xmm4\n\t"
2698 "aesenc %%xmm0, %%xmm8\n\t"
2699 "aesenc %%xmm0, %%xmm9\n\t"
2700 "aesenc %%xmm0, %%xmm10\n\t"
2701 "aesenc %%xmm0, %%xmm11\n\t"
2702 "movdqa 0xd0(%[key]), %%xmm0\n\t"
2703 "aesenc %%xmm0, %%xmm1\n\t"
2704 "aesenc %%xmm0, %%xmm2\n\t"
2705 "aesenc %%xmm0, %%xmm3\n\t"
2706 "aesenc %%xmm0, %%xmm4\n\t"
2707 "aesenc %%xmm0, %%xmm8\n\t"
2708 "aesenc %%xmm0, %%xmm9\n\t"
2709 "aesenc %%xmm0, %%xmm10\n\t"
2710 "aesenc %%xmm0, %%xmm11\n\t"
2711
2712 ".Ldeclast%=:\n\t"
2713 :
2714 : [key] "r" (ctx->keyschenc),
2715 [rounds] "r" (ctx->rounds)
2716 : "cc", "memory");
2717
2718 asm volatile ("aesenclast %%xmm12, %%xmm1\n\t"
2719 "aesenclast %%xmm13, %%xmm2\n\t"
2720 "aesenclast %%xmm14, %%xmm3\n\t"
2721 "aesenclast %%xmm15, %%xmm4\n\t"
2722 "aesenclast %[tmpbuf0],%%xmm8\n\t"
2723 "aesenclast %[tmpbuf1],%%xmm9\n\t"
2724 "aesenclast %[tmpbuf2],%%xmm10\n\t"
2725 :
2726 : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
2727 [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)),
2728 [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)),
2729 [lxfkey] "m" (*lxf_key)
2730 : "memory" );
2731 asm volatile ("aesenclast %%xmm5, %%xmm11\n\t"
2732 "pxor %[lxfkey], %%xmm11\n\t"
2733 "movdqu %%xmm1, %[outbuf0]\n\t"
2734 "movdqu %%xmm2, %[outbuf1]\n\t"
2735 : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
2736 [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
2737 : [lxfkey] "m" (*lxf_key)
2738 : "memory" );
2739 asm volatile ("movdqu %%xmm3, %[outbuf2]\n\t"
2740 "movdqu %%xmm4, %[outbuf3]\n\t"
2741 "movdqu %%xmm8, %[outbuf4]\n\t"
2742 : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
2743 [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
2744 [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
2745 :
2746 : "memory" );
2747 asm volatile ("movdqu %%xmm9, %[outbuf5]\n\t"
2748 "movdqu %%xmm10, %[outbuf6]\n\t"
2749 "movdqu %%xmm11, %[outbuf7]\n\t"
2750 : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)),
2751 [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)),
2752 [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
2753 :
2754 : "memory" );
2755
2756 outbuf += 8*BLOCKSIZE;
2757 inbuf += 8*BLOCKSIZE;
2758 }
2759
2760 asm volatile ("pxor %[first_key], %%xmm5\n\t"
2761 "pxor %%xmm0, %%xmm0\n\t"
2762 "movdqu %%xmm0, %[lxfkey]\n\t"
2763 : [lxfkey] "=m" (*lxf_key)
2764 : [first_key] "m" (ctx->keyschenc[0][0][0])
2765 : "memory" );
2766
2767 aesni_cleanup_8_15();
2768 }
2769 #endif
2770
2771 for ( ;nblocks >= 4 ; nblocks -= 4 )
2772 {
2773 n += 4;
2774 l = aes_ocb_get_l(c, n);
2775
2776 /* Checksum_i = Checksum_{i-1} xor P_i */
2777 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
2778 /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
2779 asm volatile ("movdqu %[l0], %%xmm0\n\t"
2780 "movdqu %[inbuf0], %%xmm1\n\t"
2781 "movdqu %[l0l1], %%xmm3\n\t"
2782 :
2783 : [l0] "m" (*c->u_mode.ocb.L[0]),
2784 [l0l1] "m" (*c->u_mode.ocb.L0L1),
2785 [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
2786 : "memory" );
2787 asm volatile ("movdqu %[l1], %%xmm4\n\t"
2788 "movdqu %[l3], %%xmm6\n\t"
2789 "pxor %%xmm5, %%xmm0\n\t"
2790 "pxor %%xmm1, %%xmm7\n\t"
2791 "pxor %%xmm0, %%xmm1\n\t"
2792 "movdqa %%xmm0, %[tmpbuf0]\n\t"
2793 : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE))
2794 : [l1] "m" (*c->u_mode.ocb.L[1]),
2795 [l3] "m" (*l)
2796 : "memory" );
2797 asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
2798 "pxor %%xmm5, %%xmm3\n\t"
2799 "pxor %%xmm2, %%xmm7\n\t"
2800 "pxor %%xmm3, %%xmm2\n\t"
2801 "movdqa %%xmm3, %[tmpbuf1]\n\t"
2802 : [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
2803 : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
2804 : "memory" );
2805 asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
2806 "movdqu %[inbuf2], %%xmm3\n\t"
2807 "pxor %%xmm5, %%xmm0\n\t"
2808 "pxor %%xmm3, %%xmm7\n\t"
2809 "pxor %%xmm0, %%xmm3\n\t"
2810 "movdqa %%xmm0, %[tmpbuf2]\n\t"
2811 : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
2812 :
2813 [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
2814 : "memory" );
2815 asm volatile ("pxor %%xmm6, %%xmm5\n\t"
2816 "pxor %%xmm4, %%xmm5\n\t"
2817 "movdqu %[inbuf3], %%xmm4\n\t"
2818 "pxor %%xmm4, %%xmm7\n\t"
2819 "pxor %%xmm5, %%xmm4\n\t"
2820 :
2821 : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
2822 : "memory" );
2823
2824 do_aesni_enc_vec4 (ctx);
2825
2826 asm volatile ("pxor %[tmpbuf0],%%xmm1\n\t"
2827 "movdqu %%xmm1, %[outbuf0]\n\t"
2828 "pxor %[tmpbuf1],%%xmm2\n\t"
2829 "movdqu %%xmm2, %[outbuf1]\n\t"
2830 : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
2831 [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
2832 : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
2833 [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE))
2834 : "memory" );
2835 asm volatile ("pxor %[tmpbuf2],%%xmm3\n\t"
2836 "movdqu %%xmm3, %[outbuf2]\n\t"
2837 "pxor %%xmm5, %%xmm4\n\t"
2838 "movdqu %%xmm4, %[outbuf3]\n\t"
2839 : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
2840 [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
2841 : [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
2842 : "memory" );
2843
2844 outbuf += 4*BLOCKSIZE;
2845 inbuf += 4*BLOCKSIZE;
2846 }
2847
2848 for ( ;nblocks; nblocks-- )
2849 {
2850 l = aes_ocb_get_l(c, ++n);
2851
2852 /* Checksum_i = Checksum_{i-1} xor P_i */
2853 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
2854 /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
2855 asm volatile ("movdqu %[l], %%xmm1\n\t"
2856 "movdqu %[inbuf], %%xmm0\n\t"
2857 "pxor %%xmm1, %%xmm5\n\t"
2858 "pxor %%xmm0, %%xmm7\n\t"
2859 "pxor %%xmm5, %%xmm0\n\t"
2860 :
2861 : [l] "m" (*l),
2862 [inbuf] "m" (*inbuf)
2863 : "memory" );
2864
2865 do_aesni_enc (ctx);
2866
2867 asm volatile ("pxor %%xmm5, %%xmm0\n\t"
2868 "movdqu %%xmm0, %[outbuf]\n\t"
2869 : [outbuf] "=m" (*outbuf)
2870 :
2871 : "memory" );
2872
2873 inbuf += BLOCKSIZE;
2874 outbuf += BLOCKSIZE;
2875 }
2876
2877 c->u_mode.ocb.data_nblocks = n;
2878 asm volatile ("movdqu %%xmm5, %[iv]\n\t"
2879 "movdqu %%xmm7, %[ctr]\n\t"
2880 : [iv] "=m" (*c->u_iv.iv),
2881 [ctr] "=m" (*c->u_ctr.ctr)
2882 :
2883 : "memory" );
2884
2885 asm volatile ("pxor %%xmm0, %%xmm0\n\t"
2886 "movdqa %%xmm0, %[tmpbuf0]\n\t"
2887 "movdqa %%xmm0, %[tmpbuf1]\n\t"
2888 "movdqa %%xmm0, %[tmpbuf2]\n\t"
2889 : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
2890 [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)),
2891 [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
2892 :
2893 : "memory" );
2894
2895 aesni_cleanup ();
2896 aesni_cleanup_2_7 ();
2897
2898 return 0;
2899 }
2900
2901
2902 static unsigned int ASM_FUNC_ATTR_NOINLINE
aesni_ocb_dec(gcry_cipher_hd_t c,void * outbuf_arg,const void * inbuf_arg,size_t nblocks_arg)2903 aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
2904 const void *inbuf_arg, size_t nblocks_arg)
2905 {
2906 RIJNDAEL_context *ctx = (void *)&c->context.c;
2907 unsigned char *outbuf = outbuf_arg;
2908 const unsigned char *inbuf = inbuf_arg;
2909 u64 n = c->u_mode.ocb.data_nblocks;
2910 const unsigned char *l;
2911 size_t nblocks = nblocks_arg;
2912 byte tmpbuf_store[3 * 16 + 15];
2913 byte *tmpbuf;
2914 aesni_prepare_2_7_variable;
2915
2916 asm volatile ("" : "=r" (tmpbuf) : "0" (tmpbuf_store) : "memory");
2917 tmpbuf = tmpbuf + (-(uintptr_t)tmpbuf & 15);
2918
2919 aesni_prepare ();
2920 aesni_prepare_2_7 ();
2921
2922 if ( !ctx->decryption_prepared )
2923 {
2924 do_aesni_prepare_decryption ( ctx );
2925 ctx->decryption_prepared = 1;
2926 }
2927
2928 /* Preload Offset */
2929 asm volatile ("movdqu %[iv], %%xmm5\n\t"
2930 : /* No output */
2931 : [iv] "m" (*c->u_iv.iv)
2932 : "memory" );
2933
2934 for ( ;nblocks && n % 4; nblocks-- )
2935 {
2936 l = aes_ocb_get_l(c, ++n);
2937
2938 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
2939 /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
2940 asm volatile ("movdqu %[l], %%xmm1\n\t"
2941 "movdqu %[inbuf], %%xmm0\n\t"
2942 "pxor %%xmm1, %%xmm5\n\t"
2943 "pxor %%xmm5, %%xmm0\n\t"
2944 :
2945 : [l] "m" (*l),
2946 [inbuf] "m" (*inbuf)
2947 : "memory" );
2948
2949 do_aesni_dec (ctx);
2950
2951 asm volatile ("pxor %%xmm5, %%xmm0\n\t"
2952 "movdqu %%xmm0, %[outbuf]\n\t"
2953 : [outbuf] "=m" (*outbuf)
2954 :
2955 : "memory" );
2956
2957 inbuf += BLOCKSIZE;
2958 outbuf += BLOCKSIZE;
2959 }
2960
2961 #ifdef __x86_64__
2962 if (nblocks >= 8)
2963 {
2964 unsigned char last_xor_first_key_store[16 + 15];
2965 unsigned char *lxf_key;
2966 aesni_prepare_8_15_variable;
2967
2968 asm volatile (""
2969 : "=r" (lxf_key)
2970 : "0" (last_xor_first_key_store)
2971 : "memory");
2972 lxf_key = lxf_key + (-(uintptr_t)lxf_key & 15);
2973
2974 aesni_prepare_8_15();
2975
2976 asm volatile ("movdqu %[l0], %%xmm6\n\t"
2977 "movdqa %[last_key], %%xmm0\n\t"
2978 "pxor %[first_key], %%xmm5\n\t"
2979 "pxor %[first_key], %%xmm0\n\t"
2980 "movdqa %%xmm0, %[lxfkey]\n\t"
2981 : [lxfkey] "=m" (*lxf_key)
2982 : [l0] "m" (*c->u_mode.ocb.L[0]),
2983 [last_key] "m" (ctx->keyschdec[ctx->rounds][0][0]),
2984 [first_key] "m" (ctx->keyschdec[0][0][0])
2985 : "memory" );
2986
2987 for ( ;nblocks >= 8 ; nblocks -= 8 )
2988 {
2989 n += 4;
2990 l = aes_ocb_get_l(c, n);
2991
2992 asm volatile ("movdqu %[l0l1], %%xmm10\n\t"
2993 "movdqu %[l1], %%xmm11\n\t"
2994 "movdqu %[l3], %%xmm15\n\t"
2995 :
2996 : [l0l1] "m" (*c->u_mode.ocb.L0L1),
2997 [l1] "m" (*c->u_mode.ocb.L[1]),
2998 [l3] "m" (*l)
2999 : "memory" );
3000
3001 n += 4;
3002 l = aes_ocb_get_l(c, n);
3003
3004 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3005 /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */
3006 asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
3007 "movdqu %[inbuf1], %%xmm2\n\t"
3008 "movdqu %[inbuf2], %%xmm3\n\t"
3009 :
3010 : [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)),
3011 [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
3012 [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
3013 : "memory" );
3014 asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
3015 "movdqu %[inbuf4], %%xmm8\n\t"
3016 "movdqu %[inbuf5], %%xmm9\n\t"
3017 :
3018 : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
3019 [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)),
3020 [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
3021 : "memory" );
3022 asm volatile ("movdqa %[lxfkey], %%xmm0\n\t"
3023 "movdqa %%xmm6, %%xmm12\n\t"
3024 "pxor %%xmm5, %%xmm12\n\t"
3025 "pxor %%xmm12, %%xmm1\n\t"
3026 "pxor %%xmm0, %%xmm12\n\t"
3027
3028 "movdqa %%xmm10, %%xmm13\n\t"
3029 "pxor %%xmm5, %%xmm13\n\t"
3030 "pxor %%xmm13, %%xmm2\n\t"
3031 "pxor %%xmm0, %%xmm13\n\t"
3032
3033 "movdqa %%xmm11, %%xmm14\n\t"
3034 "pxor %%xmm5, %%xmm14\n\t"
3035 "pxor %%xmm14, %%xmm3\n\t"
3036 "pxor %%xmm0, %%xmm14\n\t"
3037
3038 "pxor %%xmm11, %%xmm5\n\t"
3039 "pxor %%xmm15, %%xmm5\n\t"
3040 "pxor %%xmm5, %%xmm4\n\t"
3041 "movdqa %%xmm5, %%xmm15\n\t"
3042 "pxor %%xmm0, %%xmm15\n\t"
3043
3044 "movdqa %%xmm5, %%xmm0\n\t"
3045 "pxor %%xmm6, %%xmm0\n\t"
3046 "pxor %%xmm0, %%xmm8\n\t"
3047 "pxor %[lxfkey], %%xmm0\n\t"
3048 "movdqa %%xmm0, %[tmpbuf0]\n\t"
3049
3050 "movdqa %%xmm10, %%xmm0\n\t"
3051 "pxor %%xmm5, %%xmm0\n\t"
3052 "pxor %%xmm0, %%xmm9\n\t"
3053 "pxor %[lxfkey], %%xmm0\n"
3054 "movdqa %%xmm0, %[tmpbuf1]\n\t"
3055 : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
3056 [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
3057 : [lxfkey] "m" (*lxf_key)
3058 : "memory" );
3059 asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
3060 "movdqa %%xmm11, %%xmm0\n\t"
3061 "pxor %%xmm5, %%xmm0\n\t"
3062 "pxor %%xmm0, %%xmm10\n\t"
3063 "pxor %[lxfkey], %%xmm0\n\t"
3064 "movdqa %%xmm0, %[tmpbuf2]\n\t"
3065 : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
3066 : [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)),
3067 [lxfkey] "m" (*lxf_key)
3068 : "memory" );
3069 asm volatile ("movdqu %[l7], %%xmm0\n\t"
3070 "pxor %%xmm11, %%xmm5\n\t"
3071 "pxor %%xmm0, %%xmm5\n\t"
3072 "movdqa 0x10(%[key]), %%xmm0\n\t"
3073 "movdqu %[inbuf7], %%xmm11\n\t"
3074 "pxor %%xmm5, %%xmm11\n\t"
3075 :
3076 : [l7] "m" (*l),
3077 [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)),
3078 [key] "r" (ctx->keyschdec)
3079 : "memory" );
3080
3081 asm volatile ("cmpl $12, %[rounds]\n\t"
3082 "aesdec %%xmm0, %%xmm1\n\t"
3083 "aesdec %%xmm0, %%xmm2\n\t"
3084 "aesdec %%xmm0, %%xmm3\n\t"
3085 "aesdec %%xmm0, %%xmm4\n\t"
3086 "aesdec %%xmm0, %%xmm8\n\t"
3087 "aesdec %%xmm0, %%xmm9\n\t"
3088 "aesdec %%xmm0, %%xmm10\n\t"
3089 "aesdec %%xmm0, %%xmm11\n\t"
3090 "movdqa 0x20(%[key]), %%xmm0\n\t"
3091 "aesdec %%xmm0, %%xmm1\n\t"
3092 "aesdec %%xmm0, %%xmm2\n\t"
3093 "aesdec %%xmm0, %%xmm3\n\t"
3094 "aesdec %%xmm0, %%xmm4\n\t"
3095 "aesdec %%xmm0, %%xmm8\n\t"
3096 "aesdec %%xmm0, %%xmm9\n\t"
3097 "aesdec %%xmm0, %%xmm10\n\t"
3098 "aesdec %%xmm0, %%xmm11\n\t"
3099 "movdqa 0x30(%[key]), %%xmm0\n\t"
3100 "aesdec %%xmm0, %%xmm1\n\t"
3101 "aesdec %%xmm0, %%xmm2\n\t"
3102 "aesdec %%xmm0, %%xmm3\n\t"
3103 "aesdec %%xmm0, %%xmm4\n\t"
3104 "aesdec %%xmm0, %%xmm8\n\t"
3105 "aesdec %%xmm0, %%xmm9\n\t"
3106 "aesdec %%xmm0, %%xmm10\n\t"
3107 "aesdec %%xmm0, %%xmm11\n\t"
3108 "movdqa 0x40(%[key]), %%xmm0\n\t"
3109 "aesdec %%xmm0, %%xmm1\n\t"
3110 "aesdec %%xmm0, %%xmm2\n\t"
3111 "aesdec %%xmm0, %%xmm3\n\t"
3112 "aesdec %%xmm0, %%xmm4\n\t"
3113 "aesdec %%xmm0, %%xmm8\n\t"
3114 "aesdec %%xmm0, %%xmm9\n\t"
3115 "aesdec %%xmm0, %%xmm10\n\t"
3116 "aesdec %%xmm0, %%xmm11\n\t"
3117 "movdqa 0x50(%[key]), %%xmm0\n\t"
3118 "aesdec %%xmm0, %%xmm1\n\t"
3119 "aesdec %%xmm0, %%xmm2\n\t"
3120 "aesdec %%xmm0, %%xmm3\n\t"
3121 "aesdec %%xmm0, %%xmm4\n\t"
3122 "aesdec %%xmm0, %%xmm8\n\t"
3123 "aesdec %%xmm0, %%xmm9\n\t"
3124 "aesdec %%xmm0, %%xmm10\n\t"
3125 "aesdec %%xmm0, %%xmm11\n\t"
3126 "movdqa 0x60(%[key]), %%xmm0\n\t"
3127 "aesdec %%xmm0, %%xmm1\n\t"
3128 "aesdec %%xmm0, %%xmm2\n\t"
3129 "aesdec %%xmm0, %%xmm3\n\t"
3130 "aesdec %%xmm0, %%xmm4\n\t"
3131 "aesdec %%xmm0, %%xmm8\n\t"
3132 "aesdec %%xmm0, %%xmm9\n\t"
3133 "aesdec %%xmm0, %%xmm10\n\t"
3134 "aesdec %%xmm0, %%xmm11\n\t"
3135 "movdqa 0x70(%[key]), %%xmm0\n\t"
3136 "aesdec %%xmm0, %%xmm1\n\t"
3137 "aesdec %%xmm0, %%xmm2\n\t"
3138 "aesdec %%xmm0, %%xmm3\n\t"
3139 "aesdec %%xmm0, %%xmm4\n\t"
3140 "aesdec %%xmm0, %%xmm8\n\t"
3141 "aesdec %%xmm0, %%xmm9\n\t"
3142 "aesdec %%xmm0, %%xmm10\n\t"
3143 "aesdec %%xmm0, %%xmm11\n\t"
3144 "movdqa 0x80(%[key]), %%xmm0\n\t"
3145 "aesdec %%xmm0, %%xmm1\n\t"
3146 "aesdec %%xmm0, %%xmm2\n\t"
3147 "aesdec %%xmm0, %%xmm3\n\t"
3148 "aesdec %%xmm0, %%xmm4\n\t"
3149 "aesdec %%xmm0, %%xmm8\n\t"
3150 "aesdec %%xmm0, %%xmm9\n\t"
3151 "aesdec %%xmm0, %%xmm10\n\t"
3152 "aesdec %%xmm0, %%xmm11\n\t"
3153 "movdqa 0x90(%[key]), %%xmm0\n\t"
3154 "aesdec %%xmm0, %%xmm1\n\t"
3155 "aesdec %%xmm0, %%xmm2\n\t"
3156 "aesdec %%xmm0, %%xmm3\n\t"
3157 "aesdec %%xmm0, %%xmm4\n\t"
3158 "aesdec %%xmm0, %%xmm8\n\t"
3159 "aesdec %%xmm0, %%xmm9\n\t"
3160 "aesdec %%xmm0, %%xmm10\n\t"
3161 "aesdec %%xmm0, %%xmm11\n\t"
3162 "jb .Ldeclast%=\n\t"
3163 "movdqa 0xa0(%[key]), %%xmm0\n\t"
3164 "aesdec %%xmm0, %%xmm1\n\t"
3165 "aesdec %%xmm0, %%xmm2\n\t"
3166 "aesdec %%xmm0, %%xmm3\n\t"
3167 "aesdec %%xmm0, %%xmm4\n\t"
3168 "aesdec %%xmm0, %%xmm8\n\t"
3169 "aesdec %%xmm0, %%xmm9\n\t"
3170 "aesdec %%xmm0, %%xmm10\n\t"
3171 "aesdec %%xmm0, %%xmm11\n\t"
3172 "movdqa 0xb0(%[key]), %%xmm0\n\t"
3173 "aesdec %%xmm0, %%xmm1\n\t"
3174 "aesdec %%xmm0, %%xmm2\n\t"
3175 "aesdec %%xmm0, %%xmm3\n\t"
3176 "aesdec %%xmm0, %%xmm4\n\t"
3177 "aesdec %%xmm0, %%xmm8\n\t"
3178 "aesdec %%xmm0, %%xmm9\n\t"
3179 "aesdec %%xmm0, %%xmm10\n\t"
3180 "aesdec %%xmm0, %%xmm11\n\t"
3181 "je .Ldeclast%=\n\t"
3182 "movdqa 0xc0(%[key]), %%xmm0\n\t"
3183 "aesdec %%xmm0, %%xmm1\n\t"
3184 "aesdec %%xmm0, %%xmm2\n\t"
3185 "aesdec %%xmm0, %%xmm3\n\t"
3186 "aesdec %%xmm0, %%xmm4\n\t"
3187 "aesdec %%xmm0, %%xmm8\n\t"
3188 "aesdec %%xmm0, %%xmm9\n\t"
3189 "aesdec %%xmm0, %%xmm10\n\t"
3190 "aesdec %%xmm0, %%xmm11\n\t"
3191 "movdqa 0xd0(%[key]), %%xmm0\n\t"
3192 "aesdec %%xmm0, %%xmm1\n\t"
3193 "aesdec %%xmm0, %%xmm2\n\t"
3194 "aesdec %%xmm0, %%xmm3\n\t"
3195 "aesdec %%xmm0, %%xmm4\n\t"
3196 "aesdec %%xmm0, %%xmm8\n\t"
3197 "aesdec %%xmm0, %%xmm9\n\t"
3198 "aesdec %%xmm0, %%xmm10\n\t"
3199 "aesdec %%xmm0, %%xmm11\n\t"
3200
3201 ".Ldeclast%=:\n\t"
3202 :
3203 : [key] "r" (ctx->keyschdec),
3204 [rounds] "r" (ctx->rounds)
3205 : "cc", "memory");
3206
3207 asm volatile ("aesdeclast %%xmm12, %%xmm1\n\t"
3208 "aesdeclast %%xmm13, %%xmm2\n\t"
3209 "aesdeclast %%xmm14, %%xmm3\n\t"
3210 "aesdeclast %%xmm15, %%xmm4\n\t"
3211 "aesdeclast %[tmpbuf0],%%xmm8\n\t"
3212 "aesdeclast %[tmpbuf1],%%xmm9\n\t"
3213 "aesdeclast %[tmpbuf2],%%xmm10\n\t"
3214 :
3215 : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
3216 [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)),
3217 [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
3218 : "memory" );
3219 asm volatile ("aesdeclast %%xmm5, %%xmm11\n\t"
3220 "pxor %[lxfkey], %%xmm11\n\t"
3221 "movdqu %%xmm1, %[outbuf0]\n\t"
3222 "movdqu %%xmm2, %[outbuf1]\n\t"
3223 : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
3224 [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
3225 : [lxfkey] "m" (*lxf_key)
3226 : "memory" );
3227 asm volatile ("movdqu %%xmm3, %[outbuf2]\n\t"
3228 "movdqu %%xmm4, %[outbuf3]\n\t"
3229 "movdqu %%xmm8, %[outbuf4]\n\t"
3230 : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
3231 [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
3232 [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
3233 :
3234 : "memory" );
3235 asm volatile ("movdqu %%xmm9, %[outbuf5]\n\t"
3236 "movdqu %%xmm10, %[outbuf6]\n\t"
3237 "movdqu %%xmm11, %[outbuf7]\n\t"
3238 : [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)),
3239 [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)),
3240 [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
3241 :
3242 : "memory" );
3243
3244 outbuf += 8*BLOCKSIZE;
3245 inbuf += 8*BLOCKSIZE;
3246 }
3247
3248 asm volatile ("pxor %[first_key], %%xmm5\n\t"
3249 "pxor %%xmm0, %%xmm0\n\t"
3250 "movdqu %%xmm0, %[lxfkey]\n\t"
3251 : [lxfkey] "=m" (*lxf_key)
3252 : [first_key] "m" (ctx->keyschdec[0][0][0])
3253 : "memory" );
3254
3255 aesni_cleanup_8_15();
3256 }
3257 #endif
3258
3259 for ( ;nblocks >= 4 ; nblocks -= 4 )
3260 {
3261 n += 4;
3262 l = aes_ocb_get_l(c, n);
3263
3264 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3265 /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i) */
3266 asm volatile ("movdqu %[l0], %%xmm0\n\t"
3267 "movdqu %[inbuf0], %%xmm1\n\t"
3268 "movdqu %[l0l1], %%xmm3\n\t"
3269 :
3270 : [l0] "m" (*c->u_mode.ocb.L[0]),
3271 [l0l1] "m" (*c->u_mode.ocb.L0L1),
3272 [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
3273 : "memory" );
3274 asm volatile ("movdqu %[l1], %%xmm4\n\t"
3275 "movdqu %[l3], %%xmm6\n\t"
3276 "pxor %%xmm5, %%xmm0\n\t"
3277 "pxor %%xmm0, %%xmm1\n\t"
3278 "movdqa %%xmm0, %[tmpbuf0]\n\t"
3279 : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE))
3280 : [l1] "m" (*c->u_mode.ocb.L[1]),
3281 [l3] "m" (*l)
3282 : "memory" );
3283 asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
3284 "pxor %%xmm5, %%xmm3\n\t"
3285 "pxor %%xmm3, %%xmm2\n\t"
3286 "movdqa %%xmm3, %[tmpbuf1]\n\t"
3287 : [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
3288 : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
3289 : "memory" );
3290 asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
3291 "movdqu %[inbuf2], %%xmm3\n\t"
3292 "pxor %%xmm5, %%xmm0\n\t"
3293 "pxor %%xmm0, %%xmm3\n\t"
3294 "movdqa %%xmm0, %[tmpbuf2]\n\t"
3295 : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
3296 :
3297 [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
3298 : "memory" );
3299 asm volatile ("pxor %%xmm6, %%xmm5\n\t"
3300 "pxor %%xmm4, %%xmm5\n\t"
3301 "movdqu %[inbuf3], %%xmm4\n\t"
3302 "pxor %%xmm5, %%xmm4\n\t"
3303 :
3304 : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
3305 : "memory" );
3306
3307 do_aesni_dec_vec4 (ctx);
3308
3309 asm volatile ("pxor %[tmpbuf0],%%xmm1\n\t"
3310 "movdqu %%xmm1, %[outbuf0]\n\t"
3311 "pxor %[tmpbuf1],%%xmm2\n\t"
3312 "movdqu %%xmm2, %[outbuf1]\n\t"
3313 : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
3314 [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
3315 : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
3316 [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE))
3317 : "memory" );
3318 asm volatile ("pxor %[tmpbuf2],%%xmm3\n\t"
3319 "movdqu %%xmm3, %[outbuf2]\n\t"
3320 "pxor %%xmm5, %%xmm4\n\t"
3321 "movdqu %%xmm4, %[outbuf3]\n\t"
3322 : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
3323 [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
3324 : [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
3325 : "memory" );
3326
3327 outbuf += 4*BLOCKSIZE;
3328 inbuf += 4*BLOCKSIZE;
3329 }
3330
3331 for ( ;nblocks; nblocks-- )
3332 {
3333 l = aes_ocb_get_l(c, ++n);
3334
3335 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3336 /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
3337 /* Checksum_i = Checksum_{i-1} xor P_i */
3338 asm volatile ("movdqu %[l], %%xmm1\n\t"
3339 "movdqu %[inbuf], %%xmm0\n\t"
3340 "pxor %%xmm1, %%xmm5\n\t"
3341 "pxor %%xmm5, %%xmm0\n\t"
3342 :
3343 : [l] "m" (*l),
3344 [inbuf] "m" (*inbuf)
3345 : "memory" );
3346
3347 do_aesni_dec (ctx);
3348
3349 asm volatile ("pxor %%xmm5, %%xmm0\n\t"
3350 "movdqu %%xmm0, %[outbuf]\n\t"
3351 : [outbuf] "=m" (*outbuf)
3352 :
3353 : "memory" );
3354
3355 inbuf += BLOCKSIZE;
3356 outbuf += BLOCKSIZE;
3357 }
3358
3359 c->u_mode.ocb.data_nblocks = n;
3360 asm volatile ("movdqu %%xmm5, %[iv]\n\t"
3361 : [iv] "=m" (*c->u_iv.iv)
3362 :
3363 : "memory" );
3364
3365 asm volatile ("pxor %%xmm0, %%xmm0\n\t"
3366 "movdqa %%xmm0, %[tmpbuf0]\n\t"
3367 "movdqa %%xmm0, %[tmpbuf1]\n\t"
3368 "movdqa %%xmm0, %[tmpbuf2]\n\t"
3369 : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
3370 [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)),
3371 [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
3372 :
3373 : "memory" );
3374
3375 aesni_ocb_checksum (c, outbuf_arg, nblocks_arg);
3376
3377 aesni_cleanup ();
3378 aesni_cleanup_2_7 ();
3379
3380 return 0;
3381 }
3382
3383
3384 size_t ASM_FUNC_ATTR
_gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c,void * outbuf_arg,const void * inbuf_arg,size_t nblocks,int encrypt)3385 _gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
3386 const void *inbuf_arg, size_t nblocks, int encrypt)
3387 {
3388 if (encrypt)
3389 return aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
3390 else
3391 return aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
3392 }
3393
3394
3395 size_t ASM_FUNC_ATTR
_gcry_aes_aesni_ocb_auth(gcry_cipher_hd_t c,const void * abuf_arg,size_t nblocks)3396 _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
3397 size_t nblocks)
3398 {
3399 RIJNDAEL_context *ctx = (void *)&c->context.c;
3400 const unsigned char *abuf = abuf_arg;
3401 u64 n = c->u_mode.ocb.aad_nblocks;
3402 const unsigned char *l;
3403 aesni_prepare_2_7_variable;
3404
3405 aesni_prepare ();
3406 aesni_prepare_2_7 ();
3407
3408 /* Preload Offset and Sum */
3409 asm volatile ("movdqu %[iv], %%xmm5\n\t"
3410 "movdqu %[ctr], %%xmm6\n\t"
3411 : /* No output */
3412 : [iv] "m" (*c->u_mode.ocb.aad_offset),
3413 [ctr] "m" (*c->u_mode.ocb.aad_sum)
3414 : "memory" );
3415
3416 for ( ;nblocks && n % 4; nblocks-- )
3417 {
3418 l = aes_ocb_get_l(c, ++n);
3419
3420 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3421 /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
3422 asm volatile ("movdqu %[l], %%xmm1\n\t"
3423 "movdqu %[abuf], %%xmm0\n\t"
3424 "pxor %%xmm1, %%xmm5\n\t"
3425 "pxor %%xmm5, %%xmm0\n\t"
3426 :
3427 : [l] "m" (*l),
3428 [abuf] "m" (*abuf)
3429 : "memory" );
3430
3431 do_aesni_enc (ctx);
3432
3433 asm volatile ("pxor %%xmm0, %%xmm6\n\t"
3434 :
3435 :
3436 : "memory" );
3437
3438 abuf += BLOCKSIZE;
3439 }
3440
3441 #ifdef __x86_64__
3442 if (nblocks >= 8)
3443 {
3444 aesni_prepare_8_15_variable;
3445
3446 aesni_prepare_8_15();
3447
3448 asm volatile ("movdqu %[l0], %%xmm7\n\t"
3449 "movdqu %[l0l1], %%xmm12\n\t"
3450 "movdqu %[l1], %%xmm13\n\t"
3451 :
3452 : [l0] "m" (*c->u_mode.ocb.L[0]),
3453 [l0l1] "m" (*c->u_mode.ocb.L0L1),
3454 [l1] "m" (*c->u_mode.ocb.L[1])
3455 : "memory" );
3456
3457 for ( ;nblocks >= 8 ; nblocks -= 8 )
3458 {
3459 n += 4;
3460 l = aes_ocb_get_l(c, n);
3461
3462 asm volatile ("movdqu %[l3], %%xmm0\n\t"
3463 "pxor %%xmm13, %%xmm0\n\t"
3464 :
3465 : [l3] "m" (*l)
3466 : "memory" );
3467
3468 n += 4;
3469 l = aes_ocb_get_l(c, n);
3470
3471 asm volatile ("movdqu %[l7], %%xmm14\n\t"
3472 "pxor %%xmm13, %%xmm14\n\t"
3473 :
3474 : [l7] "m" (*l)
3475 : "memory" );
3476
3477 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3478 /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
3479 asm volatile ("movdqu %[abuf0], %%xmm1\n\t"
3480 "movdqu %[abuf1], %%xmm2\n\t"
3481 "movdqu %[abuf2], %%xmm3\n\t"
3482 "movdqu %[abuf3], %%xmm4\n\t"
3483 :
3484 : [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)),
3485 [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)),
3486 [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)),
3487 [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
3488 : "memory" );
3489 asm volatile ("movdqu %[abuf4], %%xmm8\n\t"
3490 "movdqu %[abuf5], %%xmm9\n\t"
3491 "movdqu %[abuf6], %%xmm10\n\t"
3492 "movdqu %[abuf7], %%xmm11\n\t"
3493 :
3494 : [abuf4] "m" (*(abuf + 4 * BLOCKSIZE)),
3495 [abuf5] "m" (*(abuf + 5 * BLOCKSIZE)),
3496 [abuf6] "m" (*(abuf + 6 * BLOCKSIZE)),
3497 [abuf7] "m" (*(abuf + 7 * BLOCKSIZE))
3498 : "memory" );
3499 asm volatile ("pxor %%xmm7, %%xmm1\n\t"
3500 "pxor %%xmm5, %%xmm1\n\t"
3501
3502 "pxor %%xmm12, %%xmm2\n\t"
3503 "pxor %%xmm5, %%xmm2\n\t"
3504
3505 "pxor %%xmm13, %%xmm3\n\t"
3506 "pxor %%xmm5, %%xmm3\n\t"
3507
3508 "pxor %%xmm0, %%xmm5\n\t"
3509 "movdqa (%[key]), %%xmm0\n\t"
3510 "pxor %%xmm5, %%xmm4\n\t"
3511
3512 "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
3513 "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
3514 "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
3515 "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
3516
3517 "pxor %%xmm7, %%xmm8\n\t"
3518 "pxor %%xmm5, %%xmm8\n\t"
3519
3520 "pxor %%xmm12, %%xmm9\n\t"
3521 "pxor %%xmm5, %%xmm9\n\t"
3522
3523 "pxor %%xmm13, %%xmm10\n\t"
3524 "pxor %%xmm5, %%xmm10\n\t"
3525
3526 "pxor %%xmm14, %%xmm5\n\t"
3527 "pxor %%xmm5, %%xmm11\n\t"
3528
3529 "pxor %%xmm0, %%xmm8\n\t" /* xmm8 ^= key[0] */
3530 "pxor %%xmm0, %%xmm9\n\t" /* xmm9 ^= key[0] */
3531 "pxor %%xmm0, %%xmm10\n\t" /* xmm10 ^= key[0] */
3532 "pxor %%xmm0, %%xmm11\n\t" /* xmm11 ^= key[0] */
3533 :
3534 : [key] "r" (ctx->keyschenc)
3535 : "memory" );
3536
3537 do_aesni_enc_vec8 (ctx);
3538
3539 asm volatile (
3540 "aesenclast %%xmm0, %%xmm1\n\t"
3541 "aesenclast %%xmm0, %%xmm2\n\t"
3542 "aesenclast %%xmm0, %%xmm3\n\t"
3543 "aesenclast %%xmm0, %%xmm4\n\t"
3544 "aesenclast %%xmm0, %%xmm8\n\t"
3545 "aesenclast %%xmm0, %%xmm9\n\t"
3546 "aesenclast %%xmm0, %%xmm10\n\t"
3547 "aesenclast %%xmm0, %%xmm11\n\t"
3548 "pxor %%xmm2, %%xmm1\n\t"
3549 "pxor %%xmm3, %%xmm1\n\t"
3550 "pxor %%xmm4, %%xmm1\n\t"
3551 "pxor %%xmm8, %%xmm1\n\t"
3552 "pxor %%xmm9, %%xmm6\n\t"
3553 "pxor %%xmm10, %%xmm6\n\t"
3554 "pxor %%xmm11, %%xmm6\n\t"
3555 "pxor %%xmm1, %%xmm6\n\t"
3556 :
3557 :
3558 : "memory" );
3559
3560 abuf += 8*BLOCKSIZE;
3561 }
3562
3563 aesni_cleanup_8_15();
3564 }
3565 #endif
3566
3567 for ( ;nblocks >= 4 ; nblocks -= 4 )
3568 {
3569 n += 4;
3570 l = aes_ocb_get_l(c, n);
3571
3572 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3573 /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
3574 asm volatile ("movdqu %[l0], %%xmm0\n\t"
3575 "movdqu %[abuf0], %%xmm1\n\t"
3576 "movdqu %[l0l1], %%xmm3\n\t"
3577 :
3578 : [l0] "m" (*c->u_mode.ocb.L[0]),
3579 [l0l1] "m" (*c->u_mode.ocb.L0L1),
3580 [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
3581 : "memory" );
3582 asm volatile ("movdqu %[l1], %%xmm4\n\t"
3583 "movdqu %[l3], %%xmm7\n\t"
3584 "pxor %%xmm5, %%xmm0\n\t"
3585 "pxor %%xmm0, %%xmm1\n\t"
3586 :
3587 : [l1] "m" (*c->u_mode.ocb.L[1]),
3588 [l3] "m" (*l)
3589 : "memory" );
3590 asm volatile ("movdqu %[abuf1], %%xmm2\n\t"
3591 "pxor %%xmm5, %%xmm3\n\t"
3592 "pxor %%xmm3, %%xmm2\n\t"
3593 :
3594 : [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
3595 : "memory" );
3596 asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
3597 "movdqu %[abuf2], %%xmm3\n\t"
3598 "pxor %%xmm5, %%xmm0\n\t"
3599 "pxor %%xmm0, %%xmm3\n\t"
3600 :
3601 : [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
3602 : "memory" );
3603 asm volatile ("pxor %%xmm7, %%xmm5\n\t"
3604 "pxor %%xmm4, %%xmm5\n\t"
3605 "movdqu %[abuf3], %%xmm4\n\t"
3606 "pxor %%xmm5, %%xmm4\n\t"
3607 :
3608 : [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
3609 : "memory" );
3610
3611 do_aesni_enc_vec4 (ctx);
3612
3613 asm volatile ("pxor %%xmm1, %%xmm6\n\t"
3614 "pxor %%xmm2, %%xmm6\n\t"
3615 "pxor %%xmm3, %%xmm6\n\t"
3616 "pxor %%xmm4, %%xmm6\n\t"
3617 :
3618 :
3619 : "memory" );
3620
3621 abuf += 4*BLOCKSIZE;
3622 }
3623
3624 for ( ;nblocks; nblocks-- )
3625 {
3626 l = aes_ocb_get_l(c, ++n);
3627
3628 /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3629 /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
3630 asm volatile ("movdqu %[l], %%xmm1\n\t"
3631 "movdqu %[abuf], %%xmm0\n\t"
3632 "pxor %%xmm1, %%xmm5\n\t"
3633 "pxor %%xmm5, %%xmm0\n\t"
3634 :
3635 : [l] "m" (*l),
3636 [abuf] "m" (*abuf)
3637 : "memory" );
3638
3639 do_aesni_enc (ctx);
3640
3641 asm volatile ("pxor %%xmm0, %%xmm6\n\t"
3642 :
3643 :
3644 : "memory" );
3645
3646 abuf += BLOCKSIZE;
3647 }
3648
3649 c->u_mode.ocb.aad_nblocks = n;
3650 asm volatile ("movdqu %%xmm5, %[iv]\n\t"
3651 "movdqu %%xmm6, %[ctr]\n\t"
3652 : [iv] "=m" (*c->u_mode.ocb.aad_offset),
3653 [ctr] "=m" (*c->u_mode.ocb.aad_sum)
3654 :
3655 : "memory" );
3656
3657 aesni_cleanup ();
3658 aesni_cleanup_2_7 ();
3659
3660 return 0;
3661 }
3662
3663
3664 static const u64 xts_gfmul_const[2] __attribute__ ((aligned (16))) =
3665 { 0x87, 0x01 };
3666
3667
3668 static void ASM_FUNC_ATTR
_gcry_aes_aesni_xts_enc(RIJNDAEL_context * ctx,unsigned char * tweak,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)3669 _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
3670 unsigned char *outbuf, const unsigned char *inbuf,
3671 size_t nblocks)
3672 {
3673 aesni_prepare_2_7_variable;
3674
3675 aesni_prepare ();
3676 aesni_prepare_2_7 ();
3677
3678 /* Preload Tweak */
3679 asm volatile ("movdqu %[tweak], %%xmm5\n\t"
3680 "movdqa %[gfmul], %%xmm6\n\t"
3681 :
3682 : [tweak] "m" (*tweak),
3683 [gfmul] "m" (*xts_gfmul_const)
3684 : "memory" );
3685
3686 #ifdef __x86_64__
3687 if (nblocks >= 8)
3688 {
3689 aesni_prepare_8_15_variable;
3690
3691 aesni_prepare_8_15();
3692
3693 for ( ;nblocks >= 8 ; nblocks -= 8 )
3694 {
3695 asm volatile ("pshufd $0x13, %%xmm5, %%xmm11\n\t"
3696 "movdqu %[inbuf0], %%xmm1\n\t"
3697 "pxor %%xmm5, %%xmm1\n\t"
3698 "movdqa %%xmm5, %%xmm7\n\t"
3699
3700 "movdqa %%xmm11, %%xmm0\n\t"
3701 "paddd %%xmm11, %%xmm11\n\t"
3702 "psrad $31, %%xmm0\n\t"
3703 "paddq %%xmm5, %%xmm5\n\t"
3704 "pand %%xmm6, %%xmm0\n\t"
3705 "pxor %%xmm0, %%xmm5\n\t"
3706 :
3707 : [inbuf0] "m" (*(inbuf + 0 * 16))
3708 : "memory" );
3709
3710 asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
3711 "pxor %%xmm5, %%xmm2\n\t"
3712 "movdqa %%xmm5, %%xmm12\n\t"
3713
3714 "movdqa %%xmm11, %%xmm0\n\t"
3715 "paddd %%xmm11, %%xmm11\n\t"
3716 "psrad $31, %%xmm0\n\t"
3717 "paddq %%xmm5, %%xmm5\n\t"
3718 "pand %%xmm6, %%xmm0\n\t"
3719 "pxor %%xmm0, %%xmm5\n\t"
3720 :
3721 : [inbuf1] "m" (*(inbuf + 1 * 16))
3722 : "memory" );
3723
3724 asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
3725 "pxor %%xmm5, %%xmm3\n\t"
3726 "movdqa %%xmm5, %%xmm13\n\t"
3727
3728 "movdqa %%xmm11, %%xmm0\n\t"
3729 "paddd %%xmm11, %%xmm11\n\t"
3730 "psrad $31, %%xmm0\n\t"
3731 "paddq %%xmm5, %%xmm5\n\t"
3732 "pand %%xmm6, %%xmm0\n\t"
3733 "pxor %%xmm0, %%xmm5\n\t"
3734 :
3735 : [inbuf2] "m" (*(inbuf + 2 * 16))
3736 : "memory" );
3737
3738 asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
3739 "pxor %%xmm5, %%xmm4\n\t"
3740 "movdqa %%xmm5, %%xmm14\n\t"
3741
3742 "movdqa %%xmm11, %%xmm0\n\t"
3743 "paddd %%xmm11, %%xmm11\n\t"
3744 "psrad $31, %%xmm0\n\t"
3745 "paddq %%xmm5, %%xmm5\n\t"
3746 "pand %%xmm6, %%xmm0\n\t"
3747 "pxor %%xmm0, %%xmm5\n\t"
3748 :
3749 : [inbuf3] "m" (*(inbuf + 3 * 16))
3750 : "memory" );
3751
3752 asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
3753 "pxor %%xmm5, %%xmm8\n\t"
3754 "movdqa %%xmm5, %%xmm15\n\t"
3755
3756 "movdqa %%xmm11, %%xmm0\n\t"
3757 "paddd %%xmm11, %%xmm11\n\t"
3758 "psrad $31, %%xmm0\n\t"
3759 "paddq %%xmm5, %%xmm5\n\t"
3760 "pand %%xmm6, %%xmm0\n\t"
3761 "pxor %%xmm0, %%xmm5\n\t"
3762 :
3763 : [inbuf4] "m" (*(inbuf + 4 * 16))
3764 : "memory" );
3765
3766 asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
3767 "pxor %%xmm5, %%xmm9\n\t"
3768 "movdqu %%xmm5, %[outbuf5]\n\t"
3769
3770 "movdqa %%xmm11, %%xmm0\n\t"
3771 "paddd %%xmm11, %%xmm11\n\t"
3772 "psrad $31, %%xmm0\n\t"
3773 "paddq %%xmm5, %%xmm5\n\t"
3774 "pand %%xmm6, %%xmm0\n\t"
3775 "pxor %%xmm0, %%xmm5\n\t"
3776 : [outbuf5] "=m" (*(outbuf + 5 * 16))
3777 : [inbuf5] "m" (*(inbuf + 5 * 16))
3778 : "memory" );
3779
3780 asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
3781 "pxor %%xmm5, %%xmm10\n\t"
3782 "movdqu %%xmm5, %[outbuf6]\n\t"
3783
3784 "movdqa %%xmm11, %%xmm0\n\t"
3785 "paddd %%xmm11, %%xmm11\n\t"
3786 "psrad $31, %%xmm0\n\t"
3787 "paddq %%xmm5, %%xmm5\n\t"
3788 "pand %%xmm6, %%xmm0\n\t"
3789 "pxor %%xmm0, %%xmm5\n\t"
3790 : [outbuf6] "=m" (*(outbuf + 6 * 16))
3791 : [inbuf6] "m" (*(inbuf + 6 * 16))
3792 : "memory" );
3793
3794 asm volatile ("movdqa %%xmm11, %%xmm0\n\t"
3795 "movdqu %[inbuf7], %%xmm11\n\t"
3796 "pxor %%xmm5, %%xmm11\n\t"
3797 "movdqu %%xmm5, %[outbuf7]\n\t"
3798
3799 "psrad $31, %%xmm0\n\t"
3800 "paddq %%xmm5, %%xmm5\n\t"
3801 "pand %%xmm6, %%xmm0\n\t"
3802 "pxor %%xmm0, %%xmm5\n\t"
3803 : [outbuf7] "=m" (*(outbuf + 7 * 16))
3804 : [inbuf7] "m" (*(inbuf + 7 * 16))
3805 : "memory" );
3806
3807 asm volatile ("cmpl $12, %[rounds]\n\t"
3808 "movdqa (%[key]), %%xmm0\n\t"
3809 "pxor %%xmm0, %%xmm1\n\t"
3810 "pxor %%xmm0, %%xmm2\n\t"
3811 "pxor %%xmm0, %%xmm3\n\t"
3812 "pxor %%xmm0, %%xmm4\n\t"
3813 "pxor %%xmm0, %%xmm8\n\t"
3814 "pxor %%xmm0, %%xmm9\n\t"
3815 "pxor %%xmm0, %%xmm10\n\t"
3816 "pxor %%xmm0, %%xmm11\n\t"
3817 "movdqa 0x10(%[key]), %%xmm0\n\t"
3818 "aesenc %%xmm0, %%xmm1\n\t"
3819 "aesenc %%xmm0, %%xmm2\n\t"
3820 "aesenc %%xmm0, %%xmm3\n\t"
3821 "aesenc %%xmm0, %%xmm4\n\t"
3822 "aesenc %%xmm0, %%xmm8\n\t"
3823 "aesenc %%xmm0, %%xmm9\n\t"
3824 "aesenc %%xmm0, %%xmm10\n\t"
3825 "aesenc %%xmm0, %%xmm11\n\t"
3826 "movdqa 0x20(%[key]), %%xmm0\n\t"
3827 "aesenc %%xmm0, %%xmm1\n\t"
3828 "aesenc %%xmm0, %%xmm2\n\t"
3829 "aesenc %%xmm0, %%xmm3\n\t"
3830 "aesenc %%xmm0, %%xmm4\n\t"
3831 "aesenc %%xmm0, %%xmm8\n\t"
3832 "aesenc %%xmm0, %%xmm9\n\t"
3833 "aesenc %%xmm0, %%xmm10\n\t"
3834 "aesenc %%xmm0, %%xmm11\n\t"
3835 "movdqa 0x30(%[key]), %%xmm0\n\t"
3836 "aesenc %%xmm0, %%xmm1\n\t"
3837 "aesenc %%xmm0, %%xmm2\n\t"
3838 "aesenc %%xmm0, %%xmm3\n\t"
3839 "aesenc %%xmm0, %%xmm4\n\t"
3840 "aesenc %%xmm0, %%xmm8\n\t"
3841 "aesenc %%xmm0, %%xmm9\n\t"
3842 "aesenc %%xmm0, %%xmm10\n\t"
3843 "aesenc %%xmm0, %%xmm11\n\t"
3844 "movdqa 0x40(%[key]), %%xmm0\n\t"
3845 "aesenc %%xmm0, %%xmm1\n\t"
3846 "aesenc %%xmm0, %%xmm2\n\t"
3847 "aesenc %%xmm0, %%xmm3\n\t"
3848 "aesenc %%xmm0, %%xmm4\n\t"
3849 "aesenc %%xmm0, %%xmm8\n\t"
3850 "aesenc %%xmm0, %%xmm9\n\t"
3851 "aesenc %%xmm0, %%xmm10\n\t"
3852 "aesenc %%xmm0, %%xmm11\n\t"
3853 "movdqa 0x50(%[key]), %%xmm0\n\t"
3854 "aesenc %%xmm0, %%xmm1\n\t"
3855 "aesenc %%xmm0, %%xmm2\n\t"
3856 "aesenc %%xmm0, %%xmm3\n\t"
3857 "aesenc %%xmm0, %%xmm4\n\t"
3858 "aesenc %%xmm0, %%xmm8\n\t"
3859 "aesenc %%xmm0, %%xmm9\n\t"
3860 "aesenc %%xmm0, %%xmm10\n\t"
3861 "aesenc %%xmm0, %%xmm11\n\t"
3862 "movdqa 0x60(%[key]), %%xmm0\n\t"
3863 "aesenc %%xmm0, %%xmm1\n\t"
3864 "aesenc %%xmm0, %%xmm2\n\t"
3865 "aesenc %%xmm0, %%xmm3\n\t"
3866 "aesenc %%xmm0, %%xmm4\n\t"
3867 "aesenc %%xmm0, %%xmm8\n\t"
3868 "aesenc %%xmm0, %%xmm9\n\t"
3869 "aesenc %%xmm0, %%xmm10\n\t"
3870 "aesenc %%xmm0, %%xmm11\n\t"
3871 "movdqa 0x70(%[key]), %%xmm0\n\t"
3872 "aesenc %%xmm0, %%xmm1\n\t"
3873 "aesenc %%xmm0, %%xmm2\n\t"
3874 "aesenc %%xmm0, %%xmm3\n\t"
3875 "aesenc %%xmm0, %%xmm4\n\t"
3876 "aesenc %%xmm0, %%xmm8\n\t"
3877 "aesenc %%xmm0, %%xmm9\n\t"
3878 "aesenc %%xmm0, %%xmm10\n\t"
3879 "aesenc %%xmm0, %%xmm11\n\t"
3880 "movdqa 0x80(%[key]), %%xmm0\n\t"
3881 "aesenc %%xmm0, %%xmm1\n\t"
3882 "aesenc %%xmm0, %%xmm2\n\t"
3883 "aesenc %%xmm0, %%xmm3\n\t"
3884 "aesenc %%xmm0, %%xmm4\n\t"
3885 "aesenc %%xmm0, %%xmm8\n\t"
3886 "aesenc %%xmm0, %%xmm9\n\t"
3887 "aesenc %%xmm0, %%xmm10\n\t"
3888 "aesenc %%xmm0, %%xmm11\n\t"
3889 "movdqa 0x90(%[key]), %%xmm0\n\t"
3890 "aesenc %%xmm0, %%xmm1\n\t"
3891 "aesenc %%xmm0, %%xmm2\n\t"
3892 "aesenc %%xmm0, %%xmm3\n\t"
3893 "aesenc %%xmm0, %%xmm4\n\t"
3894 "aesenc %%xmm0, %%xmm8\n\t"
3895 "aesenc %%xmm0, %%xmm9\n\t"
3896 "aesenc %%xmm0, %%xmm10\n\t"
3897 "aesenc %%xmm0, %%xmm11\n\t"
3898 "movdqa 0xa0(%[key]), %%xmm0\n\t"
3899 "jb .Lenclast%=\n\t"
3900 "aesenc %%xmm0, %%xmm1\n\t"
3901 "aesenc %%xmm0, %%xmm2\n\t"
3902 "aesenc %%xmm0, %%xmm3\n\t"
3903 "aesenc %%xmm0, %%xmm4\n\t"
3904 "aesenc %%xmm0, %%xmm8\n\t"
3905 "aesenc %%xmm0, %%xmm9\n\t"
3906 "aesenc %%xmm0, %%xmm10\n\t"
3907 "aesenc %%xmm0, %%xmm11\n\t"
3908 "movdqa 0xb0(%[key]), %%xmm0\n\t"
3909 "aesenc %%xmm0, %%xmm1\n\t"
3910 "aesenc %%xmm0, %%xmm2\n\t"
3911 "aesenc %%xmm0, %%xmm3\n\t"
3912 "aesenc %%xmm0, %%xmm4\n\t"
3913 "aesenc %%xmm0, %%xmm8\n\t"
3914 "aesenc %%xmm0, %%xmm9\n\t"
3915 "aesenc %%xmm0, %%xmm10\n\t"
3916 "aesenc %%xmm0, %%xmm11\n\t"
3917 "movdqa 0xc0(%[key]), %%xmm0\n\t"
3918 "je .Lenclast%=\n\t"
3919 "aesenc %%xmm0, %%xmm1\n\t"
3920 "aesenc %%xmm0, %%xmm2\n\t"
3921 "aesenc %%xmm0, %%xmm3\n\t"
3922 "aesenc %%xmm0, %%xmm4\n\t"
3923 "aesenc %%xmm0, %%xmm8\n\t"
3924 "aesenc %%xmm0, %%xmm9\n\t"
3925 "aesenc %%xmm0, %%xmm10\n\t"
3926 "aesenc %%xmm0, %%xmm11\n\t"
3927 "movdqa 0xd0(%[key]), %%xmm0\n\t"
3928 "aesenc %%xmm0, %%xmm1\n\t"
3929 "aesenc %%xmm0, %%xmm2\n\t"
3930 "aesenc %%xmm0, %%xmm3\n\t"
3931 "aesenc %%xmm0, %%xmm4\n\t"
3932 "aesenc %%xmm0, %%xmm8\n\t"
3933 "aesenc %%xmm0, %%xmm9\n\t"
3934 "aesenc %%xmm0, %%xmm10\n\t"
3935 "aesenc %%xmm0, %%xmm11\n\t"
3936 "movdqa 0xe0(%[key]), %%xmm0\n\t"
3937
3938 ".Lenclast%=:\n\t"
3939 :
3940 : [key] "r" (ctx->keyschenc),
3941 [rounds] "rm" (ctx->rounds)
3942 : "cc", "memory");
3943
3944 asm volatile ("pxor %%xmm0, %%xmm7\n\t"
3945 "pxor %%xmm0, %%xmm12\n\t"
3946 "pxor %%xmm0, %%xmm13\n\t"
3947 "pxor %%xmm0, %%xmm14\n\t"
3948 "aesenclast %%xmm7, %%xmm1\n\t"
3949 "aesenclast %%xmm12, %%xmm2\n\t"
3950 "aesenclast %%xmm13, %%xmm3\n\t"
3951 "aesenclast %%xmm14, %%xmm4\n\t"
3952 "movdqu 5*16(%[outbuf]), %%xmm12\n\t"
3953 "movdqu 6*16(%[outbuf]), %%xmm13\n\t"
3954 "movdqu 7*16(%[outbuf]), %%xmm14\n\t"
3955 "pxor %%xmm0, %%xmm15\n\t"
3956 "pxor %%xmm0, %%xmm12\n\t"
3957 "pxor %%xmm0, %%xmm13\n\t"
3958 "pxor %%xmm0, %%xmm14\n\t"
3959 "aesenclast %%xmm15, %%xmm8\n\t"
3960 "aesenclast %%xmm12, %%xmm9\n\t"
3961 "aesenclast %%xmm13, %%xmm10\n\t"
3962 "aesenclast %%xmm14, %%xmm11\n\t"
3963 "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
3964 "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
3965 "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
3966 "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
3967 "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
3968 "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
3969 "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
3970 "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
3971 :
3972 : [outbuf] "r" (outbuf)
3973 : "memory" );
3974
3975 outbuf += 8*BLOCKSIZE;
3976 inbuf += 8*BLOCKSIZE;
3977 }
3978
3979 aesni_cleanup_8_15();
3980 }
3981 #endif
3982
3983 for ( ;nblocks >= 4; nblocks -= 4 )
3984 {
3985 asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t"
3986 "movdqu %[inbuf0], %%xmm1\n\t"
3987 "pxor %%xmm5, %%xmm1\n\t"
3988 "movdqu %%xmm5, %[outbuf0]\n\t"
3989
3990 "movdqa %%xmm4, %%xmm0\n\t"
3991 "paddd %%xmm4, %%xmm4\n\t"
3992 "psrad $31, %%xmm0\n\t"
3993 "paddq %%xmm5, %%xmm5\n\t"
3994 "pand %%xmm6, %%xmm0\n\t"
3995 "pxor %%xmm0, %%xmm5\n\t"
3996 : [outbuf0] "=m" (*(outbuf + 0 * 16))
3997 : [inbuf0] "m" (*(inbuf + 0 * 16))
3998 : "memory" );
3999
4000 asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
4001 "pxor %%xmm5, %%xmm2\n\t"
4002 "movdqu %%xmm5, %[outbuf1]\n\t"
4003
4004 "movdqa %%xmm4, %%xmm0\n\t"
4005 "paddd %%xmm4, %%xmm4\n\t"
4006 "psrad $31, %%xmm0\n\t"
4007 "paddq %%xmm5, %%xmm5\n\t"
4008 "pand %%xmm6, %%xmm0\n\t"
4009 "pxor %%xmm0, %%xmm5\n\t"
4010 : [outbuf1] "=m" (*(outbuf + 1 * 16))
4011 : [inbuf1] "m" (*(inbuf + 1 * 16))
4012 : "memory" );
4013
4014 asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
4015 "pxor %%xmm5, %%xmm3\n\t"
4016 "movdqu %%xmm5, %[outbuf2]\n\t"
4017
4018 "movdqa %%xmm4, %%xmm0\n\t"
4019 "paddd %%xmm4, %%xmm4\n\t"
4020 "psrad $31, %%xmm0\n\t"
4021 "paddq %%xmm5, %%xmm5\n\t"
4022 "pand %%xmm6, %%xmm0\n\t"
4023 "pxor %%xmm0, %%xmm5\n\t"
4024 : [outbuf2] "=m" (*(outbuf + 2 * 16))
4025 : [inbuf2] "m" (*(inbuf + 2 * 16))
4026 : "memory" );
4027
4028 asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
4029 "movdqu %[inbuf3], %%xmm4\n\t"
4030 "pxor %%xmm5, %%xmm4\n\t"
4031 "movdqu %%xmm5, %[outbuf3]\n\t"
4032
4033 "psrad $31, %%xmm0\n\t"
4034 "paddq %%xmm5, %%xmm5\n\t"
4035 "pand %%xmm6, %%xmm0\n\t"
4036 "pxor %%xmm0, %%xmm5\n\t"
4037 : [outbuf3] "=m" (*(outbuf + 3 * 16))
4038 : [inbuf3] "m" (*(inbuf + 3 * 16))
4039 : "memory" );
4040
4041 do_aesni_enc_vec4 (ctx);
4042
4043 asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
4044 "pxor %%xmm0, %%xmm1\n\t"
4045 "movdqu %[outbuf1], %%xmm0\n\t"
4046 "movdqu %%xmm1, %[outbuf0]\n\t"
4047 "movdqu %[outbuf2], %%xmm1\n\t"
4048 "pxor %%xmm0, %%xmm2\n\t"
4049 "movdqu %[outbuf3], %%xmm0\n\t"
4050 "pxor %%xmm1, %%xmm3\n\t"
4051 "pxor %%xmm0, %%xmm4\n\t"
4052 "movdqu %%xmm2, %[outbuf1]\n\t"
4053 "movdqu %%xmm3, %[outbuf2]\n\t"
4054 "movdqu %%xmm4, %[outbuf3]\n\t"
4055 : [outbuf0] "+m" (*(outbuf + 0 * 16)),
4056 [outbuf1] "+m" (*(outbuf + 1 * 16)),
4057 [outbuf2] "+m" (*(outbuf + 2 * 16)),
4058 [outbuf3] "+m" (*(outbuf + 3 * 16))
4059 :
4060 : "memory" );
4061
4062 outbuf += BLOCKSIZE * 4;
4063 inbuf += BLOCKSIZE * 4;
4064 }
4065
4066 for ( ;nblocks; nblocks-- )
4067 {
4068 asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
4069 "pxor %%xmm5, %%xmm0\n\t"
4070 "movdqa %%xmm5, %%xmm4\n\t"
4071
4072 "pshufd $0x13, %%xmm5, %%xmm1\n\t"
4073 "psrad $31, %%xmm1\n\t"
4074 "paddq %%xmm5, %%xmm5\n\t"
4075 "pand %%xmm6, %%xmm1\n\t"
4076 "pxor %%xmm1, %%xmm5\n\t"
4077 :
4078 : [inbuf] "m" (*inbuf)
4079 : "memory" );
4080
4081 do_aesni_enc (ctx);
4082
4083 asm volatile ("pxor %%xmm4, %%xmm0\n\t"
4084 "movdqu %%xmm0, %[outbuf]\n\t"
4085 : [outbuf] "=m" (*outbuf)
4086 :
4087 : "memory" );
4088
4089 outbuf += BLOCKSIZE;
4090 inbuf += BLOCKSIZE;
4091 }
4092
4093 asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
4094 : [tweak] "=m" (*tweak)
4095 :
4096 : "memory" );
4097
4098 aesni_cleanup ();
4099 aesni_cleanup_2_7 ();
4100 }
4101
4102
4103 static void ASM_FUNC_ATTR
_gcry_aes_aesni_xts_dec(RIJNDAEL_context * ctx,unsigned char * tweak,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)4104 _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
4105 unsigned char *outbuf, const unsigned char *inbuf,
4106 size_t nblocks)
4107 {
4108 aesni_prepare_2_7_variable;
4109
4110 aesni_prepare ();
4111 aesni_prepare_2_7 ();
4112
4113 if ( !ctx->decryption_prepared )
4114 {
4115 do_aesni_prepare_decryption ( ctx );
4116 ctx->decryption_prepared = 1;
4117 }
4118
4119 /* Preload Tweak */
4120 asm volatile ("movdqu %[tweak], %%xmm5\n\t"
4121 "movdqa %[gfmul], %%xmm6\n\t"
4122 :
4123 : [tweak] "m" (*tweak),
4124 [gfmul] "m" (*xts_gfmul_const)
4125 : "memory" );
4126
4127 #ifdef __x86_64__
4128 if (nblocks >= 8)
4129 {
4130 aesni_prepare_8_15_variable;
4131
4132 aesni_prepare_8_15();
4133
4134 for ( ;nblocks >= 8 ; nblocks -= 8 )
4135 {
4136 asm volatile ("pshufd $0x13, %%xmm5, %%xmm11\n\t"
4137 "movdqu %[inbuf0], %%xmm1\n\t"
4138 "pxor %%xmm5, %%xmm1\n\t"
4139 "movdqa %%xmm5, %%xmm7\n\t"
4140
4141 "movdqa %%xmm11, %%xmm0\n\t"
4142 "paddd %%xmm11, %%xmm11\n\t"
4143 "psrad $31, %%xmm0\n\t"
4144 "paddq %%xmm5, %%xmm5\n\t"
4145 "pand %%xmm6, %%xmm0\n\t"
4146 "pxor %%xmm0, %%xmm5\n\t"
4147 :
4148 : [inbuf0] "m" (*(inbuf + 0 * 16))
4149 : "memory" );
4150
4151 asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
4152 "pxor %%xmm5, %%xmm2\n\t"
4153 "movdqa %%xmm5, %%xmm12\n\t"
4154
4155 "movdqa %%xmm11, %%xmm0\n\t"
4156 "paddd %%xmm11, %%xmm11\n\t"
4157 "psrad $31, %%xmm0\n\t"
4158 "paddq %%xmm5, %%xmm5\n\t"
4159 "pand %%xmm6, %%xmm0\n\t"
4160 "pxor %%xmm0, %%xmm5\n\t"
4161 :
4162 : [inbuf1] "m" (*(inbuf + 1 * 16))
4163 : "memory" );
4164
4165 asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
4166 "pxor %%xmm5, %%xmm3\n\t"
4167 "movdqa %%xmm5, %%xmm13\n\t"
4168
4169 "movdqa %%xmm11, %%xmm0\n\t"
4170 "paddd %%xmm11, %%xmm11\n\t"
4171 "psrad $31, %%xmm0\n\t"
4172 "paddq %%xmm5, %%xmm5\n\t"
4173 "pand %%xmm6, %%xmm0\n\t"
4174 "pxor %%xmm0, %%xmm5\n\t"
4175 :
4176 : [inbuf2] "m" (*(inbuf + 2 * 16))
4177 : "memory" );
4178
4179 asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
4180 "pxor %%xmm5, %%xmm4\n\t"
4181 "movdqa %%xmm5, %%xmm14\n\t"
4182
4183 "movdqa %%xmm11, %%xmm0\n\t"
4184 "paddd %%xmm11, %%xmm11\n\t"
4185 "psrad $31, %%xmm0\n\t"
4186 "paddq %%xmm5, %%xmm5\n\t"
4187 "pand %%xmm6, %%xmm0\n\t"
4188 "pxor %%xmm0, %%xmm5\n\t"
4189 :
4190 : [inbuf3] "m" (*(inbuf + 3 * 16))
4191 : "memory" );
4192
4193 asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
4194 "pxor %%xmm5, %%xmm8\n\t"
4195 "movdqa %%xmm5, %%xmm15\n\t"
4196
4197 "movdqa %%xmm11, %%xmm0\n\t"
4198 "paddd %%xmm11, %%xmm11\n\t"
4199 "psrad $31, %%xmm0\n\t"
4200 "paddq %%xmm5, %%xmm5\n\t"
4201 "pand %%xmm6, %%xmm0\n\t"
4202 "pxor %%xmm0, %%xmm5\n\t"
4203 :
4204 : [inbuf4] "m" (*(inbuf + 4 * 16))
4205 : "memory" );
4206
4207 asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
4208 "pxor %%xmm5, %%xmm9\n\t"
4209 "movdqu %%xmm5, %[outbuf5]\n\t"
4210
4211 "movdqa %%xmm11, %%xmm0\n\t"
4212 "paddd %%xmm11, %%xmm11\n\t"
4213 "psrad $31, %%xmm0\n\t"
4214 "paddq %%xmm5, %%xmm5\n\t"
4215 "pand %%xmm6, %%xmm0\n\t"
4216 "pxor %%xmm0, %%xmm5\n\t"
4217 : [outbuf5] "=m" (*(outbuf + 5 * 16))
4218 : [inbuf5] "m" (*(inbuf + 5 * 16))
4219 : "memory" );
4220
4221 asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
4222 "pxor %%xmm5, %%xmm10\n\t"
4223 "movdqu %%xmm5, %[outbuf6]\n\t"
4224
4225 "movdqa %%xmm11, %%xmm0\n\t"
4226 "paddd %%xmm11, %%xmm11\n\t"
4227 "psrad $31, %%xmm0\n\t"
4228 "paddq %%xmm5, %%xmm5\n\t"
4229 "pand %%xmm6, %%xmm0\n\t"
4230 "pxor %%xmm0, %%xmm5\n\t"
4231 : [outbuf6] "=m" (*(outbuf + 6 * 16))
4232 : [inbuf6] "m" (*(inbuf + 6 * 16))
4233 : "memory" );
4234
4235 asm volatile ("movdqa %%xmm11, %%xmm0\n\t"
4236 "movdqu %[inbuf7], %%xmm11\n\t"
4237 "pxor %%xmm5, %%xmm11\n\t"
4238 "movdqu %%xmm5, %[outbuf7]\n\t"
4239
4240 "psrad $31, %%xmm0\n\t"
4241 "paddq %%xmm5, %%xmm5\n\t"
4242 "pand %%xmm6, %%xmm0\n\t"
4243 "pxor %%xmm0, %%xmm5\n\t"
4244 : [outbuf7] "=m" (*(outbuf + 7 * 16))
4245 : [inbuf7] "m" (*(inbuf + 7 * 16))
4246 : "memory" );
4247
4248 asm volatile ("cmpl $12, %[rounds]\n\t"
4249 "movdqa (%[key]), %%xmm0\n\t"
4250 "pxor %%xmm0, %%xmm1\n\t"
4251 "pxor %%xmm0, %%xmm2\n\t"
4252 "pxor %%xmm0, %%xmm3\n\t"
4253 "pxor %%xmm0, %%xmm4\n\t"
4254 "pxor %%xmm0, %%xmm8\n\t"
4255 "pxor %%xmm0, %%xmm9\n\t"
4256 "pxor %%xmm0, %%xmm10\n\t"
4257 "pxor %%xmm0, %%xmm11\n\t"
4258 "movdqa 0x10(%[key]), %%xmm0\n\t"
4259 "aesdec %%xmm0, %%xmm1\n\t"
4260 "aesdec %%xmm0, %%xmm2\n\t"
4261 "aesdec %%xmm0, %%xmm3\n\t"
4262 "aesdec %%xmm0, %%xmm4\n\t"
4263 "aesdec %%xmm0, %%xmm8\n\t"
4264 "aesdec %%xmm0, %%xmm9\n\t"
4265 "aesdec %%xmm0, %%xmm10\n\t"
4266 "aesdec %%xmm0, %%xmm11\n\t"
4267 "movdqa 0x20(%[key]), %%xmm0\n\t"
4268 "aesdec %%xmm0, %%xmm1\n\t"
4269 "aesdec %%xmm0, %%xmm2\n\t"
4270 "aesdec %%xmm0, %%xmm3\n\t"
4271 "aesdec %%xmm0, %%xmm4\n\t"
4272 "aesdec %%xmm0, %%xmm8\n\t"
4273 "aesdec %%xmm0, %%xmm9\n\t"
4274 "aesdec %%xmm0, %%xmm10\n\t"
4275 "aesdec %%xmm0, %%xmm11\n\t"
4276 "movdqa 0x30(%[key]), %%xmm0\n\t"
4277 "aesdec %%xmm0, %%xmm1\n\t"
4278 "aesdec %%xmm0, %%xmm2\n\t"
4279 "aesdec %%xmm0, %%xmm3\n\t"
4280 "aesdec %%xmm0, %%xmm4\n\t"
4281 "aesdec %%xmm0, %%xmm8\n\t"
4282 "aesdec %%xmm0, %%xmm9\n\t"
4283 "aesdec %%xmm0, %%xmm10\n\t"
4284 "aesdec %%xmm0, %%xmm11\n\t"
4285 "movdqa 0x40(%[key]), %%xmm0\n\t"
4286 "aesdec %%xmm0, %%xmm1\n\t"
4287 "aesdec %%xmm0, %%xmm2\n\t"
4288 "aesdec %%xmm0, %%xmm3\n\t"
4289 "aesdec %%xmm0, %%xmm4\n\t"
4290 "aesdec %%xmm0, %%xmm8\n\t"
4291 "aesdec %%xmm0, %%xmm9\n\t"
4292 "aesdec %%xmm0, %%xmm10\n\t"
4293 "aesdec %%xmm0, %%xmm11\n\t"
4294 "movdqa 0x50(%[key]), %%xmm0\n\t"
4295 "aesdec %%xmm0, %%xmm1\n\t"
4296 "aesdec %%xmm0, %%xmm2\n\t"
4297 "aesdec %%xmm0, %%xmm3\n\t"
4298 "aesdec %%xmm0, %%xmm4\n\t"
4299 "aesdec %%xmm0, %%xmm8\n\t"
4300 "aesdec %%xmm0, %%xmm9\n\t"
4301 "aesdec %%xmm0, %%xmm10\n\t"
4302 "aesdec %%xmm0, %%xmm11\n\t"
4303 "movdqa 0x60(%[key]), %%xmm0\n\t"
4304 "aesdec %%xmm0, %%xmm1\n\t"
4305 "aesdec %%xmm0, %%xmm2\n\t"
4306 "aesdec %%xmm0, %%xmm3\n\t"
4307 "aesdec %%xmm0, %%xmm4\n\t"
4308 "aesdec %%xmm0, %%xmm8\n\t"
4309 "aesdec %%xmm0, %%xmm9\n\t"
4310 "aesdec %%xmm0, %%xmm10\n\t"
4311 "aesdec %%xmm0, %%xmm11\n\t"
4312 "movdqa 0x70(%[key]), %%xmm0\n\t"
4313 "aesdec %%xmm0, %%xmm1\n\t"
4314 "aesdec %%xmm0, %%xmm2\n\t"
4315 "aesdec %%xmm0, %%xmm3\n\t"
4316 "aesdec %%xmm0, %%xmm4\n\t"
4317 "aesdec %%xmm0, %%xmm8\n\t"
4318 "aesdec %%xmm0, %%xmm9\n\t"
4319 "aesdec %%xmm0, %%xmm10\n\t"
4320 "aesdec %%xmm0, %%xmm11\n\t"
4321 "movdqa 0x80(%[key]), %%xmm0\n\t"
4322 "aesdec %%xmm0, %%xmm1\n\t"
4323 "aesdec %%xmm0, %%xmm2\n\t"
4324 "aesdec %%xmm0, %%xmm3\n\t"
4325 "aesdec %%xmm0, %%xmm4\n\t"
4326 "aesdec %%xmm0, %%xmm8\n\t"
4327 "aesdec %%xmm0, %%xmm9\n\t"
4328 "aesdec %%xmm0, %%xmm10\n\t"
4329 "aesdec %%xmm0, %%xmm11\n\t"
4330 "movdqa 0x90(%[key]), %%xmm0\n\t"
4331 "aesdec %%xmm0, %%xmm1\n\t"
4332 "aesdec %%xmm0, %%xmm2\n\t"
4333 "aesdec %%xmm0, %%xmm3\n\t"
4334 "aesdec %%xmm0, %%xmm4\n\t"
4335 "aesdec %%xmm0, %%xmm8\n\t"
4336 "aesdec %%xmm0, %%xmm9\n\t"
4337 "aesdec %%xmm0, %%xmm10\n\t"
4338 "aesdec %%xmm0, %%xmm11\n\t"
4339 "movdqa 0xa0(%[key]), %%xmm0\n\t"
4340 "jb .Ldeclast%=\n\t"
4341 "aesdec %%xmm0, %%xmm1\n\t"
4342 "aesdec %%xmm0, %%xmm2\n\t"
4343 "aesdec %%xmm0, %%xmm3\n\t"
4344 "aesdec %%xmm0, %%xmm4\n\t"
4345 "aesdec %%xmm0, %%xmm8\n\t"
4346 "aesdec %%xmm0, %%xmm9\n\t"
4347 "aesdec %%xmm0, %%xmm10\n\t"
4348 "aesdec %%xmm0, %%xmm11\n\t"
4349 "movdqa 0xb0(%[key]), %%xmm0\n\t"
4350 "aesdec %%xmm0, %%xmm1\n\t"
4351 "aesdec %%xmm0, %%xmm2\n\t"
4352 "aesdec %%xmm0, %%xmm3\n\t"
4353 "aesdec %%xmm0, %%xmm4\n\t"
4354 "aesdec %%xmm0, %%xmm8\n\t"
4355 "aesdec %%xmm0, %%xmm9\n\t"
4356 "aesdec %%xmm0, %%xmm10\n\t"
4357 "aesdec %%xmm0, %%xmm11\n\t"
4358 "movdqa 0xc0(%[key]), %%xmm0\n\t"
4359 "je .Ldeclast%=\n\t"
4360 "aesdec %%xmm0, %%xmm1\n\t"
4361 "aesdec %%xmm0, %%xmm2\n\t"
4362 "aesdec %%xmm0, %%xmm3\n\t"
4363 "aesdec %%xmm0, %%xmm4\n\t"
4364 "aesdec %%xmm0, %%xmm8\n\t"
4365 "aesdec %%xmm0, %%xmm9\n\t"
4366 "aesdec %%xmm0, %%xmm10\n\t"
4367 "aesdec %%xmm0, %%xmm11\n\t"
4368 "movdqa 0xd0(%[key]), %%xmm0\n\t"
4369 "aesdec %%xmm0, %%xmm1\n\t"
4370 "aesdec %%xmm0, %%xmm2\n\t"
4371 "aesdec %%xmm0, %%xmm3\n\t"
4372 "aesdec %%xmm0, %%xmm4\n\t"
4373 "aesdec %%xmm0, %%xmm8\n\t"
4374 "aesdec %%xmm0, %%xmm9\n\t"
4375 "aesdec %%xmm0, %%xmm10\n\t"
4376 "aesdec %%xmm0, %%xmm11\n\t"
4377 "movdqa 0xe0(%[key]), %%xmm0\n\t"
4378
4379 ".Ldeclast%=:\n\t"
4380 :
4381 : [key] "r" (ctx->keyschdec),
4382 [rounds] "rm" (ctx->rounds)
4383 : "cc", "memory");
4384
4385 asm volatile ("pxor %%xmm0, %%xmm7\n\t"
4386 "pxor %%xmm0, %%xmm12\n\t"
4387 "pxor %%xmm0, %%xmm13\n\t"
4388 "pxor %%xmm0, %%xmm14\n\t"
4389 "aesdeclast %%xmm7, %%xmm1\n\t"
4390 "aesdeclast %%xmm12, %%xmm2\n\t"
4391 "aesdeclast %%xmm13, %%xmm3\n\t"
4392 "aesdeclast %%xmm14, %%xmm4\n\t"
4393 "movdqu 5*16(%[outbuf]), %%xmm12\n\t"
4394 "movdqu 6*16(%[outbuf]), %%xmm13\n\t"
4395 "movdqu 7*16(%[outbuf]), %%xmm14\n\t"
4396 "pxor %%xmm0, %%xmm15\n\t"
4397 "pxor %%xmm0, %%xmm12\n\t"
4398 "pxor %%xmm0, %%xmm13\n\t"
4399 "pxor %%xmm0, %%xmm14\n\t"
4400 "aesdeclast %%xmm15, %%xmm8\n\t"
4401 "aesdeclast %%xmm12, %%xmm9\n\t"
4402 "aesdeclast %%xmm13, %%xmm10\n\t"
4403 "aesdeclast %%xmm14, %%xmm11\n\t"
4404 "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
4405 "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
4406 "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
4407 "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
4408 "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
4409 "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
4410 "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
4411 "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
4412 :
4413 : [outbuf] "r" (outbuf)
4414 : "memory" );
4415
4416 outbuf += 8*BLOCKSIZE;
4417 inbuf += 8*BLOCKSIZE;
4418 }
4419
4420 aesni_cleanup_8_15();
4421 }
4422 #endif
4423
4424 for ( ;nblocks >= 4; nblocks -= 4 )
4425 {
4426 asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t"
4427 "movdqu %[inbuf0], %%xmm1\n\t"
4428 "pxor %%xmm5, %%xmm1\n\t"
4429 "movdqu %%xmm5, %[outbuf0]\n\t"
4430
4431 "movdqa %%xmm4, %%xmm0\n\t"
4432 "paddd %%xmm4, %%xmm4\n\t"
4433 "psrad $31, %%xmm0\n\t"
4434 "paddq %%xmm5, %%xmm5\n\t"
4435 "pand %%xmm6, %%xmm0\n\t"
4436 "pxor %%xmm0, %%xmm5\n\t"
4437 : [outbuf0] "=m" (*(outbuf + 0 * 16))
4438 : [inbuf0] "m" (*(inbuf + 0 * 16))
4439 : "memory" );
4440
4441 asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
4442 "pxor %%xmm5, %%xmm2\n\t"
4443 "movdqu %%xmm5, %[outbuf1]\n\t"
4444
4445 "movdqa %%xmm4, %%xmm0\n\t"
4446 "paddd %%xmm4, %%xmm4\n\t"
4447 "psrad $31, %%xmm0\n\t"
4448 "paddq %%xmm5, %%xmm5\n\t"
4449 "pand %%xmm6, %%xmm0\n\t"
4450 "pxor %%xmm0, %%xmm5\n\t"
4451 : [outbuf1] "=m" (*(outbuf + 1 * 16))
4452 : [inbuf1] "m" (*(inbuf + 1 * 16))
4453 : "memory" );
4454
4455 asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
4456 "pxor %%xmm5, %%xmm3\n\t"
4457 "movdqu %%xmm5, %[outbuf2]\n\t"
4458
4459 "movdqa %%xmm4, %%xmm0\n\t"
4460 "paddd %%xmm4, %%xmm4\n\t"
4461 "psrad $31, %%xmm0\n\t"
4462 "paddq %%xmm5, %%xmm5\n\t"
4463 "pand %%xmm6, %%xmm0\n\t"
4464 "pxor %%xmm0, %%xmm5\n\t"
4465 : [outbuf2] "=m" (*(outbuf + 2 * 16))
4466 : [inbuf2] "m" (*(inbuf + 2 * 16))
4467 : "memory" );
4468
4469 asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
4470 "movdqu %[inbuf3], %%xmm4\n\t"
4471 "pxor %%xmm5, %%xmm4\n\t"
4472 "movdqu %%xmm5, %[outbuf3]\n\t"
4473
4474 "psrad $31, %%xmm0\n\t"
4475 "paddq %%xmm5, %%xmm5\n\t"
4476 "pand %%xmm6, %%xmm0\n\t"
4477 "pxor %%xmm0, %%xmm5\n\t"
4478 : [outbuf3] "=m" (*(outbuf + 3 * 16))
4479 : [inbuf3] "m" (*(inbuf + 3 * 16))
4480 : "memory" );
4481
4482 do_aesni_dec_vec4 (ctx);
4483
4484 asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
4485 "pxor %%xmm0, %%xmm1\n\t"
4486 "movdqu %[outbuf1], %%xmm0\n\t"
4487 "movdqu %%xmm1, %[outbuf0]\n\t"
4488 "movdqu %[outbuf2], %%xmm1\n\t"
4489 "pxor %%xmm0, %%xmm2\n\t"
4490 "movdqu %[outbuf3], %%xmm0\n\t"
4491 "pxor %%xmm1, %%xmm3\n\t"
4492 "pxor %%xmm0, %%xmm4\n\t"
4493 "movdqu %%xmm2, %[outbuf1]\n\t"
4494 "movdqu %%xmm3, %[outbuf2]\n\t"
4495 "movdqu %%xmm4, %[outbuf3]\n\t"
4496 : [outbuf0] "+m" (*(outbuf + 0 * 16)),
4497 [outbuf1] "+m" (*(outbuf + 1 * 16)),
4498 [outbuf2] "+m" (*(outbuf + 2 * 16)),
4499 [outbuf3] "+m" (*(outbuf + 3 * 16))
4500 :
4501 : "memory" );
4502
4503 outbuf += BLOCKSIZE * 4;
4504 inbuf += BLOCKSIZE * 4;
4505 }
4506
4507 for ( ;nblocks; nblocks-- )
4508 {
4509 asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
4510 "pxor %%xmm5, %%xmm0\n\t"
4511 "movdqa %%xmm5, %%xmm4\n\t"
4512
4513 "pshufd $0x13, %%xmm5, %%xmm1\n\t"
4514 "psrad $31, %%xmm1\n\t"
4515 "paddq %%xmm5, %%xmm5\n\t"
4516 "pand %%xmm6, %%xmm1\n\t"
4517 "pxor %%xmm1, %%xmm5\n\t"
4518 :
4519 : [inbuf] "m" (*inbuf)
4520 : "memory" );
4521
4522 do_aesni_dec (ctx);
4523
4524 asm volatile ("pxor %%xmm4, %%xmm0\n\t"
4525 "movdqu %%xmm0, %[outbuf]\n\t"
4526 : [outbuf] "=m" (*outbuf)
4527 :
4528 : "memory" );
4529
4530 outbuf += BLOCKSIZE;
4531 inbuf += BLOCKSIZE;
4532 }
4533
4534 asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
4535 : [tweak] "=m" (*tweak)
4536 :
4537 : "memory" );
4538
4539 aesni_cleanup ();
4540 aesni_cleanup_2_7 ();
4541 }
4542
4543
4544 void ASM_FUNC_ATTR
_gcry_aes_aesni_xts_crypt(RIJNDAEL_context * ctx,unsigned char * tweak,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks,int encrypt)4545 _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
4546 unsigned char *outbuf, const unsigned char *inbuf,
4547 size_t nblocks, int encrypt)
4548 {
4549 if (encrypt)
4550 _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks);
4551 else
4552 _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks);
4553 }
4554
4555 #if __clang__
4556 # pragma clang attribute pop
4557 #endif
4558
4559 #endif /* USE_AESNI */
4560