1 /* AES-NI accelerated AES for Libgcrypt
2  * Copyright (C) 2000, 2001, 2002, 2003, 2007,
3  *               2008, 2011, 2012 Free Software Foundation, Inc.
4  *
5  * This file is part of Libgcrypt.
6  *
7  * Libgcrypt is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as
9  * published by the Free Software Foundation; either version 2.1 of
10  * the License, or (at your option) any later version.
11  *
12  * Libgcrypt is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include <config.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h> /* for memcmp() */
25 
26 #include "types.h"  /* for byte and u32 typedefs */
27 #include "g10lib.h"
28 #include "cipher.h"
29 #include "bufhelp.h"
30 #include "cipher-selftest.h"
31 #include "rijndael-internal.h"
32 #include "./cipher-internal.h"
33 
34 
35 #ifdef USE_AESNI
36 
37 
38 #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
39 /* Prevent compiler from issuing SSE instructions between asm blocks. */
40 #  pragma GCC target("no-sse")
41 #endif
42 #if __clang__
43 #  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
44 #endif
45 
46 
47 #define ALWAYS_INLINE inline __attribute__((always_inline))
48 #define NO_INLINE __attribute__((noinline))
49 #define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
50 
51 #define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
52 #define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
53 #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
54 
55 
56 typedef struct u128_s
57 {
58   u32 a, b, c, d;
59 } __attribute__((packed, aligned(1), may_alias)) u128_t;
60 
61 
62 /* Copy of ocb_get_l needed here as GCC is unable to inline ocb_get_l
63    because of 'pragma target'. */
64 static ASM_FUNC_ATTR_INLINE const unsigned char *
aes_ocb_get_l(gcry_cipher_hd_t c,u64 n)65 aes_ocb_get_l (gcry_cipher_hd_t c, u64 n)
66 {
67   unsigned long ntz;
68 
69   /* Assumes that N != 0. */
70   asm ("rep;bsfl %k[low], %k[ntz]\n\t"
71         : [ntz] "=r" (ntz)
72         : [low] "r" ((unsigned long)n)
73         : "cc");
74 
75   return c->u_mode.ocb.L[ntz];
76 }
77 
78 
79 /* Two macros to be called prior and after the use of AESNI
80    instructions.  There should be no external function calls between
81    the use of these macros.  There purpose is to make sure that the
82    SSE regsiters are cleared and won't reveal any information about
83    the key or the data.  */
84 #ifdef __WIN64__
85 /* XMM6-XMM15 are callee-saved registers on WIN64. */
86 # define aesni_prepare_2_7_variable char win64tmp[16 * 2]
87 # define aesni_prepare_8_15_variable char win64tmp8_15[16 * 8]
88 # define aesni_prepare() do { } while (0)
89 # define aesni_prepare_2_7()                                            \
90    do { asm volatile ("movdqu %%xmm6, %0\n\t"                           \
91 		      "movdqu %%xmm7, %1\n\t"                           \
92                       : "=m" (*win64tmp), "=m" (*(win64tmp+16))         \
93                       :                                                 \
94                       : "memory");                                      \
95    } while (0)
96 # define aesni_prepare_8_15()                                           \
97    do { asm volatile ("movdqu %%xmm8,  0*16(%0)\n\t"                    \
98                       "movdqu %%xmm9,  1*16(%0)\n\t"                    \
99                       "movdqu %%xmm10, 2*16(%0)\n\t"                    \
100                       "movdqu %%xmm11, 3*16(%0)\n\t"                    \
101                       "movdqu %%xmm12, 4*16(%0)\n\t"                    \
102                       "movdqu %%xmm13, 5*16(%0)\n\t"                    \
103                       "movdqu %%xmm14, 6*16(%0)\n\t"                    \
104                       "movdqu %%xmm15, 7*16(%0)\n\t"                    \
105                       :                                                 \
106                       : "r" (win64tmp8_15)                              \
107                       : "memory");                                      \
108    } while (0)
109 # define aesni_cleanup()                                                \
110    do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
111                       "pxor %%xmm1, %%xmm1\n" :: );                     \
112    } while (0)
113 # define aesni_cleanup_2_7()                                            \
114    do { asm volatile ("movdqu %0,   %%xmm6\n\t"                         \
115 		      "movdqu %1,   %%xmm7\n\t"                         \
116                       "pxor %%xmm2, %%xmm2\n"                           \
117                       "pxor %%xmm3, %%xmm3\n"                           \
118                       "pxor %%xmm4, %%xmm4\n"                           \
119                       "pxor %%xmm5, %%xmm5\n"                           \
120                       :                                                 \
121                       : "m" (*win64tmp), "m" (*(win64tmp+16))           \
122                       : "memory");                                      \
123    } while (0)
124 # define aesni_cleanup_8_15()                                           \
125    do { asm volatile ("movdqu 0*16(%0), %%xmm8\n\t"                     \
126                       "movdqu 1*16(%0), %%xmm9\n\t"                     \
127                       "movdqu 2*16(%0), %%xmm10\n\t"                    \
128                       "movdqu 3*16(%0), %%xmm11\n\t"                    \
129                       "movdqu 4*16(%0), %%xmm12\n\t"                    \
130                       "movdqu 5*16(%0), %%xmm13\n\t"                    \
131                       "movdqu 6*16(%0), %%xmm14\n\t"                    \
132                       "movdqu 7*16(%0), %%xmm15\n\t"                    \
133                       :                                                 \
134                       : "r" (win64tmp8_15)                              \
135                       : "memory");                                      \
136    } while (0)
137 #else
138 # define aesni_prepare_2_7_variable
139 # define aesni_prepare() do { } while (0)
140 # define aesni_prepare_2_7() do { } while (0)
141 # define aesni_cleanup()                                                \
142    do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
143                       "pxor %%xmm1, %%xmm1\n" :: );                     \
144    } while (0)
145 # define aesni_cleanup_2_7()                                            \
146    do { asm volatile ("pxor %%xmm7, %%xmm7\n\t"                         \
147                       "pxor %%xmm2, %%xmm2\n\t"                         \
148                       "pxor %%xmm3, %%xmm3\n"                           \
149                       "pxor %%xmm4, %%xmm4\n"                           \
150                       "pxor %%xmm5, %%xmm5\n"                           \
151                       "pxor %%xmm6, %%xmm6\n":: );                      \
152    } while (0)
153 # ifdef __x86_64__
154 #  define aesni_prepare_8_15_variable
155 #  define aesni_prepare_8_15() do { } while (0)
156 #  define aesni_cleanup_8_15()                                          \
157    do { asm volatile ("pxor %%xmm8, %%xmm8\n"                           \
158                       "pxor %%xmm9, %%xmm9\n"                           \
159                       "pxor %%xmm10, %%xmm10\n"                         \
160                       "pxor %%xmm11, %%xmm11\n"                         \
161                       "pxor %%xmm12, %%xmm12\n"                         \
162                       "pxor %%xmm13, %%xmm13\n"                         \
163                       "pxor %%xmm14, %%xmm14\n"                         \
164                       "pxor %%xmm15, %%xmm15\n":: );                    \
165    } while (0)
166 # endif
167 #endif
168 
169 void ASM_FUNC_ATTR
_gcry_aes_aesni_do_setkey(RIJNDAEL_context * ctx,const byte * key)170 _gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
171 {
172   aesni_prepare_2_7_variable;
173 
174   aesni_prepare();
175   aesni_prepare_2_7();
176 
177   if (ctx->rounds < 12)
178     {
179       /* 128-bit key */
180 #define AESKEYGENASSIST_xmm1_xmm2(imm8) \
181 	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
182 #define AESKEY_EXPAND128 \
183 	"pshufd $0xff, %%xmm2, %%xmm2\n\t" \
184 	"movdqa %%xmm1, %%xmm3\n\t" \
185 	"pslldq $4, %%xmm3\n\t" \
186 	"pxor   %%xmm3, %%xmm1\n\t" \
187 	"pslldq $4, %%xmm3\n\t" \
188 	"pxor   %%xmm3, %%xmm1\n\t" \
189 	"pslldq $4, %%xmm3\n\t" \
190 	"pxor   %%xmm3, %%xmm2\n\t" \
191 	"pxor   %%xmm2, %%xmm1\n\t"
192 
193       asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key   */
194                     "movdqa %%xmm1, (%[ksch])\n\t"     /* ksch[0] := xmm1  */
195                     AESKEYGENASSIST_xmm1_xmm2(0x01)
196                     AESKEY_EXPAND128
197                     "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1  */
198                     AESKEYGENASSIST_xmm1_xmm2(0x02)
199                     AESKEY_EXPAND128
200                     "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1  */
201                     AESKEYGENASSIST_xmm1_xmm2(0x04)
202                     AESKEY_EXPAND128
203                     "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1  */
204                     AESKEYGENASSIST_xmm1_xmm2(0x08)
205                     AESKEY_EXPAND128
206                     "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1  */
207                     AESKEYGENASSIST_xmm1_xmm2(0x10)
208                     AESKEY_EXPAND128
209                     "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1  */
210                     AESKEYGENASSIST_xmm1_xmm2(0x20)
211                     AESKEY_EXPAND128
212                     "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
213                     AESKEYGENASSIST_xmm1_xmm2(0x40)
214                     AESKEY_EXPAND128
215                     "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1  */
216                     AESKEYGENASSIST_xmm1_xmm2(0x80)
217                     AESKEY_EXPAND128
218                     "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1  */
219                     AESKEYGENASSIST_xmm1_xmm2(0x1b)
220                     AESKEY_EXPAND128
221                     "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1  */
222                     AESKEYGENASSIST_xmm1_xmm2(0x36)
223                     AESKEY_EXPAND128
224                     "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1  */
225                     :
226                     : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
227                     : "cc", "memory" );
228 #undef AESKEYGENASSIST_xmm1_xmm2
229 #undef AESKEY_EXPAND128
230     }
231   else if (ctx->rounds == 12)
232     {
233       /* 192-bit key */
234 #define AESKEYGENASSIST_xmm3_xmm2(imm8) \
235 	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
236 #define AESKEY_EXPAND192 \
237 	"pshufd $0x55, %%xmm2, %%xmm2\n\t" \
238 	"movdqu %%xmm1, %%xmm4\n\t" \
239 	"pslldq $4, %%xmm4\n\t" \
240 	"pxor %%xmm4, %%xmm1\n\t" \
241 	"pslldq $4, %%xmm4\n\t" \
242 	"pxor %%xmm4, %%xmm1\n\t" \
243 	"pslldq $4, %%xmm4\n\t" \
244 	"pxor %%xmm4, %%xmm1\n\t" \
245 	"pxor %%xmm2, %%xmm1\n\t" \
246 	"pshufd $0xff, %%xmm1, %%xmm2\n\t" \
247 	"movdqu %%xmm3, %%xmm4\n\t" \
248 	"pslldq $4, %%xmm4\n\t" \
249 	"pxor %%xmm4, %%xmm3\n\t" \
250 	"pxor %%xmm2, %%xmm3\n\t"
251 
252       asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key[0..15]   */
253                     "movq 16(%[key]), %%xmm3\n\t"     /* xmm3 := key[16..23]  */
254                     "movdqa %%xmm1, (%[ksch])\n\t"    /* ksch[0] := xmm1  */
255                     "movdqa %%xmm3, %%xmm5\n\t"
256 
257                     AESKEYGENASSIST_xmm3_xmm2(0x01)
258                     AESKEY_EXPAND192
259                     "shufpd $0, %%xmm1, %%xmm5\n\t"
260                     "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5  */
261                     "movdqa %%xmm1, %%xmm6\n\t"
262                     "shufpd $1, %%xmm3, %%xmm6\n\t"
263                     "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6  */
264                     AESKEYGENASSIST_xmm3_xmm2(0x02)
265                     AESKEY_EXPAND192
266                     "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1  */
267                     "movdqa %%xmm3, %%xmm5\n\t"
268 
269                     AESKEYGENASSIST_xmm3_xmm2(0x04)
270                     AESKEY_EXPAND192
271                     "shufpd $0, %%xmm1, %%xmm5\n\t"
272                     "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5  */
273                     "movdqa %%xmm1, %%xmm6\n\t"
274                     "shufpd $1, %%xmm3, %%xmm6\n\t"
275                     "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6  */
276                     AESKEYGENASSIST_xmm3_xmm2(0x08)
277                     AESKEY_EXPAND192
278                     "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
279                     "movdqa %%xmm3, %%xmm5\n\t"
280 
281                     AESKEYGENASSIST_xmm3_xmm2(0x10)
282                     AESKEY_EXPAND192
283                     "shufpd $0, %%xmm1, %%xmm5\n\t"
284                     "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5  */
285                     "movdqa %%xmm1, %%xmm6\n\t"
286                     "shufpd $1, %%xmm3, %%xmm6\n\t"
287                     "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6  */
288                     AESKEYGENASSIST_xmm3_xmm2(0x20)
289                     AESKEY_EXPAND192
290                     "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1  */
291                     "movdqa %%xmm3, %%xmm5\n\t"
292 
293                     AESKEYGENASSIST_xmm3_xmm2(0x40)
294                     AESKEY_EXPAND192
295                     "shufpd $0, %%xmm1, %%xmm5\n\t"
296                     "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5  */
297                     "movdqa %%xmm1, %%xmm6\n\t"
298                     "shufpd $1, %%xmm3, %%xmm6\n\t"
299                     "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6  */
300                     AESKEYGENASSIST_xmm3_xmm2(0x80)
301                     AESKEY_EXPAND192
302                     "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1  */
303                     :
304                     : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
305                     : "cc", "memory" );
306 #undef AESKEYGENASSIST_xmm3_xmm2
307 #undef AESKEY_EXPAND192
308     }
309   else if (ctx->rounds > 12)
310     {
311       /* 256-bit key */
312 #define AESKEYGENASSIST_xmm1_xmm2(imm8) \
313 	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
314 #define AESKEYGENASSIST_xmm3_xmm2(imm8) \
315 	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
316 #define AESKEY_EXPAND256_A \
317 	"pshufd $0xff, %%xmm2, %%xmm2\n\t" \
318 	"movdqa %%xmm1, %%xmm4\n\t" \
319 	"pslldq $4, %%xmm4\n\t" \
320 	"pxor %%xmm4, %%xmm1\n\t" \
321 	"pslldq $4, %%xmm4\n\t" \
322 	"pxor %%xmm4, %%xmm1\n\t" \
323 	"pslldq $4, %%xmm4\n\t" \
324 	"pxor %%xmm4, %%xmm1\n\t" \
325 	"pxor %%xmm2, %%xmm1\n\t"
326 #define AESKEY_EXPAND256_B \
327 	"pshufd $0xaa, %%xmm2, %%xmm2\n\t" \
328 	"movdqa %%xmm3, %%xmm4\n\t" \
329 	"pslldq $4, %%xmm4\n\t" \
330 	"pxor %%xmm4, %%xmm3\n\t" \
331 	"pslldq $4, %%xmm4\n\t" \
332 	"pxor %%xmm4, %%xmm3\n\t" \
333 	"pslldq $4, %%xmm4\n\t" \
334 	"pxor %%xmm4, %%xmm3\n\t" \
335 	"pxor %%xmm2, %%xmm3\n\t"
336 
337       asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key[0..15]   */
338                     "movdqu 16(%[key]), %%xmm3\n\t"   /* xmm3 := key[16..31]  */
339                     "movdqa %%xmm1, (%[ksch])\n\t"     /* ksch[0] := xmm1  */
340                     "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3  */
341 
342                     AESKEYGENASSIST_xmm3_xmm2(0x01)
343                     AESKEY_EXPAND256_A
344                     "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1  */
345                     AESKEYGENASSIST_xmm1_xmm2(0x00)
346                     AESKEY_EXPAND256_B
347                     "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3  */
348 
349                     AESKEYGENASSIST_xmm3_xmm2(0x02)
350                     AESKEY_EXPAND256_A
351                     "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1  */
352                     AESKEYGENASSIST_xmm1_xmm2(0x00)
353                     AESKEY_EXPAND256_B
354                     "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3  */
355 
356                     AESKEYGENASSIST_xmm3_xmm2(0x04)
357                     AESKEY_EXPAND256_A
358                     "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
359                     AESKEYGENASSIST_xmm1_xmm2(0x00)
360                     AESKEY_EXPAND256_B
361                     "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3  */
362 
363                     AESKEYGENASSIST_xmm3_xmm2(0x08)
364                     AESKEY_EXPAND256_A
365                     "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1  */
366                     AESKEYGENASSIST_xmm1_xmm2(0x00)
367                     AESKEY_EXPAND256_B
368                     "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3  */
369 
370                     AESKEYGENASSIST_xmm3_xmm2(0x10)
371                     AESKEY_EXPAND256_A
372                     "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1  */
373                     AESKEYGENASSIST_xmm1_xmm2(0x00)
374                     AESKEY_EXPAND256_B
375                     "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3  */
376 
377                     AESKEYGENASSIST_xmm3_xmm2(0x20)
378                     AESKEY_EXPAND256_A
379                     "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1  */
380                     AESKEYGENASSIST_xmm1_xmm2(0x00)
381                     AESKEY_EXPAND256_B
382                     "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3  */
383 
384                     AESKEYGENASSIST_xmm3_xmm2(0x40)
385                     AESKEY_EXPAND256_A
386                     "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1  */
387 
388                     :
389                     : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
390                     : "cc", "memory" );
391 #undef AESKEYGENASSIST_xmm1_xmm2
392 #undef AESKEYGENASSIST_xmm3_xmm2
393 #undef AESKEY_EXPAND256_A
394 #undef AESKEY_EXPAND256_B
395     }
396 
397   aesni_cleanup();
398   aesni_cleanup_2_7();
399 }
400 
401 
402 /* Make a decryption key from an encryption key. */
403 static ASM_FUNC_ATTR_INLINE void
do_aesni_prepare_decryption(RIJNDAEL_context * ctx)404 do_aesni_prepare_decryption (RIJNDAEL_context *ctx)
405 {
406   /* The AES-NI decrypt instructions use the Equivalent Inverse
407      Cipher, thus we can't use the the standard decrypt key
408      preparation.  */
409   u128_t *ekey = (u128_t *)ctx->keyschenc;
410   u128_t *dkey = (u128_t *)ctx->keyschdec;
411   int rr;
412   int r;
413 
414 #define DO_AESNI_AESIMC() \
415   asm volatile ("movdqa %[ekey], %%xmm1\n\t" \
416                 /*"aesimc %%xmm1, %%xmm1\n\t"*/ \
417                 ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \
418                 "movdqa %%xmm1, %[dkey]" \
419                 : [dkey] "=m" (dkey[r]) \
420                 : [ekey] "m" (ekey[rr]) \
421                 : "memory")
422 
423   dkey[0] = ekey[ctx->rounds];
424   r=1;
425   rr=ctx->rounds-1;
426   DO_AESNI_AESIMC(); r++; rr--; /* round 1 */
427   DO_AESNI_AESIMC(); r++; rr--; /* round 2 */
428   DO_AESNI_AESIMC(); r++; rr--; /* round 3 */
429   DO_AESNI_AESIMC(); r++; rr--; /* round 4 */
430   DO_AESNI_AESIMC(); r++; rr--; /* round 5 */
431   DO_AESNI_AESIMC(); r++; rr--; /* round 6 */
432   DO_AESNI_AESIMC(); r++; rr--; /* round 7 */
433   DO_AESNI_AESIMC(); r++; rr--; /* round 8 */
434   DO_AESNI_AESIMC(); r++; rr--; /* round 9 */
435   if (ctx->rounds > 10)
436     {
437       DO_AESNI_AESIMC(); r++; rr--; /* round 10 */
438       DO_AESNI_AESIMC(); r++; rr--; /* round 11 */
439       if (ctx->rounds > 12)
440         {
441           DO_AESNI_AESIMC(); r++; rr--; /* round 12 */
442           DO_AESNI_AESIMC(); r++; rr--; /* round 13 */
443         }
444     }
445 
446   dkey[r] = ekey[0];
447 
448 #undef DO_AESNI_AESIMC
449 }
450 
451 void ASM_FUNC_ATTR
_gcry_aes_aesni_prepare_decryption(RIJNDAEL_context * ctx)452 _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx)
453 {
454   aesni_prepare();
455   do_aesni_prepare_decryption (ctx);
456   aesni_cleanup();
457 }
458 
459 
460 /* Encrypt one block using the Intel AES-NI instructions.  Block is input
461  * and output through SSE register xmm0. */
462 static ASM_FUNC_ATTR_INLINE void
do_aesni_enc(const RIJNDAEL_context * ctx)463 do_aesni_enc (const RIJNDAEL_context *ctx)
464 {
465 #define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
466 #define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
467   asm volatile ("movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
468                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
469                 "movdqa 0x10(%[key]), %%xmm1\n\t"
470                 aesenc_xmm1_xmm0
471                 "movdqa 0x20(%[key]), %%xmm1\n\t"
472                 aesenc_xmm1_xmm0
473                 "movdqa 0x30(%[key]), %%xmm1\n\t"
474                 aesenc_xmm1_xmm0
475                 "movdqa 0x40(%[key]), %%xmm1\n\t"
476                 aesenc_xmm1_xmm0
477                 "movdqa 0x50(%[key]), %%xmm1\n\t"
478                 aesenc_xmm1_xmm0
479                 "movdqa 0x60(%[key]), %%xmm1\n\t"
480                 aesenc_xmm1_xmm0
481                 "movdqa 0x70(%[key]), %%xmm1\n\t"
482                 aesenc_xmm1_xmm0
483                 "movdqa 0x80(%[key]), %%xmm1\n\t"
484                 aesenc_xmm1_xmm0
485                 "movdqa 0x90(%[key]), %%xmm1\n\t"
486                 aesenc_xmm1_xmm0
487                 "movdqa 0xa0(%[key]), %%xmm1\n\t"
488                 "cmpl $10, %[rounds]\n\t"
489                 "jz .Lenclast%=\n\t"
490                 aesenc_xmm1_xmm0
491                 "movdqa 0xb0(%[key]), %%xmm1\n\t"
492                 aesenc_xmm1_xmm0
493                 "movdqa 0xc0(%[key]), %%xmm1\n\t"
494                 "cmpl $12, %[rounds]\n\t"
495                 "jz .Lenclast%=\n\t"
496                 aesenc_xmm1_xmm0
497                 "movdqa 0xd0(%[key]), %%xmm1\n\t"
498                 aesenc_xmm1_xmm0
499                 "movdqa 0xe0(%[key]), %%xmm1\n"
500 
501                 ".Lenclast%=:\n\t"
502                 aesenclast_xmm1_xmm0
503                 "\n"
504                 :
505                 : [key] "r" (ctx->keyschenc),
506                   [rounds] "r" (ctx->rounds)
507                 : "cc", "memory");
508 #undef aesenc_xmm1_xmm0
509 #undef aesenclast_xmm1_xmm0
510 }
511 
512 
513 /* Decrypt one block using the Intel AES-NI instructions.  Block is input
514  * and output through SSE register xmm0. */
515 static ASM_FUNC_ATTR_INLINE void
do_aesni_dec(const RIJNDAEL_context * ctx)516 do_aesni_dec (const RIJNDAEL_context *ctx)
517 {
518 #define aesdec_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
519 #define aesdeclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t"
520   asm volatile ("movdqa (%[key]), %%xmm1\n\t"
521                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
522                 "movdqa 0x10(%[key]), %%xmm1\n\t"
523                 aesdec_xmm1_xmm0
524                 "movdqa 0x20(%[key]), %%xmm1\n\t"
525                 aesdec_xmm1_xmm0
526                 "movdqa 0x30(%[key]), %%xmm1\n\t"
527                 aesdec_xmm1_xmm0
528                 "movdqa 0x40(%[key]), %%xmm1\n\t"
529                 aesdec_xmm1_xmm0
530                 "movdqa 0x50(%[key]), %%xmm1\n\t"
531                 aesdec_xmm1_xmm0
532                 "movdqa 0x60(%[key]), %%xmm1\n\t"
533                 aesdec_xmm1_xmm0
534                 "movdqa 0x70(%[key]), %%xmm1\n\t"
535                 aesdec_xmm1_xmm0
536                 "movdqa 0x80(%[key]), %%xmm1\n\t"
537                 aesdec_xmm1_xmm0
538                 "movdqa 0x90(%[key]), %%xmm1\n\t"
539                 aesdec_xmm1_xmm0
540                 "movdqa 0xa0(%[key]), %%xmm1\n\t"
541                 "cmpl $10, %[rounds]\n\t"
542                 "jz .Ldeclast%=\n\t"
543                 aesdec_xmm1_xmm0
544                 "movdqa 0xb0(%[key]), %%xmm1\n\t"
545                 aesdec_xmm1_xmm0
546                 "movdqa 0xc0(%[key]), %%xmm1\n\t"
547                 "cmpl $12, %[rounds]\n\t"
548                 "jz .Ldeclast%=\n\t"
549                 aesdec_xmm1_xmm0
550                 "movdqa 0xd0(%[key]), %%xmm1\n\t"
551                 aesdec_xmm1_xmm0
552                 "movdqa 0xe0(%[key]), %%xmm1\n"
553 
554                 ".Ldeclast%=:\n\t"
555                 aesdeclast_xmm1_xmm0
556                 "\n"
557                 :
558                 : [key] "r" (ctx->keyschdec),
559                   [rounds] "r" (ctx->rounds)
560                 : "cc", "memory");
561 #undef aesdec_xmm1_xmm0
562 #undef aesdeclast_xmm1_xmm0
563 }
564 
565 
566 /* Encrypt four blocks using the Intel AES-NI instructions.  Blocks are input
567  * and output through SSE registers xmm1 to xmm4.  */
568 static ASM_FUNC_ATTR_INLINE void
do_aesni_enc_vec4(const RIJNDAEL_context * ctx)569 do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
570 {
571 #define aesenc_xmm0_xmm1      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
572 #define aesenc_xmm0_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
573 #define aesenc_xmm0_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
574 #define aesenc_xmm0_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
575 #define aesenclast_xmm0_xmm1  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
576 #define aesenclast_xmm0_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
577 #define aesenclast_xmm0_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
578 #define aesenclast_xmm0_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
579   asm volatile ("movdqa (%[key]), %%xmm0\n\t"
580                 "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
581                 "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
582                 "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
583                 "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
584                 "movdqa 0x10(%[key]), %%xmm0\n\t"
585                 aesenc_xmm0_xmm1
586                 aesenc_xmm0_xmm2
587                 aesenc_xmm0_xmm3
588                 aesenc_xmm0_xmm4
589                 "movdqa 0x20(%[key]), %%xmm0\n\t"
590                 aesenc_xmm0_xmm1
591                 aesenc_xmm0_xmm2
592                 aesenc_xmm0_xmm3
593                 aesenc_xmm0_xmm4
594                 "movdqa 0x30(%[key]), %%xmm0\n\t"
595                 aesenc_xmm0_xmm1
596                 aesenc_xmm0_xmm2
597                 aesenc_xmm0_xmm3
598                 aesenc_xmm0_xmm4
599                 "movdqa 0x40(%[key]), %%xmm0\n\t"
600                 aesenc_xmm0_xmm1
601                 aesenc_xmm0_xmm2
602                 aesenc_xmm0_xmm3
603                 aesenc_xmm0_xmm4
604                 "movdqa 0x50(%[key]), %%xmm0\n\t"
605                 aesenc_xmm0_xmm1
606                 aesenc_xmm0_xmm2
607                 aesenc_xmm0_xmm3
608                 aesenc_xmm0_xmm4
609                 "movdqa 0x60(%[key]), %%xmm0\n\t"
610                 aesenc_xmm0_xmm1
611                 aesenc_xmm0_xmm2
612                 aesenc_xmm0_xmm3
613                 aesenc_xmm0_xmm4
614                 "movdqa 0x70(%[key]), %%xmm0\n\t"
615                 aesenc_xmm0_xmm1
616                 aesenc_xmm0_xmm2
617                 aesenc_xmm0_xmm3
618                 aesenc_xmm0_xmm4
619                 "movdqa 0x80(%[key]), %%xmm0\n\t"
620                 aesenc_xmm0_xmm1
621                 aesenc_xmm0_xmm2
622                 aesenc_xmm0_xmm3
623                 aesenc_xmm0_xmm4
624                 "movdqa 0x90(%[key]), %%xmm0\n\t"
625                 aesenc_xmm0_xmm1
626                 aesenc_xmm0_xmm2
627                 aesenc_xmm0_xmm3
628                 aesenc_xmm0_xmm4
629                 "movdqa 0xa0(%[key]), %%xmm0\n\t"
630                 "cmpl $10, %[rounds]\n\t"
631                 "jz .Ldeclast%=\n\t"
632                 aesenc_xmm0_xmm1
633                 aesenc_xmm0_xmm2
634                 aesenc_xmm0_xmm3
635                 aesenc_xmm0_xmm4
636                 "movdqa 0xb0(%[key]), %%xmm0\n\t"
637                 aesenc_xmm0_xmm1
638                 aesenc_xmm0_xmm2
639                 aesenc_xmm0_xmm3
640                 aesenc_xmm0_xmm4
641                 "movdqa 0xc0(%[key]), %%xmm0\n\t"
642                 "cmpl $12, %[rounds]\n\t"
643                 "jz .Ldeclast%=\n\t"
644                 aesenc_xmm0_xmm1
645                 aesenc_xmm0_xmm2
646                 aesenc_xmm0_xmm3
647                 aesenc_xmm0_xmm4
648                 "movdqa 0xd0(%[key]), %%xmm0\n\t"
649                 aesenc_xmm0_xmm1
650                 aesenc_xmm0_xmm2
651                 aesenc_xmm0_xmm3
652                 aesenc_xmm0_xmm4
653                 "movdqa 0xe0(%[key]), %%xmm0\n"
654 
655                 ".Ldeclast%=:\n\t"
656                 aesenclast_xmm0_xmm1
657                 aesenclast_xmm0_xmm2
658                 aesenclast_xmm0_xmm3
659                 aesenclast_xmm0_xmm4
660                 : /* no output */
661                 : [key] "r" (ctx->keyschenc),
662                   [rounds] "r" (ctx->rounds)
663                 : "cc", "memory");
664 #undef aesenc_xmm0_xmm1
665 #undef aesenc_xmm0_xmm2
666 #undef aesenc_xmm0_xmm3
667 #undef aesenc_xmm0_xmm4
668 #undef aesenclast_xmm0_xmm1
669 #undef aesenclast_xmm0_xmm2
670 #undef aesenclast_xmm0_xmm3
671 #undef aesenclast_xmm0_xmm4
672 }
673 
674 
675 /* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
676  * and output through SSE registers xmm1 to xmm4.  */
677 static ASM_FUNC_ATTR_INLINE void
do_aesni_dec_vec4(const RIJNDAEL_context * ctx)678 do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
679 {
680 #define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
681 #define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t"
682 #define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t"
683 #define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t"
684 #define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t"
685 #define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t"
686 #define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t"
687 #define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t"
688   asm volatile ("movdqa (%[key]), %%xmm0\n\t"
689                 "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
690                 "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
691                 "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
692                 "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
693                 "movdqa 0x10(%[key]), %%xmm0\n\t"
694                 aesdec_xmm0_xmm1
695                 aesdec_xmm0_xmm2
696                 aesdec_xmm0_xmm3
697                 aesdec_xmm0_xmm4
698                 "movdqa 0x20(%[key]), %%xmm0\n\t"
699                 aesdec_xmm0_xmm1
700                 aesdec_xmm0_xmm2
701                 aesdec_xmm0_xmm3
702                 aesdec_xmm0_xmm4
703                 "movdqa 0x30(%[key]), %%xmm0\n\t"
704                 aesdec_xmm0_xmm1
705                 aesdec_xmm0_xmm2
706                 aesdec_xmm0_xmm3
707                 aesdec_xmm0_xmm4
708                 "movdqa 0x40(%[key]), %%xmm0\n\t"
709                 aesdec_xmm0_xmm1
710                 aesdec_xmm0_xmm2
711                 aesdec_xmm0_xmm3
712                 aesdec_xmm0_xmm4
713                 "movdqa 0x50(%[key]), %%xmm0\n\t"
714                 aesdec_xmm0_xmm1
715                 aesdec_xmm0_xmm2
716                 aesdec_xmm0_xmm3
717                 aesdec_xmm0_xmm4
718                 "movdqa 0x60(%[key]), %%xmm0\n\t"
719                 aesdec_xmm0_xmm1
720                 aesdec_xmm0_xmm2
721                 aesdec_xmm0_xmm3
722                 aesdec_xmm0_xmm4
723                 "movdqa 0x70(%[key]), %%xmm0\n\t"
724                 aesdec_xmm0_xmm1
725                 aesdec_xmm0_xmm2
726                 aesdec_xmm0_xmm3
727                 aesdec_xmm0_xmm4
728                 "movdqa 0x80(%[key]), %%xmm0\n\t"
729                 aesdec_xmm0_xmm1
730                 aesdec_xmm0_xmm2
731                 aesdec_xmm0_xmm3
732                 aesdec_xmm0_xmm4
733                 "movdqa 0x90(%[key]), %%xmm0\n\t"
734                 aesdec_xmm0_xmm1
735                 aesdec_xmm0_xmm2
736                 aesdec_xmm0_xmm3
737                 aesdec_xmm0_xmm4
738                 "movdqa 0xa0(%[key]), %%xmm0\n\t"
739                 "cmpl $10, %[rounds]\n\t"
740                 "jz .Ldeclast%=\n\t"
741                 aesdec_xmm0_xmm1
742                 aesdec_xmm0_xmm2
743                 aesdec_xmm0_xmm3
744                 aesdec_xmm0_xmm4
745                 "movdqa 0xb0(%[key]), %%xmm0\n\t"
746                 aesdec_xmm0_xmm1
747                 aesdec_xmm0_xmm2
748                 aesdec_xmm0_xmm3
749                 aesdec_xmm0_xmm4
750                 "movdqa 0xc0(%[key]), %%xmm0\n\t"
751                 "cmpl $12, %[rounds]\n\t"
752                 "jz .Ldeclast%=\n\t"
753                 aesdec_xmm0_xmm1
754                 aesdec_xmm0_xmm2
755                 aesdec_xmm0_xmm3
756                 aesdec_xmm0_xmm4
757                 "movdqa 0xd0(%[key]), %%xmm0\n\t"
758                 aesdec_xmm0_xmm1
759                 aesdec_xmm0_xmm2
760                 aesdec_xmm0_xmm3
761                 aesdec_xmm0_xmm4
762                 "movdqa 0xe0(%[key]), %%xmm0\n"
763 
764                 ".Ldeclast%=:\n\t"
765                 aesdeclast_xmm0_xmm1
766                 aesdeclast_xmm0_xmm2
767                 aesdeclast_xmm0_xmm3
768                 aesdeclast_xmm0_xmm4
769                 : /* no output */
770                 : [key] "r" (ctx->keyschdec),
771                   [rounds] "r" (ctx->rounds)
772                 : "cc", "memory");
773 #undef aesdec_xmm0_xmm1
774 #undef aesdec_xmm0_xmm2
775 #undef aesdec_xmm0_xmm3
776 #undef aesdec_xmm0_xmm4
777 #undef aesdeclast_xmm0_xmm1
778 #undef aesdeclast_xmm0_xmm2
779 #undef aesdeclast_xmm0_xmm3
780 #undef aesdeclast_xmm0_xmm4
781 }
782 
783 
784 #ifdef __x86_64__
785 
786 /* Encrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
787  * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
788 static ASM_FUNC_ATTR_INLINE void
do_aesni_enc_vec8(const RIJNDAEL_context * ctx)789 do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
790 {
791   asm volatile ("movdqa 0x10(%[key]), %%xmm0\n\t"
792                 "aesenc %%xmm0, %%xmm1\n\t"
793                 "aesenc %%xmm0, %%xmm2\n\t"
794                 "aesenc %%xmm0, %%xmm3\n\t"
795                 "aesenc %%xmm0, %%xmm4\n\t"
796                 "aesenc %%xmm0, %%xmm8\n\t"
797                 "aesenc %%xmm0, %%xmm9\n\t"
798                 "aesenc %%xmm0, %%xmm10\n\t"
799                 "aesenc %%xmm0, %%xmm11\n\t"
800                 "movdqa 0x20(%[key]), %%xmm0\n\t"
801                 "cmpl $12, %[rounds]\n\t"
802                 "aesenc %%xmm0, %%xmm1\n\t"
803                 "aesenc %%xmm0, %%xmm2\n\t"
804                 "aesenc %%xmm0, %%xmm3\n\t"
805                 "aesenc %%xmm0, %%xmm4\n\t"
806                 "aesenc %%xmm0, %%xmm8\n\t"
807                 "aesenc %%xmm0, %%xmm9\n\t"
808                 "aesenc %%xmm0, %%xmm10\n\t"
809                 "aesenc %%xmm0, %%xmm11\n\t"
810                 "movdqa 0x30(%[key]), %%xmm0\n\t"
811                 "aesenc %%xmm0, %%xmm1\n\t"
812                 "aesenc %%xmm0, %%xmm2\n\t"
813                 "aesenc %%xmm0, %%xmm3\n\t"
814                 "aesenc %%xmm0, %%xmm4\n\t"
815                 "aesenc %%xmm0, %%xmm8\n\t"
816                 "aesenc %%xmm0, %%xmm9\n\t"
817                 "aesenc %%xmm0, %%xmm10\n\t"
818                 "aesenc %%xmm0, %%xmm11\n\t"
819                 "movdqa 0x40(%[key]), %%xmm0\n\t"
820                 "aesenc %%xmm0, %%xmm1\n\t"
821                 "aesenc %%xmm0, %%xmm2\n\t"
822                 "aesenc %%xmm0, %%xmm3\n\t"
823                 "aesenc %%xmm0, %%xmm4\n\t"
824                 "aesenc %%xmm0, %%xmm8\n\t"
825                 "aesenc %%xmm0, %%xmm9\n\t"
826                 "aesenc %%xmm0, %%xmm10\n\t"
827                 "aesenc %%xmm0, %%xmm11\n\t"
828                 "movdqa 0x50(%[key]), %%xmm0\n\t"
829                 "aesenc %%xmm0, %%xmm1\n\t"
830                 "aesenc %%xmm0, %%xmm2\n\t"
831                 "aesenc %%xmm0, %%xmm3\n\t"
832                 "aesenc %%xmm0, %%xmm4\n\t"
833                 "aesenc %%xmm0, %%xmm8\n\t"
834                 "aesenc %%xmm0, %%xmm9\n\t"
835                 "aesenc %%xmm0, %%xmm10\n\t"
836                 "aesenc %%xmm0, %%xmm11\n\t"
837                 "movdqa 0x60(%[key]), %%xmm0\n\t"
838                 "aesenc %%xmm0, %%xmm1\n\t"
839                 "aesenc %%xmm0, %%xmm2\n\t"
840                 "aesenc %%xmm0, %%xmm3\n\t"
841                 "aesenc %%xmm0, %%xmm4\n\t"
842                 "aesenc %%xmm0, %%xmm8\n\t"
843                 "aesenc %%xmm0, %%xmm9\n\t"
844                 "aesenc %%xmm0, %%xmm10\n\t"
845                 "aesenc %%xmm0, %%xmm11\n\t"
846                 "movdqa 0x70(%[key]), %%xmm0\n\t"
847                 "aesenc %%xmm0, %%xmm1\n\t"
848                 "aesenc %%xmm0, %%xmm2\n\t"
849                 "aesenc %%xmm0, %%xmm3\n\t"
850                 "aesenc %%xmm0, %%xmm4\n\t"
851                 "aesenc %%xmm0, %%xmm8\n\t"
852                 "aesenc %%xmm0, %%xmm9\n\t"
853                 "aesenc %%xmm0, %%xmm10\n\t"
854                 "aesenc %%xmm0, %%xmm11\n\t"
855                 "movdqa 0x80(%[key]), %%xmm0\n\t"
856                 "aesenc %%xmm0, %%xmm1\n\t"
857                 "aesenc %%xmm0, %%xmm2\n\t"
858                 "aesenc %%xmm0, %%xmm3\n\t"
859                 "aesenc %%xmm0, %%xmm4\n\t"
860                 "aesenc %%xmm0, %%xmm8\n\t"
861                 "aesenc %%xmm0, %%xmm9\n\t"
862                 "aesenc %%xmm0, %%xmm10\n\t"
863                 "aesenc %%xmm0, %%xmm11\n\t"
864                 "movdqa 0x90(%[key]), %%xmm0\n\t"
865                 "aesenc %%xmm0, %%xmm1\n\t"
866                 "aesenc %%xmm0, %%xmm2\n\t"
867                 "aesenc %%xmm0, %%xmm3\n\t"
868                 "aesenc %%xmm0, %%xmm4\n\t"
869                 "aesenc %%xmm0, %%xmm8\n\t"
870                 "aesenc %%xmm0, %%xmm9\n\t"
871                 "aesenc %%xmm0, %%xmm10\n\t"
872                 "aesenc %%xmm0, %%xmm11\n\t"
873                 "movdqa 0xa0(%[key]), %%xmm0\n\t"
874                 "jb .Ldeclast%=\n\t"
875                 "aesenc %%xmm0, %%xmm1\n\t"
876                 "aesenc %%xmm0, %%xmm2\n\t"
877                 "aesenc %%xmm0, %%xmm3\n\t"
878                 "aesenc %%xmm0, %%xmm4\n\t"
879                 "aesenc %%xmm0, %%xmm8\n\t"
880                 "aesenc %%xmm0, %%xmm9\n\t"
881                 "aesenc %%xmm0, %%xmm10\n\t"
882                 "aesenc %%xmm0, %%xmm11\n\t"
883                 "movdqa 0xb0(%[key]), %%xmm0\n\t"
884                 "aesenc %%xmm0, %%xmm1\n\t"
885                 "aesenc %%xmm0, %%xmm2\n\t"
886                 "aesenc %%xmm0, %%xmm3\n\t"
887                 "aesenc %%xmm0, %%xmm4\n\t"
888                 "aesenc %%xmm0, %%xmm8\n\t"
889                 "aesenc %%xmm0, %%xmm9\n\t"
890                 "aesenc %%xmm0, %%xmm10\n\t"
891                 "aesenc %%xmm0, %%xmm11\n\t"
892                 "movdqa 0xc0(%[key]), %%xmm0\n\t"
893                 "je .Ldeclast%=\n\t"
894                 "aesenc %%xmm0, %%xmm1\n\t"
895                 "aesenc %%xmm0, %%xmm2\n\t"
896                 "aesenc %%xmm0, %%xmm3\n\t"
897                 "aesenc %%xmm0, %%xmm4\n\t"
898                 "aesenc %%xmm0, %%xmm8\n\t"
899                 "aesenc %%xmm0, %%xmm9\n\t"
900                 "aesenc %%xmm0, %%xmm10\n\t"
901                 "aesenc %%xmm0, %%xmm11\n\t"
902                 "movdqa 0xd0(%[key]), %%xmm0\n\t"
903                 "aesenc %%xmm0, %%xmm1\n\t"
904                 "aesenc %%xmm0, %%xmm2\n\t"
905                 "aesenc %%xmm0, %%xmm3\n\t"
906                 "aesenc %%xmm0, %%xmm4\n\t"
907                 "aesenc %%xmm0, %%xmm8\n\t"
908                 "aesenc %%xmm0, %%xmm9\n\t"
909                 "aesenc %%xmm0, %%xmm10\n\t"
910                 "aesenc %%xmm0, %%xmm11\n\t"
911                 "movdqa 0xe0(%[key]), %%xmm0\n"
912 
913                 ".Ldeclast%=:\n\t"
914                 : /* no output */
915                 : [key] "r" (ctx->keyschenc),
916                   [rounds] "r" (ctx->rounds)
917                 : "cc", "memory");
918 }
919 
920 
921 /* Decrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
922  * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
923 static ASM_FUNC_ATTR_INLINE void
do_aesni_dec_vec8(const RIJNDAEL_context * ctx)924 do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
925 {
926   asm volatile ("movdqa 0x10(%[key]), %%xmm0\n\t"
927                 "cmpl $12, %[rounds]\n\t"
928                 "aesdec %%xmm0, %%xmm1\n\t"
929                 "aesdec %%xmm0, %%xmm2\n\t"
930                 "aesdec %%xmm0, %%xmm3\n\t"
931                 "aesdec %%xmm0, %%xmm4\n\t"
932                 "aesdec %%xmm0, %%xmm8\n\t"
933                 "aesdec %%xmm0, %%xmm9\n\t"
934                 "aesdec %%xmm0, %%xmm10\n\t"
935                 "aesdec %%xmm0, %%xmm11\n\t"
936                 "movdqa 0x20(%[key]), %%xmm0\n\t"
937                 "aesdec %%xmm0, %%xmm1\n\t"
938                 "aesdec %%xmm0, %%xmm2\n\t"
939                 "aesdec %%xmm0, %%xmm3\n\t"
940                 "aesdec %%xmm0, %%xmm4\n\t"
941                 "aesdec %%xmm0, %%xmm8\n\t"
942                 "aesdec %%xmm0, %%xmm9\n\t"
943                 "aesdec %%xmm0, %%xmm10\n\t"
944                 "aesdec %%xmm0, %%xmm11\n\t"
945                 "movdqa 0x30(%[key]), %%xmm0\n\t"
946                 "aesdec %%xmm0, %%xmm1\n\t"
947                 "aesdec %%xmm0, %%xmm2\n\t"
948                 "aesdec %%xmm0, %%xmm3\n\t"
949                 "aesdec %%xmm0, %%xmm4\n\t"
950                 "aesdec %%xmm0, %%xmm8\n\t"
951                 "aesdec %%xmm0, %%xmm9\n\t"
952                 "aesdec %%xmm0, %%xmm10\n\t"
953                 "aesdec %%xmm0, %%xmm11\n\t"
954                 "movdqa 0x40(%[key]), %%xmm0\n\t"
955                 "aesdec %%xmm0, %%xmm1\n\t"
956                 "aesdec %%xmm0, %%xmm2\n\t"
957                 "aesdec %%xmm0, %%xmm3\n\t"
958                 "aesdec %%xmm0, %%xmm4\n\t"
959                 "aesdec %%xmm0, %%xmm8\n\t"
960                 "aesdec %%xmm0, %%xmm9\n\t"
961                 "aesdec %%xmm0, %%xmm10\n\t"
962                 "aesdec %%xmm0, %%xmm11\n\t"
963                 "movdqa 0x50(%[key]), %%xmm0\n\t"
964                 "aesdec %%xmm0, %%xmm1\n\t"
965                 "aesdec %%xmm0, %%xmm2\n\t"
966                 "aesdec %%xmm0, %%xmm3\n\t"
967                 "aesdec %%xmm0, %%xmm4\n\t"
968                 "aesdec %%xmm0, %%xmm8\n\t"
969                 "aesdec %%xmm0, %%xmm9\n\t"
970                 "aesdec %%xmm0, %%xmm10\n\t"
971                 "aesdec %%xmm0, %%xmm11\n\t"
972                 "movdqa 0x60(%[key]), %%xmm0\n\t"
973                 "aesdec %%xmm0, %%xmm1\n\t"
974                 "aesdec %%xmm0, %%xmm2\n\t"
975                 "aesdec %%xmm0, %%xmm3\n\t"
976                 "aesdec %%xmm0, %%xmm4\n\t"
977                 "aesdec %%xmm0, %%xmm8\n\t"
978                 "aesdec %%xmm0, %%xmm9\n\t"
979                 "aesdec %%xmm0, %%xmm10\n\t"
980                 "aesdec %%xmm0, %%xmm11\n\t"
981                 "movdqa 0x70(%[key]), %%xmm0\n\t"
982                 "aesdec %%xmm0, %%xmm1\n\t"
983                 "aesdec %%xmm0, %%xmm2\n\t"
984                 "aesdec %%xmm0, %%xmm3\n\t"
985                 "aesdec %%xmm0, %%xmm4\n\t"
986                 "aesdec %%xmm0, %%xmm8\n\t"
987                 "aesdec %%xmm0, %%xmm9\n\t"
988                 "aesdec %%xmm0, %%xmm10\n\t"
989                 "aesdec %%xmm0, %%xmm11\n\t"
990                 "movdqa 0x80(%[key]), %%xmm0\n\t"
991                 "aesdec %%xmm0, %%xmm1\n\t"
992                 "aesdec %%xmm0, %%xmm2\n\t"
993                 "aesdec %%xmm0, %%xmm3\n\t"
994                 "aesdec %%xmm0, %%xmm4\n\t"
995                 "aesdec %%xmm0, %%xmm8\n\t"
996                 "aesdec %%xmm0, %%xmm9\n\t"
997                 "aesdec %%xmm0, %%xmm10\n\t"
998                 "aesdec %%xmm0, %%xmm11\n\t"
999                 "movdqa 0x90(%[key]), %%xmm0\n\t"
1000                 "aesdec %%xmm0, %%xmm1\n\t"
1001                 "aesdec %%xmm0, %%xmm2\n\t"
1002                 "aesdec %%xmm0, %%xmm3\n\t"
1003                 "aesdec %%xmm0, %%xmm4\n\t"
1004                 "aesdec %%xmm0, %%xmm8\n\t"
1005                 "aesdec %%xmm0, %%xmm9\n\t"
1006                 "aesdec %%xmm0, %%xmm10\n\t"
1007                 "aesdec %%xmm0, %%xmm11\n\t"
1008                 "movdqa 0xa0(%[key]), %%xmm0\n\t"
1009                 "jb .Ldeclast%=\n\t"
1010                 "aesdec %%xmm0, %%xmm1\n\t"
1011                 "aesdec %%xmm0, %%xmm2\n\t"
1012                 "aesdec %%xmm0, %%xmm3\n\t"
1013                 "aesdec %%xmm0, %%xmm4\n\t"
1014                 "aesdec %%xmm0, %%xmm8\n\t"
1015                 "aesdec %%xmm0, %%xmm9\n\t"
1016                 "aesdec %%xmm0, %%xmm10\n\t"
1017                 "aesdec %%xmm0, %%xmm11\n\t"
1018                 "movdqa 0xb0(%[key]), %%xmm0\n\t"
1019                 "aesdec %%xmm0, %%xmm1\n\t"
1020                 "aesdec %%xmm0, %%xmm2\n\t"
1021                 "aesdec %%xmm0, %%xmm3\n\t"
1022                 "aesdec %%xmm0, %%xmm4\n\t"
1023                 "aesdec %%xmm0, %%xmm8\n\t"
1024                 "aesdec %%xmm0, %%xmm9\n\t"
1025                 "aesdec %%xmm0, %%xmm10\n\t"
1026                 "aesdec %%xmm0, %%xmm11\n\t"
1027                 "movdqa 0xc0(%[key]), %%xmm0\n\t"
1028                 "je .Ldeclast%=\n\t"
1029                 "aesdec %%xmm0, %%xmm1\n\t"
1030                 "aesdec %%xmm0, %%xmm2\n\t"
1031                 "aesdec %%xmm0, %%xmm3\n\t"
1032                 "aesdec %%xmm0, %%xmm4\n\t"
1033                 "aesdec %%xmm0, %%xmm8\n\t"
1034                 "aesdec %%xmm0, %%xmm9\n\t"
1035                 "aesdec %%xmm0, %%xmm10\n\t"
1036                 "aesdec %%xmm0, %%xmm11\n\t"
1037                 "movdqa 0xd0(%[key]), %%xmm0\n\t"
1038                 "aesdec %%xmm0, %%xmm1\n\t"
1039                 "aesdec %%xmm0, %%xmm2\n\t"
1040                 "aesdec %%xmm0, %%xmm3\n\t"
1041                 "aesdec %%xmm0, %%xmm4\n\t"
1042                 "aesdec %%xmm0, %%xmm8\n\t"
1043                 "aesdec %%xmm0, %%xmm9\n\t"
1044                 "aesdec %%xmm0, %%xmm10\n\t"
1045                 "aesdec %%xmm0, %%xmm11\n\t"
1046                 "movdqa 0xe0(%[key]), %%xmm0\n"
1047 
1048                 ".Ldeclast%=:\n\t"
1049                 : /* no output */
1050                 : [key] "r" (ctx->keyschdec),
1051                   [rounds] "r" (ctx->rounds)
1052                 : "cc", "memory");
1053 }
1054 
1055 #endif /* __x86_64__ */
1056 
1057 
1058 /* Perform a CTR encryption round using the counter CTR and the input
1059    block A.  Write the result to the output block B and update CTR.
1060    CTR needs to be a 16 byte aligned little-endian value.  */
1061 static ASM_FUNC_ATTR_INLINE void
do_aesni_ctr(const RIJNDAEL_context * ctx,unsigned char * ctr,unsigned char * b,const unsigned char * a)1062 do_aesni_ctr (const RIJNDAEL_context *ctx,
1063               unsigned char *ctr, unsigned char *b, const unsigned char *a)
1064 {
1065 #define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
1066 #define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
1067 
1068   asm volatile ("movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5)  */
1069                 "pcmpeqd %%xmm1, %%xmm1\n\t"
1070                 "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
1071 
1072                 "pshufb %%xmm6, %%xmm5\n\t"
1073                 "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++ (big endian) */
1074 
1075                 /* detect if 64-bit carry handling is needed */
1076                 "cmpl   $0xffffffff, 8(%[ctr])\n\t"
1077                 "jne    .Lno_carry%=\n\t"
1078                 "cmpl   $0xffffffff, 12(%[ctr])\n\t"
1079                 "jne    .Lno_carry%=\n\t"
1080 
1081                 "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
1082                 "psubq   %%xmm1, %%xmm5\n\t"    /* add carry to upper 64bits */
1083 
1084                 ".Lno_carry%=:\n\t"
1085 
1086                 "pshufb %%xmm6, %%xmm5\n\t"
1087                 "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).       */
1088 
1089                 "pxor (%[key]), %%xmm0\n\t"     /* xmm1 ^= key[0]    */
1090                 "movdqa 0x10(%[key]), %%xmm1\n\t"
1091                 aesenc_xmm1_xmm0
1092                 "movdqa 0x20(%[key]), %%xmm1\n\t"
1093                 aesenc_xmm1_xmm0
1094                 "movdqa 0x30(%[key]), %%xmm1\n\t"
1095                 aesenc_xmm1_xmm0
1096                 "movdqa 0x40(%[key]), %%xmm1\n\t"
1097                 aesenc_xmm1_xmm0
1098                 "movdqa 0x50(%[key]), %%xmm1\n\t"
1099                 aesenc_xmm1_xmm0
1100                 "movdqa 0x60(%[key]), %%xmm1\n\t"
1101                 aesenc_xmm1_xmm0
1102                 "movdqa 0x70(%[key]), %%xmm1\n\t"
1103                 aesenc_xmm1_xmm0
1104                 "movdqa 0x80(%[key]), %%xmm1\n\t"
1105                 aesenc_xmm1_xmm0
1106                 "movdqa 0x90(%[key]), %%xmm1\n\t"
1107                 aesenc_xmm1_xmm0
1108                 "movdqa 0xa0(%[key]), %%xmm1\n\t"
1109                 "cmpl $10, %[rounds]\n\t"
1110                 "jz .Lenclast%=\n\t"
1111                 aesenc_xmm1_xmm0
1112                 "movdqa 0xb0(%[key]), %%xmm1\n\t"
1113                 aesenc_xmm1_xmm0
1114                 "movdqa 0xc0(%[key]), %%xmm1\n\t"
1115                 "cmpl $12, %[rounds]\n\t"
1116                 "jz .Lenclast%=\n\t"
1117                 aesenc_xmm1_xmm0
1118                 "movdqa 0xd0(%[key]), %%xmm1\n\t"
1119                 aesenc_xmm1_xmm0
1120                 "movdqa 0xe0(%[key]), %%xmm1\n"
1121 
1122                 ".Lenclast%=:\n\t"
1123                 aesenclast_xmm1_xmm0
1124                 "movdqu %[src], %%xmm1\n\t"      /* xmm1 := input   */
1125                 "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR ^= input  */
1126                 "movdqu %%xmm0, %[dst]"          /* Store EncCTR.    */
1127 
1128                 : [dst] "=m" (*b)
1129                 : [src] "m" (*a),
1130                   [ctr] "r" (ctr),
1131                   [key] "r" (ctx->keyschenc),
1132                   [rounds] "g" (ctx->rounds)
1133                 : "cc", "memory");
1134 #undef aesenc_xmm1_xmm0
1135 #undef aesenclast_xmm1_xmm0
1136 }
1137 
1138 
1139 /* Four blocks at a time variant of do_aesni_ctr.  */
1140 static ASM_FUNC_ATTR_INLINE void
do_aesni_ctr_4(const RIJNDAEL_context * ctx,unsigned char * ctr,unsigned char * b,const unsigned char * a)1141 do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
1142                 unsigned char *ctr, unsigned char *b, const unsigned char *a)
1143 {
1144   static const byte bige_addb_const[4][16] __attribute__ ((aligned (16))) =
1145     {
1146       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
1147       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
1148       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
1149       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }
1150     };
1151   const void *bige_addb = bige_addb_const;
1152 #define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
1153 #define aesenc_xmm1_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t"
1154 #define aesenc_xmm1_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t"
1155 #define aesenc_xmm1_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t"
1156 #define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
1157 #define aesenclast_xmm1_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t"
1158 #define aesenclast_xmm1_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t"
1159 #define aesenclast_xmm1_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
1160 
1161   /* Register usage:
1162       [key] keyschedule
1163       xmm0  CTR-0
1164       xmm1  temp / round key
1165       xmm2  CTR-1
1166       xmm3  CTR-2
1167       xmm4  CTR-3
1168       xmm5  copy of *ctr
1169       xmm6  endian swapping mask
1170    */
1171 
1172   asm volatile (/* detect if 8-bit carry handling is needed */
1173                 "addb   $4, 15(%[ctr])\n\t"
1174                 "jc     .Ladd32bit%=\n\t"
1175 
1176                 "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
1177                 "movdqa 0*16(%[addb]), %%xmm2\n\t"  /* xmm2 := be(1) */
1178                 "movdqa 1*16(%[addb]), %%xmm3\n\t"  /* xmm3 := be(2) */
1179                 "movdqa 2*16(%[addb]), %%xmm4\n\t"  /* xmm4 := be(3) */
1180                 "movdqa 3*16(%[addb]), %%xmm5\n\t"  /* xmm5 := be(4) */
1181                 "paddb  %%xmm0, %%xmm2\n\t"     /* xmm2 := be(1) + CTR (xmm0) */
1182                 "paddb  %%xmm0, %%xmm3\n\t"     /* xmm3 := be(2) + CTR (xmm0) */
1183                 "paddb  %%xmm0, %%xmm4\n\t"     /* xmm4 := be(3) + CTR (xmm0) */
1184                 "paddb  %%xmm0, %%xmm5\n\t"     /* xmm5 := be(4) + CTR (xmm0) */
1185                 "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
1186                 "jmp    .Ldone_ctr%=\n\t"
1187 
1188                 ".Ladd32bit%=:\n\t"
1189                 "movdqa %%xmm5, (%[ctr])\n\t"   /* Restore CTR.  */
1190                 "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
1191                 "movdqa %%xmm0, %%xmm2\n\t"
1192                 "pcmpeqd %%xmm1, %%xmm1\n\t"
1193                 "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
1194 
1195                 "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
1196                 "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
1197                 "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
1198                 "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
1199                 "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
1200                 "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
1201                 "movdqa %%xmm4, %%xmm5\n\t"     /* xmm5 := xmm4     */
1202                 "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
1203 
1204                 /* detect if 64-bit carry handling is needed */
1205                 "cmpl   $0xffffffff, 8(%[ctr])\n\t"
1206                 "jne    .Lno_carry%=\n\t"
1207                 "movl   12(%[ctr]), %%esi\n\t"
1208                 "bswapl %%esi\n\t"
1209                 "cmpl   $0xfffffffc, %%esi\n\t"
1210                 "jb     .Lno_carry%=\n\t"       /* no carry */
1211 
1212                 "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
1213                 "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffffc */
1214                 "cmpl   $0xfffffffe, %%esi\n\t"
1215                 "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
1216                 "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
1217                 /* esi == 0xffffffff */
1218 
1219                 "psubq   %%xmm1, %%xmm2\n\t"
1220                 ".Lcarry_xmm3%=:\n\t"
1221                 "psubq   %%xmm1, %%xmm3\n\t"
1222                 ".Lcarry_xmm4%=:\n\t"
1223                 "psubq   %%xmm1, %%xmm4\n\t"
1224                 ".Lcarry_xmm5%=:\n\t"
1225                 "psubq   %%xmm1, %%xmm5\n\t"
1226 
1227                 ".Lno_carry%=:\n\t"
1228                 "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
1229 
1230                 "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
1231                 "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
1232                 "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
1233                 "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
1234 
1235                 "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
1236 
1237                 ".Ldone_ctr%=:\n\t"
1238                 :
1239                 : [ctr] "r" (ctr),
1240                   [key] "r" (ctx->keyschenc),
1241                   [addb] "r" (bige_addb)
1242                 : "%esi", "cc", "memory");
1243 
1244   asm volatile ("pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
1245                 "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
1246                 "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
1247                 "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
1248                 "movdqa 0x10(%[key]), %%xmm1\n\t"
1249                 aesenc_xmm1_xmm0
1250                 aesenc_xmm1_xmm2
1251                 aesenc_xmm1_xmm3
1252                 aesenc_xmm1_xmm4
1253                 "movdqa 0x20(%[key]), %%xmm1\n\t"
1254                 aesenc_xmm1_xmm0
1255                 aesenc_xmm1_xmm2
1256                 aesenc_xmm1_xmm3
1257                 aesenc_xmm1_xmm4
1258                 "movdqa 0x30(%[key]), %%xmm1\n\t"
1259                 aesenc_xmm1_xmm0
1260                 aesenc_xmm1_xmm2
1261                 aesenc_xmm1_xmm3
1262                 aesenc_xmm1_xmm4
1263                 "movdqa 0x40(%[key]), %%xmm1\n\t"
1264                 aesenc_xmm1_xmm0
1265                 aesenc_xmm1_xmm2
1266                 aesenc_xmm1_xmm3
1267                 aesenc_xmm1_xmm4
1268                 "movdqa 0x50(%[key]), %%xmm1\n\t"
1269                 aesenc_xmm1_xmm0
1270                 aesenc_xmm1_xmm2
1271                 aesenc_xmm1_xmm3
1272                 aesenc_xmm1_xmm4
1273                 "movdqa 0x60(%[key]), %%xmm1\n\t"
1274                 aesenc_xmm1_xmm0
1275                 aesenc_xmm1_xmm2
1276                 aesenc_xmm1_xmm3
1277                 aesenc_xmm1_xmm4
1278                 "movdqa 0x70(%[key]), %%xmm1\n\t"
1279                 aesenc_xmm1_xmm0
1280                 aesenc_xmm1_xmm2
1281                 aesenc_xmm1_xmm3
1282                 aesenc_xmm1_xmm4
1283                 "movdqa 0x80(%[key]), %%xmm1\n\t"
1284                 aesenc_xmm1_xmm0
1285                 aesenc_xmm1_xmm2
1286                 aesenc_xmm1_xmm3
1287                 aesenc_xmm1_xmm4
1288                 "movdqa 0x90(%[key]), %%xmm1\n\t"
1289                 aesenc_xmm1_xmm0
1290                 aesenc_xmm1_xmm2
1291                 aesenc_xmm1_xmm3
1292                 aesenc_xmm1_xmm4
1293                 "movdqa 0xa0(%[key]), %%xmm1\n\t"
1294                 "cmpl $10, %[rounds]\n\t"
1295                 "jz .Lenclast%=\n\t"
1296                 aesenc_xmm1_xmm0
1297                 aesenc_xmm1_xmm2
1298                 aesenc_xmm1_xmm3
1299                 aesenc_xmm1_xmm4
1300                 "movdqa 0xb0(%[key]), %%xmm1\n\t"
1301                 aesenc_xmm1_xmm0
1302                 aesenc_xmm1_xmm2
1303                 aesenc_xmm1_xmm3
1304                 aesenc_xmm1_xmm4
1305                 "movdqa 0xc0(%[key]), %%xmm1\n\t"
1306                 "cmpl $12, %[rounds]\n\t"
1307                 "jz .Lenclast%=\n\t"
1308                 aesenc_xmm1_xmm0
1309                 aesenc_xmm1_xmm2
1310                 aesenc_xmm1_xmm3
1311                 aesenc_xmm1_xmm4
1312                 "movdqa 0xd0(%[key]), %%xmm1\n\t"
1313                 aesenc_xmm1_xmm0
1314                 aesenc_xmm1_xmm2
1315                 aesenc_xmm1_xmm3
1316                 aesenc_xmm1_xmm4
1317                 "movdqa 0xe0(%[key]), %%xmm1\n"
1318 
1319                 ".Lenclast%=:\n\t"
1320                 aesenclast_xmm1_xmm0
1321                 aesenclast_xmm1_xmm2
1322                 aesenclast_xmm1_xmm3
1323                 aesenclast_xmm1_xmm4
1324                 :
1325                 : [key] "r" (ctx->keyschenc),
1326                   [rounds] "r" (ctx->rounds)
1327                 : "cc", "memory");
1328 
1329   asm volatile ("movdqu (%[src]), %%xmm1\n\t"    /* Get block 1.      */
1330                 "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR-1 ^= input */
1331                 "movdqu %%xmm0, (%[dst])\n\t"    /* Store block 1     */
1332 
1333                 "movdqu 16(%[src]), %%xmm1\n\t"  /* Get block 2.      */
1334                 "pxor %%xmm1, %%xmm2\n\t"        /* EncCTR-2 ^= input */
1335                 "movdqu %%xmm2, 16(%[dst])\n\t"  /* Store block 2.    */
1336 
1337                 "movdqu 32(%[src]), %%xmm1\n\t"  /* Get block 3.      */
1338                 "pxor %%xmm1, %%xmm3\n\t"        /* EncCTR-3 ^= input */
1339                 "movdqu %%xmm3, 32(%[dst])\n\t"  /* Store block 3.    */
1340 
1341                 "movdqu 48(%[src]), %%xmm1\n\t"  /* Get block 4.      */
1342                 "pxor %%xmm1, %%xmm4\n\t"        /* EncCTR-4 ^= input */
1343                 "movdqu %%xmm4, 48(%[dst])"      /* Store block 4.   */
1344                 :
1345                 : [src] "r" (a),
1346                   [dst] "r" (b)
1347                 : "memory");
1348 #undef aesenc_xmm1_xmm0
1349 #undef aesenc_xmm1_xmm2
1350 #undef aesenc_xmm1_xmm3
1351 #undef aesenc_xmm1_xmm4
1352 #undef aesenclast_xmm1_xmm0
1353 #undef aesenclast_xmm1_xmm2
1354 #undef aesenclast_xmm1_xmm3
1355 #undef aesenclast_xmm1_xmm4
1356 }
1357 
1358 
1359 #ifdef __x86_64__
1360 
1361 /* Eight blocks at a time variant of do_aesni_ctr.  */
1362 static ASM_FUNC_ATTR_INLINE void
do_aesni_ctr_8(const RIJNDAEL_context * ctx,unsigned char * ctr,unsigned char * b,const unsigned char * a)1363 do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
1364                 unsigned char *ctr, unsigned char *b, const unsigned char *a)
1365 {
1366   static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) =
1367     {
1368       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
1369       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
1370       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
1371       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 },
1372       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 },
1373       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 },
1374       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 },
1375       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }
1376     };
1377   const void *bige_addb = bige_addb_const;
1378 
1379   /* Register usage:
1380       [key] keyschedule
1381       xmm0  CTR-0
1382       xmm1  temp / round key
1383       xmm2  CTR-1
1384       xmm3  CTR-2
1385       xmm4  CTR-3
1386       xmm5  copy of *ctr
1387       xmm6  endian swapping mask
1388       xmm8  CTR-4
1389       xmm9  CTR-5
1390       xmm10 CTR-6
1391       xmm11 CTR-7
1392       xmm12 temp
1393       xmm13 temp
1394       xmm14 temp
1395       xmm15 temp
1396    */
1397 
1398   asm volatile (/* detect if 8-bit carry handling is needed */
1399                 "addb   $8, 15(%[ctr])\n\t"
1400                 "jc     .Ladd32bit%=\n\t"
1401 
1402                 "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
1403                 "movdqa 16(%[key]), %%xmm7\n\t" /* xmm7 := key[1] */
1404 
1405                 "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
1406                 "movdqa %%xmm5, %%xmm2\n\t"     /* xmm2 := CTR (xmm5) */
1407                 "movdqa %%xmm5, %%xmm3\n\t"     /* xmm3 := CTR (xmm5) */
1408                 "movdqa %%xmm5, %%xmm4\n\t"     /* xmm4 := CTR (xmm5) */
1409                 "paddb  0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) + CTR */
1410                 "paddb  1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) + CTR */
1411                 "paddb  2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) + CTR */
1412                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
1413                 "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
1414                 "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
1415                 "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
1416                 "aesenc %%xmm7, %%xmm0\n\t"
1417                 "aesenc %%xmm7, %%xmm2\n\t"
1418                 "aesenc %%xmm7, %%xmm3\n\t"
1419                 "aesenc %%xmm7, %%xmm4\n\t"
1420                 "movdqa %%xmm5, %%xmm8\n\t"     /* xmm8 := CTR (xmm5) */
1421                 "movdqa %%xmm5, %%xmm9\n\t"     /* xmm9 := CTR (xmm5) */
1422                 "movdqa %%xmm5, %%xmm10\n\t"    /* xmm10 := CTR (xmm5) */
1423                 "movdqa %%xmm5, %%xmm11\n\t"    /* xmm11 := CTR (xmm5) */
1424                 "paddb  3*16(%[addb]), %%xmm8\n\t"  /* xmm8 := be(4) + CTR */
1425                 "paddb  4*16(%[addb]), %%xmm9\n\t"  /* xmm9 := be(5) + CTR */
1426                 "paddb  5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) + CTR */
1427                 "paddb  6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) + CTR */
1428                 "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
1429                 "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
1430                 "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
1431                 "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
1432                 "aesenc %%xmm7, %%xmm8\n\t"
1433                 "aesenc %%xmm7, %%xmm9\n\t"
1434                 "aesenc %%xmm7, %%xmm10\n\t"
1435                 "aesenc %%xmm7, %%xmm11\n\t"
1436 
1437                 "paddb  7*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(8) + CTR */
1438 
1439                 "jmp    .Ldone_ctr%=\n\t"
1440 
1441                 ".Ladd32bit%=:\n\t"
1442                 "movdqa %%xmm5, (%[ctr])\n\t"   /* Restore CTR. */
1443                 "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
1444                 "movdqa %%xmm0, %%xmm2\n\t"
1445                 "pcmpeqd %%xmm1, %%xmm1\n\t"
1446                 "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
1447 
1448                 "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
1449                 "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
1450                 "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
1451                 "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
1452                 "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
1453                 "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
1454                 "movdqa %%xmm4, %%xmm8\n\t"     /* xmm8 := xmm4     */
1455                 "psubq  %%xmm1, %%xmm8\n\t"     /* xmm8++           */
1456                 "movdqa %%xmm8, %%xmm9\n\t"     /* xmm9 := xmm8     */
1457                 "psubq  %%xmm1, %%xmm9\n\t"     /* xmm9++           */
1458                 "movdqa %%xmm9, %%xmm10\n\t"    /* xmm10 := xmm9    */
1459                 "psubq  %%xmm1, %%xmm10\n\t"    /* xmm10++          */
1460                 "movdqa %%xmm10, %%xmm11\n\t"   /* xmm11 := xmm10   */
1461                 "psubq  %%xmm1, %%xmm11\n\t"    /* xmm11++          */
1462                 "movdqa %%xmm11, %%xmm5\n\t"    /* xmm5 := xmm11    */
1463                 "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
1464 
1465                 /* detect if 64-bit carry handling is needed */
1466                 "cmpl   $0xffffffff, 8(%[ctr])\n\t"
1467                 "jne    .Lno_carry%=\n\t"
1468                 "movl   12(%[ctr]), %%esi\n\t"
1469                 "bswapl %%esi\n\t"
1470                 "cmpl   $0xfffffff8, %%esi\n\t"
1471                 "jb     .Lno_carry%=\n\t"       /* no carry */
1472 
1473                 "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
1474                 "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffff8 */
1475                 "cmpl   $0xfffffffa, %%esi\n\t"
1476                 "jb     .Lcarry_xmm11%=\n\t"     /* esi == 0xfffffff9 */
1477                 "je     .Lcarry_xmm10%=\n\t"     /* esi == 0xfffffffa */
1478                 "cmpl   $0xfffffffc, %%esi\n\t"
1479                 "jb     .Lcarry_xmm9%=\n\t"     /* esi == 0xfffffffb */
1480                 "je     .Lcarry_xmm8%=\n\t"     /* esi == 0xfffffffc */
1481                 "cmpl   $0xfffffffe, %%esi\n\t"
1482                 "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
1483                 "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
1484                 /* esi == 0xffffffff */
1485 
1486                 "psubq   %%xmm1, %%xmm2\n\t"
1487                 ".Lcarry_xmm3%=:\n\t"
1488                 "psubq   %%xmm1, %%xmm3\n\t"
1489                 ".Lcarry_xmm4%=:\n\t"
1490                 "psubq   %%xmm1, %%xmm4\n\t"
1491                 ".Lcarry_xmm8%=:\n\t"
1492                 "psubq   %%xmm1, %%xmm8\n\t"
1493                 ".Lcarry_xmm9%=:\n\t"
1494                 "psubq   %%xmm1, %%xmm9\n\t"
1495                 ".Lcarry_xmm10%=:\n\t"
1496                 "psubq   %%xmm1, %%xmm10\n\t"
1497                 ".Lcarry_xmm11%=:\n\t"
1498                 "psubq   %%xmm1, %%xmm11\n\t"
1499                 ".Lcarry_xmm5%=:\n\t"
1500                 "psubq   %%xmm1, %%xmm5\n\t"
1501 
1502                 ".Lno_carry%=:\n\t"
1503                 "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
1504                 "movdqa 16(%[key]), %%xmm7\n\t" /* xmm7 := key[1] */
1505 
1506                 "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
1507                 "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
1508                 "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
1509                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
1510                 "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
1511                 "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
1512                 "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
1513                 "aesenc %%xmm7, %%xmm0\n\t"
1514                 "aesenc %%xmm7, %%xmm2\n\t"
1515                 "aesenc %%xmm7, %%xmm3\n\t"
1516                 "aesenc %%xmm7, %%xmm4\n\t"
1517                 "pshufb %%xmm6, %%xmm8\n\t"     /* xmm8 := be(xmm8) */
1518                 "pshufb %%xmm6, %%xmm9\n\t"     /* xmm9 := be(xmm9) */
1519                 "pshufb %%xmm6, %%xmm10\n\t"    /* xmm10 := be(xmm10) */
1520                 "pshufb %%xmm6, %%xmm11\n\t"    /* xmm11 := be(xmm11) */
1521                 "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
1522                 "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
1523                 "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
1524                 "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
1525                 "aesenc %%xmm7, %%xmm8\n\t"
1526                 "aesenc %%xmm7, %%xmm9\n\t"
1527                 "aesenc %%xmm7, %%xmm10\n\t"
1528                 "aesenc %%xmm7, %%xmm11\n\t"
1529 
1530                 "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
1531                 "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
1532 
1533                 ".align 16\n\t"
1534                 ".Ldone_ctr%=:\n\t"
1535                 :
1536                 : [ctr] "r" (ctr),
1537                   [key] "r" (ctx->keyschenc),
1538                   [addb] "r" (bige_addb)
1539                 : "%esi", "cc", "memory");
1540 
1541   asm volatile ("movdqa 0x20(%[key]), %%xmm1\n\t"
1542                 "movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1.      */
1543                 "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2.      */
1544                 "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3.      */
1545                 "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4.      */
1546                 "movdqu 4*16(%[src]), %%xmm7\n\t"  /* Get block 5.      */
1547                 "aesenc %%xmm1, %%xmm0\n\t"
1548                 "aesenc %%xmm1, %%xmm2\n\t"
1549                 "aesenc %%xmm1, %%xmm3\n\t"
1550                 "aesenc %%xmm1, %%xmm4\n\t"
1551                 "aesenc %%xmm1, %%xmm8\n\t"
1552                 "aesenc %%xmm1, %%xmm9\n\t"
1553                 "aesenc %%xmm1, %%xmm10\n\t"
1554                 "aesenc %%xmm1, %%xmm11\n\t"
1555                 "cmpl $12, %[rounds]\n\t"
1556                 "movdqa 0x30(%[key]), %%xmm1\n\t"
1557                 "aesenc %%xmm1, %%xmm0\n\t"
1558                 "aesenc %%xmm1, %%xmm2\n\t"
1559                 "aesenc %%xmm1, %%xmm3\n\t"
1560                 "aesenc %%xmm1, %%xmm4\n\t"
1561                 "aesenc %%xmm1, %%xmm8\n\t"
1562                 "aesenc %%xmm1, %%xmm9\n\t"
1563                 "aesenc %%xmm1, %%xmm10\n\t"
1564                 "aesenc %%xmm1, %%xmm11\n\t"
1565                 "movdqa 0x40(%[key]), %%xmm1\n\t"
1566                 "aesenc %%xmm1, %%xmm0\n\t"
1567                 "aesenc %%xmm1, %%xmm2\n\t"
1568                 "aesenc %%xmm1, %%xmm3\n\t"
1569                 "aesenc %%xmm1, %%xmm4\n\t"
1570                 "aesenc %%xmm1, %%xmm8\n\t"
1571                 "aesenc %%xmm1, %%xmm9\n\t"
1572                 "aesenc %%xmm1, %%xmm10\n\t"
1573                 "aesenc %%xmm1, %%xmm11\n\t"
1574                 "movdqa 0x50(%[key]), %%xmm1\n\t"
1575                 "aesenc %%xmm1, %%xmm0\n\t"
1576                 "aesenc %%xmm1, %%xmm2\n\t"
1577                 "aesenc %%xmm1, %%xmm3\n\t"
1578                 "aesenc %%xmm1, %%xmm4\n\t"
1579                 "aesenc %%xmm1, %%xmm8\n\t"
1580                 "aesenc %%xmm1, %%xmm9\n\t"
1581                 "aesenc %%xmm1, %%xmm10\n\t"
1582                 "aesenc %%xmm1, %%xmm11\n\t"
1583                 "movdqa 0x60(%[key]), %%xmm1\n\t"
1584                 "aesenc %%xmm1, %%xmm0\n\t"
1585                 "aesenc %%xmm1, %%xmm2\n\t"
1586                 "aesenc %%xmm1, %%xmm3\n\t"
1587                 "aesenc %%xmm1, %%xmm4\n\t"
1588                 "aesenc %%xmm1, %%xmm8\n\t"
1589                 "aesenc %%xmm1, %%xmm9\n\t"
1590                 "aesenc %%xmm1, %%xmm10\n\t"
1591                 "aesenc %%xmm1, %%xmm11\n\t"
1592                 "movdqa 0x70(%[key]), %%xmm1\n\t"
1593                 "aesenc %%xmm1, %%xmm0\n\t"
1594                 "aesenc %%xmm1, %%xmm2\n\t"
1595                 "aesenc %%xmm1, %%xmm3\n\t"
1596                 "aesenc %%xmm1, %%xmm4\n\t"
1597                 "aesenc %%xmm1, %%xmm8\n\t"
1598                 "aesenc %%xmm1, %%xmm9\n\t"
1599                 "aesenc %%xmm1, %%xmm10\n\t"
1600                 "aesenc %%xmm1, %%xmm11\n\t"
1601                 "movdqa 0x80(%[key]), %%xmm1\n\t"
1602                 "aesenc %%xmm1, %%xmm0\n\t"
1603                 "aesenc %%xmm1, %%xmm2\n\t"
1604                 "aesenc %%xmm1, %%xmm3\n\t"
1605                 "aesenc %%xmm1, %%xmm4\n\t"
1606                 "aesenc %%xmm1, %%xmm8\n\t"
1607                 "aesenc %%xmm1, %%xmm9\n\t"
1608                 "aesenc %%xmm1, %%xmm10\n\t"
1609                 "aesenc %%xmm1, %%xmm11\n\t"
1610                 "movdqa 0x90(%[key]), %%xmm1\n\t"
1611                 "aesenc %%xmm1, %%xmm0\n\t"
1612                 "aesenc %%xmm1, %%xmm2\n\t"
1613                 "aesenc %%xmm1, %%xmm3\n\t"
1614                 "aesenc %%xmm1, %%xmm4\n\t"
1615                 "aesenc %%xmm1, %%xmm8\n\t"
1616                 "aesenc %%xmm1, %%xmm9\n\t"
1617                 "aesenc %%xmm1, %%xmm10\n\t"
1618                 "aesenc %%xmm1, %%xmm11\n\t"
1619                 "movdqa 0xa0(%[key]), %%xmm1\n\t"
1620                 "jb .Lenclast%=\n\t"
1621                 "aesenc %%xmm1, %%xmm0\n\t"
1622                 "aesenc %%xmm1, %%xmm2\n\t"
1623                 "aesenc %%xmm1, %%xmm3\n\t"
1624                 "aesenc %%xmm1, %%xmm4\n\t"
1625                 "aesenc %%xmm1, %%xmm8\n\t"
1626                 "aesenc %%xmm1, %%xmm9\n\t"
1627                 "aesenc %%xmm1, %%xmm10\n\t"
1628                 "aesenc %%xmm1, %%xmm11\n\t"
1629                 "movdqa 0xb0(%[key]), %%xmm1\n\t"
1630                 "aesenc %%xmm1, %%xmm0\n\t"
1631                 "aesenc %%xmm1, %%xmm2\n\t"
1632                 "aesenc %%xmm1, %%xmm3\n\t"
1633                 "aesenc %%xmm1, %%xmm4\n\t"
1634                 "aesenc %%xmm1, %%xmm8\n\t"
1635                 "aesenc %%xmm1, %%xmm9\n\t"
1636                 "aesenc %%xmm1, %%xmm10\n\t"
1637                 "aesenc %%xmm1, %%xmm11\n\t"
1638                 "movdqa 0xc0(%[key]), %%xmm1\n\t"
1639                 "je .Lenclast%=\n\t"
1640                 "aesenc %%xmm1, %%xmm0\n\t"
1641                 "aesenc %%xmm1, %%xmm2\n\t"
1642                 "aesenc %%xmm1, %%xmm3\n\t"
1643                 "aesenc %%xmm1, %%xmm4\n\t"
1644                 "aesenc %%xmm1, %%xmm8\n\t"
1645                 "aesenc %%xmm1, %%xmm9\n\t"
1646                 "aesenc %%xmm1, %%xmm10\n\t"
1647                 "aesenc %%xmm1, %%xmm11\n\t"
1648                 "movdqa 0xd0(%[key]), %%xmm1\n\t"
1649                 "aesenc %%xmm1, %%xmm0\n\t"
1650                 "aesenc %%xmm1, %%xmm2\n\t"
1651                 "aesenc %%xmm1, %%xmm3\n\t"
1652                 "aesenc %%xmm1, %%xmm4\n\t"
1653                 "aesenc %%xmm1, %%xmm8\n\t"
1654                 "aesenc %%xmm1, %%xmm9\n\t"
1655                 "aesenc %%xmm1, %%xmm10\n\t"
1656                 "aesenc %%xmm1, %%xmm11\n\t"
1657                 "movdqa 0xe0(%[key]), %%xmm1\n"
1658 
1659                 ".Lenclast%=:\n\t"
1660                 :
1661                 : [key] "r" (ctx->keyschenc),
1662                   [rounds] "r" (ctx->rounds),
1663                   [src] "r" (a)
1664                 : "cc", "memory");
1665 
1666   asm volatile ("pxor %%xmm1, %%xmm12\n\t"         /* block1 ^= lastkey */
1667                 "pxor %%xmm1, %%xmm13\n\t"         /* block2 ^= lastkey */
1668                 "pxor %%xmm1, %%xmm14\n\t"         /* block3 ^= lastkey */
1669                 "pxor %%xmm1, %%xmm15\n\t"         /* block4 ^= lastkey */
1670                 "aesenclast %%xmm12, %%xmm0\n\t"
1671                 "aesenclast %%xmm13, %%xmm2\n\t"
1672                 "aesenclast %%xmm14, %%xmm3\n\t"
1673                 "aesenclast %%xmm15, %%xmm4\n\t"
1674                 "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6.      */
1675                 "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7.      */
1676                 "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8.      */
1677                 "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1.    */
1678                 "movdqu %%xmm2, 1*16(%[dst])\n\t"  /* Store block 2.    */
1679                 "movdqu %%xmm3, 2*16(%[dst])\n\t"  /* Store block 3.    */
1680                 "movdqu %%xmm4, 3*16(%[dst])\n\t"  /* Store block 4.    */
1681                 "pxor %%xmm1, %%xmm7\n\t"          /* block5 ^= lastkey */
1682                 "pxor %%xmm1, %%xmm12\n\t"         /* block6 ^= lastkey */
1683                 "pxor %%xmm1, %%xmm13\n\t"         /* block7 ^= lastkey */
1684                 "pxor %%xmm1, %%xmm14\n\t"         /* block8 ^= lastkey */
1685                 "aesenclast %%xmm7, %%xmm8\n\t"
1686                 "aesenclast %%xmm12, %%xmm9\n\t"
1687                 "aesenclast %%xmm13, %%xmm10\n\t"
1688                 "aesenclast %%xmm14, %%xmm11\n\t"
1689                 "movdqu %%xmm8, 4*16(%[dst])\n\t"  /* Store block 8.    */
1690                 "movdqu %%xmm9, 5*16(%[dst])\n\t"  /* Store block 9.    */
1691                 "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10.   */
1692                 "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11.   */
1693                 :
1694                 : [src] "r" (a),
1695                   [dst] "r" (b)
1696                 : "memory");
1697 }
1698 
1699 #endif /* __x86_64__ */
1700 
1701 
1702 unsigned int ASM_FUNC_ATTR
_gcry_aes_aesni_encrypt(const RIJNDAEL_context * ctx,unsigned char * dst,const unsigned char * src)1703 _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
1704                          const unsigned char *src)
1705 {
1706   aesni_prepare ();
1707   asm volatile ("movdqu %[src], %%xmm0\n\t"
1708                 :
1709                 : [src] "m" (*src)
1710                 : "memory" );
1711   do_aesni_enc (ctx);
1712   asm volatile ("movdqu %%xmm0, %[dst]\n\t"
1713                 : [dst] "=m" (*dst)
1714                 :
1715                 : "memory" );
1716   aesni_cleanup ();
1717   return 0;
1718 }
1719 
1720 
1721 void ASM_FUNC_ATTR
_gcry_aes_aesni_cfb_enc(RIJNDAEL_context * ctx,unsigned char * iv,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)1722 _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
1723                          unsigned char *outbuf, const unsigned char *inbuf,
1724                          size_t nblocks)
1725 {
1726   aesni_prepare ();
1727 
1728   asm volatile ("movdqu %[iv], %%xmm0\n\t"
1729                 : /* No output */
1730                 : [iv] "m" (*iv)
1731                 : "memory" );
1732 
1733   for ( ;nblocks; nblocks-- )
1734     {
1735       do_aesni_enc (ctx);
1736 
1737       asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
1738                     "pxor %%xmm1, %%xmm0\n\t"
1739                     "movdqu %%xmm0, %[outbuf]\n\t"
1740                     : [outbuf] "=m" (*outbuf)
1741                     : [inbuf] "m" (*inbuf)
1742                     : "memory" );
1743 
1744       outbuf += BLOCKSIZE;
1745       inbuf  += BLOCKSIZE;
1746     }
1747 
1748   asm volatile ("movdqu %%xmm0, %[iv]\n\t"
1749                 : [iv] "=m" (*iv)
1750                 :
1751                 : "memory" );
1752 
1753   aesni_cleanup ();
1754 }
1755 
1756 
1757 void ASM_FUNC_ATTR
_gcry_aes_aesni_cbc_enc(RIJNDAEL_context * ctx,unsigned char * iv,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks,int cbc_mac)1758 _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv,
1759                          unsigned char *outbuf, const unsigned char *inbuf,
1760                          size_t nblocks, int cbc_mac)
1761 {
1762   aesni_prepare_2_7_variable;
1763 
1764   aesni_prepare ();
1765   aesni_prepare_2_7();
1766 
1767   asm volatile ("movdqu %[iv], %%xmm5\n\t"
1768                 : /* No output */
1769                 : [iv] "m" (*iv)
1770                 : "memory" );
1771 
1772   for ( ;nblocks; nblocks-- )
1773     {
1774       asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
1775                     "pxor %%xmm5, %%xmm0\n\t"
1776                     : /* No output */
1777                     : [inbuf] "m" (*inbuf)
1778                     : "memory" );
1779 
1780       do_aesni_enc (ctx);
1781 
1782       asm volatile ("movdqa %%xmm0, %%xmm5\n\t"
1783                     "movdqu %%xmm0, %[outbuf]\n\t"
1784                     : [outbuf] "=m" (*outbuf)
1785                     :
1786                     : "memory" );
1787 
1788       inbuf += BLOCKSIZE;
1789       if (!cbc_mac)
1790         outbuf += BLOCKSIZE;
1791     }
1792 
1793   asm volatile ("movdqu %%xmm5, %[iv]\n\t"
1794                 : [iv] "=m" (*iv)
1795                 :
1796                 : "memory" );
1797 
1798   aesni_cleanup ();
1799   aesni_cleanup_2_7 ();
1800 }
1801 
1802 
1803 void ASM_FUNC_ATTR
_gcry_aes_aesni_ctr_enc(RIJNDAEL_context * ctx,unsigned char * ctr,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)1804 _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
1805                          unsigned char *outbuf, const unsigned char *inbuf,
1806                          size_t nblocks)
1807 {
1808   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
1809     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
1810   aesni_prepare_2_7_variable;
1811 
1812   aesni_prepare ();
1813   aesni_prepare_2_7();
1814 
1815   asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
1816                 "movdqa %[ctr], %%xmm5\n\t"  /* Preload CTR */
1817                 : /* No output */
1818                 : [mask] "m" (*be_mask),
1819                   [ctr] "m" (*ctr)
1820                 : "memory");
1821 
1822 #ifdef __x86_64__
1823   if (nblocks >= 8)
1824     {
1825       aesni_prepare_8_15_variable;
1826 
1827       aesni_prepare_8_15();
1828 
1829       for ( ;nblocks >= 8 ; nblocks -= 8 )
1830 	{
1831 	  do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf);
1832 	  outbuf += 8*BLOCKSIZE;
1833 	  inbuf  += 8*BLOCKSIZE;
1834 	}
1835 
1836       aesni_cleanup_8_15();
1837     }
1838 #endif
1839 
1840   for ( ;nblocks >= 4 ; nblocks -= 4 )
1841     {
1842       do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
1843       outbuf += 4*BLOCKSIZE;
1844       inbuf  += 4*BLOCKSIZE;
1845     }
1846   for ( ;nblocks; nblocks-- )
1847     {
1848       do_aesni_ctr (ctx, ctr, outbuf, inbuf);
1849       outbuf += BLOCKSIZE;
1850       inbuf  += BLOCKSIZE;
1851     }
1852   aesni_cleanup ();
1853   aesni_cleanup_2_7 ();
1854 }
1855 
1856 
1857 unsigned int ASM_FUNC_ATTR
_gcry_aes_aesni_decrypt(const RIJNDAEL_context * ctx,unsigned char * dst,const unsigned char * src)1858 _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
1859                          const unsigned char *src)
1860 {
1861   aesni_prepare ();
1862   asm volatile ("movdqu %[src], %%xmm0\n\t"
1863                 :
1864                 : [src] "m" (*src)
1865                 : "memory" );
1866   do_aesni_dec (ctx);
1867   asm volatile ("movdqu %%xmm0, %[dst]\n\t"
1868                 : [dst] "=m" (*dst)
1869                 :
1870                 : "memory" );
1871   aesni_cleanup ();
1872   return 0;
1873 }
1874 
1875 
1876 void ASM_FUNC_ATTR
_gcry_aes_aesni_cfb_dec(RIJNDAEL_context * ctx,unsigned char * iv,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)1877 _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
1878                          unsigned char *outbuf, const unsigned char *inbuf,
1879                          size_t nblocks)
1880 {
1881   aesni_prepare_2_7_variable;
1882 
1883   aesni_prepare ();
1884   aesni_prepare_2_7();
1885 
1886   asm volatile ("movdqu %[iv], %%xmm6\n\t"
1887                 : /* No output */
1888                 : [iv] "m" (*iv)
1889                 : "memory" );
1890 
1891   /* CFB decryption can be parallelized */
1892 
1893 #ifdef __x86_64__
1894   if (nblocks >= 8)
1895     {
1896       aesni_prepare_8_15_variable;
1897 
1898       aesni_prepare_8_15();
1899 
1900       for ( ;nblocks >= 8; nblocks -= 8)
1901 	{
1902 	  asm volatile
1903 	    ("movdqa (%[key]), %%xmm0\n\t"
1904 
1905 	     "movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
1906 	     "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
1907 	     "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
1908 	     "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
1909 	     "movdqu 3*16(%[inbuf]), %%xmm8\n\t"
1910 	     "movdqu 4*16(%[inbuf]), %%xmm9\n\t"
1911 	     "movdqu 5*16(%[inbuf]), %%xmm10\n\t"
1912 	     "movdqu 6*16(%[inbuf]), %%xmm11\n\t"
1913 
1914 	     "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */
1915 
1916 	     "movdqa %%xmm2, %%xmm12\n\t"
1917 	     "movdqa %%xmm3, %%xmm13\n\t"
1918 	     "movdqa %%xmm4, %%xmm14\n\t"
1919 	     "movdqa %%xmm8, %%xmm15\n\t"
1920 
1921              "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
1922              "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
1923              "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
1924              "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
1925              "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
1926              "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
1927              "pxor   %%xmm0, %%xmm10\n\t"     /* xmm10 ^= key[0] */
1928              "pxor   %%xmm0, %%xmm11\n\t"     /* xmm11 ^= key[0] */
1929 	     : /* No output */
1930 	     : [inbuf] "r" (inbuf),
1931 	       [key] "r" (ctx->keyschenc)
1932 	     : "memory");
1933 
1934 	  do_aesni_enc_vec8 (ctx);
1935 
1936 	  asm volatile
1937 	    (
1938 	     "pxor %%xmm0, %%xmm12\n\t"
1939 	     "pxor %%xmm0, %%xmm13\n\t"
1940 	     "pxor %%xmm0, %%xmm14\n\t"
1941 	     "pxor %%xmm0, %%xmm15\n\t"
1942 	     "aesenclast %%xmm12, %%xmm1\n\t"
1943 	     "aesenclast %%xmm13, %%xmm2\n\t"
1944 	     "aesenclast %%xmm14, %%xmm3\n\t"
1945 	     "aesenclast %%xmm15, %%xmm4\n\t"
1946 
1947 	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
1948 	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
1949 	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
1950 	     "movdqu 7*16(%[inbuf]), %%xmm15\n\t"
1951 	     "pxor %%xmm0, %%xmm12\n\t"
1952 	     "pxor %%xmm0, %%xmm13\n\t"
1953 	     "pxor %%xmm0, %%xmm14\n\t"
1954 	     "pxor %%xmm0, %%xmm15\n\t"
1955 
1956 	     "aesenclast %%xmm12, %%xmm8\n\t"
1957 	     "aesenclast %%xmm13, %%xmm9\n\t"
1958 	     "aesenclast %%xmm14, %%xmm10\n\t"
1959 	     "aesenclast %%xmm15, %%xmm11\n\t"
1960 
1961 	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
1962 	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
1963 	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
1964 	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
1965 
1966 	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
1967 	     "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
1968 	     "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
1969 	     "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
1970 
1971 	     : /* No output */
1972 	     : [inbuf] "r" (inbuf),
1973 	       [outbuf] "r" (outbuf)
1974 	     : "memory");
1975 
1976 	  outbuf += 8*BLOCKSIZE;
1977 	  inbuf  += 8*BLOCKSIZE;
1978 	}
1979 
1980       aesni_cleanup_8_15();
1981     }
1982 #endif
1983 
1984   for ( ;nblocks >= 4; nblocks -= 4)
1985     {
1986       asm volatile
1987         ("movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
1988          "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
1989          "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
1990          "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
1991 
1992          "movdqu 3*16(%[inbuf]), %%xmm6\n\t" /* update IV */
1993          : /* No output */
1994          : [inbuf] "r" (inbuf)
1995          : "memory");
1996 
1997       do_aesni_enc_vec4 (ctx);
1998 
1999       asm volatile
2000         ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
2001          "pxor %%xmm5, %%xmm1\n\t"
2002          "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
2003 
2004          "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
2005          "pxor %%xmm5, %%xmm2\n\t"
2006          "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
2007 
2008          "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
2009          "pxor %%xmm5, %%xmm3\n\t"
2010          "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
2011 
2012          "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
2013          "pxor %%xmm5, %%xmm4\n\t"
2014          "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
2015 
2016          : /* No output */
2017          : [inbuf] "r" (inbuf),
2018            [outbuf] "r" (outbuf)
2019          : "memory");
2020 
2021       outbuf += 4*BLOCKSIZE;
2022       inbuf  += 4*BLOCKSIZE;
2023     }
2024 
2025   asm volatile ("movdqu %%xmm6, %%xmm0\n\t" ::: "cc");
2026 
2027   for ( ;nblocks; nblocks-- )
2028     {
2029       do_aesni_enc (ctx);
2030 
2031       asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
2032                     "movdqu %[inbuf], %%xmm0\n\t"
2033                     "pxor %%xmm0, %%xmm6\n\t"
2034                     "movdqu %%xmm6, %[outbuf]\n\t"
2035                     : [outbuf] "=m" (*outbuf)
2036                     : [inbuf] "m" (*inbuf)
2037                     : "memory" );
2038 
2039       outbuf += BLOCKSIZE;
2040       inbuf  += BLOCKSIZE;
2041     }
2042 
2043   asm volatile ("movdqu %%xmm0, %[iv]\n\t"
2044                 : [iv] "=m" (*iv)
2045                 :
2046                 : "memory" );
2047 
2048   aesni_cleanup ();
2049   aesni_cleanup_2_7 ();
2050 }
2051 
2052 
2053 void ASM_FUNC_ATTR
_gcry_aes_aesni_cbc_dec(RIJNDAEL_context * ctx,unsigned char * iv,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)2054 _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
2055                          unsigned char *outbuf, const unsigned char *inbuf,
2056                          size_t nblocks)
2057 {
2058   aesni_prepare_2_7_variable;
2059 
2060   aesni_prepare ();
2061   aesni_prepare_2_7();
2062 
2063   if ( !ctx->decryption_prepared )
2064     {
2065       do_aesni_prepare_decryption ( ctx );
2066       ctx->decryption_prepared = 1;
2067     }
2068 
2069   asm volatile
2070     ("movdqu %[iv], %%xmm5\n\t"	/* use xmm5 as fast IV storage */
2071      : /* No output */
2072      : [iv] "m" (*iv)
2073      : "memory");
2074 
2075 #ifdef __x86_64__
2076   if (nblocks >= 8)
2077     {
2078       aesni_prepare_8_15_variable;
2079 
2080       aesni_prepare_8_15();
2081 
2082       for ( ;nblocks >= 8 ; nblocks -= 8 )
2083 	{
2084 	  asm volatile
2085 	    ("movdqa (%[key]), %%xmm0\n\t"
2086 
2087 	     "movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
2088 	     "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
2089 	     "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
2090 	     "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
2091 	     "movdqu 4*16(%[inbuf]), %%xmm8\n\t"
2092 	     "movdqu 5*16(%[inbuf]), %%xmm9\n\t"
2093 	     "movdqu 6*16(%[inbuf]), %%xmm10\n\t"
2094 	     "movdqu 7*16(%[inbuf]), %%xmm11\n\t"
2095 
2096 	     "movdqa %%xmm1, %%xmm12\n\t"
2097 	     "movdqa %%xmm2, %%xmm13\n\t"
2098 	     "movdqa %%xmm3, %%xmm14\n\t"
2099 	     "movdqa %%xmm4, %%xmm15\n\t"
2100 
2101 	     "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
2102 	     "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
2103 	     "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
2104 	     "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
2105 	     "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
2106 	     "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
2107 	     "pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
2108 	     "pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
2109 
2110 	     : /* No output */
2111 	     : [inbuf] "r" (inbuf),
2112 	       [key] "r" (ctx->keyschdec)
2113 	     : "memory");
2114 
2115 	  do_aesni_dec_vec8 (ctx);
2116 
2117 	  asm volatile
2118 	    (
2119 	     "pxor %%xmm0, %%xmm5\n\t"			/* xor IV with key */
2120 	     "pxor %%xmm0, %%xmm12\n\t"			/* xor IV with key */
2121 	     "pxor %%xmm0, %%xmm13\n\t"			/* xor IV with key */
2122 	     "pxor %%xmm0, %%xmm14\n\t"			/* xor IV with key */
2123 	     "pxor %%xmm0, %%xmm15\n\t"			/* xor IV with key */
2124 
2125 	     "aesdeclast %%xmm5, %%xmm1\n\t"
2126 	     "aesdeclast %%xmm12, %%xmm2\n\t"
2127 	     "aesdeclast %%xmm13, %%xmm3\n\t"
2128 	     "aesdeclast %%xmm14, %%xmm4\n\t"
2129 
2130 	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
2131 	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
2132 	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
2133 	     "movdqu 7*16(%[inbuf]), %%xmm5\n\t"
2134 	     "pxor %%xmm0, %%xmm12\n\t"			/* xor IV with key */
2135 	     "pxor %%xmm0, %%xmm13\n\t"			/* xor IV with key */
2136 	     "pxor %%xmm0, %%xmm14\n\t"			/* xor IV with key */
2137 
2138 	     "aesdeclast %%xmm15, %%xmm8\n\t"
2139 	     "aesdeclast %%xmm12, %%xmm9\n\t"
2140 	     "aesdeclast %%xmm13, %%xmm10\n\t"
2141 	     "aesdeclast %%xmm14, %%xmm11\n\t"
2142 
2143 	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
2144 	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
2145 	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
2146 	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
2147 	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
2148 	     "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
2149 	     "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
2150 	     "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
2151 
2152 	     : /* No output */
2153 	     : [inbuf] "r" (inbuf),
2154 	       [outbuf] "r" (outbuf)
2155 	     : "memory");
2156 
2157 	  outbuf += 8*BLOCKSIZE;
2158 	  inbuf  += 8*BLOCKSIZE;
2159 	}
2160 
2161       aesni_cleanup_8_15();
2162     }
2163 #endif
2164 
2165   for ( ;nblocks >= 4 ; nblocks -= 4 )
2166     {
2167       asm volatile
2168         ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
2169          "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
2170          "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
2171          "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
2172          : /* No output */
2173          : [inbuf] "r" (inbuf)
2174          : "memory");
2175 
2176       do_aesni_dec_vec4 (ctx);
2177 
2178       asm volatile
2179         ("pxor %%xmm5, %%xmm1\n\t"		/* xor IV with output */
2180          "movdqu 0*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
2181          "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
2182 
2183          "pxor %%xmm5, %%xmm2\n\t"		/* xor IV with output */
2184          "movdqu 1*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
2185          "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
2186 
2187          "pxor %%xmm5, %%xmm3\n\t"		/* xor IV with output */
2188          "movdqu 2*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
2189          "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
2190 
2191          "pxor %%xmm5, %%xmm4\n\t"		/* xor IV with output */
2192          "movdqu 3*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
2193          "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
2194 
2195          : /* No output */
2196          : [inbuf] "r" (inbuf),
2197            [outbuf] "r" (outbuf)
2198          : "memory");
2199 
2200       outbuf += 4*BLOCKSIZE;
2201       inbuf  += 4*BLOCKSIZE;
2202     }
2203 
2204   for ( ;nblocks; nblocks-- )
2205     {
2206       asm volatile
2207         ("movdqu %[inbuf], %%xmm0\n\t"
2208          "movdqa %%xmm0, %%xmm2\n\t"    /* use xmm2 as savebuf */
2209          : /* No output */
2210          : [inbuf] "m" (*inbuf)
2211          : "memory");
2212 
2213       /* uses only xmm0 and xmm1 */
2214       do_aesni_dec (ctx);
2215 
2216       asm volatile
2217         ("pxor %%xmm5, %%xmm0\n\t"	/* xor IV with output */
2218          "movdqu %%xmm0, %[outbuf]\n\t"
2219          "movdqu %%xmm2, %%xmm5\n\t"	/* store savebuf as new IV */
2220          : [outbuf] "=m" (*outbuf)
2221          :
2222          : "memory");
2223 
2224       outbuf += BLOCKSIZE;
2225       inbuf  += BLOCKSIZE;
2226     }
2227 
2228   asm volatile
2229     ("movdqu %%xmm5, %[iv]\n\t"	/* store IV */
2230      : /* No output */
2231      : [iv] "m" (*iv)
2232      : "memory");
2233 
2234   aesni_cleanup ();
2235   aesni_cleanup_2_7 ();
2236 }
2237 
2238 
2239 static ASM_FUNC_ATTR_INLINE void
aesni_ocb_checksum(gcry_cipher_hd_t c,const unsigned char * plaintext,size_t nblocks)2240 aesni_ocb_checksum (gcry_cipher_hd_t c, const unsigned char *plaintext,
2241 		    size_t nblocks)
2242 {
2243   RIJNDAEL_context *ctx = (void *)&c->context.c;
2244 
2245   /* Calculate checksum */
2246   asm volatile ("movdqu %[checksum], %%xmm6\n\t"
2247                 "pxor %%xmm1, %%xmm1\n\t"
2248                 "pxor %%xmm2, %%xmm2\n\t"
2249                 "pxor %%xmm3, %%xmm3\n\t"
2250                 :
2251                 :[checksum] "m" (*c->u_ctr.ctr)
2252                 : "memory" );
2253 
2254   if (0) {}
2255 #if defined(HAVE_GCC_INLINE_ASM_AVX2)
2256   else if (nblocks >= 16 && ctx->use_avx2)
2257     {
2258       /* Use wider 256-bit registers for fast xoring of plaintext. */
2259       asm volatile ("vzeroupper\n\t"
2260 		    "vpxor %%xmm0, %%xmm0, %%xmm0\n\t"
2261 		    "vpxor %%xmm4, %%xmm4, %%xmm4\n\t"
2262 		    "vpxor %%xmm5, %%xmm5, %%xmm5\n\t"
2263 		    "vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
2264                     :
2265                     :
2266                     : "memory");
2267 
2268       for (;nblocks >= 16; nblocks -= 16)
2269 	{
2270 	  asm volatile ("vpxor %[ptr0], %%ymm6, %%ymm6\n\t"
2271 			"vpxor %[ptr1], %%ymm1, %%ymm1\n\t"
2272 			"vpxor %[ptr2], %%ymm2, %%ymm2\n\t"
2273 			"vpxor %[ptr3], %%ymm3, %%ymm3\n\t"
2274 			:
2275 			: [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
2276 			  [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
2277 			  [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
2278 			  [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
2279 			: "memory" );
2280 	  asm volatile ("vpxor %[ptr4], %%ymm0, %%ymm0\n\t"
2281 			"vpxor %[ptr5], %%ymm4, %%ymm4\n\t"
2282 			"vpxor %[ptr6], %%ymm5, %%ymm5\n\t"
2283 			"vpxor %[ptr7], %%ymm7, %%ymm7\n\t"
2284 			:
2285 			: [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
2286 			  [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
2287 			  [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
2288 			  [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
2289 			: "memory" );
2290 	  plaintext += BLOCKSIZE * 16;
2291 	}
2292 
2293       asm volatile ("vpxor %%ymm0, %%ymm6, %%ymm6\n\t"
2294 		    "vpxor %%ymm4, %%ymm1, %%ymm1\n\t"
2295 		    "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
2296 		    "vpxor %%ymm7, %%ymm3, %%ymm3\n\t"
2297 		    "vextracti128 $1, %%ymm6, %%xmm0\n\t"
2298 		    "vextracti128 $1, %%ymm1, %%xmm4\n\t"
2299 		    "vextracti128 $1, %%ymm2, %%xmm5\n\t"
2300 		    "vextracti128 $1, %%ymm3, %%xmm7\n\t"
2301 		    "vpxor %%xmm0, %%xmm6, %%xmm6\n\t"
2302 		    "vpxor %%xmm4, %%xmm1, %%xmm1\n\t"
2303 		    "vpxor %%xmm5, %%xmm2, %%xmm2\n\t"
2304 		    "vpxor %%xmm7, %%xmm3, %%xmm3\n\t"
2305 		    "vzeroupper\n\t"
2306 		    :
2307 		    :
2308 		    : "memory" );
2309     }
2310 #endif
2311 #if defined(HAVE_GCC_INLINE_ASM_AVX)
2312   else if (nblocks >= 16 && ctx->use_avx)
2313     {
2314       /* Same as AVX2, except using 256-bit floating point instructions. */
2315       asm volatile ("vzeroupper\n\t"
2316 		    "vxorpd %%xmm0, %%xmm0, %%xmm0\n\t"
2317 		    "vxorpd %%xmm4, %%xmm4, %%xmm4\n\t"
2318 		    "vxorpd %%xmm5, %%xmm5, %%xmm5\n\t"
2319 		    "vxorpd %%xmm7, %%xmm7, %%xmm7\n\t"
2320                     :
2321                     :
2322                     : "memory");
2323 
2324       for (;nblocks >= 16; nblocks -= 16)
2325 	{
2326 	  asm volatile ("vxorpd %[ptr0], %%ymm6, %%ymm6\n\t"
2327 			"vxorpd %[ptr1], %%ymm1, %%ymm1\n\t"
2328 			"vxorpd %[ptr2], %%ymm2, %%ymm2\n\t"
2329 			"vxorpd %[ptr3], %%ymm3, %%ymm3\n\t"
2330 			:
2331 			: [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
2332 			  [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
2333 			  [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
2334 			  [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
2335 			: "memory" );
2336 	  asm volatile ("vxorpd %[ptr4], %%ymm0, %%ymm0\n\t"
2337 			"vxorpd %[ptr5], %%ymm4, %%ymm4\n\t"
2338 			"vxorpd %[ptr6], %%ymm5, %%ymm5\n\t"
2339 			"vxorpd %[ptr7], %%ymm7, %%ymm7\n\t"
2340 			:
2341 			: [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
2342 			  [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
2343 			  [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
2344 			  [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
2345 			: "memory" );
2346 	  plaintext += BLOCKSIZE * 16;
2347 	}
2348 
2349       asm volatile ("vxorpd %%ymm0, %%ymm6, %%ymm6\n\t"
2350 		    "vxorpd %%ymm4, %%ymm1, %%ymm1\n\t"
2351 		    "vxorpd %%ymm5, %%ymm2, %%ymm2\n\t"
2352 		    "vxorpd %%ymm7, %%ymm3, %%ymm3\n\t"
2353 		    "vextractf128 $1, %%ymm6, %%xmm0\n\t"
2354 		    "vextractf128 $1, %%ymm1, %%xmm4\n\t"
2355 		    "vextractf128 $1, %%ymm2, %%xmm5\n\t"
2356 		    "vextractf128 $1, %%ymm3, %%xmm7\n\t"
2357 		    "vxorpd %%xmm0, %%xmm6, %%xmm6\n\t"
2358 		    "vxorpd %%xmm4, %%xmm1, %%xmm1\n\t"
2359 		    "vxorpd %%xmm5, %%xmm2, %%xmm2\n\t"
2360 		    "vxorpd %%xmm7, %%xmm3, %%xmm3\n\t"
2361 		    "vzeroupper\n\t"
2362 		    :
2363 		    :
2364 		    : "memory" );
2365     }
2366 #endif
2367 
2368   for (;nblocks >= 4; nblocks -= 4)
2369     {
2370       asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
2371 		    "movdqu %[ptr1], %%xmm4\n\t"
2372 		    "movdqu %[ptr2], %%xmm5\n\t"
2373 		    "movdqu %[ptr3], %%xmm7\n\t"
2374 		    "pxor %%xmm0, %%xmm6\n\t"
2375 		    "pxor %%xmm4, %%xmm1\n\t"
2376 		    "pxor %%xmm5, %%xmm2\n\t"
2377 		    "pxor %%xmm7, %%xmm3\n\t"
2378 		    :
2379 		    : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)),
2380 		      [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE)),
2381 		      [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE)),
2382 		      [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE))
2383 		    : "memory" );
2384       plaintext += BLOCKSIZE * 4;
2385     }
2386 
2387   for (;nblocks >= 1; nblocks -= 1)
2388     {
2389       asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
2390 		    "pxor %%xmm0, %%xmm6\n\t"
2391 		    :
2392 		    : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE))
2393 		    : "memory" );
2394       plaintext += BLOCKSIZE;
2395     }
2396 
2397   asm volatile ("pxor %%xmm1, %%xmm6\n\t"
2398 		"pxor %%xmm2, %%xmm6\n\t"
2399 		"pxor %%xmm3, %%xmm6\n\t"
2400 		"movdqu %%xmm6, %[checksum]\n\t"
2401 		: [checksum] "=m" (*c->u_ctr.ctr)
2402 		:
2403 		: "memory" );
2404 }
2405 
2406 
2407 static unsigned int ASM_FUNC_ATTR_NOINLINE
aesni_ocb_enc(gcry_cipher_hd_t c,void * outbuf_arg,const void * inbuf_arg,size_t nblocks)2408 aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
2409                const void *inbuf_arg, size_t nblocks)
2410 {
2411   RIJNDAEL_context *ctx = (void *)&c->context.c;
2412   unsigned char *outbuf = outbuf_arg;
2413   const unsigned char *inbuf = inbuf_arg;
2414   u64 n = c->u_mode.ocb.data_nblocks;
2415   const unsigned char *l;
2416   byte tmpbuf_store[3 * 16 + 15];
2417   byte *tmpbuf;
2418   aesni_prepare_2_7_variable;
2419 
2420   asm volatile ("" : "=r" (tmpbuf) : "0" (tmpbuf_store) : "memory");
2421   tmpbuf = tmpbuf + (-(uintptr_t)tmpbuf & 15);
2422 
2423   aesni_prepare ();
2424   aesni_prepare_2_7 ();
2425 
2426   /* Preload Offset */
2427   asm volatile ("movdqu %[iv], %%xmm5\n\t"
2428 		"movdqu %[ctr], %%xmm7\n\t"
2429 		: /* No output */
2430 		: [iv] "m" (*c->u_iv.iv),
2431 		  [ctr] "m" (*c->u_ctr.ctr)
2432 		: "memory" );
2433 
2434   for ( ;nblocks && n % 4; nblocks-- )
2435     {
2436       l = aes_ocb_get_l(c, ++n);
2437 
2438       /* Checksum_i = Checksum_{i-1} xor P_i  */
2439       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
2440       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
2441       asm volatile ("movdqu %[l],     %%xmm1\n\t"
2442                     "movdqu %[inbuf], %%xmm0\n\t"
2443                     "pxor   %%xmm1,   %%xmm5\n\t"
2444                     "pxor   %%xmm0,   %%xmm7\n\t"
2445                     "pxor   %%xmm5,   %%xmm0\n\t"
2446                     :
2447                     : [l] "m" (*l),
2448                       [inbuf] "m" (*inbuf)
2449                     : "memory" );
2450 
2451       do_aesni_enc (ctx);
2452 
2453       asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
2454                     "movdqu %%xmm0, %[outbuf]\n\t"
2455                     : [outbuf] "=m" (*outbuf)
2456                     :
2457                     : "memory" );
2458 
2459       inbuf += BLOCKSIZE;
2460       outbuf += BLOCKSIZE;
2461     }
2462 
2463 #ifdef __x86_64__
2464   if (nblocks >= 8)
2465     {
2466       unsigned char last_xor_first_key_store[16 + 15];
2467       unsigned char *lxf_key;
2468       aesni_prepare_8_15_variable;
2469 
2470       asm volatile (""
2471                     : "=r" (lxf_key)
2472 		    : "0" (last_xor_first_key_store)
2473 		    : "memory");
2474       lxf_key = lxf_key + (-(uintptr_t)lxf_key & 15);
2475 
2476       aesni_prepare_8_15();
2477 
2478       asm volatile ("movdqu %[l0], %%xmm6\n\t"
2479 		    "movdqa %[last_key], %%xmm0\n\t"
2480 		    "pxor %[first_key], %%xmm5\n\t"
2481 		    "pxor %[first_key], %%xmm0\n\t"
2482 		    "movdqa %%xmm0, %[lxfkey]\n\t"
2483 		    : [lxfkey] "=m" (*lxf_key)
2484 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
2485 		      [last_key] "m" (ctx->keyschenc[ctx->rounds][0][0]),
2486 		      [first_key] "m" (ctx->keyschenc[0][0][0])
2487 		    : "memory" );
2488 
2489       for ( ;nblocks >= 8 ; nblocks -= 8 )
2490 	{
2491 	  n += 4;
2492 	  l = aes_ocb_get_l(c, n);
2493 
2494 	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
2495 			"movdqu %[l1],     %%xmm11\n\t"
2496 			"movdqu %[l3],     %%xmm15\n\t"
2497 			:
2498 			: [l0l1] "m" (*c->u_mode.ocb.L0L1),
2499 			  [l1] "m" (*c->u_mode.ocb.L[1]),
2500 			  [l3] "m" (*l)
2501 			: "memory" );
2502 
2503 	  n += 4;
2504 	  l = aes_ocb_get_l(c, n);
2505 
2506           /* Checksum_i = Checksum_{i-1} xor P_i  */
2507 	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
2508 	  /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i)  */
2509 	  asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
2510 			"movdqu %[inbuf1], %%xmm2\n\t"
2511 			"movdqu %[inbuf2], %%xmm3\n\t"
2512 			:
2513 			: [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)),
2514 			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
2515 			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
2516 			: "memory" );
2517 	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
2518 			"movdqu %[inbuf4], %%xmm8\n\t"
2519 			"movdqu %[inbuf5], %%xmm9\n\t"
2520 			:
2521 			: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
2522 			  [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)),
2523 			  [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
2524 			: "memory" );
2525 	  asm volatile ("movdqa %[lxfkey], %%xmm0\n\t"
2526 			"movdqa %%xmm6,    %%xmm12\n\t"
2527 			"pxor   %%xmm5,    %%xmm12\n\t"
2528 			"pxor   %%xmm1,    %%xmm7\n\t"
2529 			"pxor   %%xmm12,   %%xmm1\n\t"
2530 			"pxor   %%xmm0,    %%xmm12\n\t"
2531 
2532 			"movdqa %%xmm10,   %%xmm13\n\t"
2533 			"pxor   %%xmm5,    %%xmm13\n\t"
2534 			"pxor   %%xmm2,    %%xmm7\n\t"
2535 			"pxor   %%xmm13,   %%xmm2\n\t"
2536 			"pxor   %%xmm0,    %%xmm13\n\t"
2537 
2538 			"movdqa %%xmm11,   %%xmm14\n\t"
2539 			"pxor   %%xmm5,    %%xmm14\n\t"
2540 			"pxor   %%xmm3,    %%xmm7\n\t"
2541 			"pxor   %%xmm14,   %%xmm3\n\t"
2542 			"pxor   %%xmm0,    %%xmm14\n\t"
2543 
2544 			"pxor   %%xmm11,   %%xmm5\n\t"
2545 			"pxor   %%xmm15,   %%xmm5\n\t"
2546 			"pxor   %%xmm4,    %%xmm7\n\t"
2547 			"pxor   %%xmm5,    %%xmm4\n\t"
2548 			"movdqa %%xmm5,    %%xmm15\n\t"
2549 			"pxor   %%xmm0,    %%xmm15\n\t"
2550 
2551 			"movdqa %%xmm5,    %%xmm0\n\t"
2552 			"pxor   %%xmm6,    %%xmm0\n\t"
2553 			"pxor   %%xmm8,    %%xmm7\n\t"
2554 			"pxor   %%xmm0,    %%xmm8\n\t"
2555 			"pxor   %[lxfkey], %%xmm0\n\t"
2556 			"movdqa %%xmm0,    %[tmpbuf0]\n\t"
2557 
2558 			"movdqa %%xmm10,   %%xmm0\n\t"
2559 			"pxor   %%xmm5,    %%xmm0\n\t"
2560 			"pxor   %%xmm9,    %%xmm7\n\t"
2561 			"pxor   %%xmm0,    %%xmm9\n\t"
2562 			"pxor   %[lxfkey], %%xmm0\n"
2563 			"movdqa %%xmm0,    %[tmpbuf1]\n\t"
2564 			: [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
2565 			  [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
2566 			: [lxfkey] "m" (*lxf_key)
2567 			: "memory" );
2568 	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
2569 			"movdqa %%xmm11,   %%xmm0\n\t"
2570 			"pxor   %%xmm5,    %%xmm0\n\t"
2571 			"pxor   %%xmm10,   %%xmm7\n\t"
2572 			"pxor   %%xmm0,    %%xmm10\n\t"
2573 			"pxor   %[lxfkey], %%xmm0\n\t"
2574 			"movdqa %%xmm0,    %[tmpbuf2]\n\t"
2575 			: [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
2576 			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)),
2577 			  [lxfkey] "m" (*lxf_key)
2578 			: "memory" );
2579 	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
2580 			"pxor   %%xmm11,   %%xmm5\n\t"
2581 			"pxor   %%xmm0,    %%xmm5\n\t"
2582 			"movdqa 0x10(%[key]), %%xmm0\n\t"
2583 			"movdqu %[inbuf7], %%xmm11\n\t"
2584 			"pxor   %%xmm11,   %%xmm7\n\t"
2585 			"pxor   %%xmm5,    %%xmm11\n\t"
2586 			:
2587 			: [l7] "m" (*l),
2588 			  [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)),
2589 			  [key] "r" (ctx->keyschenc)
2590 			: "memory" );
2591 
2592 	  asm volatile ("cmpl $12, %[rounds]\n\t"
2593 			"aesenc %%xmm0, %%xmm1\n\t"
2594 			"aesenc %%xmm0, %%xmm2\n\t"
2595 			"aesenc %%xmm0, %%xmm3\n\t"
2596 			"aesenc %%xmm0, %%xmm4\n\t"
2597 			"aesenc %%xmm0, %%xmm8\n\t"
2598 			"aesenc %%xmm0, %%xmm9\n\t"
2599 			"aesenc %%xmm0, %%xmm10\n\t"
2600 			"aesenc %%xmm0, %%xmm11\n\t"
2601 			"movdqa 0x20(%[key]), %%xmm0\n\t"
2602 			"aesenc %%xmm0, %%xmm1\n\t"
2603 			"aesenc %%xmm0, %%xmm2\n\t"
2604 			"aesenc %%xmm0, %%xmm3\n\t"
2605 			"aesenc %%xmm0, %%xmm4\n\t"
2606 			"aesenc %%xmm0, %%xmm8\n\t"
2607 			"aesenc %%xmm0, %%xmm9\n\t"
2608 			"aesenc %%xmm0, %%xmm10\n\t"
2609 			"aesenc %%xmm0, %%xmm11\n\t"
2610 			"movdqa 0x30(%[key]), %%xmm0\n\t"
2611 			"aesenc %%xmm0, %%xmm1\n\t"
2612 			"aesenc %%xmm0, %%xmm2\n\t"
2613 			"aesenc %%xmm0, %%xmm3\n\t"
2614 			"aesenc %%xmm0, %%xmm4\n\t"
2615 			"aesenc %%xmm0, %%xmm8\n\t"
2616 			"aesenc %%xmm0, %%xmm9\n\t"
2617 			"aesenc %%xmm0, %%xmm10\n\t"
2618 			"aesenc %%xmm0, %%xmm11\n\t"
2619 			"movdqa 0x40(%[key]), %%xmm0\n\t"
2620 			"aesenc %%xmm0, %%xmm1\n\t"
2621 			"aesenc %%xmm0, %%xmm2\n\t"
2622 			"aesenc %%xmm0, %%xmm3\n\t"
2623 			"aesenc %%xmm0, %%xmm4\n\t"
2624 			"aesenc %%xmm0, %%xmm8\n\t"
2625 			"aesenc %%xmm0, %%xmm9\n\t"
2626 			"aesenc %%xmm0, %%xmm10\n\t"
2627 			"aesenc %%xmm0, %%xmm11\n\t"
2628 			"movdqa 0x50(%[key]), %%xmm0\n\t"
2629 			"aesenc %%xmm0, %%xmm1\n\t"
2630 			"aesenc %%xmm0, %%xmm2\n\t"
2631 			"aesenc %%xmm0, %%xmm3\n\t"
2632 			"aesenc %%xmm0, %%xmm4\n\t"
2633 			"aesenc %%xmm0, %%xmm8\n\t"
2634 			"aesenc %%xmm0, %%xmm9\n\t"
2635 			"aesenc %%xmm0, %%xmm10\n\t"
2636 			"aesenc %%xmm0, %%xmm11\n\t"
2637 			"movdqa 0x60(%[key]), %%xmm0\n\t"
2638 			"aesenc %%xmm0, %%xmm1\n\t"
2639 			"aesenc %%xmm0, %%xmm2\n\t"
2640 			"aesenc %%xmm0, %%xmm3\n\t"
2641 			"aesenc %%xmm0, %%xmm4\n\t"
2642 			"aesenc %%xmm0, %%xmm8\n\t"
2643 			"aesenc %%xmm0, %%xmm9\n\t"
2644 			"aesenc %%xmm0, %%xmm10\n\t"
2645 			"aesenc %%xmm0, %%xmm11\n\t"
2646 			"movdqa 0x70(%[key]), %%xmm0\n\t"
2647 			"aesenc %%xmm0, %%xmm1\n\t"
2648 			"aesenc %%xmm0, %%xmm2\n\t"
2649 			"aesenc %%xmm0, %%xmm3\n\t"
2650 			"aesenc %%xmm0, %%xmm4\n\t"
2651 			"aesenc %%xmm0, %%xmm8\n\t"
2652 			"aesenc %%xmm0, %%xmm9\n\t"
2653 			"aesenc %%xmm0, %%xmm10\n\t"
2654 			"aesenc %%xmm0, %%xmm11\n\t"
2655 			"movdqa 0x80(%[key]), %%xmm0\n\t"
2656 			"aesenc %%xmm0, %%xmm1\n\t"
2657 			"aesenc %%xmm0, %%xmm2\n\t"
2658 			"aesenc %%xmm0, %%xmm3\n\t"
2659 			"aesenc %%xmm0, %%xmm4\n\t"
2660 			"aesenc %%xmm0, %%xmm8\n\t"
2661 			"aesenc %%xmm0, %%xmm9\n\t"
2662 			"aesenc %%xmm0, %%xmm10\n\t"
2663 			"aesenc %%xmm0, %%xmm11\n\t"
2664 			"movdqa 0x90(%[key]), %%xmm0\n\t"
2665 			"aesenc %%xmm0, %%xmm1\n\t"
2666 			"aesenc %%xmm0, %%xmm2\n\t"
2667 			"aesenc %%xmm0, %%xmm3\n\t"
2668 			"aesenc %%xmm0, %%xmm4\n\t"
2669 			"aesenc %%xmm0, %%xmm8\n\t"
2670 			"aesenc %%xmm0, %%xmm9\n\t"
2671 			"aesenc %%xmm0, %%xmm10\n\t"
2672 			"aesenc %%xmm0, %%xmm11\n\t"
2673 			"jb .Ldeclast%=\n\t"
2674 			"movdqa 0xa0(%[key]), %%xmm0\n\t"
2675 			"aesenc %%xmm0, %%xmm1\n\t"
2676 			"aesenc %%xmm0, %%xmm2\n\t"
2677 			"aesenc %%xmm0, %%xmm3\n\t"
2678 			"aesenc %%xmm0, %%xmm4\n\t"
2679 			"aesenc %%xmm0, %%xmm8\n\t"
2680 			"aesenc %%xmm0, %%xmm9\n\t"
2681 			"aesenc %%xmm0, %%xmm10\n\t"
2682 			"aesenc %%xmm0, %%xmm11\n\t"
2683 			"movdqa 0xb0(%[key]), %%xmm0\n\t"
2684 			"aesenc %%xmm0, %%xmm1\n\t"
2685 			"aesenc %%xmm0, %%xmm2\n\t"
2686 			"aesenc %%xmm0, %%xmm3\n\t"
2687 			"aesenc %%xmm0, %%xmm4\n\t"
2688 			"aesenc %%xmm0, %%xmm8\n\t"
2689 			"aesenc %%xmm0, %%xmm9\n\t"
2690 			"aesenc %%xmm0, %%xmm10\n\t"
2691 			"aesenc %%xmm0, %%xmm11\n\t"
2692 			"je .Ldeclast%=\n\t"
2693 			"movdqa 0xc0(%[key]), %%xmm0\n\t"
2694 			"aesenc %%xmm0, %%xmm1\n\t"
2695 			"aesenc %%xmm0, %%xmm2\n\t"
2696 			"aesenc %%xmm0, %%xmm3\n\t"
2697 			"aesenc %%xmm0, %%xmm4\n\t"
2698 			"aesenc %%xmm0, %%xmm8\n\t"
2699 			"aesenc %%xmm0, %%xmm9\n\t"
2700 			"aesenc %%xmm0, %%xmm10\n\t"
2701 			"aesenc %%xmm0, %%xmm11\n\t"
2702 			"movdqa 0xd0(%[key]), %%xmm0\n\t"
2703 			"aesenc %%xmm0, %%xmm1\n\t"
2704 			"aesenc %%xmm0, %%xmm2\n\t"
2705 			"aesenc %%xmm0, %%xmm3\n\t"
2706 			"aesenc %%xmm0, %%xmm4\n\t"
2707 			"aesenc %%xmm0, %%xmm8\n\t"
2708 			"aesenc %%xmm0, %%xmm9\n\t"
2709 			"aesenc %%xmm0, %%xmm10\n\t"
2710 			"aesenc %%xmm0, %%xmm11\n\t"
2711 
2712 			".Ldeclast%=:\n\t"
2713 			:
2714 			: [key] "r" (ctx->keyschenc),
2715 			  [rounds] "r" (ctx->rounds)
2716 			: "cc", "memory");
2717 
2718 	  asm volatile ("aesenclast %%xmm12,   %%xmm1\n\t"
2719 			"aesenclast %%xmm13,   %%xmm2\n\t"
2720 			"aesenclast %%xmm14,   %%xmm3\n\t"
2721 			"aesenclast %%xmm15,   %%xmm4\n\t"
2722 			"aesenclast %[tmpbuf0],%%xmm8\n\t"
2723 			"aesenclast %[tmpbuf1],%%xmm9\n\t"
2724 			"aesenclast %[tmpbuf2],%%xmm10\n\t"
2725 			:
2726 			: [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
2727 			  [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)),
2728 			  [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)),
2729 			  [lxfkey] "m" (*lxf_key)
2730 			: "memory" );
2731 	  asm volatile ("aesenclast %%xmm5,    %%xmm11\n\t"
2732 			"pxor   %[lxfkey], %%xmm11\n\t"
2733 			"movdqu %%xmm1,    %[outbuf0]\n\t"
2734 			"movdqu %%xmm2,    %[outbuf1]\n\t"
2735 			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
2736 			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
2737 			: [lxfkey] "m" (*lxf_key)
2738 			: "memory" );
2739 	  asm volatile ("movdqu %%xmm3,    %[outbuf2]\n\t"
2740 			"movdqu %%xmm4,    %[outbuf3]\n\t"
2741 			"movdqu %%xmm8,    %[outbuf4]\n\t"
2742 			: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
2743 			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
2744 			  [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
2745 			:
2746 			: "memory" );
2747 	  asm volatile ("movdqu %%xmm9,    %[outbuf5]\n\t"
2748 			"movdqu %%xmm10,   %[outbuf6]\n\t"
2749 			"movdqu %%xmm11,   %[outbuf7]\n\t"
2750 			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)),
2751 			  [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)),
2752 			  [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
2753 			:
2754 			: "memory" );
2755 
2756 	  outbuf += 8*BLOCKSIZE;
2757 	  inbuf  += 8*BLOCKSIZE;
2758 	}
2759 
2760       asm volatile ("pxor %[first_key], %%xmm5\n\t"
2761 		    "pxor %%xmm0, %%xmm0\n\t"
2762 		    "movdqu %%xmm0, %[lxfkey]\n\t"
2763 		    : [lxfkey] "=m" (*lxf_key)
2764 		    : [first_key] "m" (ctx->keyschenc[0][0][0])
2765 		    : "memory" );
2766 
2767       aesni_cleanup_8_15();
2768     }
2769 #endif
2770 
2771   for ( ;nblocks >= 4 ; nblocks -= 4 )
2772     {
2773       n += 4;
2774       l = aes_ocb_get_l(c, n);
2775 
2776       /* Checksum_i = Checksum_{i-1} xor P_i  */
2777       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
2778       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
2779       asm volatile ("movdqu %[l0],     %%xmm0\n\t"
2780 		    "movdqu %[inbuf0], %%xmm1\n\t"
2781 		    "movdqu %[l0l1],   %%xmm3\n\t"
2782 		    :
2783 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
2784 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
2785 		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
2786 		    : "memory" );
2787       asm volatile ("movdqu %[l1],     %%xmm4\n\t"
2788 		    "movdqu %[l3],     %%xmm6\n\t"
2789 		    "pxor   %%xmm5,    %%xmm0\n\t"
2790 		    "pxor   %%xmm1,    %%xmm7\n\t"
2791 		    "pxor   %%xmm0,    %%xmm1\n\t"
2792 		    "movdqa %%xmm0,    %[tmpbuf0]\n\t"
2793 		    : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE))
2794 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
2795 		      [l3] "m" (*l)
2796 		    : "memory" );
2797       asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
2798 		    "pxor   %%xmm5,    %%xmm3\n\t"
2799 		    "pxor   %%xmm2,    %%xmm7\n\t"
2800 		    "pxor   %%xmm3,    %%xmm2\n\t"
2801 		    "movdqa %%xmm3,    %[tmpbuf1]\n\t"
2802 		    : [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
2803 		    : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
2804 		    : "memory" );
2805       asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
2806 		    "movdqu %[inbuf2], %%xmm3\n\t"
2807 		    "pxor   %%xmm5,    %%xmm0\n\t"
2808 		    "pxor   %%xmm3,    %%xmm7\n\t"
2809 		    "pxor   %%xmm0,    %%xmm3\n\t"
2810 		    "movdqa %%xmm0,    %[tmpbuf2]\n\t"
2811 		    : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
2812 		    :
2813 		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
2814 		    : "memory" );
2815       asm volatile ("pxor   %%xmm6,    %%xmm5\n\t"
2816 		    "pxor   %%xmm4,    %%xmm5\n\t"
2817 		    "movdqu %[inbuf3], %%xmm4\n\t"
2818 		    "pxor   %%xmm4,    %%xmm7\n\t"
2819 		    "pxor   %%xmm5,    %%xmm4\n\t"
2820 		    :
2821 		    : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
2822 		    : "memory" );
2823 
2824       do_aesni_enc_vec4 (ctx);
2825 
2826       asm volatile ("pxor   %[tmpbuf0],%%xmm1\n\t"
2827 		    "movdqu %%xmm1,    %[outbuf0]\n\t"
2828 		    "pxor   %[tmpbuf1],%%xmm2\n\t"
2829 		    "movdqu %%xmm2,    %[outbuf1]\n\t"
2830 		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
2831 		      [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
2832 		    : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
2833 		      [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE))
2834 		    : "memory" );
2835       asm volatile ("pxor   %[tmpbuf2],%%xmm3\n\t"
2836 		    "movdqu %%xmm3,    %[outbuf2]\n\t"
2837 		    "pxor   %%xmm5,    %%xmm4\n\t"
2838 		    "movdqu %%xmm4,    %[outbuf3]\n\t"
2839 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
2840 		      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
2841 		    : [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
2842 		    : "memory" );
2843 
2844       outbuf += 4*BLOCKSIZE;
2845       inbuf  += 4*BLOCKSIZE;
2846     }
2847 
2848   for ( ;nblocks; nblocks-- )
2849     {
2850       l = aes_ocb_get_l(c, ++n);
2851 
2852       /* Checksum_i = Checksum_{i-1} xor P_i  */
2853       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
2854       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
2855       asm volatile ("movdqu %[l],     %%xmm1\n\t"
2856                     "movdqu %[inbuf], %%xmm0\n\t"
2857                     "pxor   %%xmm1,   %%xmm5\n\t"
2858 		    "pxor   %%xmm0,   %%xmm7\n\t"
2859                     "pxor   %%xmm5,   %%xmm0\n\t"
2860                     :
2861                     : [l] "m" (*l),
2862                       [inbuf] "m" (*inbuf)
2863                     : "memory" );
2864 
2865       do_aesni_enc (ctx);
2866 
2867       asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
2868                     "movdqu %%xmm0, %[outbuf]\n\t"
2869                     : [outbuf] "=m" (*outbuf)
2870                     :
2871                     : "memory" );
2872 
2873       inbuf += BLOCKSIZE;
2874       outbuf += BLOCKSIZE;
2875     }
2876 
2877   c->u_mode.ocb.data_nblocks = n;
2878   asm volatile ("movdqu %%xmm5, %[iv]\n\t"
2879                 "movdqu %%xmm7, %[ctr]\n\t"
2880 		: [iv] "=m" (*c->u_iv.iv),
2881 		  [ctr] "=m" (*c->u_ctr.ctr)
2882                 :
2883                 : "memory" );
2884 
2885   asm volatile ("pxor   %%xmm0, %%xmm0\n\t"
2886                 "movdqa %%xmm0, %[tmpbuf0]\n\t"
2887                 "movdqa %%xmm0, %[tmpbuf1]\n\t"
2888                 "movdqa %%xmm0, %[tmpbuf2]\n\t"
2889 		: [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
2890 		  [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)),
2891 		  [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
2892                 :
2893                 : "memory" );
2894 
2895   aesni_cleanup ();
2896   aesni_cleanup_2_7 ();
2897 
2898   return 0;
2899 }
2900 
2901 
2902 static unsigned int ASM_FUNC_ATTR_NOINLINE
aesni_ocb_dec(gcry_cipher_hd_t c,void * outbuf_arg,const void * inbuf_arg,size_t nblocks_arg)2903 aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
2904                const void *inbuf_arg, size_t nblocks_arg)
2905 {
2906   RIJNDAEL_context *ctx = (void *)&c->context.c;
2907   unsigned char *outbuf = outbuf_arg;
2908   const unsigned char *inbuf = inbuf_arg;
2909   u64 n = c->u_mode.ocb.data_nblocks;
2910   const unsigned char *l;
2911   size_t nblocks = nblocks_arg;
2912   byte tmpbuf_store[3 * 16 + 15];
2913   byte *tmpbuf;
2914   aesni_prepare_2_7_variable;
2915 
2916   asm volatile ("" : "=r" (tmpbuf) : "0" (tmpbuf_store) : "memory");
2917   tmpbuf = tmpbuf + (-(uintptr_t)tmpbuf & 15);
2918 
2919   aesni_prepare ();
2920   aesni_prepare_2_7 ();
2921 
2922   if ( !ctx->decryption_prepared )
2923     {
2924       do_aesni_prepare_decryption ( ctx );
2925       ctx->decryption_prepared = 1;
2926     }
2927 
2928   /* Preload Offset */
2929   asm volatile ("movdqu %[iv], %%xmm5\n\t"
2930                 : /* No output */
2931                 : [iv] "m" (*c->u_iv.iv)
2932                 : "memory" );
2933 
2934   for ( ;nblocks && n % 4; nblocks-- )
2935     {
2936       l = aes_ocb_get_l(c, ++n);
2937 
2938       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
2939       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
2940       asm volatile ("movdqu %[l],     %%xmm1\n\t"
2941                     "movdqu %[inbuf], %%xmm0\n\t"
2942                     "pxor   %%xmm1,   %%xmm5\n\t"
2943                     "pxor   %%xmm5,   %%xmm0\n\t"
2944                     :
2945                     : [l] "m" (*l),
2946                       [inbuf] "m" (*inbuf)
2947                     : "memory" );
2948 
2949       do_aesni_dec (ctx);
2950 
2951       asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
2952                     "movdqu %%xmm0, %[outbuf]\n\t"
2953                     : [outbuf] "=m" (*outbuf)
2954                     :
2955                     : "memory" );
2956 
2957       inbuf += BLOCKSIZE;
2958       outbuf += BLOCKSIZE;
2959     }
2960 
2961 #ifdef __x86_64__
2962   if (nblocks >= 8)
2963     {
2964       unsigned char last_xor_first_key_store[16 + 15];
2965       unsigned char *lxf_key;
2966       aesni_prepare_8_15_variable;
2967 
2968       asm volatile (""
2969                     : "=r" (lxf_key)
2970 		    : "0" (last_xor_first_key_store)
2971 		    : "memory");
2972       lxf_key = lxf_key + (-(uintptr_t)lxf_key & 15);
2973 
2974       aesni_prepare_8_15();
2975 
2976       asm volatile ("movdqu %[l0], %%xmm6\n\t"
2977 		    "movdqa %[last_key], %%xmm0\n\t"
2978 		    "pxor %[first_key], %%xmm5\n\t"
2979 		    "pxor %[first_key], %%xmm0\n\t"
2980 		    "movdqa %%xmm0, %[lxfkey]\n\t"
2981 		    : [lxfkey] "=m" (*lxf_key)
2982 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
2983 		      [last_key] "m" (ctx->keyschdec[ctx->rounds][0][0]),
2984 		      [first_key] "m" (ctx->keyschdec[0][0][0])
2985 		    : "memory" );
2986 
2987       for ( ;nblocks >= 8 ; nblocks -= 8 )
2988 	{
2989 	  n += 4;
2990 	  l = aes_ocb_get_l(c, n);
2991 
2992 	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
2993 			"movdqu %[l1],     %%xmm11\n\t"
2994 			"movdqu %[l3],     %%xmm15\n\t"
2995 			:
2996 			: [l0l1] "m" (*c->u_mode.ocb.L0L1),
2997 			  [l1] "m" (*c->u_mode.ocb.L[1]),
2998 			  [l3] "m" (*l)
2999 			: "memory" );
3000 
3001 	  n += 4;
3002 	  l = aes_ocb_get_l(c, n);
3003 
3004 	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3005 	  /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i)  */
3006 	  asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
3007 			"movdqu %[inbuf1], %%xmm2\n\t"
3008 			"movdqu %[inbuf2], %%xmm3\n\t"
3009 			:
3010 			: [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)),
3011 			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
3012 			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
3013 			: "memory" );
3014 	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
3015 			"movdqu %[inbuf4], %%xmm8\n\t"
3016 			"movdqu %[inbuf5], %%xmm9\n\t"
3017 			:
3018 			: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
3019 			  [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)),
3020 			  [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
3021 			: "memory" );
3022 	  asm volatile ("movdqa %[lxfkey], %%xmm0\n\t"
3023 			"movdqa %%xmm6,    %%xmm12\n\t"
3024 			"pxor   %%xmm5,    %%xmm12\n\t"
3025 			"pxor   %%xmm12,   %%xmm1\n\t"
3026 			"pxor   %%xmm0,    %%xmm12\n\t"
3027 
3028 			"movdqa %%xmm10,   %%xmm13\n\t"
3029 			"pxor   %%xmm5,    %%xmm13\n\t"
3030 			"pxor   %%xmm13,   %%xmm2\n\t"
3031 			"pxor   %%xmm0,    %%xmm13\n\t"
3032 
3033 			"movdqa %%xmm11,   %%xmm14\n\t"
3034 			"pxor   %%xmm5,    %%xmm14\n\t"
3035 			"pxor   %%xmm14,   %%xmm3\n\t"
3036 			"pxor   %%xmm0,    %%xmm14\n\t"
3037 
3038 			"pxor   %%xmm11,   %%xmm5\n\t"
3039 			"pxor   %%xmm15,   %%xmm5\n\t"
3040 			"pxor   %%xmm5,    %%xmm4\n\t"
3041 			"movdqa %%xmm5,    %%xmm15\n\t"
3042 			"pxor   %%xmm0,    %%xmm15\n\t"
3043 
3044 			"movdqa %%xmm5,    %%xmm0\n\t"
3045 			"pxor   %%xmm6,    %%xmm0\n\t"
3046 			"pxor   %%xmm0,    %%xmm8\n\t"
3047 			"pxor   %[lxfkey], %%xmm0\n\t"
3048 			"movdqa %%xmm0,    %[tmpbuf0]\n\t"
3049 
3050 			"movdqa %%xmm10,   %%xmm0\n\t"
3051 			"pxor   %%xmm5,    %%xmm0\n\t"
3052 			"pxor   %%xmm0,    %%xmm9\n\t"
3053 			"pxor   %[lxfkey], %%xmm0\n"
3054 			"movdqa %%xmm0,    %[tmpbuf1]\n\t"
3055 			: [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
3056 			  [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
3057 			: [lxfkey] "m" (*lxf_key)
3058 			: "memory" );
3059 	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
3060 			"movdqa %%xmm11,   %%xmm0\n\t"
3061 			"pxor   %%xmm5,    %%xmm0\n\t"
3062 			"pxor   %%xmm0,    %%xmm10\n\t"
3063 			"pxor   %[lxfkey], %%xmm0\n\t"
3064 			"movdqa %%xmm0,    %[tmpbuf2]\n\t"
3065 			: [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
3066 			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)),
3067 			  [lxfkey] "m" (*lxf_key)
3068 			: "memory" );
3069 	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
3070 			"pxor   %%xmm11,   %%xmm5\n\t"
3071 			"pxor   %%xmm0,    %%xmm5\n\t"
3072 			"movdqa 0x10(%[key]), %%xmm0\n\t"
3073 			"movdqu %[inbuf7], %%xmm11\n\t"
3074 			"pxor   %%xmm5,    %%xmm11\n\t"
3075 			:
3076 			: [l7] "m" (*l),
3077 			  [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)),
3078 			  [key] "r" (ctx->keyschdec)
3079 			: "memory" );
3080 
3081 	  asm volatile ("cmpl $12, %[rounds]\n\t"
3082 			"aesdec %%xmm0, %%xmm1\n\t"
3083 			"aesdec %%xmm0, %%xmm2\n\t"
3084 			"aesdec %%xmm0, %%xmm3\n\t"
3085 			"aesdec %%xmm0, %%xmm4\n\t"
3086 			"aesdec %%xmm0, %%xmm8\n\t"
3087 			"aesdec %%xmm0, %%xmm9\n\t"
3088 			"aesdec %%xmm0, %%xmm10\n\t"
3089 			"aesdec %%xmm0, %%xmm11\n\t"
3090 			"movdqa 0x20(%[key]), %%xmm0\n\t"
3091 			"aesdec %%xmm0, %%xmm1\n\t"
3092 			"aesdec %%xmm0, %%xmm2\n\t"
3093 			"aesdec %%xmm0, %%xmm3\n\t"
3094 			"aesdec %%xmm0, %%xmm4\n\t"
3095 			"aesdec %%xmm0, %%xmm8\n\t"
3096 			"aesdec %%xmm0, %%xmm9\n\t"
3097 			"aesdec %%xmm0, %%xmm10\n\t"
3098 			"aesdec %%xmm0, %%xmm11\n\t"
3099 			"movdqa 0x30(%[key]), %%xmm0\n\t"
3100 			"aesdec %%xmm0, %%xmm1\n\t"
3101 			"aesdec %%xmm0, %%xmm2\n\t"
3102 			"aesdec %%xmm0, %%xmm3\n\t"
3103 			"aesdec %%xmm0, %%xmm4\n\t"
3104 			"aesdec %%xmm0, %%xmm8\n\t"
3105 			"aesdec %%xmm0, %%xmm9\n\t"
3106 			"aesdec %%xmm0, %%xmm10\n\t"
3107 			"aesdec %%xmm0, %%xmm11\n\t"
3108 			"movdqa 0x40(%[key]), %%xmm0\n\t"
3109 			"aesdec %%xmm0, %%xmm1\n\t"
3110 			"aesdec %%xmm0, %%xmm2\n\t"
3111 			"aesdec %%xmm0, %%xmm3\n\t"
3112 			"aesdec %%xmm0, %%xmm4\n\t"
3113 			"aesdec %%xmm0, %%xmm8\n\t"
3114 			"aesdec %%xmm0, %%xmm9\n\t"
3115 			"aesdec %%xmm0, %%xmm10\n\t"
3116 			"aesdec %%xmm0, %%xmm11\n\t"
3117 			"movdqa 0x50(%[key]), %%xmm0\n\t"
3118 			"aesdec %%xmm0, %%xmm1\n\t"
3119 			"aesdec %%xmm0, %%xmm2\n\t"
3120 			"aesdec %%xmm0, %%xmm3\n\t"
3121 			"aesdec %%xmm0, %%xmm4\n\t"
3122 			"aesdec %%xmm0, %%xmm8\n\t"
3123 			"aesdec %%xmm0, %%xmm9\n\t"
3124 			"aesdec %%xmm0, %%xmm10\n\t"
3125 			"aesdec %%xmm0, %%xmm11\n\t"
3126 			"movdqa 0x60(%[key]), %%xmm0\n\t"
3127 			"aesdec %%xmm0, %%xmm1\n\t"
3128 			"aesdec %%xmm0, %%xmm2\n\t"
3129 			"aesdec %%xmm0, %%xmm3\n\t"
3130 			"aesdec %%xmm0, %%xmm4\n\t"
3131 			"aesdec %%xmm0, %%xmm8\n\t"
3132 			"aesdec %%xmm0, %%xmm9\n\t"
3133 			"aesdec %%xmm0, %%xmm10\n\t"
3134 			"aesdec %%xmm0, %%xmm11\n\t"
3135 			"movdqa 0x70(%[key]), %%xmm0\n\t"
3136 			"aesdec %%xmm0, %%xmm1\n\t"
3137 			"aesdec %%xmm0, %%xmm2\n\t"
3138 			"aesdec %%xmm0, %%xmm3\n\t"
3139 			"aesdec %%xmm0, %%xmm4\n\t"
3140 			"aesdec %%xmm0, %%xmm8\n\t"
3141 			"aesdec %%xmm0, %%xmm9\n\t"
3142 			"aesdec %%xmm0, %%xmm10\n\t"
3143 			"aesdec %%xmm0, %%xmm11\n\t"
3144 			"movdqa 0x80(%[key]), %%xmm0\n\t"
3145 			"aesdec %%xmm0, %%xmm1\n\t"
3146 			"aesdec %%xmm0, %%xmm2\n\t"
3147 			"aesdec %%xmm0, %%xmm3\n\t"
3148 			"aesdec %%xmm0, %%xmm4\n\t"
3149 			"aesdec %%xmm0, %%xmm8\n\t"
3150 			"aesdec %%xmm0, %%xmm9\n\t"
3151 			"aesdec %%xmm0, %%xmm10\n\t"
3152 			"aesdec %%xmm0, %%xmm11\n\t"
3153 			"movdqa 0x90(%[key]), %%xmm0\n\t"
3154 			"aesdec %%xmm0, %%xmm1\n\t"
3155 			"aesdec %%xmm0, %%xmm2\n\t"
3156 			"aesdec %%xmm0, %%xmm3\n\t"
3157 			"aesdec %%xmm0, %%xmm4\n\t"
3158 			"aesdec %%xmm0, %%xmm8\n\t"
3159 			"aesdec %%xmm0, %%xmm9\n\t"
3160 			"aesdec %%xmm0, %%xmm10\n\t"
3161 			"aesdec %%xmm0, %%xmm11\n\t"
3162 			"jb .Ldeclast%=\n\t"
3163 			"movdqa 0xa0(%[key]), %%xmm0\n\t"
3164 			"aesdec %%xmm0, %%xmm1\n\t"
3165 			"aesdec %%xmm0, %%xmm2\n\t"
3166 			"aesdec %%xmm0, %%xmm3\n\t"
3167 			"aesdec %%xmm0, %%xmm4\n\t"
3168 			"aesdec %%xmm0, %%xmm8\n\t"
3169 			"aesdec %%xmm0, %%xmm9\n\t"
3170 			"aesdec %%xmm0, %%xmm10\n\t"
3171 			"aesdec %%xmm0, %%xmm11\n\t"
3172 			"movdqa 0xb0(%[key]), %%xmm0\n\t"
3173 			"aesdec %%xmm0, %%xmm1\n\t"
3174 			"aesdec %%xmm0, %%xmm2\n\t"
3175 			"aesdec %%xmm0, %%xmm3\n\t"
3176 			"aesdec %%xmm0, %%xmm4\n\t"
3177 			"aesdec %%xmm0, %%xmm8\n\t"
3178 			"aesdec %%xmm0, %%xmm9\n\t"
3179 			"aesdec %%xmm0, %%xmm10\n\t"
3180 			"aesdec %%xmm0, %%xmm11\n\t"
3181 			"je .Ldeclast%=\n\t"
3182 			"movdqa 0xc0(%[key]), %%xmm0\n\t"
3183 			"aesdec %%xmm0, %%xmm1\n\t"
3184 			"aesdec %%xmm0, %%xmm2\n\t"
3185 			"aesdec %%xmm0, %%xmm3\n\t"
3186 			"aesdec %%xmm0, %%xmm4\n\t"
3187 			"aesdec %%xmm0, %%xmm8\n\t"
3188 			"aesdec %%xmm0, %%xmm9\n\t"
3189 			"aesdec %%xmm0, %%xmm10\n\t"
3190 			"aesdec %%xmm0, %%xmm11\n\t"
3191 			"movdqa 0xd0(%[key]), %%xmm0\n\t"
3192 			"aesdec %%xmm0, %%xmm1\n\t"
3193 			"aesdec %%xmm0, %%xmm2\n\t"
3194 			"aesdec %%xmm0, %%xmm3\n\t"
3195 			"aesdec %%xmm0, %%xmm4\n\t"
3196 			"aesdec %%xmm0, %%xmm8\n\t"
3197 			"aesdec %%xmm0, %%xmm9\n\t"
3198 			"aesdec %%xmm0, %%xmm10\n\t"
3199 			"aesdec %%xmm0, %%xmm11\n\t"
3200 
3201 			".Ldeclast%=:\n\t"
3202 			:
3203 			: [key] "r" (ctx->keyschdec),
3204 			  [rounds] "r" (ctx->rounds)
3205 			: "cc", "memory");
3206 
3207 	  asm volatile ("aesdeclast %%xmm12,   %%xmm1\n\t"
3208 			"aesdeclast %%xmm13,   %%xmm2\n\t"
3209 			"aesdeclast %%xmm14,   %%xmm3\n\t"
3210 			"aesdeclast %%xmm15,   %%xmm4\n\t"
3211 			"aesdeclast %[tmpbuf0],%%xmm8\n\t"
3212 			"aesdeclast %[tmpbuf1],%%xmm9\n\t"
3213 			"aesdeclast %[tmpbuf2],%%xmm10\n\t"
3214 			:
3215 			: [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
3216 			  [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)),
3217 			  [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
3218 			: "memory" );
3219 	  asm volatile ("aesdeclast %%xmm5,    %%xmm11\n\t"
3220 			"pxor   %[lxfkey], %%xmm11\n\t"
3221 			"movdqu %%xmm1,    %[outbuf0]\n\t"
3222 			"movdqu %%xmm2,    %[outbuf1]\n\t"
3223 			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
3224 			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
3225 			: [lxfkey] "m" (*lxf_key)
3226 			: "memory" );
3227 	  asm volatile ("movdqu %%xmm3,    %[outbuf2]\n\t"
3228 			"movdqu %%xmm4,    %[outbuf3]\n\t"
3229 			"movdqu %%xmm8,    %[outbuf4]\n\t"
3230 			: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
3231 			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
3232 			  [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
3233 			:
3234 			: "memory" );
3235 	  asm volatile ("movdqu %%xmm9,    %[outbuf5]\n\t"
3236 			"movdqu %%xmm10,   %[outbuf6]\n\t"
3237 			"movdqu %%xmm11,   %[outbuf7]\n\t"
3238 			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)),
3239 			  [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)),
3240 			  [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
3241 			:
3242 			: "memory" );
3243 
3244 	  outbuf += 8*BLOCKSIZE;
3245 	  inbuf  += 8*BLOCKSIZE;
3246 	}
3247 
3248       asm volatile ("pxor %[first_key], %%xmm5\n\t"
3249 		    "pxor %%xmm0, %%xmm0\n\t"
3250 		    "movdqu %%xmm0, %[lxfkey]\n\t"
3251 		    : [lxfkey] "=m" (*lxf_key)
3252 		    : [first_key] "m" (ctx->keyschdec[0][0][0])
3253 		    : "memory" );
3254 
3255       aesni_cleanup_8_15();
3256     }
3257 #endif
3258 
3259   for ( ;nblocks >= 4 ; nblocks -= 4 )
3260     {
3261       n += 4;
3262       l = aes_ocb_get_l(c, n);
3263 
3264       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3265       /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i)  */
3266       asm volatile ("movdqu %[l0],     %%xmm0\n\t"
3267 		    "movdqu %[inbuf0], %%xmm1\n\t"
3268 		    "movdqu %[l0l1],   %%xmm3\n\t"
3269 		    :
3270 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
3271 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
3272 		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
3273 		    : "memory" );
3274       asm volatile ("movdqu %[l1],     %%xmm4\n\t"
3275 		    "movdqu %[l3],     %%xmm6\n\t"
3276 		    "pxor   %%xmm5,    %%xmm0\n\t"
3277 		    "pxor   %%xmm0,    %%xmm1\n\t"
3278 		    "movdqa %%xmm0,    %[tmpbuf0]\n\t"
3279 		    : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE))
3280 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
3281 		      [l3] "m" (*l)
3282 		    : "memory" );
3283       asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
3284 		    "pxor   %%xmm5,    %%xmm3\n\t"
3285 		    "pxor   %%xmm3,    %%xmm2\n\t"
3286 		    "movdqa %%xmm3,    %[tmpbuf1]\n\t"
3287 		    : [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
3288 		    : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
3289 		    : "memory" );
3290       asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
3291 		    "movdqu %[inbuf2], %%xmm3\n\t"
3292 		    "pxor   %%xmm5,    %%xmm0\n\t"
3293 		    "pxor   %%xmm0,    %%xmm3\n\t"
3294 		    "movdqa %%xmm0,    %[tmpbuf2]\n\t"
3295 		    : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
3296 		    :
3297 		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
3298 		    : "memory" );
3299       asm volatile ("pxor   %%xmm6,    %%xmm5\n\t"
3300 		    "pxor   %%xmm4,    %%xmm5\n\t"
3301 		    "movdqu %[inbuf3], %%xmm4\n\t"
3302 		    "pxor   %%xmm5,    %%xmm4\n\t"
3303 		    :
3304 		    : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
3305 		    : "memory" );
3306 
3307       do_aesni_dec_vec4 (ctx);
3308 
3309       asm volatile ("pxor   %[tmpbuf0],%%xmm1\n\t"
3310 		    "movdqu %%xmm1,    %[outbuf0]\n\t"
3311 		    "pxor   %[tmpbuf1],%%xmm2\n\t"
3312 		    "movdqu %%xmm2,    %[outbuf1]\n\t"
3313 		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
3314 		      [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
3315 		    : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
3316 		      [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE))
3317 		    : "memory" );
3318       asm volatile ("pxor   %[tmpbuf2],%%xmm3\n\t"
3319 		    "movdqu %%xmm3,    %[outbuf2]\n\t"
3320 		    "pxor   %%xmm5,    %%xmm4\n\t"
3321 		    "movdqu %%xmm4,    %[outbuf3]\n\t"
3322 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
3323 		      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
3324 		    : [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
3325 		    : "memory" );
3326 
3327       outbuf += 4*BLOCKSIZE;
3328       inbuf  += 4*BLOCKSIZE;
3329     }
3330 
3331   for ( ;nblocks; nblocks-- )
3332     {
3333       l = aes_ocb_get_l(c, ++n);
3334 
3335       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3336       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
3337       /* Checksum_i = Checksum_{i-1} xor P_i  */
3338       asm volatile ("movdqu %[l],     %%xmm1\n\t"
3339                     "movdqu %[inbuf], %%xmm0\n\t"
3340                     "pxor   %%xmm1,   %%xmm5\n\t"
3341                     "pxor   %%xmm5,   %%xmm0\n\t"
3342                     :
3343                     : [l] "m" (*l),
3344                       [inbuf] "m" (*inbuf)
3345                     : "memory" );
3346 
3347       do_aesni_dec (ctx);
3348 
3349       asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
3350                     "movdqu %%xmm0, %[outbuf]\n\t"
3351                     : [outbuf] "=m" (*outbuf)
3352                     :
3353                     : "memory" );
3354 
3355       inbuf += BLOCKSIZE;
3356       outbuf += BLOCKSIZE;
3357     }
3358 
3359   c->u_mode.ocb.data_nblocks = n;
3360   asm volatile ("movdqu %%xmm5, %[iv]\n\t"
3361                 : [iv] "=m" (*c->u_iv.iv)
3362                 :
3363                 : "memory" );
3364 
3365   asm volatile ("pxor   %%xmm0, %%xmm0\n\t"
3366                 "movdqa %%xmm0, %[tmpbuf0]\n\t"
3367                 "movdqa %%xmm0, %[tmpbuf1]\n\t"
3368                 "movdqa %%xmm0, %[tmpbuf2]\n\t"
3369 		: [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
3370 		  [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)),
3371 		  [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
3372                 :
3373                 : "memory" );
3374 
3375   aesni_ocb_checksum (c, outbuf_arg, nblocks_arg);
3376 
3377   aesni_cleanup ();
3378   aesni_cleanup_2_7 ();
3379 
3380   return 0;
3381 }
3382 
3383 
3384 size_t ASM_FUNC_ATTR
_gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c,void * outbuf_arg,const void * inbuf_arg,size_t nblocks,int encrypt)3385 _gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
3386                           const void *inbuf_arg, size_t nblocks, int encrypt)
3387 {
3388   if (encrypt)
3389     return aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
3390   else
3391     return aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
3392 }
3393 
3394 
3395 size_t ASM_FUNC_ATTR
_gcry_aes_aesni_ocb_auth(gcry_cipher_hd_t c,const void * abuf_arg,size_t nblocks)3396 _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
3397                           size_t nblocks)
3398 {
3399   RIJNDAEL_context *ctx = (void *)&c->context.c;
3400   const unsigned char *abuf = abuf_arg;
3401   u64 n = c->u_mode.ocb.aad_nblocks;
3402   const unsigned char *l;
3403   aesni_prepare_2_7_variable;
3404 
3405   aesni_prepare ();
3406   aesni_prepare_2_7 ();
3407 
3408   /* Preload Offset and Sum */
3409   asm volatile ("movdqu %[iv], %%xmm5\n\t"
3410                 "movdqu %[ctr], %%xmm6\n\t"
3411                 : /* No output */
3412                 : [iv] "m" (*c->u_mode.ocb.aad_offset),
3413                   [ctr] "m" (*c->u_mode.ocb.aad_sum)
3414                 : "memory" );
3415 
3416   for ( ;nblocks && n % 4; nblocks-- )
3417     {
3418       l = aes_ocb_get_l(c, ++n);
3419 
3420       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3421       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
3422       asm volatile ("movdqu %[l],     %%xmm1\n\t"
3423                     "movdqu %[abuf],  %%xmm0\n\t"
3424                     "pxor   %%xmm1,   %%xmm5\n\t"
3425                     "pxor   %%xmm5,   %%xmm0\n\t"
3426                     :
3427                     : [l] "m" (*l),
3428                       [abuf] "m" (*abuf)
3429                     : "memory" );
3430 
3431       do_aesni_enc (ctx);
3432 
3433       asm volatile ("pxor   %%xmm0,   %%xmm6\n\t"
3434                     :
3435                     :
3436                     : "memory" );
3437 
3438       abuf += BLOCKSIZE;
3439     }
3440 
3441 #ifdef __x86_64__
3442   if (nblocks >= 8)
3443     {
3444       aesni_prepare_8_15_variable;
3445 
3446       aesni_prepare_8_15();
3447 
3448       asm volatile ("movdqu %[l0],     %%xmm7\n\t"
3449 		    "movdqu %[l0l1],   %%xmm12\n\t"
3450 		    "movdqu %[l1],     %%xmm13\n\t"
3451 		    :
3452 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
3453 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
3454 		      [l1] "m" (*c->u_mode.ocb.L[1])
3455 		    : "memory" );
3456 
3457       for ( ;nblocks >= 8 ; nblocks -= 8 )
3458 	{
3459 	  n += 4;
3460 	  l = aes_ocb_get_l(c, n);
3461 
3462 	  asm volatile ("movdqu %[l3],   %%xmm0\n\t"
3463 			"pxor   %%xmm13, %%xmm0\n\t"
3464 			:
3465 			: [l3] "m" (*l)
3466 			: "memory" );
3467 
3468 	  n += 4;
3469 	  l = aes_ocb_get_l(c, n);
3470 
3471 	  asm volatile ("movdqu %[l7],   %%xmm14\n\t"
3472 			"pxor   %%xmm13, %%xmm14\n\t"
3473 			:
3474 			: [l7] "m" (*l)
3475 			: "memory" );
3476 
3477 	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3478 	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
3479 	  asm volatile ("movdqu %[abuf0],  %%xmm1\n\t"
3480 			"movdqu %[abuf1],  %%xmm2\n\t"
3481 			"movdqu %[abuf2],  %%xmm3\n\t"
3482 			"movdqu %[abuf3],  %%xmm4\n\t"
3483 			:
3484 			: [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)),
3485 			  [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)),
3486 			  [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)),
3487 			  [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
3488 			: "memory" );
3489 	  asm volatile ("movdqu %[abuf4],  %%xmm8\n\t"
3490 			"movdqu %[abuf5],  %%xmm9\n\t"
3491 			"movdqu %[abuf6],  %%xmm10\n\t"
3492 			"movdqu %[abuf7],  %%xmm11\n\t"
3493 			:
3494 			: [abuf4] "m" (*(abuf + 4 * BLOCKSIZE)),
3495 			  [abuf5] "m" (*(abuf + 5 * BLOCKSIZE)),
3496 			  [abuf6] "m" (*(abuf + 6 * BLOCKSIZE)),
3497 			  [abuf7] "m" (*(abuf + 7 * BLOCKSIZE))
3498 			: "memory" );
3499 	  asm volatile ("pxor   %%xmm7,    %%xmm1\n\t"
3500 			"pxor   %%xmm5,    %%xmm1\n\t"
3501 
3502 			"pxor   %%xmm12,   %%xmm2\n\t"
3503 			"pxor   %%xmm5,    %%xmm2\n\t"
3504 
3505 			"pxor   %%xmm13,   %%xmm3\n\t"
3506 			"pxor   %%xmm5,    %%xmm3\n\t"
3507 
3508 			"pxor   %%xmm0,    %%xmm5\n\t"
3509 			"movdqa (%[key]),  %%xmm0\n\t"
3510 			"pxor   %%xmm5,    %%xmm4\n\t"
3511 
3512 			"pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
3513 			"pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
3514 			"pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
3515 			"pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
3516 
3517 			"pxor   %%xmm7,    %%xmm8\n\t"
3518 			"pxor   %%xmm5,    %%xmm8\n\t"
3519 
3520 			"pxor   %%xmm12,   %%xmm9\n\t"
3521 			"pxor   %%xmm5,    %%xmm9\n\t"
3522 
3523 			"pxor   %%xmm13,   %%xmm10\n\t"
3524 			"pxor   %%xmm5,    %%xmm10\n\t"
3525 
3526 			"pxor   %%xmm14,   %%xmm5\n\t"
3527 			"pxor   %%xmm5,    %%xmm11\n\t"
3528 
3529 			"pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
3530 			"pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
3531 			"pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
3532 			"pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
3533 			:
3534 			: [key] "r" (ctx->keyschenc)
3535 			: "memory" );
3536 
3537 	  do_aesni_enc_vec8 (ctx);
3538 
3539 	  asm volatile (
3540 			"aesenclast %%xmm0, %%xmm1\n\t"
3541 			"aesenclast %%xmm0, %%xmm2\n\t"
3542 			"aesenclast %%xmm0, %%xmm3\n\t"
3543 			"aesenclast %%xmm0, %%xmm4\n\t"
3544 			"aesenclast %%xmm0, %%xmm8\n\t"
3545 			"aesenclast %%xmm0, %%xmm9\n\t"
3546 			"aesenclast %%xmm0, %%xmm10\n\t"
3547 			"aesenclast %%xmm0, %%xmm11\n\t"
3548 			"pxor   %%xmm2,   %%xmm1\n\t"
3549 			"pxor   %%xmm3,   %%xmm1\n\t"
3550 			"pxor   %%xmm4,   %%xmm1\n\t"
3551 			"pxor   %%xmm8,   %%xmm1\n\t"
3552 			"pxor   %%xmm9,   %%xmm6\n\t"
3553 			"pxor   %%xmm10,  %%xmm6\n\t"
3554 			"pxor   %%xmm11,  %%xmm6\n\t"
3555 			"pxor   %%xmm1,   %%xmm6\n\t"
3556 			:
3557 			:
3558 			: "memory" );
3559 
3560 	  abuf += 8*BLOCKSIZE;
3561 	}
3562 
3563       aesni_cleanup_8_15();
3564     }
3565 #endif
3566 
3567   for ( ;nblocks >= 4 ; nblocks -= 4 )
3568     {
3569       n += 4;
3570       l = aes_ocb_get_l(c, n);
3571 
3572       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3573       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
3574       asm volatile ("movdqu %[l0],     %%xmm0\n\t"
3575 		    "movdqu %[abuf0],  %%xmm1\n\t"
3576 		    "movdqu %[l0l1],   %%xmm3\n\t"
3577 		    :
3578 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
3579 		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
3580 		      [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
3581 		    : "memory" );
3582       asm volatile ("movdqu %[l1],     %%xmm4\n\t"
3583 		    "movdqu %[l3],     %%xmm7\n\t"
3584 		    "pxor   %%xmm5,    %%xmm0\n\t"
3585 		    "pxor   %%xmm0,    %%xmm1\n\t"
3586 		    :
3587 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
3588 		      [l3] "m" (*l)
3589 		    : "memory" );
3590       asm volatile ("movdqu %[abuf1],  %%xmm2\n\t"
3591 		    "pxor   %%xmm5,    %%xmm3\n\t"
3592 		    "pxor   %%xmm3,    %%xmm2\n\t"
3593 		    :
3594 		    : [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
3595 		    : "memory" );
3596       asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
3597 		    "movdqu %[abuf2],  %%xmm3\n\t"
3598 		    "pxor   %%xmm5,    %%xmm0\n\t"
3599 		    "pxor   %%xmm0,    %%xmm3\n\t"
3600 		    :
3601 		    : [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
3602 		    : "memory" );
3603       asm volatile ("pxor   %%xmm7,    %%xmm5\n\t"
3604 		    "pxor   %%xmm4,    %%xmm5\n\t"
3605 		    "movdqu %[abuf3],  %%xmm4\n\t"
3606 		    "pxor   %%xmm5,    %%xmm4\n\t"
3607 		    :
3608 		    : [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
3609 		    : "memory" );
3610 
3611       do_aesni_enc_vec4 (ctx);
3612 
3613       asm volatile ("pxor   %%xmm1,   %%xmm6\n\t"
3614 		    "pxor   %%xmm2,   %%xmm6\n\t"
3615 		    "pxor   %%xmm3,   %%xmm6\n\t"
3616 		    "pxor   %%xmm4,   %%xmm6\n\t"
3617 		    :
3618 		    :
3619 		    : "memory" );
3620 
3621       abuf += 4*BLOCKSIZE;
3622     }
3623 
3624   for ( ;nblocks; nblocks-- )
3625     {
3626       l = aes_ocb_get_l(c, ++n);
3627 
3628       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
3629       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
3630       asm volatile ("movdqu %[l],     %%xmm1\n\t"
3631                     "movdqu %[abuf],  %%xmm0\n\t"
3632                     "pxor   %%xmm1,   %%xmm5\n\t"
3633                     "pxor   %%xmm5,   %%xmm0\n\t"
3634                     :
3635                     : [l] "m" (*l),
3636                       [abuf] "m" (*abuf)
3637                     : "memory" );
3638 
3639       do_aesni_enc (ctx);
3640 
3641       asm volatile ("pxor   %%xmm0,   %%xmm6\n\t"
3642                     :
3643                     :
3644                     : "memory" );
3645 
3646       abuf += BLOCKSIZE;
3647     }
3648 
3649   c->u_mode.ocb.aad_nblocks = n;
3650   asm volatile ("movdqu %%xmm5, %[iv]\n\t"
3651                 "movdqu %%xmm6, %[ctr]\n\t"
3652                 : [iv] "=m" (*c->u_mode.ocb.aad_offset),
3653                   [ctr] "=m" (*c->u_mode.ocb.aad_sum)
3654                 :
3655                 : "memory" );
3656 
3657   aesni_cleanup ();
3658   aesni_cleanup_2_7 ();
3659 
3660   return 0;
3661 }
3662 
3663 
3664 static const u64 xts_gfmul_const[2] __attribute__ ((aligned (16))) =
3665   { 0x87, 0x01 };
3666 
3667 
3668 static void ASM_FUNC_ATTR
_gcry_aes_aesni_xts_enc(RIJNDAEL_context * ctx,unsigned char * tweak,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)3669 _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
3670 			 unsigned char *outbuf, const unsigned char *inbuf,
3671 			 size_t nblocks)
3672 {
3673   aesni_prepare_2_7_variable;
3674 
3675   aesni_prepare ();
3676   aesni_prepare_2_7 ();
3677 
3678   /* Preload Tweak */
3679   asm volatile ("movdqu %[tweak], %%xmm5\n\t"
3680 		"movdqa %[gfmul], %%xmm6\n\t"
3681 		:
3682 		: [tweak] "m" (*tweak),
3683 		  [gfmul] "m" (*xts_gfmul_const)
3684 		: "memory" );
3685 
3686 #ifdef __x86_64__
3687   if (nblocks >= 8)
3688     {
3689       aesni_prepare_8_15_variable;
3690 
3691       aesni_prepare_8_15();
3692 
3693       for ( ;nblocks >= 8 ; nblocks -= 8 )
3694 	{
3695 	  asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm11\n\t"
3696 			"movdqu %[inbuf0], %%xmm1\n\t"
3697 			"pxor   %%xmm5,    %%xmm1\n\t"
3698 			"movdqa %%xmm5,    %%xmm7\n\t"
3699 
3700 			"movdqa %%xmm11,   %%xmm0\n\t"
3701 			"paddd  %%xmm11,   %%xmm11\n\t"
3702 			"psrad  $31,       %%xmm0\n\t"
3703 			"paddq  %%xmm5,    %%xmm5\n\t"
3704 			"pand   %%xmm6,    %%xmm0\n\t"
3705 			"pxor   %%xmm0,    %%xmm5\n\t"
3706 			:
3707 			: [inbuf0] "m" (*(inbuf + 0 * 16))
3708 			: "memory" );
3709 
3710 	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
3711 			"pxor   %%xmm5,    %%xmm2\n\t"
3712 			"movdqa %%xmm5,    %%xmm12\n\t"
3713 
3714 			"movdqa %%xmm11,   %%xmm0\n\t"
3715 			"paddd  %%xmm11,   %%xmm11\n\t"
3716 			"psrad  $31,       %%xmm0\n\t"
3717 			"paddq  %%xmm5,    %%xmm5\n\t"
3718 			"pand   %%xmm6,    %%xmm0\n\t"
3719 			"pxor   %%xmm0,    %%xmm5\n\t"
3720 			:
3721 			: [inbuf1] "m" (*(inbuf + 1 * 16))
3722 			: "memory" );
3723 
3724 	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
3725 			"pxor   %%xmm5,    %%xmm3\n\t"
3726 			"movdqa %%xmm5,    %%xmm13\n\t"
3727 
3728 			"movdqa %%xmm11,   %%xmm0\n\t"
3729 			"paddd  %%xmm11,   %%xmm11\n\t"
3730 			"psrad  $31,       %%xmm0\n\t"
3731 			"paddq  %%xmm5,    %%xmm5\n\t"
3732 			"pand   %%xmm6,    %%xmm0\n\t"
3733 			"pxor   %%xmm0,    %%xmm5\n\t"
3734 			:
3735 			: [inbuf2] "m" (*(inbuf + 2 * 16))
3736 			: "memory" );
3737 
3738 	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
3739 			"pxor   %%xmm5,    %%xmm4\n\t"
3740 			"movdqa %%xmm5,    %%xmm14\n\t"
3741 
3742 			"movdqa %%xmm11,   %%xmm0\n\t"
3743 			"paddd  %%xmm11,   %%xmm11\n\t"
3744 			"psrad  $31,       %%xmm0\n\t"
3745 			"paddq  %%xmm5,    %%xmm5\n\t"
3746 			"pand   %%xmm6,    %%xmm0\n\t"
3747 			"pxor   %%xmm0,    %%xmm5\n\t"
3748 			:
3749 			: [inbuf3] "m" (*(inbuf + 3 * 16))
3750 			: "memory" );
3751 
3752 	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
3753 			"pxor   %%xmm5,    %%xmm8\n\t"
3754 			"movdqa %%xmm5,    %%xmm15\n\t"
3755 
3756 			"movdqa %%xmm11,   %%xmm0\n\t"
3757 			"paddd  %%xmm11,   %%xmm11\n\t"
3758 			"psrad  $31,       %%xmm0\n\t"
3759 			"paddq  %%xmm5,    %%xmm5\n\t"
3760 			"pand   %%xmm6,    %%xmm0\n\t"
3761 			"pxor   %%xmm0,    %%xmm5\n\t"
3762 			:
3763 			: [inbuf4] "m" (*(inbuf + 4 * 16))
3764 			: "memory" );
3765 
3766 	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
3767 			"pxor   %%xmm5,    %%xmm9\n\t"
3768 			"movdqu %%xmm5,    %[outbuf5]\n\t"
3769 
3770 			"movdqa %%xmm11,   %%xmm0\n\t"
3771 			"paddd  %%xmm11,   %%xmm11\n\t"
3772 			"psrad  $31,       %%xmm0\n\t"
3773 			"paddq  %%xmm5,    %%xmm5\n\t"
3774 			"pand   %%xmm6,    %%xmm0\n\t"
3775 			"pxor   %%xmm0,    %%xmm5\n\t"
3776 			: [outbuf5] "=m" (*(outbuf + 5 * 16))
3777 			: [inbuf5] "m" (*(inbuf + 5 * 16))
3778 			: "memory" );
3779 
3780 	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
3781 			"pxor   %%xmm5,    %%xmm10\n\t"
3782 			"movdqu %%xmm5,    %[outbuf6]\n\t"
3783 
3784 			"movdqa %%xmm11,   %%xmm0\n\t"
3785 			"paddd  %%xmm11,   %%xmm11\n\t"
3786 			"psrad  $31,       %%xmm0\n\t"
3787 			"paddq  %%xmm5,    %%xmm5\n\t"
3788 			"pand   %%xmm6,    %%xmm0\n\t"
3789 			"pxor   %%xmm0,    %%xmm5\n\t"
3790 			: [outbuf6] "=m" (*(outbuf + 6 * 16))
3791 			: [inbuf6] "m" (*(inbuf + 6 * 16))
3792 			: "memory" );
3793 
3794 	  asm volatile ("movdqa %%xmm11,   %%xmm0\n\t"
3795 			"movdqu %[inbuf7], %%xmm11\n\t"
3796 			"pxor   %%xmm5,    %%xmm11\n\t"
3797 			"movdqu %%xmm5,    %[outbuf7]\n\t"
3798 
3799 			"psrad  $31,       %%xmm0\n\t"
3800 			"paddq  %%xmm5,    %%xmm5\n\t"
3801 			"pand   %%xmm6,    %%xmm0\n\t"
3802 			"pxor   %%xmm0,    %%xmm5\n\t"
3803 			: [outbuf7] "=m" (*(outbuf + 7 * 16))
3804 			: [inbuf7] "m" (*(inbuf + 7 * 16))
3805 			: "memory" );
3806 
3807 	  asm volatile ("cmpl $12, %[rounds]\n\t"
3808 			"movdqa (%[key]), %%xmm0\n\t"
3809 			"pxor %%xmm0, %%xmm1\n\t"
3810 			"pxor %%xmm0, %%xmm2\n\t"
3811 			"pxor %%xmm0, %%xmm3\n\t"
3812 			"pxor %%xmm0, %%xmm4\n\t"
3813 			"pxor %%xmm0, %%xmm8\n\t"
3814 			"pxor %%xmm0, %%xmm9\n\t"
3815 			"pxor %%xmm0, %%xmm10\n\t"
3816 			"pxor %%xmm0, %%xmm11\n\t"
3817 			"movdqa 0x10(%[key]), %%xmm0\n\t"
3818 			"aesenc %%xmm0, %%xmm1\n\t"
3819 			"aesenc %%xmm0, %%xmm2\n\t"
3820 			"aesenc %%xmm0, %%xmm3\n\t"
3821 			"aesenc %%xmm0, %%xmm4\n\t"
3822 			"aesenc %%xmm0, %%xmm8\n\t"
3823 			"aesenc %%xmm0, %%xmm9\n\t"
3824 			"aesenc %%xmm0, %%xmm10\n\t"
3825 			"aesenc %%xmm0, %%xmm11\n\t"
3826 			"movdqa 0x20(%[key]), %%xmm0\n\t"
3827 			"aesenc %%xmm0, %%xmm1\n\t"
3828 			"aesenc %%xmm0, %%xmm2\n\t"
3829 			"aesenc %%xmm0, %%xmm3\n\t"
3830 			"aesenc %%xmm0, %%xmm4\n\t"
3831 			"aesenc %%xmm0, %%xmm8\n\t"
3832 			"aesenc %%xmm0, %%xmm9\n\t"
3833 			"aesenc %%xmm0, %%xmm10\n\t"
3834 			"aesenc %%xmm0, %%xmm11\n\t"
3835 			"movdqa 0x30(%[key]), %%xmm0\n\t"
3836 			"aesenc %%xmm0, %%xmm1\n\t"
3837 			"aesenc %%xmm0, %%xmm2\n\t"
3838 			"aesenc %%xmm0, %%xmm3\n\t"
3839 			"aesenc %%xmm0, %%xmm4\n\t"
3840 			"aesenc %%xmm0, %%xmm8\n\t"
3841 			"aesenc %%xmm0, %%xmm9\n\t"
3842 			"aesenc %%xmm0, %%xmm10\n\t"
3843 			"aesenc %%xmm0, %%xmm11\n\t"
3844 			"movdqa 0x40(%[key]), %%xmm0\n\t"
3845 			"aesenc %%xmm0, %%xmm1\n\t"
3846 			"aesenc %%xmm0, %%xmm2\n\t"
3847 			"aesenc %%xmm0, %%xmm3\n\t"
3848 			"aesenc %%xmm0, %%xmm4\n\t"
3849 			"aesenc %%xmm0, %%xmm8\n\t"
3850 			"aesenc %%xmm0, %%xmm9\n\t"
3851 			"aesenc %%xmm0, %%xmm10\n\t"
3852 			"aesenc %%xmm0, %%xmm11\n\t"
3853 			"movdqa 0x50(%[key]), %%xmm0\n\t"
3854 			"aesenc %%xmm0, %%xmm1\n\t"
3855 			"aesenc %%xmm0, %%xmm2\n\t"
3856 			"aesenc %%xmm0, %%xmm3\n\t"
3857 			"aesenc %%xmm0, %%xmm4\n\t"
3858 			"aesenc %%xmm0, %%xmm8\n\t"
3859 			"aesenc %%xmm0, %%xmm9\n\t"
3860 			"aesenc %%xmm0, %%xmm10\n\t"
3861 			"aesenc %%xmm0, %%xmm11\n\t"
3862 			"movdqa 0x60(%[key]), %%xmm0\n\t"
3863 			"aesenc %%xmm0, %%xmm1\n\t"
3864 			"aesenc %%xmm0, %%xmm2\n\t"
3865 			"aesenc %%xmm0, %%xmm3\n\t"
3866 			"aesenc %%xmm0, %%xmm4\n\t"
3867 			"aesenc %%xmm0, %%xmm8\n\t"
3868 			"aesenc %%xmm0, %%xmm9\n\t"
3869 			"aesenc %%xmm0, %%xmm10\n\t"
3870 			"aesenc %%xmm0, %%xmm11\n\t"
3871 			"movdqa 0x70(%[key]), %%xmm0\n\t"
3872 			"aesenc %%xmm0, %%xmm1\n\t"
3873 			"aesenc %%xmm0, %%xmm2\n\t"
3874 			"aesenc %%xmm0, %%xmm3\n\t"
3875 			"aesenc %%xmm0, %%xmm4\n\t"
3876 			"aesenc %%xmm0, %%xmm8\n\t"
3877 			"aesenc %%xmm0, %%xmm9\n\t"
3878 			"aesenc %%xmm0, %%xmm10\n\t"
3879 			"aesenc %%xmm0, %%xmm11\n\t"
3880 			"movdqa 0x80(%[key]), %%xmm0\n\t"
3881 			"aesenc %%xmm0, %%xmm1\n\t"
3882 			"aesenc %%xmm0, %%xmm2\n\t"
3883 			"aesenc %%xmm0, %%xmm3\n\t"
3884 			"aesenc %%xmm0, %%xmm4\n\t"
3885 			"aesenc %%xmm0, %%xmm8\n\t"
3886 			"aesenc %%xmm0, %%xmm9\n\t"
3887 			"aesenc %%xmm0, %%xmm10\n\t"
3888 			"aesenc %%xmm0, %%xmm11\n\t"
3889 			"movdqa 0x90(%[key]), %%xmm0\n\t"
3890 			"aesenc %%xmm0, %%xmm1\n\t"
3891 			"aesenc %%xmm0, %%xmm2\n\t"
3892 			"aesenc %%xmm0, %%xmm3\n\t"
3893 			"aesenc %%xmm0, %%xmm4\n\t"
3894 			"aesenc %%xmm0, %%xmm8\n\t"
3895 			"aesenc %%xmm0, %%xmm9\n\t"
3896 			"aesenc %%xmm0, %%xmm10\n\t"
3897 			"aesenc %%xmm0, %%xmm11\n\t"
3898 			"movdqa 0xa0(%[key]), %%xmm0\n\t"
3899 			"jb .Lenclast%=\n\t"
3900 			"aesenc %%xmm0, %%xmm1\n\t"
3901 			"aesenc %%xmm0, %%xmm2\n\t"
3902 			"aesenc %%xmm0, %%xmm3\n\t"
3903 			"aesenc %%xmm0, %%xmm4\n\t"
3904 			"aesenc %%xmm0, %%xmm8\n\t"
3905 			"aesenc %%xmm0, %%xmm9\n\t"
3906 			"aesenc %%xmm0, %%xmm10\n\t"
3907 			"aesenc %%xmm0, %%xmm11\n\t"
3908 			"movdqa 0xb0(%[key]), %%xmm0\n\t"
3909 			"aesenc %%xmm0, %%xmm1\n\t"
3910 			"aesenc %%xmm0, %%xmm2\n\t"
3911 			"aesenc %%xmm0, %%xmm3\n\t"
3912 			"aesenc %%xmm0, %%xmm4\n\t"
3913 			"aesenc %%xmm0, %%xmm8\n\t"
3914 			"aesenc %%xmm0, %%xmm9\n\t"
3915 			"aesenc %%xmm0, %%xmm10\n\t"
3916 			"aesenc %%xmm0, %%xmm11\n\t"
3917 			"movdqa 0xc0(%[key]), %%xmm0\n\t"
3918 			"je .Lenclast%=\n\t"
3919 			"aesenc %%xmm0, %%xmm1\n\t"
3920 			"aesenc %%xmm0, %%xmm2\n\t"
3921 			"aesenc %%xmm0, %%xmm3\n\t"
3922 			"aesenc %%xmm0, %%xmm4\n\t"
3923 			"aesenc %%xmm0, %%xmm8\n\t"
3924 			"aesenc %%xmm0, %%xmm9\n\t"
3925 			"aesenc %%xmm0, %%xmm10\n\t"
3926 			"aesenc %%xmm0, %%xmm11\n\t"
3927 			"movdqa 0xd0(%[key]), %%xmm0\n\t"
3928 			"aesenc %%xmm0, %%xmm1\n\t"
3929 			"aesenc %%xmm0, %%xmm2\n\t"
3930 			"aesenc %%xmm0, %%xmm3\n\t"
3931 			"aesenc %%xmm0, %%xmm4\n\t"
3932 			"aesenc %%xmm0, %%xmm8\n\t"
3933 			"aesenc %%xmm0, %%xmm9\n\t"
3934 			"aesenc %%xmm0, %%xmm10\n\t"
3935 			"aesenc %%xmm0, %%xmm11\n\t"
3936 			"movdqa 0xe0(%[key]), %%xmm0\n\t"
3937 
3938 			".Lenclast%=:\n\t"
3939 			:
3940 			: [key] "r" (ctx->keyschenc),
3941 			  [rounds] "rm" (ctx->rounds)
3942 			: "cc", "memory");
3943 
3944 	  asm volatile ("pxor %%xmm0, %%xmm7\n\t"
3945 			"pxor %%xmm0, %%xmm12\n\t"
3946 			"pxor %%xmm0, %%xmm13\n\t"
3947 			"pxor %%xmm0, %%xmm14\n\t"
3948 			"aesenclast %%xmm7, %%xmm1\n\t"
3949 			"aesenclast %%xmm12, %%xmm2\n\t"
3950 			"aesenclast %%xmm13, %%xmm3\n\t"
3951 			"aesenclast %%xmm14, %%xmm4\n\t"
3952 			"movdqu 5*16(%[outbuf]), %%xmm12\n\t"
3953 			"movdqu 6*16(%[outbuf]), %%xmm13\n\t"
3954 			"movdqu 7*16(%[outbuf]), %%xmm14\n\t"
3955 			"pxor %%xmm0, %%xmm15\n\t"
3956 			"pxor %%xmm0, %%xmm12\n\t"
3957 			"pxor %%xmm0, %%xmm13\n\t"
3958 			"pxor %%xmm0, %%xmm14\n\t"
3959 			"aesenclast %%xmm15, %%xmm8\n\t"
3960 			"aesenclast %%xmm12, %%xmm9\n\t"
3961 			"aesenclast %%xmm13, %%xmm10\n\t"
3962 			"aesenclast %%xmm14, %%xmm11\n\t"
3963 			"movdqu %%xmm1, 0*16(%[outbuf])\n\t"
3964 			"movdqu %%xmm2, 1*16(%[outbuf])\n\t"
3965 			"movdqu %%xmm3, 2*16(%[outbuf])\n\t"
3966 			"movdqu %%xmm4, 3*16(%[outbuf])\n\t"
3967 			"movdqu %%xmm8, 4*16(%[outbuf])\n\t"
3968 			"movdqu %%xmm9, 5*16(%[outbuf])\n\t"
3969 			"movdqu %%xmm10, 6*16(%[outbuf])\n\t"
3970 			"movdqu %%xmm11, 7*16(%[outbuf])\n\t"
3971 			:
3972 			: [outbuf] "r" (outbuf)
3973 			: "memory" );
3974 
3975 	  outbuf += 8*BLOCKSIZE;
3976 	  inbuf  += 8*BLOCKSIZE;
3977 	}
3978 
3979       aesni_cleanup_8_15();
3980     }
3981 #endif
3982 
3983   for ( ;nblocks >= 4; nblocks -= 4 )
3984     {
3985       asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
3986 		    "movdqu %[inbuf0], %%xmm1\n\t"
3987 		    "pxor   %%xmm5,    %%xmm1\n\t"
3988 		    "movdqu %%xmm5,    %[outbuf0]\n\t"
3989 
3990 		    "movdqa %%xmm4,    %%xmm0\n\t"
3991 		    "paddd  %%xmm4,    %%xmm4\n\t"
3992 		    "psrad  $31,       %%xmm0\n\t"
3993 		    "paddq  %%xmm5,    %%xmm5\n\t"
3994 		    "pand   %%xmm6,    %%xmm0\n\t"
3995 		    "pxor   %%xmm0,    %%xmm5\n\t"
3996 		    : [outbuf0] "=m" (*(outbuf + 0 * 16))
3997 		    : [inbuf0] "m" (*(inbuf + 0 * 16))
3998 		    : "memory" );
3999 
4000       asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
4001 		    "pxor   %%xmm5,    %%xmm2\n\t"
4002 		    "movdqu %%xmm5,    %[outbuf1]\n\t"
4003 
4004 		    "movdqa %%xmm4,    %%xmm0\n\t"
4005 		    "paddd  %%xmm4,    %%xmm4\n\t"
4006 		    "psrad  $31,       %%xmm0\n\t"
4007 		    "paddq  %%xmm5,    %%xmm5\n\t"
4008 		    "pand   %%xmm6,    %%xmm0\n\t"
4009 		    "pxor   %%xmm0,    %%xmm5\n\t"
4010 		    : [outbuf1] "=m" (*(outbuf + 1 * 16))
4011 		    : [inbuf1] "m" (*(inbuf + 1 * 16))
4012 		    : "memory" );
4013 
4014       asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
4015 		    "pxor   %%xmm5,    %%xmm3\n\t"
4016 		    "movdqu %%xmm5,    %[outbuf2]\n\t"
4017 
4018 		    "movdqa %%xmm4,    %%xmm0\n\t"
4019 		    "paddd  %%xmm4,    %%xmm4\n\t"
4020 		    "psrad  $31,       %%xmm0\n\t"
4021 		    "paddq  %%xmm5,    %%xmm5\n\t"
4022 		    "pand   %%xmm6,    %%xmm0\n\t"
4023 		    "pxor   %%xmm0,    %%xmm5\n\t"
4024 		    : [outbuf2] "=m" (*(outbuf + 2 * 16))
4025 		    : [inbuf2] "m" (*(inbuf + 2 * 16))
4026 		    : "memory" );
4027 
4028       asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
4029 		    "movdqu %[inbuf3], %%xmm4\n\t"
4030 		    "pxor   %%xmm5,    %%xmm4\n\t"
4031 		    "movdqu %%xmm5,    %[outbuf3]\n\t"
4032 
4033 		    "psrad  $31,       %%xmm0\n\t"
4034 		    "paddq  %%xmm5,    %%xmm5\n\t"
4035 		    "pand   %%xmm6,    %%xmm0\n\t"
4036 		    "pxor   %%xmm0,    %%xmm5\n\t"
4037 		    : [outbuf3] "=m" (*(outbuf + 3 * 16))
4038 		    : [inbuf3] "m" (*(inbuf + 3 * 16))
4039 		    : "memory" );
4040 
4041       do_aesni_enc_vec4 (ctx);
4042 
4043       asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
4044                     "pxor   %%xmm0,     %%xmm1\n\t"
4045 		    "movdqu %[outbuf1], %%xmm0\n\t"
4046 		    "movdqu %%xmm1,     %[outbuf0]\n\t"
4047 		    "movdqu %[outbuf2], %%xmm1\n\t"
4048                     "pxor   %%xmm0,     %%xmm2\n\t"
4049 		    "movdqu %[outbuf3], %%xmm0\n\t"
4050                     "pxor   %%xmm1,     %%xmm3\n\t"
4051                     "pxor   %%xmm0,     %%xmm4\n\t"
4052 		    "movdqu %%xmm2,     %[outbuf1]\n\t"
4053 		    "movdqu %%xmm3,     %[outbuf2]\n\t"
4054 		    "movdqu %%xmm4,     %[outbuf3]\n\t"
4055 		    : [outbuf0] "+m" (*(outbuf + 0 * 16)),
4056 		      [outbuf1] "+m" (*(outbuf + 1 * 16)),
4057 		      [outbuf2] "+m" (*(outbuf + 2 * 16)),
4058 		      [outbuf3] "+m" (*(outbuf + 3 * 16))
4059 		    :
4060 		    : "memory" );
4061 
4062       outbuf += BLOCKSIZE * 4;
4063       inbuf += BLOCKSIZE * 4;
4064     }
4065 
4066   for ( ;nblocks; nblocks-- )
4067     {
4068       asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
4069 		    "pxor   %%xmm5,    %%xmm0\n\t"
4070 		    "movdqa %%xmm5,    %%xmm4\n\t"
4071 
4072 		    "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
4073 		    "psrad  $31,       %%xmm1\n\t"
4074 		    "paddq  %%xmm5,    %%xmm5\n\t"
4075 		    "pand   %%xmm6,    %%xmm1\n\t"
4076 		    "pxor   %%xmm1,    %%xmm5\n\t"
4077 		    :
4078 		    : [inbuf] "m" (*inbuf)
4079 		    : "memory" );
4080 
4081       do_aesni_enc (ctx);
4082 
4083       asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
4084 		    "movdqu %%xmm0,    %[outbuf]\n\t"
4085 		    : [outbuf] "=m" (*outbuf)
4086 		    :
4087 		    : "memory" );
4088 
4089       outbuf += BLOCKSIZE;
4090       inbuf += BLOCKSIZE;
4091     }
4092 
4093   asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
4094 		: [tweak] "=m" (*tweak)
4095 		:
4096 		: "memory" );
4097 
4098   aesni_cleanup ();
4099   aesni_cleanup_2_7 ();
4100 }
4101 
4102 
4103 static void ASM_FUNC_ATTR
_gcry_aes_aesni_xts_dec(RIJNDAEL_context * ctx,unsigned char * tweak,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks)4104 _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
4105 			 unsigned char *outbuf, const unsigned char *inbuf,
4106 			 size_t nblocks)
4107 {
4108   aesni_prepare_2_7_variable;
4109 
4110   aesni_prepare ();
4111   aesni_prepare_2_7 ();
4112 
4113   if ( !ctx->decryption_prepared )
4114     {
4115       do_aesni_prepare_decryption ( ctx );
4116       ctx->decryption_prepared = 1;
4117     }
4118 
4119   /* Preload Tweak */
4120   asm volatile ("movdqu %[tweak], %%xmm5\n\t"
4121 		"movdqa %[gfmul], %%xmm6\n\t"
4122 		:
4123 		: [tweak] "m" (*tweak),
4124 		  [gfmul] "m" (*xts_gfmul_const)
4125 		: "memory" );
4126 
4127 #ifdef __x86_64__
4128   if (nblocks >= 8)
4129     {
4130       aesni_prepare_8_15_variable;
4131 
4132       aesni_prepare_8_15();
4133 
4134       for ( ;nblocks >= 8 ; nblocks -= 8 )
4135 	{
4136 	  asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm11\n\t"
4137 			"movdqu %[inbuf0], %%xmm1\n\t"
4138 			"pxor   %%xmm5,    %%xmm1\n\t"
4139 			"movdqa %%xmm5,    %%xmm7\n\t"
4140 
4141 			"movdqa %%xmm11,   %%xmm0\n\t"
4142 			"paddd  %%xmm11,   %%xmm11\n\t"
4143 			"psrad  $31,       %%xmm0\n\t"
4144 			"paddq  %%xmm5,    %%xmm5\n\t"
4145 			"pand   %%xmm6,    %%xmm0\n\t"
4146 			"pxor   %%xmm0,    %%xmm5\n\t"
4147 			:
4148 			: [inbuf0] "m" (*(inbuf + 0 * 16))
4149 			: "memory" );
4150 
4151 	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
4152 			"pxor   %%xmm5,    %%xmm2\n\t"
4153 			"movdqa %%xmm5,    %%xmm12\n\t"
4154 
4155 			"movdqa %%xmm11,   %%xmm0\n\t"
4156 			"paddd  %%xmm11,   %%xmm11\n\t"
4157 			"psrad  $31,       %%xmm0\n\t"
4158 			"paddq  %%xmm5,    %%xmm5\n\t"
4159 			"pand   %%xmm6,    %%xmm0\n\t"
4160 			"pxor   %%xmm0,    %%xmm5\n\t"
4161 			:
4162 			: [inbuf1] "m" (*(inbuf + 1 * 16))
4163 			: "memory" );
4164 
4165 	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
4166 			"pxor   %%xmm5,    %%xmm3\n\t"
4167 			"movdqa %%xmm5,    %%xmm13\n\t"
4168 
4169 			"movdqa %%xmm11,   %%xmm0\n\t"
4170 			"paddd  %%xmm11,   %%xmm11\n\t"
4171 			"psrad  $31,       %%xmm0\n\t"
4172 			"paddq  %%xmm5,    %%xmm5\n\t"
4173 			"pand   %%xmm6,    %%xmm0\n\t"
4174 			"pxor   %%xmm0,    %%xmm5\n\t"
4175 			:
4176 			: [inbuf2] "m" (*(inbuf + 2 * 16))
4177 			: "memory" );
4178 
4179 	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
4180 			"pxor   %%xmm5,    %%xmm4\n\t"
4181 			"movdqa %%xmm5,    %%xmm14\n\t"
4182 
4183 			"movdqa %%xmm11,   %%xmm0\n\t"
4184 			"paddd  %%xmm11,   %%xmm11\n\t"
4185 			"psrad  $31,       %%xmm0\n\t"
4186 			"paddq  %%xmm5,    %%xmm5\n\t"
4187 			"pand   %%xmm6,    %%xmm0\n\t"
4188 			"pxor   %%xmm0,    %%xmm5\n\t"
4189 			:
4190 			: [inbuf3] "m" (*(inbuf + 3 * 16))
4191 			: "memory" );
4192 
4193 	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
4194 			"pxor   %%xmm5,    %%xmm8\n\t"
4195 			"movdqa %%xmm5,    %%xmm15\n\t"
4196 
4197 			"movdqa %%xmm11,   %%xmm0\n\t"
4198 			"paddd  %%xmm11,   %%xmm11\n\t"
4199 			"psrad  $31,       %%xmm0\n\t"
4200 			"paddq  %%xmm5,    %%xmm5\n\t"
4201 			"pand   %%xmm6,    %%xmm0\n\t"
4202 			"pxor   %%xmm0,    %%xmm5\n\t"
4203 			:
4204 			: [inbuf4] "m" (*(inbuf + 4 * 16))
4205 			: "memory" );
4206 
4207 	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
4208 			"pxor   %%xmm5,    %%xmm9\n\t"
4209 			"movdqu %%xmm5,    %[outbuf5]\n\t"
4210 
4211 			"movdqa %%xmm11,   %%xmm0\n\t"
4212 			"paddd  %%xmm11,   %%xmm11\n\t"
4213 			"psrad  $31,       %%xmm0\n\t"
4214 			"paddq  %%xmm5,    %%xmm5\n\t"
4215 			"pand   %%xmm6,    %%xmm0\n\t"
4216 			"pxor   %%xmm0,    %%xmm5\n\t"
4217 			: [outbuf5] "=m" (*(outbuf + 5 * 16))
4218 			: [inbuf5] "m" (*(inbuf + 5 * 16))
4219 			: "memory" );
4220 
4221 	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
4222 			"pxor   %%xmm5,    %%xmm10\n\t"
4223 			"movdqu %%xmm5,    %[outbuf6]\n\t"
4224 
4225 			"movdqa %%xmm11,   %%xmm0\n\t"
4226 			"paddd  %%xmm11,   %%xmm11\n\t"
4227 			"psrad  $31,       %%xmm0\n\t"
4228 			"paddq  %%xmm5,    %%xmm5\n\t"
4229 			"pand   %%xmm6,    %%xmm0\n\t"
4230 			"pxor   %%xmm0,    %%xmm5\n\t"
4231 			: [outbuf6] "=m" (*(outbuf + 6 * 16))
4232 			: [inbuf6] "m" (*(inbuf + 6 * 16))
4233 			: "memory" );
4234 
4235 	  asm volatile ("movdqa %%xmm11,   %%xmm0\n\t"
4236 			"movdqu %[inbuf7], %%xmm11\n\t"
4237 			"pxor   %%xmm5,    %%xmm11\n\t"
4238 			"movdqu %%xmm5,    %[outbuf7]\n\t"
4239 
4240 			"psrad  $31,       %%xmm0\n\t"
4241 			"paddq  %%xmm5,    %%xmm5\n\t"
4242 			"pand   %%xmm6,    %%xmm0\n\t"
4243 			"pxor   %%xmm0,    %%xmm5\n\t"
4244 			: [outbuf7] "=m" (*(outbuf + 7 * 16))
4245 			: [inbuf7] "m" (*(inbuf + 7 * 16))
4246 			: "memory" );
4247 
4248 	  asm volatile ("cmpl $12, %[rounds]\n\t"
4249 			"movdqa (%[key]), %%xmm0\n\t"
4250 			"pxor %%xmm0, %%xmm1\n\t"
4251 			"pxor %%xmm0, %%xmm2\n\t"
4252 			"pxor %%xmm0, %%xmm3\n\t"
4253 			"pxor %%xmm0, %%xmm4\n\t"
4254 			"pxor %%xmm0, %%xmm8\n\t"
4255 			"pxor %%xmm0, %%xmm9\n\t"
4256 			"pxor %%xmm0, %%xmm10\n\t"
4257 			"pxor %%xmm0, %%xmm11\n\t"
4258 			"movdqa 0x10(%[key]), %%xmm0\n\t"
4259 			"aesdec %%xmm0, %%xmm1\n\t"
4260 			"aesdec %%xmm0, %%xmm2\n\t"
4261 			"aesdec %%xmm0, %%xmm3\n\t"
4262 			"aesdec %%xmm0, %%xmm4\n\t"
4263 			"aesdec %%xmm0, %%xmm8\n\t"
4264 			"aesdec %%xmm0, %%xmm9\n\t"
4265 			"aesdec %%xmm0, %%xmm10\n\t"
4266 			"aesdec %%xmm0, %%xmm11\n\t"
4267 			"movdqa 0x20(%[key]), %%xmm0\n\t"
4268 			"aesdec %%xmm0, %%xmm1\n\t"
4269 			"aesdec %%xmm0, %%xmm2\n\t"
4270 			"aesdec %%xmm0, %%xmm3\n\t"
4271 			"aesdec %%xmm0, %%xmm4\n\t"
4272 			"aesdec %%xmm0, %%xmm8\n\t"
4273 			"aesdec %%xmm0, %%xmm9\n\t"
4274 			"aesdec %%xmm0, %%xmm10\n\t"
4275 			"aesdec %%xmm0, %%xmm11\n\t"
4276 			"movdqa 0x30(%[key]), %%xmm0\n\t"
4277 			"aesdec %%xmm0, %%xmm1\n\t"
4278 			"aesdec %%xmm0, %%xmm2\n\t"
4279 			"aesdec %%xmm0, %%xmm3\n\t"
4280 			"aesdec %%xmm0, %%xmm4\n\t"
4281 			"aesdec %%xmm0, %%xmm8\n\t"
4282 			"aesdec %%xmm0, %%xmm9\n\t"
4283 			"aesdec %%xmm0, %%xmm10\n\t"
4284 			"aesdec %%xmm0, %%xmm11\n\t"
4285 			"movdqa 0x40(%[key]), %%xmm0\n\t"
4286 			"aesdec %%xmm0, %%xmm1\n\t"
4287 			"aesdec %%xmm0, %%xmm2\n\t"
4288 			"aesdec %%xmm0, %%xmm3\n\t"
4289 			"aesdec %%xmm0, %%xmm4\n\t"
4290 			"aesdec %%xmm0, %%xmm8\n\t"
4291 			"aesdec %%xmm0, %%xmm9\n\t"
4292 			"aesdec %%xmm0, %%xmm10\n\t"
4293 			"aesdec %%xmm0, %%xmm11\n\t"
4294 			"movdqa 0x50(%[key]), %%xmm0\n\t"
4295 			"aesdec %%xmm0, %%xmm1\n\t"
4296 			"aesdec %%xmm0, %%xmm2\n\t"
4297 			"aesdec %%xmm0, %%xmm3\n\t"
4298 			"aesdec %%xmm0, %%xmm4\n\t"
4299 			"aesdec %%xmm0, %%xmm8\n\t"
4300 			"aesdec %%xmm0, %%xmm9\n\t"
4301 			"aesdec %%xmm0, %%xmm10\n\t"
4302 			"aesdec %%xmm0, %%xmm11\n\t"
4303 			"movdqa 0x60(%[key]), %%xmm0\n\t"
4304 			"aesdec %%xmm0, %%xmm1\n\t"
4305 			"aesdec %%xmm0, %%xmm2\n\t"
4306 			"aesdec %%xmm0, %%xmm3\n\t"
4307 			"aesdec %%xmm0, %%xmm4\n\t"
4308 			"aesdec %%xmm0, %%xmm8\n\t"
4309 			"aesdec %%xmm0, %%xmm9\n\t"
4310 			"aesdec %%xmm0, %%xmm10\n\t"
4311 			"aesdec %%xmm0, %%xmm11\n\t"
4312 			"movdqa 0x70(%[key]), %%xmm0\n\t"
4313 			"aesdec %%xmm0, %%xmm1\n\t"
4314 			"aesdec %%xmm0, %%xmm2\n\t"
4315 			"aesdec %%xmm0, %%xmm3\n\t"
4316 			"aesdec %%xmm0, %%xmm4\n\t"
4317 			"aesdec %%xmm0, %%xmm8\n\t"
4318 			"aesdec %%xmm0, %%xmm9\n\t"
4319 			"aesdec %%xmm0, %%xmm10\n\t"
4320 			"aesdec %%xmm0, %%xmm11\n\t"
4321 			"movdqa 0x80(%[key]), %%xmm0\n\t"
4322 			"aesdec %%xmm0, %%xmm1\n\t"
4323 			"aesdec %%xmm0, %%xmm2\n\t"
4324 			"aesdec %%xmm0, %%xmm3\n\t"
4325 			"aesdec %%xmm0, %%xmm4\n\t"
4326 			"aesdec %%xmm0, %%xmm8\n\t"
4327 			"aesdec %%xmm0, %%xmm9\n\t"
4328 			"aesdec %%xmm0, %%xmm10\n\t"
4329 			"aesdec %%xmm0, %%xmm11\n\t"
4330 			"movdqa 0x90(%[key]), %%xmm0\n\t"
4331 			"aesdec %%xmm0, %%xmm1\n\t"
4332 			"aesdec %%xmm0, %%xmm2\n\t"
4333 			"aesdec %%xmm0, %%xmm3\n\t"
4334 			"aesdec %%xmm0, %%xmm4\n\t"
4335 			"aesdec %%xmm0, %%xmm8\n\t"
4336 			"aesdec %%xmm0, %%xmm9\n\t"
4337 			"aesdec %%xmm0, %%xmm10\n\t"
4338 			"aesdec %%xmm0, %%xmm11\n\t"
4339 			"movdqa 0xa0(%[key]), %%xmm0\n\t"
4340 			"jb .Ldeclast%=\n\t"
4341 			"aesdec %%xmm0, %%xmm1\n\t"
4342 			"aesdec %%xmm0, %%xmm2\n\t"
4343 			"aesdec %%xmm0, %%xmm3\n\t"
4344 			"aesdec %%xmm0, %%xmm4\n\t"
4345 			"aesdec %%xmm0, %%xmm8\n\t"
4346 			"aesdec %%xmm0, %%xmm9\n\t"
4347 			"aesdec %%xmm0, %%xmm10\n\t"
4348 			"aesdec %%xmm0, %%xmm11\n\t"
4349 			"movdqa 0xb0(%[key]), %%xmm0\n\t"
4350 			"aesdec %%xmm0, %%xmm1\n\t"
4351 			"aesdec %%xmm0, %%xmm2\n\t"
4352 			"aesdec %%xmm0, %%xmm3\n\t"
4353 			"aesdec %%xmm0, %%xmm4\n\t"
4354 			"aesdec %%xmm0, %%xmm8\n\t"
4355 			"aesdec %%xmm0, %%xmm9\n\t"
4356 			"aesdec %%xmm0, %%xmm10\n\t"
4357 			"aesdec %%xmm0, %%xmm11\n\t"
4358 			"movdqa 0xc0(%[key]), %%xmm0\n\t"
4359 			"je .Ldeclast%=\n\t"
4360 			"aesdec %%xmm0, %%xmm1\n\t"
4361 			"aesdec %%xmm0, %%xmm2\n\t"
4362 			"aesdec %%xmm0, %%xmm3\n\t"
4363 			"aesdec %%xmm0, %%xmm4\n\t"
4364 			"aesdec %%xmm0, %%xmm8\n\t"
4365 			"aesdec %%xmm0, %%xmm9\n\t"
4366 			"aesdec %%xmm0, %%xmm10\n\t"
4367 			"aesdec %%xmm0, %%xmm11\n\t"
4368 			"movdqa 0xd0(%[key]), %%xmm0\n\t"
4369 			"aesdec %%xmm0, %%xmm1\n\t"
4370 			"aesdec %%xmm0, %%xmm2\n\t"
4371 			"aesdec %%xmm0, %%xmm3\n\t"
4372 			"aesdec %%xmm0, %%xmm4\n\t"
4373 			"aesdec %%xmm0, %%xmm8\n\t"
4374 			"aesdec %%xmm0, %%xmm9\n\t"
4375 			"aesdec %%xmm0, %%xmm10\n\t"
4376 			"aesdec %%xmm0, %%xmm11\n\t"
4377 			"movdqa 0xe0(%[key]), %%xmm0\n\t"
4378 
4379 			".Ldeclast%=:\n\t"
4380 			:
4381 			: [key] "r" (ctx->keyschdec),
4382 			  [rounds] "rm" (ctx->rounds)
4383 			: "cc", "memory");
4384 
4385 	  asm volatile ("pxor %%xmm0, %%xmm7\n\t"
4386 			"pxor %%xmm0, %%xmm12\n\t"
4387 			"pxor %%xmm0, %%xmm13\n\t"
4388 			"pxor %%xmm0, %%xmm14\n\t"
4389 			"aesdeclast %%xmm7, %%xmm1\n\t"
4390 			"aesdeclast %%xmm12, %%xmm2\n\t"
4391 			"aesdeclast %%xmm13, %%xmm3\n\t"
4392 			"aesdeclast %%xmm14, %%xmm4\n\t"
4393 			"movdqu 5*16(%[outbuf]), %%xmm12\n\t"
4394 			"movdqu 6*16(%[outbuf]), %%xmm13\n\t"
4395 			"movdqu 7*16(%[outbuf]), %%xmm14\n\t"
4396 			"pxor %%xmm0, %%xmm15\n\t"
4397 			"pxor %%xmm0, %%xmm12\n\t"
4398 			"pxor %%xmm0, %%xmm13\n\t"
4399 			"pxor %%xmm0, %%xmm14\n\t"
4400 			"aesdeclast %%xmm15, %%xmm8\n\t"
4401 			"aesdeclast %%xmm12, %%xmm9\n\t"
4402 			"aesdeclast %%xmm13, %%xmm10\n\t"
4403 			"aesdeclast %%xmm14, %%xmm11\n\t"
4404 			"movdqu %%xmm1, 0*16(%[outbuf])\n\t"
4405 			"movdqu %%xmm2, 1*16(%[outbuf])\n\t"
4406 			"movdqu %%xmm3, 2*16(%[outbuf])\n\t"
4407 			"movdqu %%xmm4, 3*16(%[outbuf])\n\t"
4408 			"movdqu %%xmm8, 4*16(%[outbuf])\n\t"
4409 			"movdqu %%xmm9, 5*16(%[outbuf])\n\t"
4410 			"movdqu %%xmm10, 6*16(%[outbuf])\n\t"
4411 			"movdqu %%xmm11, 7*16(%[outbuf])\n\t"
4412 			:
4413 			: [outbuf] "r" (outbuf)
4414 			: "memory" );
4415 
4416 	  outbuf += 8*BLOCKSIZE;
4417 	  inbuf  += 8*BLOCKSIZE;
4418 	}
4419 
4420       aesni_cleanup_8_15();
4421     }
4422 #endif
4423 
4424   for ( ;nblocks >= 4; nblocks -= 4 )
4425     {
4426       asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
4427 		    "movdqu %[inbuf0], %%xmm1\n\t"
4428 		    "pxor   %%xmm5,    %%xmm1\n\t"
4429 		    "movdqu %%xmm5,    %[outbuf0]\n\t"
4430 
4431 		    "movdqa %%xmm4,    %%xmm0\n\t"
4432 		    "paddd  %%xmm4,    %%xmm4\n\t"
4433 		    "psrad  $31,       %%xmm0\n\t"
4434 		    "paddq  %%xmm5,    %%xmm5\n\t"
4435 		    "pand   %%xmm6,    %%xmm0\n\t"
4436 		    "pxor   %%xmm0,    %%xmm5\n\t"
4437 		    : [outbuf0] "=m" (*(outbuf + 0 * 16))
4438 		    : [inbuf0] "m" (*(inbuf + 0 * 16))
4439 		    : "memory" );
4440 
4441       asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
4442 		    "pxor   %%xmm5,    %%xmm2\n\t"
4443 		    "movdqu %%xmm5,    %[outbuf1]\n\t"
4444 
4445 		    "movdqa %%xmm4,    %%xmm0\n\t"
4446 		    "paddd  %%xmm4,    %%xmm4\n\t"
4447 		    "psrad  $31,       %%xmm0\n\t"
4448 		    "paddq  %%xmm5,    %%xmm5\n\t"
4449 		    "pand   %%xmm6,    %%xmm0\n\t"
4450 		    "pxor   %%xmm0,    %%xmm5\n\t"
4451 		    : [outbuf1] "=m" (*(outbuf + 1 * 16))
4452 		    : [inbuf1] "m" (*(inbuf + 1 * 16))
4453 		    : "memory" );
4454 
4455       asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
4456 		    "pxor   %%xmm5,    %%xmm3\n\t"
4457 		    "movdqu %%xmm5,    %[outbuf2]\n\t"
4458 
4459 		    "movdqa %%xmm4,    %%xmm0\n\t"
4460 		    "paddd  %%xmm4,    %%xmm4\n\t"
4461 		    "psrad  $31,       %%xmm0\n\t"
4462 		    "paddq  %%xmm5,    %%xmm5\n\t"
4463 		    "pand   %%xmm6,    %%xmm0\n\t"
4464 		    "pxor   %%xmm0,    %%xmm5\n\t"
4465 		    : [outbuf2] "=m" (*(outbuf + 2 * 16))
4466 		    : [inbuf2] "m" (*(inbuf + 2 * 16))
4467 		    : "memory" );
4468 
4469       asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
4470 		    "movdqu %[inbuf3], %%xmm4\n\t"
4471 		    "pxor   %%xmm5,    %%xmm4\n\t"
4472 		    "movdqu %%xmm5,    %[outbuf3]\n\t"
4473 
4474 		    "psrad  $31,       %%xmm0\n\t"
4475 		    "paddq  %%xmm5,    %%xmm5\n\t"
4476 		    "pand   %%xmm6,    %%xmm0\n\t"
4477 		    "pxor   %%xmm0,    %%xmm5\n\t"
4478 		    : [outbuf3] "=m" (*(outbuf + 3 * 16))
4479 		    : [inbuf3] "m" (*(inbuf + 3 * 16))
4480 		    : "memory" );
4481 
4482       do_aesni_dec_vec4 (ctx);
4483 
4484       asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
4485                     "pxor   %%xmm0,     %%xmm1\n\t"
4486 		    "movdqu %[outbuf1], %%xmm0\n\t"
4487 		    "movdqu %%xmm1,     %[outbuf0]\n\t"
4488 		    "movdqu %[outbuf2], %%xmm1\n\t"
4489                     "pxor   %%xmm0,     %%xmm2\n\t"
4490 		    "movdqu %[outbuf3], %%xmm0\n\t"
4491                     "pxor   %%xmm1,     %%xmm3\n\t"
4492                     "pxor   %%xmm0,     %%xmm4\n\t"
4493 		    "movdqu %%xmm2,     %[outbuf1]\n\t"
4494 		    "movdqu %%xmm3,     %[outbuf2]\n\t"
4495 		    "movdqu %%xmm4,     %[outbuf3]\n\t"
4496 		    : [outbuf0] "+m" (*(outbuf + 0 * 16)),
4497 		      [outbuf1] "+m" (*(outbuf + 1 * 16)),
4498 		      [outbuf2] "+m" (*(outbuf + 2 * 16)),
4499 		      [outbuf3] "+m" (*(outbuf + 3 * 16))
4500 		    :
4501 		    : "memory" );
4502 
4503       outbuf += BLOCKSIZE * 4;
4504       inbuf += BLOCKSIZE * 4;
4505     }
4506 
4507   for ( ;nblocks; nblocks-- )
4508     {
4509       asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
4510 		    "pxor   %%xmm5,    %%xmm0\n\t"
4511 		    "movdqa %%xmm5,    %%xmm4\n\t"
4512 
4513 		    "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
4514 		    "psrad  $31,       %%xmm1\n\t"
4515 		    "paddq  %%xmm5,    %%xmm5\n\t"
4516 		    "pand   %%xmm6,    %%xmm1\n\t"
4517 		    "pxor   %%xmm1,    %%xmm5\n\t"
4518 		    :
4519 		    : [inbuf] "m" (*inbuf)
4520 		    : "memory" );
4521 
4522       do_aesni_dec (ctx);
4523 
4524       asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
4525 		    "movdqu %%xmm0,    %[outbuf]\n\t"
4526 		    : [outbuf] "=m" (*outbuf)
4527 		    :
4528 		    : "memory" );
4529 
4530       outbuf += BLOCKSIZE;
4531       inbuf += BLOCKSIZE;
4532     }
4533 
4534   asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
4535                 : [tweak] "=m" (*tweak)
4536                 :
4537                 : "memory" );
4538 
4539   aesni_cleanup ();
4540   aesni_cleanup_2_7 ();
4541 }
4542 
4543 
4544 void ASM_FUNC_ATTR
_gcry_aes_aesni_xts_crypt(RIJNDAEL_context * ctx,unsigned char * tweak,unsigned char * outbuf,const unsigned char * inbuf,size_t nblocks,int encrypt)4545 _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
4546 			   unsigned char *outbuf, const unsigned char *inbuf,
4547 			   size_t nblocks, int encrypt)
4548 {
4549   if (encrypt)
4550     _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks);
4551   else
4552     _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks);
4553 }
4554 
4555 #if __clang__
4556 #  pragma clang attribute pop
4557 #endif
4558 
4559 #endif /* USE_AESNI */
4560