1/* rinjdael-amd64.S  -  AMD64 assembly implementation of AES cipher
2 *
3 * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
4 *
5 * This file is part of Libgcrypt.
6 *
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
11 *
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#ifdef __x86_64
22#include <config.h>
23#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
24     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_AES)
25
26#include "asm-common-amd64.h"
27
28.text
29
30/* table macros */
31#define E0	(0)
32#define Es0	(1)
33#define Esize	4
34#define Essize	4
35
36#define D0	(0)
37#define Ds0	(4 * 256)
38#define Dsize	4
39#define Dssize	1
40
41/* register macros */
42#define CTX	%rdi
43#define RTAB	%r12
44
45#define RA	%rax
46#define RB	%rbx
47#define RC	%rcx
48#define RD	%rdx
49
50#define RAd	%eax
51#define RBd	%ebx
52#define RCd	%ecx
53#define RDd	%edx
54
55#define RAbl	%al
56#define RBbl	%bl
57#define RCbl	%cl
58#define RDbl	%dl
59
60#define RAbh	%ah
61#define RBbh	%bh
62#define RCbh	%ch
63#define RDbh	%dh
64
65#define RNA	%r8
66#define RNB	%r9
67#define RNC	%r10
68#define RND	%r11
69
70#define RNAd	%r8d
71#define RNBd	%r9d
72#define RNCd	%r10d
73#define RNDd	%r11d
74
75#define RT0	%rbp
76#define RT1	%rsi
77
78#define RT0d	%ebp
79#define RT1d	%esi
80
81/* helper macros */
82#define do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
83	movzbl source ## bl,			t0 ## d; \
84	movzbl source ## bh,			t1 ## d; \
85	op ## l table1(RTAB,t0,tablemul),	dest1 ## d; \
86	op ## l table2(RTAB,t1,tablemul),	dest2 ## d;
87
88#define do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
89	movzbl source ## bl,			t0 ## d; \
90	movzbl source ## bh,			t1 ## d; \
91	shrl $(shf),				source ## d; \
92	op ## l table1(RTAB,t0,tablemul),	dest1 ## d; \
93	op ## l table2(RTAB,t1,tablemul),	dest2 ## d;
94
95#define last_do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
96	movzbl source ## bl,			t0 ## d; \
97	movzbl source ## bh,			t1 ## d; \
98	movzbl table1(RTAB,t0,tablemul),	t0 ## d; \
99	movzbl table2(RTAB,t1,tablemul),	t1 ## d; \
100	op ## l t0 ## d,			dest1 ## d; \
101	op ## l t1 ## d,			dest2 ## d;
102
103#define last_do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
104	movzbl source ## bl,			t0 ## d; \
105	movzbl source ## bh,			t1 ## d; \
106	shrl $(shf),				source ## d; \
107	movzbl table1(RTAB,t0,tablemul),	t0 ## d; \
108	movzbl table2(RTAB,t1,tablemul),	t1 ## d; \
109	op ## l t0 ## d,			dest1 ## d; \
110	op ## l t1 ## d,			dest2 ## d;
111
112/***********************************************************************
113 * AMD64 assembly implementation of the AES cipher
114 ***********************************************************************/
115#define addroundkey(round, ra, rb, rc, rd) \
116	xorl (((round) * 16) + 0 * 4)(CTX), ra ## d; \
117	xorl (((round) * 16) + 1 * 4)(CTX), rb ## d; \
118	xorl (((round) * 16) + 2 * 4)(CTX), rc ## d; \
119	xorl (((round) * 16) + 3 * 4)(CTX), rd ## d;
120
121#define do_encround(next_r) \
122	do16bit_shr(16, mov, RA, Esize, E0, RNA, E0, RND, RT0, RT1); \
123	do16bit(        mov, RA, Esize, E0, RNC, E0, RNB, RT0, RT1); \
124	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
125	roll $8, RNDd; \
126	xorl RNAd, RAd; \
127	roll $8, RNCd; \
128	roll $8, RNBd; \
129	roll $8, RAd; \
130	\
131	do16bit_shr(16, xor, RD, Esize, E0, RND, E0, RNC, RT0, RT1); \
132	do16bit(        xor, RD, Esize, E0, RNB, E0, RA,  RT0, RT1); \
133	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
134	roll $8, RNCd; \
135	xorl RNDd, RDd; \
136	roll $8, RNBd; \
137	roll $8, RAd; \
138	roll $8, RDd; \
139	\
140	do16bit_shr(16, xor, RC, Esize, E0, RNC, E0, RNB, RT0, RT1); \
141	do16bit(        xor, RC, Esize, E0, RA,  E0, RD,  RT0, RT1); \
142	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
143	roll $8, RNBd; \
144	xorl RNCd, RCd; \
145	roll $8, RAd; \
146	roll $8, RDd; \
147	roll $8, RCd; \
148	\
149	do16bit_shr(16, xor, RB, Esize, E0, RNB, E0, RA,  RT0, RT1); \
150	do16bit(        xor, RB, Esize, E0, RD,  E0, RC,  RT0, RT1); \
151	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
152	roll $8, RAd; \
153	xorl RNBd, RBd; \
154	roll $16, RDd; \
155	roll $24, RCd;
156
157#define do_lastencround(next_r) \
158	do16bit_shr(16, movzb, RA, Essize, Es0, RNA, Es0, RND, RT0, RT1); \
159	do16bit(        movzb, RA, Essize, Es0, RNC, Es0, RNB, RT0, RT1); \
160	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
161	roll $8, RNDd; \
162	xorl RNAd, RAd; \
163	roll $8, RNCd; \
164	roll $8, RNBd; \
165	roll $8, RAd; \
166	\
167	last_do16bit_shr(16, xor, RD, Essize, Es0, RND, Es0, RNC, RT0, RT1); \
168	last_do16bit(        xor, RD, Essize, Es0, RNB, Es0, RA,  RT0, RT1); \
169	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
170	roll $8, RNCd; \
171	xorl RNDd, RDd; \
172	roll $8, RNBd; \
173	roll $8, RAd; \
174	roll $8, RDd; \
175	\
176	last_do16bit_shr(16, xor, RC, Essize, Es0, RNC, Es0, RNB, RT0, RT1); \
177	last_do16bit(        xor, RC, Essize, Es0, RA,  Es0, RD,  RT0, RT1); \
178	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
179	roll $8, RNBd; \
180	xorl RNCd, RCd; \
181	roll $8, RAd; \
182	roll $8, RDd; \
183	roll $8, RCd; \
184	\
185	last_do16bit_shr(16, xor, RB, Essize, Es0, RNB, Es0, RA,  RT0, RT1); \
186	last_do16bit(        xor, RB, Essize, Es0, RD,  Es0, RC,  RT0, RT1); \
187	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
188	roll $8, RAd; \
189	xorl RNBd, RBd; \
190	roll $16, RDd; \
191	roll $24, RCd;
192
193#define firstencround(round) \
194	addroundkey(round, RA, RB, RC, RD); \
195	do_encround((round) + 1);
196
197#define encround(round) \
198	do_encround((round) + 1);
199
200#define lastencround(round) \
201	do_lastencround((round) + 1);
202
203.align 8
204.globl _gcry_aes_amd64_encrypt_block
205ELF(.type   _gcry_aes_amd64_encrypt_block,@function;)
206
207_gcry_aes_amd64_encrypt_block:
208	/* input:
209	 *	%rdi: keysched, CTX
210	 *	%rsi: dst
211	 *	%rdx: src
212	 *	%ecx: number of rounds.. 10, 12 or 14
213	 *	%r8:  encryption tables
214	 */
215	CFI_STARTPROC();
216	ENTER_SYSV_FUNC_PARAMS_5
217
218	subq $(5 * 8), %rsp;
219	CFI_ADJUST_CFA_OFFSET(5 * 8);
220	movq %rsi, (0 * 8)(%rsp);
221	movl %ecx, (1 * 8)(%rsp);
222	movq %rbp, (2 * 8)(%rsp);
223	movq %rbx, (3 * 8)(%rsp);
224	movq %r12, (4 * 8)(%rsp);
225	CFI_REL_OFFSET(%rbp, 2 * 8);
226	CFI_REL_OFFSET(%rbx, 3 * 8);
227	CFI_REL_OFFSET(%r12, 4 * 8);
228
229	leaq (%r8), RTAB;
230
231	/* read input block */
232	movl 0 * 4(%rdx), RAd;
233	movl 1 * 4(%rdx), RBd;
234	movl 2 * 4(%rdx), RCd;
235	movl 3 * 4(%rdx), RDd;
236
237	firstencround(0);
238	encround(1);
239	encround(2);
240	encround(3);
241	encround(4);
242	encround(5);
243	encround(6);
244	encround(7);
245	encround(8);
246	cmpl $12, (1 * 8)(%rsp);
247	jnb .Lenc_not_128;
248	lastencround(9);
249
250.align 4
251.Lenc_done:
252	/* write output block */
253	movq (0 * 8)(%rsp), %rsi;
254	movl RAd, 0 * 4(%rsi);
255	movl RBd, 1 * 4(%rsi);
256	movl RCd, 2 * 4(%rsi);
257	movl RDd, 3 * 4(%rsi);
258
259	CFI_REMEMBER_STATE();
260
261	movq (4 * 8)(%rsp), %r12;
262	movq (3 * 8)(%rsp), %rbx;
263	movq (2 * 8)(%rsp), %rbp;
264	CFI_RESTORE(%r12);
265	CFI_RESTORE(%rbx);
266	CFI_RESTORE(%rbp);
267	addq $(5 * 8), %rsp;
268	CFI_ADJUST_CFA_OFFSET(-5 * 8);
269
270	movl $(6 * 8), %eax;
271
272	EXIT_SYSV_FUNC
273	ret;
274
275	CFI_RESTORE_STATE();
276.align 4
277.Lenc_not_128:
278	je .Lenc_192
279
280	encround(9);
281	encround(10);
282	encround(11);
283	encround(12);
284	lastencround(13);
285
286	jmp .Lenc_done;
287
288.align 4
289.Lenc_192:
290	encround(9);
291	encround(10);
292	lastencround(11);
293
294	jmp .Lenc_done;
295	CFI_ENDPROC();
296ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;)
297
298#define do_decround(next_r) \
299	do16bit_shr(16, mov, RA, Dsize, D0, RNA, D0, RNB, RT0, RT1); \
300	do16bit(        mov, RA, Dsize, D0, RNC, D0, RND, RT0, RT1); \
301	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
302	roll $8, RNBd; \
303	xorl RNAd, RAd; \
304	roll $8, RNCd; \
305	roll $8, RNDd; \
306	roll $8, RAd; \
307	\
308	do16bit_shr(16, xor, RB, Dsize, D0, RNB, D0, RNC, RT0, RT1); \
309	do16bit(        xor, RB, Dsize, D0, RND, D0, RA,  RT0, RT1); \
310	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
311	roll $8, RNCd; \
312	xorl RNBd, RBd; \
313	roll $8, RNDd; \
314	roll $8, RAd; \
315	roll $8, RBd; \
316	\
317	do16bit_shr(16, xor, RC, Dsize, D0, RNC, D0, RND, RT0, RT1); \
318	do16bit(        xor, RC, Dsize, D0, RA,  D0, RB,  RT0, RT1); \
319	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
320	roll $8, RNDd; \
321	xorl RNCd, RCd; \
322	roll $8, RAd; \
323	roll $8, RBd; \
324	roll $8, RCd; \
325	\
326	do16bit_shr(16, xor, RD, Dsize, D0, RND, D0, RA,  RT0, RT1); \
327	do16bit(        xor, RD, Dsize, D0, RB,  D0, RC,  RT0, RT1); \
328	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
329	roll $8, RAd; \
330	xorl RNDd, RDd; \
331	roll $16, RBd; \
332	roll $24, RCd;
333
334#define do_lastdecround(next_r) \
335	do16bit_shr(16, movzb, RA, Dssize, Ds0, RNA, Ds0, RNB, RT0, RT1); \
336	do16bit(        movzb, RA, Dssize, Ds0, RNC, Ds0, RND, RT0, RT1); \
337	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
338	roll $8, RNBd; \
339	xorl RNAd, RAd; \
340	roll $8, RNCd; \
341	roll $8, RNDd; \
342	roll $8, RAd; \
343	\
344	last_do16bit_shr(16, xor, RB, Dssize, Ds0, RNB, Ds0, RNC, RT0, RT1); \
345	last_do16bit(        xor, RB, Dssize, Ds0, RND, Ds0, RA,  RT0, RT1); \
346	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
347	roll $8, RNCd; \
348	xorl RNBd, RBd; \
349	roll $8, RNDd; \
350	roll $8, RAd; \
351	roll $8, RBd; \
352	\
353	last_do16bit_shr(16, xor, RC, Dssize, Ds0, RNC, Ds0, RND, RT0, RT1); \
354	last_do16bit(        xor, RC, Dssize, Ds0, RA,  Ds0, RB,  RT0, RT1); \
355	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
356	roll $8, RNDd; \
357	xorl RNCd, RCd; \
358	roll $8, RAd; \
359	roll $8, RBd; \
360	roll $8, RCd; \
361	\
362	last_do16bit_shr(16, xor, RD, Dssize, Ds0, RND, Ds0, RA,  RT0, RT1); \
363	last_do16bit(        xor, RD, Dssize, Ds0, RB,  Ds0, RC,  RT0, RT1); \
364	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
365	roll $8, RAd; \
366	xorl RNDd, RDd; \
367	roll $16, RBd; \
368	roll $24, RCd;
369
370#define firstdecround(round) \
371	addroundkey((round + 1), RA, RB, RC, RD); \
372	do_decround(round);
373
374#define decround(round) \
375	do_decround(round);
376
377#define lastdecround(round) \
378	do_lastdecround(round);
379
380.align 8
381.globl _gcry_aes_amd64_decrypt_block
382ELF(.type   _gcry_aes_amd64_decrypt_block,@function;)
383
384_gcry_aes_amd64_decrypt_block:
385	/* input:
386	 *	%rdi: keysched, CTX
387	 *	%rsi: dst
388	 *	%rdx: src
389	 *	%ecx: number of rounds.. 10, 12 or 14
390	 *	%r8:  decryption tables
391	 */
392	CFI_STARTPROC();
393	ENTER_SYSV_FUNC_PARAMS_5
394
395	subq $(5 * 8), %rsp;
396	CFI_ADJUST_CFA_OFFSET(5 * 8);
397	movq %rsi, (0 * 8)(%rsp);
398	movl %ecx, (1 * 8)(%rsp);
399	movq %rbp, (2 * 8)(%rsp);
400	movq %rbx, (3 * 8)(%rsp);
401	movq %r12, (4 * 8)(%rsp);
402	CFI_REL_OFFSET(%rbp, 2 * 8);
403	CFI_REL_OFFSET(%rbx, 3 * 8);
404	CFI_REL_OFFSET(%r12, 4 * 8);
405
406	leaq (%r8), RTAB;
407
408	/* read input block */
409	movl 0 * 4(%rdx), RAd;
410	movl 1 * 4(%rdx), RBd;
411	movl 2 * 4(%rdx), RCd;
412	movl 3 * 4(%rdx), RDd;
413
414	cmpl $12, (1 * 8)(%rsp);
415	jnb .Ldec_256;
416
417	firstdecround(9);
418.align 4
419.Ldec_tail:
420	decround(8);
421	decround(7);
422	decround(6);
423	decround(5);
424	decround(4);
425	decround(3);
426	decround(2);
427	decround(1);
428	lastdecround(0);
429
430	/* write output block */
431	movq (0 * 8)(%rsp), %rsi;
432	movl RAd, 0 * 4(%rsi);
433	movl RBd, 1 * 4(%rsi);
434	movl RCd, 2 * 4(%rsi);
435	movl RDd, 3 * 4(%rsi);
436
437	CFI_REMEMBER_STATE();
438
439	movq (4 * 8)(%rsp), %r12;
440	movq (3 * 8)(%rsp), %rbx;
441	movq (2 * 8)(%rsp), %rbp;
442	CFI_RESTORE(%r12);
443	CFI_RESTORE(%rbx);
444	CFI_RESTORE(%rbp);
445	addq $(5 * 8), %rsp;
446	CFI_ADJUST_CFA_OFFSET(-5 * 8);
447
448	movl $(6 * 8), %eax;
449
450	EXIT_SYSV_FUNC
451	ret;
452
453	CFI_RESTORE_STATE();
454.align 4
455.Ldec_256:
456	je .Ldec_192;
457
458	firstdecround(13);
459	decround(12);
460	decround(11);
461	decround(10);
462	decround(9);
463
464	jmp .Ldec_tail;
465
466.align 4
467.Ldec_192:
468	firstdecround(11);
469	decround(10);
470	decround(9);
471
472	jmp .Ldec_tail;
473	CFI_ENDPROC();
474ELF(.size _gcry_aes_amd64_decrypt_block,.-_gcry_aes_amd64_decrypt_block;)
475
476#endif /*USE_AES*/
477#endif /*__x86_64*/
478