1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4 *
5 * Copyright (C) 2012 Johannes Goetzfried
6 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 *
8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
9 */
10
11#include <linux/linkage.h>
12#include <asm/frame.h>
13#include "glue_helper-asm-avx.S"
14
15.file "cast6-avx-x86_64-asm_64.S"
16
17.extern cast_s1
18.extern cast_s2
19.extern cast_s3
20.extern cast_s4
21
22/* structure of crypto context */
23#define km	0
24#define kr	(12*4*4)
25
26/* s-boxes */
27#define s1	cast_s1
28#define s2	cast_s2
29#define s3	cast_s3
30#define s4	cast_s4
31
32/**********************************************************************
33  8-way AVX cast6
34 **********************************************************************/
35#define CTX %r15
36
37#define RA1 %xmm0
38#define RB1 %xmm1
39#define RC1 %xmm2
40#define RD1 %xmm3
41
42#define RA2 %xmm4
43#define RB2 %xmm5
44#define RC2 %xmm6
45#define RD2 %xmm7
46
47#define RX  %xmm8
48
49#define RKM  %xmm9
50#define RKR  %xmm10
51#define RKRF %xmm11
52#define RKRR %xmm12
53#define R32  %xmm13
54#define R1ST %xmm14
55
56#define RTMP %xmm15
57
58#define RID1  %rdi
59#define RID1d %edi
60#define RID2  %rsi
61#define RID2d %esi
62
63#define RGI1   %rdx
64#define RGI1bl %dl
65#define RGI1bh %dh
66#define RGI2   %rcx
67#define RGI2bl %cl
68#define RGI2bh %ch
69
70#define RGI3   %rax
71#define RGI3bl %al
72#define RGI3bh %ah
73#define RGI4   %rbx
74#define RGI4bl %bl
75#define RGI4bh %bh
76
77#define RFS1  %r8
78#define RFS1d %r8d
79#define RFS2  %r9
80#define RFS2d %r9d
81#define RFS3  %r10
82#define RFS3d %r10d
83
84
85#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
86	movzbl		src ## bh,     RID1d;    \
87	leaq		s1(%rip),      RID2;     \
88	movl		(RID2,RID1,4), dst ## d; \
89	movzbl		src ## bl,     RID2d;    \
90	leaq		s2(%rip),      RID1;     \
91	op1		(RID1,RID2,4), dst ## d; \
92	shrq $16,	src;                     \
93	movzbl		src ## bh,     RID1d;    \
94	leaq		s3(%rip),      RID2;     \
95	op2		(RID2,RID1,4), dst ## d; \
96	movzbl		src ## bl,     RID2d;    \
97	interleave_op(il_reg);			 \
98	leaq		s4(%rip),      RID1;     \
99	op3		(RID1,RID2,4), dst ## d;
100
101#define dummy(d) /* do nothing */
102
103#define shr_next(reg) \
104	shrq $16,	reg;
105
106#define F_head(a, x, gi1, gi2, op0) \
107	op0	a,	RKM,  x;                 \
108	vpslld	RKRF,	x,    RTMP;              \
109	vpsrld	RKRR,	x,    x;                 \
110	vpor	RTMP,	x,    x;                 \
111	\
112	vmovq		x,    gi1;               \
113	vpextrq $1,	x,    gi2;
114
115#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
116	lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
117	lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
118	\
119	lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none);     \
120	shlq $32,	RFS2;                                      \
121	orq		RFS1, RFS2;                                \
122	lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none);     \
123	shlq $32,	RFS1;                                      \
124	orq		RFS1, RFS3;                                \
125	\
126	vmovq		RFS2, x;                                   \
127	vpinsrq $1,	RFS3, x, x;
128
129#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
130	F_head(b1, RX, RGI1, RGI2, op0);              \
131	F_head(b2, RX, RGI3, RGI4, op0);              \
132	\
133	F_tail(b1, RX, RGI1, RGI2, op1, op2, op3);    \
134	F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3);  \
135	\
136	vpxor		a1, RX,   a1;                 \
137	vpxor		a2, RTMP, a2;
138
139#define F1_2(a1, b1, a2, b2) \
140	F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
141#define F2_2(a1, b1, a2, b2) \
142	F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
143#define F3_2(a1, b1, a2, b2) \
144	F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
145
146#define qop(in, out, f) \
147	F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
148
149#define get_round_keys(nn) \
150	vbroadcastss	(km+(4*(nn)))(CTX), RKM;        \
151	vpand		R1ST,               RKR,  RKRF; \
152	vpsubq		RKRF,               R32,  RKRR; \
153	vpsrldq $1,	RKR,                RKR;
154
155#define Q(n) \
156	get_round_keys(4*n+0); \
157	qop(RD, RC, 1);        \
158	\
159	get_round_keys(4*n+1); \
160	qop(RC, RB, 2);        \
161	\
162	get_round_keys(4*n+2); \
163	qop(RB, RA, 3);        \
164	\
165	get_round_keys(4*n+3); \
166	qop(RA, RD, 1);
167
168#define QBAR(n) \
169	get_round_keys(4*n+3); \
170	qop(RA, RD, 1);        \
171	\
172	get_round_keys(4*n+2); \
173	qop(RB, RA, 3);        \
174	\
175	get_round_keys(4*n+1); \
176	qop(RC, RB, 2);        \
177	\
178	get_round_keys(4*n+0); \
179	qop(RD, RC, 1);
180
181#define shuffle(mask) \
182	vpshufb		mask(%rip),            RKR, RKR;
183
184#define preload_rkr(n, do_mask, mask) \
185	vbroadcastss	.L16_mask(%rip),          RKR;      \
186	/* add 16-bit rotation to key rotations (mod 32) */ \
187	vpxor		(kr+n*16)(CTX),           RKR, RKR; \
188	do_mask(mask);
189
190#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
191	vpunpckldq		x1, x0, t0; \
192	vpunpckhdq		x1, x0, t2; \
193	vpunpckldq		x3, x2, t1; \
194	vpunpckhdq		x3, x2, x3; \
195	\
196	vpunpcklqdq		t1, t0, x0; \
197	vpunpckhqdq		t1, t0, x1; \
198	vpunpcklqdq		x3, t2, x2; \
199	vpunpckhqdq		x3, t2, x3;
200
201#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
202	vpshufb rmask, x0,	x0; \
203	vpshufb rmask, x1,	x1; \
204	vpshufb rmask, x2,	x2; \
205	vpshufb rmask, x3,	x3; \
206	\
207	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
208
209#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
210	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
211	\
212	vpshufb rmask,		x0, x0;       \
213	vpshufb rmask,		x1, x1;       \
214	vpshufb rmask,		x2, x2;       \
215	vpshufb rmask,		x3, x3;
216
217.section	.rodata.cst16, "aM", @progbits, 16
218.align 16
219.Lbswap_mask:
220	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
221.Lbswap128_mask:
222	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
223.Lrkr_enc_Q_Q_QBAR_QBAR:
224	.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
225.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
226	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
227.Lrkr_dec_Q_Q_Q_Q:
228	.byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
229.Lrkr_dec_Q_Q_QBAR_QBAR:
230	.byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
231.Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
232	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
233
234.section	.rodata.cst4.L16_mask, "aM", @progbits, 4
235.align 4
236.L16_mask:
237	.byte 16, 16, 16, 16
238
239.section	.rodata.cst4.L32_mask, "aM", @progbits, 4
240.align 4
241.L32_mask:
242	.byte 32, 0, 0, 0
243
244.section	.rodata.cst4.first_mask, "aM", @progbits, 4
245.align 4
246.Lfirst_mask:
247	.byte 0x1f, 0, 0, 0
248
249.text
250
251.align 8
252SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
253	/* input:
254	 *	%rdi: ctx
255	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
256	 * output:
257	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
258	 */
259
260	pushq %r15;
261	pushq %rbx;
262
263	movq %rdi, CTX;
264
265	vmovdqa .Lbswap_mask(%rip), RKM;
266	vmovd .Lfirst_mask(%rip), R1ST;
267	vmovd .L32_mask(%rip), R32;
268
269	inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
270	inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
271
272	preload_rkr(0, dummy, none);
273	Q(0);
274	Q(1);
275	Q(2);
276	Q(3);
277	preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
278	Q(4);
279	Q(5);
280	QBAR(6);
281	QBAR(7);
282	preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
283	QBAR(8);
284	QBAR(9);
285	QBAR(10);
286	QBAR(11);
287
288	popq %rbx;
289	popq %r15;
290
291	vmovdqa .Lbswap_mask(%rip), RKM;
292
293	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
294	outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
295
296	RET;
297SYM_FUNC_END(__cast6_enc_blk8)
298
299.align 8
300SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
301	/* input:
302	 *	%rdi: ctx
303	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
304	 * output:
305	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
306	 */
307
308	pushq %r15;
309	pushq %rbx;
310
311	movq %rdi, CTX;
312
313	vmovdqa .Lbswap_mask(%rip), RKM;
314	vmovd .Lfirst_mask(%rip), R1ST;
315	vmovd .L32_mask(%rip), R32;
316
317	inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
318	inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
319
320	preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
321	Q(11);
322	Q(10);
323	Q(9);
324	Q(8);
325	preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
326	Q(7);
327	Q(6);
328	QBAR(5);
329	QBAR(4);
330	preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
331	QBAR(3);
332	QBAR(2);
333	QBAR(1);
334	QBAR(0);
335
336	popq %rbx;
337	popq %r15;
338
339	vmovdqa .Lbswap_mask(%rip), RKM;
340	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
341	outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
342
343	RET;
344SYM_FUNC_END(__cast6_dec_blk8)
345
346SYM_FUNC_START(cast6_ecb_enc_8way)
347	/* input:
348	 *	%rdi: ctx
349	 *	%rsi: dst
350	 *	%rdx: src
351	 */
352	FRAME_BEGIN
353	pushq %r15;
354
355	movq %rdi, CTX;
356	movq %rsi, %r11;
357
358	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
359
360	call __cast6_enc_blk8;
361
362	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
363
364	popq %r15;
365	FRAME_END
366	RET;
367SYM_FUNC_END(cast6_ecb_enc_8way)
368
369SYM_FUNC_START(cast6_ecb_dec_8way)
370	/* input:
371	 *	%rdi: ctx
372	 *	%rsi: dst
373	 *	%rdx: src
374	 */
375	FRAME_BEGIN
376	pushq %r15;
377
378	movq %rdi, CTX;
379	movq %rsi, %r11;
380
381	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
382
383	call __cast6_dec_blk8;
384
385	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
386
387	popq %r15;
388	FRAME_END
389	RET;
390SYM_FUNC_END(cast6_ecb_dec_8way)
391
392SYM_FUNC_START(cast6_cbc_dec_8way)
393	/* input:
394	 *	%rdi: ctx
395	 *	%rsi: dst
396	 *	%rdx: src
397	 */
398	FRAME_BEGIN
399	pushq %r12;
400	pushq %r15;
401
402	movq %rdi, CTX;
403	movq %rsi, %r11;
404	movq %rdx, %r12;
405
406	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
407
408	call __cast6_dec_blk8;
409
410	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
411
412	popq %r15;
413	popq %r12;
414	FRAME_END
415	RET;
416SYM_FUNC_END(cast6_cbc_dec_8way)
417