1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build amd64 && gc && !purego
6// +build amd64,gc,!purego
7
8#include "textflag.h"
9
10DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
11DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
12GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
13
14DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
15DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
16GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
17
18#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
19	MOVO       v4, t1; \
20	MOVO       v5, v4; \
21	MOVO       t1, v5; \
22	MOVO       v6, t1; \
23	PUNPCKLQDQ v6, t2; \
24	PUNPCKHQDQ v7, v6; \
25	PUNPCKHQDQ t2, v6; \
26	PUNPCKLQDQ v7, t2; \
27	MOVO       t1, v7; \
28	MOVO       v2, t1; \
29	PUNPCKHQDQ t2, v7; \
30	PUNPCKLQDQ v3, t2; \
31	PUNPCKHQDQ t2, v2; \
32	PUNPCKLQDQ t1, t2; \
33	PUNPCKHQDQ t2, v3
34
35#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
36	MOVO       v4, t1; \
37	MOVO       v5, v4; \
38	MOVO       t1, v5; \
39	MOVO       v2, t1; \
40	PUNPCKLQDQ v2, t2; \
41	PUNPCKHQDQ v3, v2; \
42	PUNPCKHQDQ t2, v2; \
43	PUNPCKLQDQ v3, t2; \
44	MOVO       t1, v3; \
45	MOVO       v6, t1; \
46	PUNPCKHQDQ t2, v3; \
47	PUNPCKLQDQ v7, t2; \
48	PUNPCKHQDQ t2, v6; \
49	PUNPCKLQDQ t1, t2; \
50	PUNPCKHQDQ t2, v7
51
52#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
53	MOVO    v0, t0;        \
54	PMULULQ v2, t0;        \
55	PADDQ   v2, v0;        \
56	PADDQ   t0, v0;        \
57	PADDQ   t0, v0;        \
58	PXOR    v0, v6;        \
59	PSHUFD  $0xB1, v6, v6; \
60	MOVO    v4, t0;        \
61	PMULULQ v6, t0;        \
62	PADDQ   v6, v4;        \
63	PADDQ   t0, v4;        \
64	PADDQ   t0, v4;        \
65	PXOR    v4, v2;        \
66	PSHUFB  c40, v2;       \
67	MOVO    v0, t0;        \
68	PMULULQ v2, t0;        \
69	PADDQ   v2, v0;        \
70	PADDQ   t0, v0;        \
71	PADDQ   t0, v0;        \
72	PXOR    v0, v6;        \
73	PSHUFB  c48, v6;       \
74	MOVO    v4, t0;        \
75	PMULULQ v6, t0;        \
76	PADDQ   v6, v4;        \
77	PADDQ   t0, v4;        \
78	PADDQ   t0, v4;        \
79	PXOR    v4, v2;        \
80	MOVO    v2, t0;        \
81	PADDQ   v2, t0;        \
82	PSRLQ   $63, v2;       \
83	PXOR    t0, v2;        \
84	MOVO    v1, t0;        \
85	PMULULQ v3, t0;        \
86	PADDQ   v3, v1;        \
87	PADDQ   t0, v1;        \
88	PADDQ   t0, v1;        \
89	PXOR    v1, v7;        \
90	PSHUFD  $0xB1, v7, v7; \
91	MOVO    v5, t0;        \
92	PMULULQ v7, t0;        \
93	PADDQ   v7, v5;        \
94	PADDQ   t0, v5;        \
95	PADDQ   t0, v5;        \
96	PXOR    v5, v3;        \
97	PSHUFB  c40, v3;       \
98	MOVO    v1, t0;        \
99	PMULULQ v3, t0;        \
100	PADDQ   v3, v1;        \
101	PADDQ   t0, v1;        \
102	PADDQ   t0, v1;        \
103	PXOR    v1, v7;        \
104	PSHUFB  c48, v7;       \
105	MOVO    v5, t0;        \
106	PMULULQ v7, t0;        \
107	PADDQ   v7, v5;        \
108	PADDQ   t0, v5;        \
109	PADDQ   t0, v5;        \
110	PXOR    v5, v3;        \
111	MOVO    v3, t0;        \
112	PADDQ   v3, t0;        \
113	PSRLQ   $63, v3;       \
114	PXOR    t0, v3
115
116#define LOAD_MSG_0(block, off) \
117	MOVOU 8*(off+0)(block), X0;  \
118	MOVOU 8*(off+2)(block), X1;  \
119	MOVOU 8*(off+4)(block), X2;  \
120	MOVOU 8*(off+6)(block), X3;  \
121	MOVOU 8*(off+8)(block), X4;  \
122	MOVOU 8*(off+10)(block), X5; \
123	MOVOU 8*(off+12)(block), X6; \
124	MOVOU 8*(off+14)(block), X7
125
126#define STORE_MSG_0(block, off) \
127	MOVOU X0, 8*(off+0)(block);  \
128	MOVOU X1, 8*(off+2)(block);  \
129	MOVOU X2, 8*(off+4)(block);  \
130	MOVOU X3, 8*(off+6)(block);  \
131	MOVOU X4, 8*(off+8)(block);  \
132	MOVOU X5, 8*(off+10)(block); \
133	MOVOU X6, 8*(off+12)(block); \
134	MOVOU X7, 8*(off+14)(block)
135
136#define LOAD_MSG_1(block, off) \
137	MOVOU 8*off+0*8(block), X0;  \
138	MOVOU 8*off+16*8(block), X1; \
139	MOVOU 8*off+32*8(block), X2; \
140	MOVOU 8*off+48*8(block), X3; \
141	MOVOU 8*off+64*8(block), X4; \
142	MOVOU 8*off+80*8(block), X5; \
143	MOVOU 8*off+96*8(block), X6; \
144	MOVOU 8*off+112*8(block), X7
145
146#define STORE_MSG_1(block, off) \
147	MOVOU X0, 8*off+0*8(block);  \
148	MOVOU X1, 8*off+16*8(block); \
149	MOVOU X2, 8*off+32*8(block); \
150	MOVOU X3, 8*off+48*8(block); \
151	MOVOU X4, 8*off+64*8(block); \
152	MOVOU X5, 8*off+80*8(block); \
153	MOVOU X6, 8*off+96*8(block); \
154	MOVOU X7, 8*off+112*8(block)
155
156#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
157	LOAD_MSG_0(block, off);                                   \
158	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
159	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
160	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
161	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
162	STORE_MSG_0(block, off)
163
164#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
165	LOAD_MSG_1(block, off);                                   \
166	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
167	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
168	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
169	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
170	STORE_MSG_1(block, off)
171
172// func blamkaSSE4(b *block)
173TEXT ·blamkaSSE4(SB), 4, $0-8
174	MOVQ b+0(FP), AX
175
176	MOVOU ·c40<>(SB), X10
177	MOVOU ·c48<>(SB), X11
178
179	BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
180	BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
181	BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
182	BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
183	BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
184	BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
185	BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
186	BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
187
188	BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
189	BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
190	BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
191	BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
192	BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
193	BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
194	BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
195	BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
196	RET
197
198// func mixBlocksSSE2(out, a, b, c *block)
199TEXT ·mixBlocksSSE2(SB), 4, $0-32
200	MOVQ out+0(FP), DX
201	MOVQ a+8(FP), AX
202	MOVQ b+16(FP), BX
203	MOVQ a+24(FP), CX
204	MOVQ $128, BP
205
206loop:
207	MOVOU 0(AX), X0
208	MOVOU 0(BX), X1
209	MOVOU 0(CX), X2
210	PXOR  X1, X0
211	PXOR  X2, X0
212	MOVOU X0, 0(DX)
213	ADDQ  $16, AX
214	ADDQ  $16, BX
215	ADDQ  $16, CX
216	ADDQ  $16, DX
217	SUBQ  $2, BP
218	JA    loop
219	RET
220
221// func xorBlocksSSE2(out, a, b, c *block)
222TEXT ·xorBlocksSSE2(SB), 4, $0-32
223	MOVQ out+0(FP), DX
224	MOVQ a+8(FP), AX
225	MOVQ b+16(FP), BX
226	MOVQ a+24(FP), CX
227	MOVQ $128, BP
228
229loop:
230	MOVOU 0(AX), X0
231	MOVOU 0(BX), X1
232	MOVOU 0(CX), X2
233	MOVOU 0(DX), X3
234	PXOR  X1, X0
235	PXOR  X2, X0
236	PXOR  X3, X0
237	MOVOU X0, 0(DX)
238	ADDQ  $16, AX
239	ADDQ  $16, BX
240	ADDQ  $16, CX
241	ADDQ  $16, DX
242	SUBQ  $2, BP
243	JA    loop
244	RET
245