1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build amd64,!gccgo,!appengine
6
7#include "textflag.h"
8
9DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
10DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
11GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
12
13DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
14DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
15GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
16
17#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
18	MOVO       v4, t1; \
19	MOVO       v5, v4; \
20	MOVO       t1, v5; \
21	MOVO       v6, t1; \
22	PUNPCKLQDQ v6, t2; \
23	PUNPCKHQDQ v7, v6; \
24	PUNPCKHQDQ t2, v6; \
25	PUNPCKLQDQ v7, t2; \
26	MOVO       t1, v7; \
27	MOVO       v2, t1; \
28	PUNPCKHQDQ t2, v7; \
29	PUNPCKLQDQ v3, t2; \
30	PUNPCKHQDQ t2, v2; \
31	PUNPCKLQDQ t1, t2; \
32	PUNPCKHQDQ t2, v3
33
34#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
35	MOVO       v4, t1; \
36	MOVO       v5, v4; \
37	MOVO       t1, v5; \
38	MOVO       v2, t1; \
39	PUNPCKLQDQ v2, t2; \
40	PUNPCKHQDQ v3, v2; \
41	PUNPCKHQDQ t2, v2; \
42	PUNPCKLQDQ v3, t2; \
43	MOVO       t1, v3; \
44	MOVO       v6, t1; \
45	PUNPCKHQDQ t2, v3; \
46	PUNPCKLQDQ v7, t2; \
47	PUNPCKHQDQ t2, v6; \
48	PUNPCKLQDQ t1, t2; \
49	PUNPCKHQDQ t2, v7
50
51#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
52	MOVO    v0, t0;        \
53	PMULULQ v2, t0;        \
54	PADDQ   v2, v0;        \
55	PADDQ   t0, v0;        \
56	PADDQ   t0, v0;        \
57	PXOR    v0, v6;        \
58	PSHUFD  $0xB1, v6, v6; \
59	MOVO    v4, t0;        \
60	PMULULQ v6, t0;        \
61	PADDQ   v6, v4;        \
62	PADDQ   t0, v4;        \
63	PADDQ   t0, v4;        \
64	PXOR    v4, v2;        \
65	PSHUFB  c40, v2;       \
66	MOVO    v0, t0;        \
67	PMULULQ v2, t0;        \
68	PADDQ   v2, v0;        \
69	PADDQ   t0, v0;        \
70	PADDQ   t0, v0;        \
71	PXOR    v0, v6;        \
72	PSHUFB  c48, v6;       \
73	MOVO    v4, t0;        \
74	PMULULQ v6, t0;        \
75	PADDQ   v6, v4;        \
76	PADDQ   t0, v4;        \
77	PADDQ   t0, v4;        \
78	PXOR    v4, v2;        \
79	MOVO    v2, t0;        \
80	PADDQ   v2, t0;        \
81	PSRLQ   $63, v2;       \
82	PXOR    t0, v2;        \
83	MOVO    v1, t0;        \
84	PMULULQ v3, t0;        \
85	PADDQ   v3, v1;        \
86	PADDQ   t0, v1;        \
87	PADDQ   t0, v1;        \
88	PXOR    v1, v7;        \
89	PSHUFD  $0xB1, v7, v7; \
90	MOVO    v5, t0;        \
91	PMULULQ v7, t0;        \
92	PADDQ   v7, v5;        \
93	PADDQ   t0, v5;        \
94	PADDQ   t0, v5;        \
95	PXOR    v5, v3;        \
96	PSHUFB  c40, v3;       \
97	MOVO    v1, t0;        \
98	PMULULQ v3, t0;        \
99	PADDQ   v3, v1;        \
100	PADDQ   t0, v1;        \
101	PADDQ   t0, v1;        \
102	PXOR    v1, v7;        \
103	PSHUFB  c48, v7;       \
104	MOVO    v5, t0;        \
105	PMULULQ v7, t0;        \
106	PADDQ   v7, v5;        \
107	PADDQ   t0, v5;        \
108	PADDQ   t0, v5;        \
109	PXOR    v5, v3;        \
110	MOVO    v3, t0;        \
111	PADDQ   v3, t0;        \
112	PSRLQ   $63, v3;       \
113	PXOR    t0, v3
114
115#define LOAD_MSG_0(block, off) \
116	MOVOU 8*(off+0)(block), X0;  \
117	MOVOU 8*(off+2)(block), X1;  \
118	MOVOU 8*(off+4)(block), X2;  \
119	MOVOU 8*(off+6)(block), X3;  \
120	MOVOU 8*(off+8)(block), X4;  \
121	MOVOU 8*(off+10)(block), X5; \
122	MOVOU 8*(off+12)(block), X6; \
123	MOVOU 8*(off+14)(block), X7
124
125#define STORE_MSG_0(block, off) \
126	MOVOU X0, 8*(off+0)(block);  \
127	MOVOU X1, 8*(off+2)(block);  \
128	MOVOU X2, 8*(off+4)(block);  \
129	MOVOU X3, 8*(off+6)(block);  \
130	MOVOU X4, 8*(off+8)(block);  \
131	MOVOU X5, 8*(off+10)(block); \
132	MOVOU X6, 8*(off+12)(block); \
133	MOVOU X7, 8*(off+14)(block)
134
135#define LOAD_MSG_1(block, off) \
136	MOVOU 8*off+0*8(block), X0;  \
137	MOVOU 8*off+16*8(block), X1; \
138	MOVOU 8*off+32*8(block), X2; \
139	MOVOU 8*off+48*8(block), X3; \
140	MOVOU 8*off+64*8(block), X4; \
141	MOVOU 8*off+80*8(block), X5; \
142	MOVOU 8*off+96*8(block), X6; \
143	MOVOU 8*off+112*8(block), X7
144
145#define STORE_MSG_1(block, off) \
146	MOVOU X0, 8*off+0*8(block);  \
147	MOVOU X1, 8*off+16*8(block); \
148	MOVOU X2, 8*off+32*8(block); \
149	MOVOU X3, 8*off+48*8(block); \
150	MOVOU X4, 8*off+64*8(block); \
151	MOVOU X5, 8*off+80*8(block); \
152	MOVOU X6, 8*off+96*8(block); \
153	MOVOU X7, 8*off+112*8(block)
154
155#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
156	LOAD_MSG_0(block, off);                                   \
157	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
158	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
159	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
160	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
161	STORE_MSG_0(block, off)
162
163#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
164	LOAD_MSG_1(block, off);                                   \
165	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
166	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
167	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
168	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
169	STORE_MSG_1(block, off)
170
171// func blamkaSSE4(b *block)
172TEXT ·blamkaSSE4(SB), 4, $0-8
173	MOVQ b+0(FP), AX
174
175	MOVOU ·c40<>(SB), X10
176	MOVOU ·c48<>(SB), X11
177
178	BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
179	BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
180	BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
181	BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
182	BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
183	BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
184	BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
185	BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
186
187	BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
188	BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
189	BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
190	BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
191	BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
192	BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
193	BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
194	BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
195	RET
196
197// func mixBlocksSSE2(out, a, b, c *block)
198TEXT ·mixBlocksSSE2(SB), 4, $0-32
199	MOVQ out+0(FP), DX
200	MOVQ a+8(FP), AX
201	MOVQ b+16(FP), BX
202	MOVQ a+24(FP), CX
203	MOVQ $128, BP
204
205loop:
206	MOVOU 0(AX), X0
207	MOVOU 0(BX), X1
208	MOVOU 0(CX), X2
209	PXOR  X1, X0
210	PXOR  X2, X0
211	MOVOU X0, 0(DX)
212	ADDQ  $16, AX
213	ADDQ  $16, BX
214	ADDQ  $16, CX
215	ADDQ  $16, DX
216	SUBQ  $2, BP
217	JA    loop
218	RET
219
220// func xorBlocksSSE2(out, a, b, c *block)
221TEXT ·xorBlocksSSE2(SB), 4, $0-32
222	MOVQ out+0(FP), DX
223	MOVQ a+8(FP), AX
224	MOVQ b+16(FP), BX
225	MOVQ a+24(FP), CX
226	MOVQ $128, BP
227
228loop:
229	MOVOU 0(AX), X0
230	MOVOU 0(BX), X1
231	MOVOU 0(CX), X2
232	MOVOU 0(DX), X3
233	PXOR  X1, X0
234	PXOR  X2, X0
235	PXOR  X3, X0
236	MOVOU X0, 0(DX)
237	ADDQ  $16, AX
238	ADDQ  $16, BX
239	ADDQ  $16, CX
240	ADDQ  $16, DX
241	SUBQ  $2, BP
242	JA    loop
243	RET
244