1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build go1.7,amd64,!gccgo,!appengine
6
7#include "textflag.h"
8
9DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
10DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
11DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
12DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
13GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
14
15DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
16DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
17DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
18DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
19GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
20
21DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
22DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
23DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
24DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
25GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
26
27DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
28DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
29DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
30DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
31GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
32
33DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
34DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
35GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
36
37DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
38DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
39GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
40
41DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
42DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
43GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
44
45DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
46DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
47GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
48
49DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
50DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
51GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
52
53DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
54DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
55GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
56
57#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
58#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
59#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
60#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
61#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
62
63#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
64	VPADDQ  m0, Y0, Y0;   \
65	VPADDQ  Y1, Y0, Y0;   \
66	VPXOR   Y0, Y3, Y3;   \
67	VPSHUFD $-79, Y3, Y3; \
68	VPADDQ  Y3, Y2, Y2;   \
69	VPXOR   Y2, Y1, Y1;   \
70	VPSHUFB c40, Y1, Y1;  \
71	VPADDQ  m1, Y0, Y0;   \
72	VPADDQ  Y1, Y0, Y0;   \
73	VPXOR   Y0, Y3, Y3;   \
74	VPSHUFB c48, Y3, Y3;  \
75	VPADDQ  Y3, Y2, Y2;   \
76	VPXOR   Y2, Y1, Y1;   \
77	VPADDQ  Y1, Y1, t;    \
78	VPSRLQ  $63, Y1, Y1;  \
79	VPXOR   t, Y1, Y1;    \
80	VPERMQ_0x39_Y1_Y1;    \
81	VPERMQ_0x4E_Y2_Y2;    \
82	VPERMQ_0x93_Y3_Y3;    \
83	VPADDQ  m2, Y0, Y0;   \
84	VPADDQ  Y1, Y0, Y0;   \
85	VPXOR   Y0, Y3, Y3;   \
86	VPSHUFD $-79, Y3, Y3; \
87	VPADDQ  Y3, Y2, Y2;   \
88	VPXOR   Y2, Y1, Y1;   \
89	VPSHUFB c40, Y1, Y1;  \
90	VPADDQ  m3, Y0, Y0;   \
91	VPADDQ  Y1, Y0, Y0;   \
92	VPXOR   Y0, Y3, Y3;   \
93	VPSHUFB c48, Y3, Y3;  \
94	VPADDQ  Y3, Y2, Y2;   \
95	VPXOR   Y2, Y1, Y1;   \
96	VPADDQ  Y1, Y1, t;    \
97	VPSRLQ  $63, Y1, Y1;  \
98	VPXOR   t, Y1, Y1;    \
99	VPERMQ_0x39_Y3_Y3;    \
100	VPERMQ_0x4E_Y2_Y2;    \
101	VPERMQ_0x93_Y1_Y1
102
103#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
104#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
105#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
106#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
107#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
108
109#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
110#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
111#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
112#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
113#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
114
115#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
116#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
117#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
118#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
119#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
120
121#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
122#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
123#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
124#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
125#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
126
127#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
128#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
129
130// load msg: Y12 = (i0, i1, i2, i3)
131// i0, i1, i2, i3 must not be 0
132#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
133	VMOVQ_SI_X12(i0*8);           \
134	VMOVQ_SI_X11(i2*8);           \
135	VPINSRQ_1_SI_X12(i1*8);       \
136	VPINSRQ_1_SI_X11(i3*8);       \
137	VINSERTI128 $1, X11, Y12, Y12
138
139// load msg: Y13 = (i0, i1, i2, i3)
140// i0, i1, i2, i3 must not be 0
141#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
142	VMOVQ_SI_X13(i0*8);           \
143	VMOVQ_SI_X11(i2*8);           \
144	VPINSRQ_1_SI_X13(i1*8);       \
145	VPINSRQ_1_SI_X11(i3*8);       \
146	VINSERTI128 $1, X11, Y13, Y13
147
148// load msg: Y14 = (i0, i1, i2, i3)
149// i0, i1, i2, i3 must not be 0
150#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
151	VMOVQ_SI_X14(i0*8);           \
152	VMOVQ_SI_X11(i2*8);           \
153	VPINSRQ_1_SI_X14(i1*8);       \
154	VPINSRQ_1_SI_X11(i3*8);       \
155	VINSERTI128 $1, X11, Y14, Y14
156
157// load msg: Y15 = (i0, i1, i2, i3)
158// i0, i1, i2, i3 must not be 0
159#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
160	VMOVQ_SI_X15(i0*8);           \
161	VMOVQ_SI_X11(i2*8);           \
162	VPINSRQ_1_SI_X15(i1*8);       \
163	VPINSRQ_1_SI_X11(i3*8);       \
164	VINSERTI128 $1, X11, Y15, Y15
165
166#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
167	VMOVQ_SI_X12_0;                   \
168	VMOVQ_SI_X11(4*8);                \
169	VPINSRQ_1_SI_X12(2*8);            \
170	VPINSRQ_1_SI_X11(6*8);            \
171	VINSERTI128 $1, X11, Y12, Y12;    \
172	LOAD_MSG_AVX2_Y13(1, 3, 5, 7);    \
173	LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
174	LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
175
176#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
177	LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
178	LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
179	VMOVQ_SI_X11(11*8);              \
180	VPSHUFD     $0x4E, 0*8(SI), X14; \
181	VPINSRQ_1_SI_X11(5*8);           \
182	VINSERTI128 $1, X11, Y14, Y14;   \
183	LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
184
185#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
186	VMOVQ_SI_X11(5*8);              \
187	VMOVDQU     11*8(SI), X12;      \
188	VPINSRQ_1_SI_X11(15*8);         \
189	VINSERTI128 $1, X11, Y12, Y12;  \
190	VMOVQ_SI_X13(8*8);              \
191	VMOVQ_SI_X11(2*8);              \
192	VPINSRQ_1_SI_X13_0;             \
193	VPINSRQ_1_SI_X11(13*8);         \
194	VINSERTI128 $1, X11, Y13, Y13;  \
195	LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
196	LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
197
198#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
199	LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
200	LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
201	LOAD_MSG_AVX2_Y14(2, 5, 4, 15);  \
202	VMOVQ_SI_X15(6*8);               \
203	VMOVQ_SI_X11_0;                  \
204	VPINSRQ_1_SI_X15(10*8);          \
205	VPINSRQ_1_SI_X11(8*8);           \
206	VINSERTI128 $1, X11, Y15, Y15
207
208#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
209	LOAD_MSG_AVX2_Y12(9, 5, 2, 10);  \
210	VMOVQ_SI_X13_0;                  \
211	VMOVQ_SI_X11(4*8);               \
212	VPINSRQ_1_SI_X13(7*8);           \
213	VPINSRQ_1_SI_X11(15*8);          \
214	VINSERTI128 $1, X11, Y13, Y13;   \
215	LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
216	LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
217
218#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
219	VMOVQ_SI_X12(2*8);                \
220	VMOVQ_SI_X11_0;                   \
221	VPINSRQ_1_SI_X12(6*8);            \
222	VPINSRQ_1_SI_X11(8*8);            \
223	VINSERTI128 $1, X11, Y12, Y12;    \
224	LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
225	LOAD_MSG_AVX2_Y14(4, 7, 15, 1);   \
226	LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
227
228#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
229	LOAD_MSG_AVX2_Y12(12, 1, 14, 4);  \
230	LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
231	VMOVQ_SI_X14_0;                   \
232	VPSHUFD     $0x4E, 8*8(SI), X11;  \
233	VPINSRQ_1_SI_X14(6*8);            \
234	VINSERTI128 $1, X11, Y14, Y14;    \
235	LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
236
237#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
238	LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
239	LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
240	LOAD_MSG_AVX2_Y14(5, 15, 8, 2);  \
241	VMOVQ_SI_X15_0;                  \
242	VMOVQ_SI_X11(6*8);               \
243	VPINSRQ_1_SI_X15(4*8);           \
244	VPINSRQ_1_SI_X11(10*8);          \
245	VINSERTI128 $1, X11, Y15, Y15
246
247#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
248	VMOVQ_SI_X12(6*8);              \
249	VMOVQ_SI_X11(11*8);             \
250	VPINSRQ_1_SI_X12(14*8);         \
251	VPINSRQ_1_SI_X11_0;             \
252	VINSERTI128 $1, X11, Y12, Y12;  \
253	LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
254	VMOVQ_SI_X11(1*8);              \
255	VMOVDQU     12*8(SI), X14;      \
256	VPINSRQ_1_SI_X11(10*8);         \
257	VINSERTI128 $1, X11, Y14, Y14;  \
258	VMOVQ_SI_X15(2*8);              \
259	VMOVDQU     4*8(SI), X11;       \
260	VPINSRQ_1_SI_X15(7*8);          \
261	VINSERTI128 $1, X11, Y15, Y15
262
263#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
264	LOAD_MSG_AVX2_Y12(10, 8, 7, 1);  \
265	VMOVQ_SI_X13(2*8);               \
266	VPSHUFD     $0x4E, 5*8(SI), X11; \
267	VPINSRQ_1_SI_X13(4*8);           \
268	VINSERTI128 $1, X11, Y13, Y13;   \
269	LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
270	VMOVQ_SI_X15(11*8);              \
271	VMOVQ_SI_X11(12*8);              \
272	VPINSRQ_1_SI_X15(14*8);          \
273	VPINSRQ_1_SI_X11_0;              \
274	VINSERTI128 $1, X11, Y15, Y15
275
276// func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
277TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
278	MOVQ h+0(FP), AX
279	MOVQ c+8(FP), BX
280	MOVQ flag+16(FP), CX
281	MOVQ blocks_base+24(FP), SI
282	MOVQ blocks_len+32(FP), DI
283
284	MOVQ SP, DX
285	MOVQ SP, R9
286	ADDQ $31, R9
287	ANDQ $~31, R9
288	MOVQ R9, SP
289
290	MOVQ CX, 16(SP)
291	XORQ CX, CX
292	MOVQ CX, 24(SP)
293
294	VMOVDQU ·AVX2_c40<>(SB), Y4
295	VMOVDQU ·AVX2_c48<>(SB), Y5
296
297	VMOVDQU 0(AX), Y8
298	VMOVDQU 32(AX), Y9
299	VMOVDQU ·AVX2_iv0<>(SB), Y6
300	VMOVDQU ·AVX2_iv1<>(SB), Y7
301
302	MOVQ 0(BX), R8
303	MOVQ 8(BX), R9
304	MOVQ R9, 8(SP)
305
306loop:
307	ADDQ $128, R8
308	MOVQ R8, 0(SP)
309	CMPQ R8, $128
310	JGE  noinc
311	INCQ R9
312	MOVQ R9, 8(SP)
313
314noinc:
315	VMOVDQA Y8, Y0
316	VMOVDQA Y9, Y1
317	VMOVDQA Y6, Y2
318	VPXOR   0(SP), Y7, Y3
319
320	LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
321	VMOVDQA Y12, 32(SP)
322	VMOVDQA Y13, 64(SP)
323	VMOVDQA Y14, 96(SP)
324	VMOVDQA Y15, 128(SP)
325	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
326	LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
327	VMOVDQA Y12, 160(SP)
328	VMOVDQA Y13, 192(SP)
329	VMOVDQA Y14, 224(SP)
330	VMOVDQA Y15, 256(SP)
331
332	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
333	LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
334	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
335	LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
336	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
337	LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
338	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
339	LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
340	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
341	LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
342	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
343	LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
344	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
345	LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
346	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
347	LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
348	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
349
350	ROUND_AVX2(32(SP), 64(SP), 96(SP), 128(SP), Y10, Y4, Y5)
351	ROUND_AVX2(160(SP), 192(SP), 224(SP), 256(SP), Y10, Y4, Y5)
352
353	VPXOR Y0, Y8, Y8
354	VPXOR Y1, Y9, Y9
355	VPXOR Y2, Y8, Y8
356	VPXOR Y3, Y9, Y9
357
358	LEAQ 128(SI), SI
359	SUBQ $128, DI
360	JNE  loop
361
362	MOVQ R8, 0(BX)
363	MOVQ R9, 8(BX)
364
365	VMOVDQU Y8, 0(AX)
366	VMOVDQU Y9, 32(AX)
367	VZEROUPPER
368
369	MOVQ DX, SP
370	RET
371
372#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
373#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
374#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
375#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
376#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
377
378#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
379#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
380#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
381#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
382#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
383#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
384#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
385#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
386
387#define SHUFFLE_AVX() \
388	VMOVDQA X6, X13;         \
389	VMOVDQA X2, X14;         \
390	VMOVDQA X4, X6;          \
391	VPUNPCKLQDQ_X13_X13_X15; \
392	VMOVDQA X5, X4;          \
393	VMOVDQA X6, X5;          \
394	VPUNPCKHQDQ_X15_X7_X6;   \
395	VPUNPCKLQDQ_X7_X7_X15;   \
396	VPUNPCKHQDQ_X15_X13_X7;  \
397	VPUNPCKLQDQ_X3_X3_X15;   \
398	VPUNPCKHQDQ_X15_X2_X2;   \
399	VPUNPCKLQDQ_X14_X14_X15; \
400	VPUNPCKHQDQ_X15_X3_X3;   \
401
402#define SHUFFLE_AVX_INV() \
403	VMOVDQA X2, X13;         \
404	VMOVDQA X4, X14;         \
405	VPUNPCKLQDQ_X2_X2_X15;   \
406	VMOVDQA X5, X4;          \
407	VPUNPCKHQDQ_X15_X3_X2;   \
408	VMOVDQA X14, X5;         \
409	VPUNPCKLQDQ_X3_X3_X15;   \
410	VMOVDQA X6, X14;         \
411	VPUNPCKHQDQ_X15_X13_X3;  \
412	VPUNPCKLQDQ_X7_X7_X15;   \
413	VPUNPCKHQDQ_X15_X6_X6;   \
414	VPUNPCKLQDQ_X14_X14_X15; \
415	VPUNPCKHQDQ_X15_X7_X7;   \
416
417#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
418	VPADDQ  m0, v0, v0;   \
419	VPADDQ  v2, v0, v0;   \
420	VPADDQ  m1, v1, v1;   \
421	VPADDQ  v3, v1, v1;   \
422	VPXOR   v0, v6, v6;   \
423	VPXOR   v1, v7, v7;   \
424	VPSHUFD $-79, v6, v6; \
425	VPSHUFD $-79, v7, v7; \
426	VPADDQ  v6, v4, v4;   \
427	VPADDQ  v7, v5, v5;   \
428	VPXOR   v4, v2, v2;   \
429	VPXOR   v5, v3, v3;   \
430	VPSHUFB c40, v2, v2;  \
431	VPSHUFB c40, v3, v3;  \
432	VPADDQ  m2, v0, v0;   \
433	VPADDQ  v2, v0, v0;   \
434	VPADDQ  m3, v1, v1;   \
435	VPADDQ  v3, v1, v1;   \
436	VPXOR   v0, v6, v6;   \
437	VPXOR   v1, v7, v7;   \
438	VPSHUFB c48, v6, v6;  \
439	VPSHUFB c48, v7, v7;  \
440	VPADDQ  v6, v4, v4;   \
441	VPADDQ  v7, v5, v5;   \
442	VPXOR   v4, v2, v2;   \
443	VPXOR   v5, v3, v3;   \
444	VPADDQ  v2, v2, t0;   \
445	VPSRLQ  $63, v2, v2;  \
446	VPXOR   t0, v2, v2;   \
447	VPADDQ  v3, v3, t0;   \
448	VPSRLQ  $63, v3, v3;  \
449	VPXOR   t0, v3, v3
450
451// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
452// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
453#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
454	VMOVQ_SI_X12(i0*8);     \
455	VMOVQ_SI_X13(i2*8);     \
456	VMOVQ_SI_X14(i4*8);     \
457	VMOVQ_SI_X15(i6*8);     \
458	VPINSRQ_1_SI_X12(i1*8); \
459	VPINSRQ_1_SI_X13(i3*8); \
460	VPINSRQ_1_SI_X14(i5*8); \
461	VPINSRQ_1_SI_X15(i7*8)
462
463// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
464#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
465	VMOVQ_SI_X12_0;        \
466	VMOVQ_SI_X13(4*8);     \
467	VMOVQ_SI_X14(1*8);     \
468	VMOVQ_SI_X15(5*8);     \
469	VPINSRQ_1_SI_X12(2*8); \
470	VPINSRQ_1_SI_X13(6*8); \
471	VPINSRQ_1_SI_X14(3*8); \
472	VPINSRQ_1_SI_X15(7*8)
473
474// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
475#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
476	VPSHUFD $0x4E, 0*8(SI), X12; \
477	VMOVQ_SI_X13(11*8);          \
478	VMOVQ_SI_X14(12*8);          \
479	VMOVQ_SI_X15(7*8);           \
480	VPINSRQ_1_SI_X13(5*8);       \
481	VPINSRQ_1_SI_X14(2*8);       \
482	VPINSRQ_1_SI_X15(3*8)
483
484// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
485#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
486	VMOVDQU 11*8(SI), X12;  \
487	VMOVQ_SI_X13(5*8);      \
488	VMOVQ_SI_X14(8*8);      \
489	VMOVQ_SI_X15(2*8);      \
490	VPINSRQ_1_SI_X13(15*8); \
491	VPINSRQ_1_SI_X14_0;     \
492	VPINSRQ_1_SI_X15(13*8)
493
494// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
495#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
496	VMOVQ_SI_X12(2*8);      \
497	VMOVQ_SI_X13(4*8);      \
498	VMOVQ_SI_X14(6*8);      \
499	VMOVQ_SI_X15_0;         \
500	VPINSRQ_1_SI_X12(5*8);  \
501	VPINSRQ_1_SI_X13(15*8); \
502	VPINSRQ_1_SI_X14(10*8); \
503	VPINSRQ_1_SI_X15(8*8)
504
505// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
506#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
507	VMOVQ_SI_X12(9*8);      \
508	VMOVQ_SI_X13(2*8);      \
509	VMOVQ_SI_X14_0;         \
510	VMOVQ_SI_X15(4*8);      \
511	VPINSRQ_1_SI_X12(5*8);  \
512	VPINSRQ_1_SI_X13(10*8); \
513	VPINSRQ_1_SI_X14(7*8);  \
514	VPINSRQ_1_SI_X15(15*8)
515
516// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
517#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
518	VMOVQ_SI_X12(2*8);      \
519	VMOVQ_SI_X13_0;         \
520	VMOVQ_SI_X14(12*8);     \
521	VMOVQ_SI_X15(11*8);     \
522	VPINSRQ_1_SI_X12(6*8);  \
523	VPINSRQ_1_SI_X13(8*8);  \
524	VPINSRQ_1_SI_X14(10*8); \
525	VPINSRQ_1_SI_X15(3*8)
526
527// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
528#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
529	MOVQ    0*8(SI), X12;        \
530	VPSHUFD $0x4E, 8*8(SI), X13; \
531	MOVQ    7*8(SI), X14;        \
532	MOVQ    2*8(SI), X15;        \
533	VPINSRQ_1_SI_X12(6*8);       \
534	VPINSRQ_1_SI_X14(3*8);       \
535	VPINSRQ_1_SI_X15(11*8)
536
537// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
538#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
539	MOVQ 6*8(SI), X12;      \
540	MOVQ 11*8(SI), X13;     \
541	MOVQ 15*8(SI), X14;     \
542	MOVQ 3*8(SI), X15;      \
543	VPINSRQ_1_SI_X12(14*8); \
544	VPINSRQ_1_SI_X13_0;     \
545	VPINSRQ_1_SI_X14(9*8);  \
546	VPINSRQ_1_SI_X15(8*8)
547
548// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
549#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
550	MOVQ 5*8(SI), X12;      \
551	MOVQ 8*8(SI), X13;      \
552	MOVQ 0*8(SI), X14;      \
553	MOVQ 6*8(SI), X15;      \
554	VPINSRQ_1_SI_X12(15*8); \
555	VPINSRQ_1_SI_X13(2*8);  \
556	VPINSRQ_1_SI_X14(4*8);  \
557	VPINSRQ_1_SI_X15(10*8)
558
559// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
560#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
561	VMOVDQU 12*8(SI), X12;  \
562	MOVQ    1*8(SI), X13;   \
563	MOVQ    2*8(SI), X14;   \
564	VPINSRQ_1_SI_X13(10*8); \
565	VPINSRQ_1_SI_X14(7*8);  \
566	VMOVDQU 4*8(SI), X15
567
568// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
569#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
570	MOVQ 15*8(SI), X12;     \
571	MOVQ 3*8(SI), X13;      \
572	MOVQ 11*8(SI), X14;     \
573	MOVQ 12*8(SI), X15;     \
574	VPINSRQ_1_SI_X12(9*8);  \
575	VPINSRQ_1_SI_X13(13*8); \
576	VPINSRQ_1_SI_X14(14*8); \
577	VPINSRQ_1_SI_X15_0
578
579// func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
580TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
581	MOVQ h+0(FP), AX
582	MOVQ c+8(FP), BX
583	MOVQ flag+16(FP), CX
584	MOVQ blocks_base+24(FP), SI
585	MOVQ blocks_len+32(FP), DI
586
587	MOVQ SP, BP
588	MOVQ SP, R9
589	ADDQ $15, R9
590	ANDQ $~15, R9
591	MOVQ R9, SP
592
593	VMOVDQU ·AVX_c40<>(SB), X0
594	VMOVDQU ·AVX_c48<>(SB), X1
595	VMOVDQA X0, X8
596	VMOVDQA X1, X9
597
598	VMOVDQU ·AVX_iv3<>(SB), X0
599	VMOVDQA X0, 0(SP)
600	XORQ    CX, 0(SP)          // 0(SP) = ·AVX_iv3 ^ (CX || 0)
601
602	VMOVDQU 0(AX), X10
603	VMOVDQU 16(AX), X11
604	VMOVDQU 32(AX), X2
605	VMOVDQU 48(AX), X3
606
607	MOVQ 0(BX), R8
608	MOVQ 8(BX), R9
609
610loop:
611	ADDQ $128, R8
612	CMPQ R8, $128
613	JGE  noinc
614	INCQ R9
615
616noinc:
617	VMOVQ_R8_X15
618	VPINSRQ_1_R9_X15
619
620	VMOVDQA X10, X0
621	VMOVDQA X11, X1
622	VMOVDQU ·AVX_iv0<>(SB), X4
623	VMOVDQU ·AVX_iv1<>(SB), X5
624	VMOVDQU ·AVX_iv2<>(SB), X6
625
626	VPXOR   X15, X6, X6
627	VMOVDQA 0(SP), X7
628
629	LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
630	VMOVDQA X12, 16(SP)
631	VMOVDQA X13, 32(SP)
632	VMOVDQA X14, 48(SP)
633	VMOVDQA X15, 64(SP)
634	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
635	SHUFFLE_AVX()
636	LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
637	VMOVDQA X12, 80(SP)
638	VMOVDQA X13, 96(SP)
639	VMOVDQA X14, 112(SP)
640	VMOVDQA X15, 128(SP)
641	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
642	SHUFFLE_AVX_INV()
643
644	LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
645	VMOVDQA X12, 144(SP)
646	VMOVDQA X13, 160(SP)
647	VMOVDQA X14, 176(SP)
648	VMOVDQA X15, 192(SP)
649	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
650	SHUFFLE_AVX()
651	LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
652	VMOVDQA X12, 208(SP)
653	VMOVDQA X13, 224(SP)
654	VMOVDQA X14, 240(SP)
655	VMOVDQA X15, 256(SP)
656	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
657	SHUFFLE_AVX_INV()
658
659	LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
660	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
661	SHUFFLE_AVX()
662	LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
663	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
664	SHUFFLE_AVX_INV()
665
666	LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
667	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
668	SHUFFLE_AVX()
669	LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
670	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
671	SHUFFLE_AVX_INV()
672
673	LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
674	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
675	SHUFFLE_AVX()
676	LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
677	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
678	SHUFFLE_AVX_INV()
679
680	LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
681	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
682	SHUFFLE_AVX()
683	LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
684	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
685	SHUFFLE_AVX_INV()
686
687	LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
688	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
689	SHUFFLE_AVX()
690	LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
691	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
692	SHUFFLE_AVX_INV()
693
694	LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
695	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
696	SHUFFLE_AVX()
697	LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
698	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
699	SHUFFLE_AVX_INV()
700
701	LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
702	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
703	SHUFFLE_AVX()
704	LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
705	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
706	SHUFFLE_AVX_INV()
707
708	LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
709	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
710	SHUFFLE_AVX()
711	LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
712	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
713	SHUFFLE_AVX_INV()
714
715	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X15, X8, X9)
716	SHUFFLE_AVX()
717	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X15, X8, X9)
718	SHUFFLE_AVX_INV()
719
720	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X15, X8, X9)
721	SHUFFLE_AVX()
722	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X15, X8, X9)
723	SHUFFLE_AVX_INV()
724
725	VMOVDQU 32(AX), X14
726	VMOVDQU 48(AX), X15
727	VPXOR   X0, X10, X10
728	VPXOR   X1, X11, X11
729	VPXOR   X2, X14, X14
730	VPXOR   X3, X15, X15
731	VPXOR   X4, X10, X10
732	VPXOR   X5, X11, X11
733	VPXOR   X6, X14, X2
734	VPXOR   X7, X15, X3
735	VMOVDQU X2, 32(AX)
736	VMOVDQU X3, 48(AX)
737
738	LEAQ 128(SI), SI
739	SUBQ $128, DI
740	JNE  loop
741
742	VMOVDQU X10, 0(AX)
743	VMOVDQU X11, 16(AX)
744
745	MOVQ R8, 0(BX)
746	MOVQ R9, 8(BX)
747	VZEROUPPER
748
749	MOVQ BP, SP
750	RET
751