1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build 386,!gccgo,!appengine
6
7#include "textflag.h"
8
9DATA iv0<>+0x00(SB)/4, $0x6a09e667
10DATA iv0<>+0x04(SB)/4, $0xbb67ae85
11DATA iv0<>+0x08(SB)/4, $0x3c6ef372
12DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
13GLOBL iv0<>(SB), (NOPTR+RODATA), $16
14
15DATA iv1<>+0x00(SB)/4, $0x510e527f
16DATA iv1<>+0x04(SB)/4, $0x9b05688c
17DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
18DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
19GLOBL iv1<>(SB), (NOPTR+RODATA), $16
20
21DATA rol16<>+0x00(SB)/8, $0x0504070601000302
22DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
23GLOBL rol16<>(SB), (NOPTR+RODATA), $16
24
25DATA rol8<>+0x00(SB)/8, $0x0407060500030201
26DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
27GLOBL rol8<>(SB), (NOPTR+RODATA), $16
28
29DATA counter<>+0x00(SB)/8, $0x40
30DATA counter<>+0x08(SB)/8, $0x0
31GLOBL counter<>(SB), (NOPTR+RODATA), $16
32
33#define ROTL_SSE2(n, t, v) \
34	MOVO  v, t;       \
35	PSLLL $n, t;      \
36	PSRLL $(32-n), v; \
37	PXOR  t, v
38
39#define ROTL_SSSE3(c, v) \
40	PSHUFB c, v
41
42#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
43	PADDL  m0, v0;        \
44	PADDL  v1, v0;        \
45	PXOR   v0, v3;        \
46	ROTL_SSE2(16, t, v3); \
47	PADDL  v3, v2;        \
48	PXOR   v2, v1;        \
49	ROTL_SSE2(20, t, v1); \
50	PADDL  m1, v0;        \
51	PADDL  v1, v0;        \
52	PXOR   v0, v3;        \
53	ROTL_SSE2(24, t, v3); \
54	PADDL  v3, v2;        \
55	PXOR   v2, v1;        \
56	ROTL_SSE2(25, t, v1); \
57	PSHUFL $0x39, v1, v1; \
58	PSHUFL $0x4E, v2, v2; \
59	PSHUFL $0x93, v3, v3; \
60	PADDL  m2, v0;        \
61	PADDL  v1, v0;        \
62	PXOR   v0, v3;        \
63	ROTL_SSE2(16, t, v3); \
64	PADDL  v3, v2;        \
65	PXOR   v2, v1;        \
66	ROTL_SSE2(20, t, v1); \
67	PADDL  m3, v0;        \
68	PADDL  v1, v0;        \
69	PXOR   v0, v3;        \
70	ROTL_SSE2(24, t, v3); \
71	PADDL  v3, v2;        \
72	PXOR   v2, v1;        \
73	ROTL_SSE2(25, t, v1); \
74	PSHUFL $0x39, v3, v3; \
75	PSHUFL $0x4E, v2, v2; \
76	PSHUFL $0x93, v1, v1
77
78#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
79	PADDL  m0, v0;        \
80	PADDL  v1, v0;        \
81	PXOR   v0, v3;        \
82	ROTL_SSSE3(c16, v3);  \
83	PADDL  v3, v2;        \
84	PXOR   v2, v1;        \
85	ROTL_SSE2(20, t, v1); \
86	PADDL  m1, v0;        \
87	PADDL  v1, v0;        \
88	PXOR   v0, v3;        \
89	ROTL_SSSE3(c8, v3);   \
90	PADDL  v3, v2;        \
91	PXOR   v2, v1;        \
92	ROTL_SSE2(25, t, v1); \
93	PSHUFL $0x39, v1, v1; \
94	PSHUFL $0x4E, v2, v2; \
95	PSHUFL $0x93, v3, v3; \
96	PADDL  m2, v0;        \
97	PADDL  v1, v0;        \
98	PXOR   v0, v3;        \
99	ROTL_SSSE3(c16, v3);  \
100	PADDL  v3, v2;        \
101	PXOR   v2, v1;        \
102	ROTL_SSE2(20, t, v1); \
103	PADDL  m3, v0;        \
104	PADDL  v1, v0;        \
105	PXOR   v0, v3;        \
106	ROTL_SSSE3(c8, v3);   \
107	PADDL  v3, v2;        \
108	PXOR   v2, v1;        \
109	ROTL_SSE2(25, t, v1); \
110	PSHUFL $0x39, v3, v3; \
111	PSHUFL $0x4E, v2, v2; \
112	PSHUFL $0x93, v1, v1
113
114#define PRECOMPUTE(dst, off, src, t) \
115	MOVL 0*4(src), t;          \
116	MOVL t, 0*4+off+0(dst);    \
117	MOVL t, 9*4+off+64(dst);   \
118	MOVL t, 5*4+off+128(dst);  \
119	MOVL t, 14*4+off+192(dst); \
120	MOVL t, 4*4+off+256(dst);  \
121	MOVL t, 2*4+off+320(dst);  \
122	MOVL t, 8*4+off+384(dst);  \
123	MOVL t, 12*4+off+448(dst); \
124	MOVL t, 3*4+off+512(dst);  \
125	MOVL t, 15*4+off+576(dst); \
126	MOVL 1*4(src), t;          \
127	MOVL t, 4*4+off+0(dst);    \
128	MOVL t, 8*4+off+64(dst);   \
129	MOVL t, 14*4+off+128(dst); \
130	MOVL t, 5*4+off+192(dst);  \
131	MOVL t, 12*4+off+256(dst); \
132	MOVL t, 11*4+off+320(dst); \
133	MOVL t, 1*4+off+384(dst);  \
134	MOVL t, 6*4+off+448(dst);  \
135	MOVL t, 10*4+off+512(dst); \
136	MOVL t, 3*4+off+576(dst);  \
137	MOVL 2*4(src), t;          \
138	MOVL t, 1*4+off+0(dst);    \
139	MOVL t, 13*4+off+64(dst);  \
140	MOVL t, 6*4+off+128(dst);  \
141	MOVL t, 8*4+off+192(dst);  \
142	MOVL t, 2*4+off+256(dst);  \
143	MOVL t, 0*4+off+320(dst);  \
144	MOVL t, 14*4+off+384(dst); \
145	MOVL t, 11*4+off+448(dst); \
146	MOVL t, 12*4+off+512(dst); \
147	MOVL t, 4*4+off+576(dst);  \
148	MOVL 3*4(src), t;          \
149	MOVL t, 5*4+off+0(dst);    \
150	MOVL t, 15*4+off+64(dst);  \
151	MOVL t, 9*4+off+128(dst);  \
152	MOVL t, 1*4+off+192(dst);  \
153	MOVL t, 11*4+off+256(dst); \
154	MOVL t, 7*4+off+320(dst);  \
155	MOVL t, 13*4+off+384(dst); \
156	MOVL t, 3*4+off+448(dst);  \
157	MOVL t, 6*4+off+512(dst);  \
158	MOVL t, 10*4+off+576(dst); \
159	MOVL 4*4(src), t;          \
160	MOVL t, 2*4+off+0(dst);    \
161	MOVL t, 1*4+off+64(dst);   \
162	MOVL t, 15*4+off+128(dst); \
163	MOVL t, 10*4+off+192(dst); \
164	MOVL t, 6*4+off+256(dst);  \
165	MOVL t, 8*4+off+320(dst);  \
166	MOVL t, 3*4+off+384(dst);  \
167	MOVL t, 13*4+off+448(dst); \
168	MOVL t, 14*4+off+512(dst); \
169	MOVL t, 5*4+off+576(dst);  \
170	MOVL 5*4(src), t;          \
171	MOVL t, 6*4+off+0(dst);    \
172	MOVL t, 11*4+off+64(dst);  \
173	MOVL t, 2*4+off+128(dst);  \
174	MOVL t, 9*4+off+192(dst);  \
175	MOVL t, 1*4+off+256(dst);  \
176	MOVL t, 13*4+off+320(dst); \
177	MOVL t, 4*4+off+384(dst);  \
178	MOVL t, 8*4+off+448(dst);  \
179	MOVL t, 15*4+off+512(dst); \
180	MOVL t, 7*4+off+576(dst);  \
181	MOVL 6*4(src), t;          \
182	MOVL t, 3*4+off+0(dst);    \
183	MOVL t, 7*4+off+64(dst);   \
184	MOVL t, 13*4+off+128(dst); \
185	MOVL t, 12*4+off+192(dst); \
186	MOVL t, 10*4+off+256(dst); \
187	MOVL t, 1*4+off+320(dst);  \
188	MOVL t, 9*4+off+384(dst);  \
189	MOVL t, 14*4+off+448(dst); \
190	MOVL t, 0*4+off+512(dst);  \
191	MOVL t, 6*4+off+576(dst);  \
192	MOVL 7*4(src), t;          \
193	MOVL t, 7*4+off+0(dst);    \
194	MOVL t, 14*4+off+64(dst);  \
195	MOVL t, 10*4+off+128(dst); \
196	MOVL t, 0*4+off+192(dst);  \
197	MOVL t, 5*4+off+256(dst);  \
198	MOVL t, 9*4+off+320(dst);  \
199	MOVL t, 12*4+off+384(dst); \
200	MOVL t, 1*4+off+448(dst);  \
201	MOVL t, 13*4+off+512(dst); \
202	MOVL t, 2*4+off+576(dst);  \
203	MOVL 8*4(src), t;          \
204	MOVL t, 8*4+off+0(dst);    \
205	MOVL t, 5*4+off+64(dst);   \
206	MOVL t, 4*4+off+128(dst);  \
207	MOVL t, 15*4+off+192(dst); \
208	MOVL t, 14*4+off+256(dst); \
209	MOVL t, 3*4+off+320(dst);  \
210	MOVL t, 11*4+off+384(dst); \
211	MOVL t, 10*4+off+448(dst); \
212	MOVL t, 7*4+off+512(dst);  \
213	MOVL t, 1*4+off+576(dst);  \
214	MOVL 9*4(src), t;          \
215	MOVL t, 12*4+off+0(dst);   \
216	MOVL t, 2*4+off+64(dst);   \
217	MOVL t, 11*4+off+128(dst); \
218	MOVL t, 4*4+off+192(dst);  \
219	MOVL t, 0*4+off+256(dst);  \
220	MOVL t, 15*4+off+320(dst); \
221	MOVL t, 10*4+off+384(dst); \
222	MOVL t, 7*4+off+448(dst);  \
223	MOVL t, 5*4+off+512(dst);  \
224	MOVL t, 9*4+off+576(dst);  \
225	MOVL 10*4(src), t;         \
226	MOVL t, 9*4+off+0(dst);    \
227	MOVL t, 4*4+off+64(dst);   \
228	MOVL t, 8*4+off+128(dst);  \
229	MOVL t, 13*4+off+192(dst); \
230	MOVL t, 3*4+off+256(dst);  \
231	MOVL t, 5*4+off+320(dst);  \
232	MOVL t, 7*4+off+384(dst);  \
233	MOVL t, 15*4+off+448(dst); \
234	MOVL t, 11*4+off+512(dst); \
235	MOVL t, 0*4+off+576(dst);  \
236	MOVL 11*4(src), t;         \
237	MOVL t, 13*4+off+0(dst);   \
238	MOVL t, 10*4+off+64(dst);  \
239	MOVL t, 0*4+off+128(dst);  \
240	MOVL t, 3*4+off+192(dst);  \
241	MOVL t, 9*4+off+256(dst);  \
242	MOVL t, 6*4+off+320(dst);  \
243	MOVL t, 15*4+off+384(dst); \
244	MOVL t, 4*4+off+448(dst);  \
245	MOVL t, 2*4+off+512(dst);  \
246	MOVL t, 12*4+off+576(dst); \
247	MOVL 12*4(src), t;         \
248	MOVL t, 10*4+off+0(dst);   \
249	MOVL t, 12*4+off+64(dst);  \
250	MOVL t, 1*4+off+128(dst);  \
251	MOVL t, 6*4+off+192(dst);  \
252	MOVL t, 13*4+off+256(dst); \
253	MOVL t, 4*4+off+320(dst);  \
254	MOVL t, 0*4+off+384(dst);  \
255	MOVL t, 2*4+off+448(dst);  \
256	MOVL t, 8*4+off+512(dst);  \
257	MOVL t, 14*4+off+576(dst); \
258	MOVL 13*4(src), t;         \
259	MOVL t, 14*4+off+0(dst);   \
260	MOVL t, 3*4+off+64(dst);   \
261	MOVL t, 7*4+off+128(dst);  \
262	MOVL t, 2*4+off+192(dst);  \
263	MOVL t, 15*4+off+256(dst); \
264	MOVL t, 12*4+off+320(dst); \
265	MOVL t, 6*4+off+384(dst);  \
266	MOVL t, 0*4+off+448(dst);  \
267	MOVL t, 9*4+off+512(dst);  \
268	MOVL t, 11*4+off+576(dst); \
269	MOVL 14*4(src), t;         \
270	MOVL t, 11*4+off+0(dst);   \
271	MOVL t, 0*4+off+64(dst);   \
272	MOVL t, 12*4+off+128(dst); \
273	MOVL t, 7*4+off+192(dst);  \
274	MOVL t, 8*4+off+256(dst);  \
275	MOVL t, 14*4+off+320(dst); \
276	MOVL t, 2*4+off+384(dst);  \
277	MOVL t, 5*4+off+448(dst);  \
278	MOVL t, 1*4+off+512(dst);  \
279	MOVL t, 13*4+off+576(dst); \
280	MOVL 15*4(src), t;         \
281	MOVL t, 15*4+off+0(dst);   \
282	MOVL t, 6*4+off+64(dst);   \
283	MOVL t, 3*4+off+128(dst);  \
284	MOVL t, 11*4+off+192(dst); \
285	MOVL t, 7*4+off+256(dst);  \
286	MOVL t, 10*4+off+320(dst); \
287	MOVL t, 5*4+off+384(dst);  \
288	MOVL t, 9*4+off+448(dst);  \
289	MOVL t, 4*4+off+512(dst);  \
290	MOVL t, 8*4+off+576(dst)
291
292// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
293TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment
294	MOVL h+0(FP), AX
295	MOVL c+4(FP), BX
296	MOVL flag+8(FP), CX
297	MOVL blocks_base+12(FP), SI
298	MOVL blocks_len+16(FP), DX
299
300	MOVL SP, BP
301	MOVL SP, DI
302	ADDL $15, DI
303	ANDL $~15, DI
304	MOVL DI, SP
305
306	MOVL CX, 8(SP)
307	MOVL 0(BX), CX
308	MOVL CX, 0(SP)
309	MOVL 4(BX), CX
310	MOVL CX, 4(SP)
311	XORL CX, CX
312	MOVL CX, 12(SP)
313
314	MOVOU 0(AX), X0
315	MOVOU 16(AX), X1
316	MOVOU counter<>(SB), X2
317
318loop:
319	MOVO  X0, X4
320	MOVO  X1, X5
321	MOVOU iv0<>(SB), X6
322	MOVOU iv1<>(SB), X7
323
324	MOVO  0(SP), X3
325	PADDQ X2, X3
326	PXOR  X3, X7
327	MOVO  X3, 0(SP)
328
329	PRECOMPUTE(SP, 16, SI, CX)
330	ROUND_SSE2(X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X3)
331	ROUND_SSE2(X4, X5, X6, X7, 16+64(SP), 32+64(SP), 48+64(SP), 64+64(SP), X3)
332	ROUND_SSE2(X4, X5, X6, X7, 16+128(SP), 32+128(SP), 48+128(SP), 64+128(SP), X3)
333	ROUND_SSE2(X4, X5, X6, X7, 16+192(SP), 32+192(SP), 48+192(SP), 64+192(SP), X3)
334	ROUND_SSE2(X4, X5, X6, X7, 16+256(SP), 32+256(SP), 48+256(SP), 64+256(SP), X3)
335	ROUND_SSE2(X4, X5, X6, X7, 16+320(SP), 32+320(SP), 48+320(SP), 64+320(SP), X3)
336	ROUND_SSE2(X4, X5, X6, X7, 16+384(SP), 32+384(SP), 48+384(SP), 64+384(SP), X3)
337	ROUND_SSE2(X4, X5, X6, X7, 16+448(SP), 32+448(SP), 48+448(SP), 64+448(SP), X3)
338	ROUND_SSE2(X4, X5, X6, X7, 16+512(SP), 32+512(SP), 48+512(SP), 64+512(SP), X3)
339	ROUND_SSE2(X4, X5, X6, X7, 16+576(SP), 32+576(SP), 48+576(SP), 64+576(SP), X3)
340
341	PXOR X4, X0
342	PXOR X5, X1
343	PXOR X6, X0
344	PXOR X7, X1
345
346	LEAL 64(SI), SI
347	SUBL $64, DX
348	JNE  loop
349
350	MOVL 0(SP), CX
351	MOVL CX, 0(BX)
352	MOVL 4(SP), CX
353	MOVL CX, 4(BX)
354
355	MOVOU X0, 0(AX)
356	MOVOU X1, 16(AX)
357
358	MOVL BP, SP
359	RET
360
361// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
362TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment
363	MOVL h+0(FP), AX
364	MOVL c+4(FP), BX
365	MOVL flag+8(FP), CX
366	MOVL blocks_base+12(FP), SI
367	MOVL blocks_len+16(FP), DX
368
369	MOVL SP, BP
370	MOVL SP, DI
371	ADDL $15, DI
372	ANDL $~15, DI
373	MOVL DI, SP
374
375	MOVL CX, 8(SP)
376	MOVL 0(BX), CX
377	MOVL CX, 0(SP)
378	MOVL 4(BX), CX
379	MOVL CX, 4(SP)
380	XORL CX, CX
381	MOVL CX, 12(SP)
382
383	MOVOU 0(AX), X0
384	MOVOU 16(AX), X1
385	MOVOU counter<>(SB), X2
386
387loop:
388	MOVO  X0, 656(SP)
389	MOVO  X1, 672(SP)
390	MOVO  X0, X4
391	MOVO  X1, X5
392	MOVOU iv0<>(SB), X6
393	MOVOU iv1<>(SB), X7
394
395	MOVO  0(SP), X3
396	PADDQ X2, X3
397	PXOR  X3, X7
398	MOVO  X3, 0(SP)
399
400	MOVOU rol16<>(SB), X0
401	MOVOU rol8<>(SB), X1
402
403	PRECOMPUTE(SP, 16, SI, CX)
404	ROUND_SSSE3(X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X3, X0, X1)
405	ROUND_SSSE3(X4, X5, X6, X7, 16+64(SP), 32+64(SP), 48+64(SP), 64+64(SP), X3, X0, X1)
406	ROUND_SSSE3(X4, X5, X6, X7, 16+128(SP), 32+128(SP), 48+128(SP), 64+128(SP), X3, X0, X1)
407	ROUND_SSSE3(X4, X5, X6, X7, 16+192(SP), 32+192(SP), 48+192(SP), 64+192(SP), X3, X0, X1)
408	ROUND_SSSE3(X4, X5, X6, X7, 16+256(SP), 32+256(SP), 48+256(SP), 64+256(SP), X3, X0, X1)
409	ROUND_SSSE3(X4, X5, X6, X7, 16+320(SP), 32+320(SP), 48+320(SP), 64+320(SP), X3, X0, X1)
410	ROUND_SSSE3(X4, X5, X6, X7, 16+384(SP), 32+384(SP), 48+384(SP), 64+384(SP), X3, X0, X1)
411	ROUND_SSSE3(X4, X5, X6, X7, 16+448(SP), 32+448(SP), 48+448(SP), 64+448(SP), X3, X0, X1)
412	ROUND_SSSE3(X4, X5, X6, X7, 16+512(SP), 32+512(SP), 48+512(SP), 64+512(SP), X3, X0, X1)
413	ROUND_SSSE3(X4, X5, X6, X7, 16+576(SP), 32+576(SP), 48+576(SP), 64+576(SP), X3, X0, X1)
414
415	MOVO 656(SP), X0
416	MOVO 672(SP), X1
417	PXOR X4, X0
418	PXOR X5, X1
419	PXOR X6, X0
420	PXOR X7, X1
421
422	LEAL 64(SI), SI
423	SUBL $64, DX
424	JNE  loop
425
426	MOVL 0(SP), CX
427	MOVL CX, 0(BX)
428	MOVL 4(SP), CX
429	MOVL CX, 4(BX)
430
431	MOVOU X0, 0(AX)
432	MOVOU X1, 16(AX)
433
434	MOVL BP, SP
435	RET
436