1/*
2;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3; Copyright (c) 2012, Intel Corporation
4;
5; All rights reserved.
6;
7; Redistribution and use in source and binary forms, with or without
8; modification, are permitted provided that the following conditions are
9; met:
10;
11; * Redistributions of source code must retain the above copyright
12;   notice, this list of conditions and the following disclaimer.
13;
14; * Redistributions in binary form must reproduce the above copyright
15;   notice, this list of conditions and the following disclaimer in the
16;   documentation and/or other materials provided with the
17;   distribution.
18;
19; * Neither the name of the Intel Corporation nor the names of its
20;   contributors may be used to endorse or promote products derived from
21;   this software without specific prior written permission.
22;
23;
24; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
25; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
28; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
32; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
33; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
36;
37; This code is described in an Intel White-Paper:
38; "Fast SHA-256 Implementations on Intel Architecture Processors"
39;
40; To find it, surf to http://www.intel.com/p/en_US/embedded
41; and search for that title.
42; The paper is expected to be released roughly at the end of April, 2012
43;
44;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
45; This code schedules 1 blocks at a time, with 4 lanes per block
46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
47*/
48/*
49 * Conversion to GAS assembly and integration to libgcrypt
50 *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
51 *
52 * Note: original implementation was named as SHA256-SSE4. However, only SSSE3
53 *       is required.
54 */
55
56#ifdef __x86_64
57#include <config.h>
58#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
59     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
60    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
61    defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256)
62
63#include "asm-common-amd64.h"
64
65.intel_syntax noprefix
66
67#define	MOVDQ movdqu /* assume buffers not aligned */
68
69/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
70
71/* addm [mem], reg
72 * Add reg to mem using reg-mem add and store */
73#define addm(p1, p2) \
74	add	p2, p1; \
75	mov	p1, p2;
76
77/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
78
79/* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
80 * Load xmm with mem and byte swap each dword */
81#define COPY_XMM_AND_BSWAP(p1, p2, p3) \
82	MOVDQ p1, p2; \
83	pshufb p1, p3;
84
85/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
86
87#define X0 xmm4
88#define X1 xmm5
89#define X2 xmm6
90#define X3 xmm7
91
92#define XTMP0 xmm0
93#define XTMP1 xmm1
94#define XTMP2 xmm2
95#define XTMP3 xmm3
96#define XTMP4 xmm8
97#define XFER xmm9
98
99#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
100#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
101#define BYTE_FLIP_MASK xmm12
102
103#define NUM_BLKS rdx	/* 3rd arg */
104#define CTX rsi	/* 2nd arg */
105#define INP rdi	/* 1st arg */
106
107#define SRND rdi	/* clobbers INP */
108#define c ecx
109#define d r8d
110#define e edx
111
112#define TBL rbp
113#define a eax
114#define b ebx
115
116#define f r9d
117#define g r10d
118#define h r11d
119
120#define y0 r13d
121#define y1 r14d
122#define y2 r15d
123
124
125
126#define _INP_END_SIZE	8
127#define _INP_SIZE	8
128#define _XFER_SIZE	8
129#define _XMM_SAVE_SIZE	0
130/* STACK_SIZE plus pushes must be an odd multiple of 8 */
131#define _ALIGN_SIZE	8
132
133#define _INP_END	0
134#define _INP		(_INP_END  + _INP_END_SIZE)
135#define _XFER		(_INP      + _INP_SIZE)
136#define _XMM_SAVE	(_XFER     + _XFER_SIZE + _ALIGN_SIZE)
137#define STACK_SIZE	(_XMM_SAVE + _XMM_SAVE_SIZE)
138
139
140#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
141		/* compute s0 four at a time and s1 two at a time */; \
142		/* compute W[-16] + W[-7] 4 at a time */; \
143		movdqa	XTMP0, X3; \
144	mov	y0, e		/* y0 = e */; \
145	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
146	mov	y1, a		/* y1 = a */; \
147		palignr	XTMP0, X2, 4	/* XTMP0 = W[-7] */; \
148	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
149	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
150	mov	y2, f		/* y2 = f */; \
151	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
152		movdqa	XTMP1, X1; \
153	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
154	xor	y2, g		/* y2 = f^g */; \
155		paddd	XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */; \
156	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
157	and	y2, e		/* y2 = (f^g)&e */; \
158	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
159		/* compute s0 */; \
160		palignr	XTMP1, X0, 4	/* XTMP1 = W[-15] */; \
161	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
162	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
163	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
164		movdqa	XTMP2, XTMP1	/* XTMP2 = W[-15] */; \
165	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
166	add	y2, y0		/* y2 = S1 + CH */; \
167	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */; \
168		movdqa	XTMP3, XTMP1	/* XTMP3 = W[-15] */; \
169	mov	y0, a		/* y0 = a */; \
170	add	h, y2		/* h = h + S1 + CH + k + w */; \
171	mov	y2, a		/* y2 = a */; \
172		pslld	XTMP1, (32-7); \
173	or	y0, c		/* y0 = a|c */; \
174	add	d, h		/* d = d + h + S1 + CH + k + w */; \
175	and	y2, c		/* y2 = a&c */; \
176		psrld	XTMP2, 7; \
177	and	y0, b		/* y0 = (a|c)&b */; \
178	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
179		por	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 */; \
180	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
181	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
182
183#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
184		movdqa	XTMP2, XTMP3	/* XTMP2 = W[-15] */; \
185	mov	y0, e		/* y0 = e */; \
186	mov	y1, a		/* y1 = a */; \
187		movdqa	XTMP4, XTMP3	/* XTMP4 = W[-15] */; \
188	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
189	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
190	mov	y2, f		/* y2 = f */; \
191	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
192		pslld	XTMP3, (32-18); \
193	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
194	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
195	xor	y2, g		/* y2 = f^g */; \
196		psrld	XTMP2, 18; \
197	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
198	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
199	and	y2, e		/* y2 = (f^g)&e */; \
200	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
201		pxor	XTMP1, XTMP3; \
202	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
203	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
204		psrld	XTMP4, 3	/* XTMP4 = W[-15] >> 3 */; \
205	add	y2, y0		/* y2 = S1 + CH */; \
206	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */; \
207	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
208		pxor	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
209	mov	y0, a		/* y0 = a */; \
210	add	h, y2		/* h = h + S1 + CH + k + w */; \
211	mov	y2, a		/* y2 = a */; \
212		pxor	XTMP1, XTMP4	/* XTMP1 = s0 */; \
213	or	y0, c		/* y0 = a|c */; \
214	add	d, h		/* d = d + h + S1 + CH + k + w */; \
215	and	y2, c		/* y2 = a&c */; \
216		/* compute low s1 */; \
217		pshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */; \
218	and	y0, b		/* y0 = (a|c)&b */; \
219	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
220		paddd	XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */; \
221	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
222	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
223
224#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
225		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {BBAA} */; \
226	mov	y0, e		/* y0 = e */; \
227	mov	y1, a		/* y1 = a */; \
228	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
229		movdqa	XTMP4, XTMP2	/* XTMP4 = W[-2] {BBAA} */; \
230	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
231	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
232	mov	y2, f		/* y2 = f */; \
233	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
234	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
235		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */; \
236	xor	y2, g		/* y2 = f^g */; \
237		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */; \
238	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
239	and	y2, e		/* y2 = (f^g)&e */; \
240		psrld	XTMP4, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */; \
241	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
242	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
243	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
244	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
245		pxor	XTMP2, XTMP3; \
246	add	y2, y0		/* y2 = S1 + CH */; \
247	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
248	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */; \
249		pxor	XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */; \
250	mov	y0, a		/* y0 = a */; \
251	add	h, y2		/* h = h + S1 + CH + k + w */; \
252	mov	y2, a		/* y2 = a */; \
253		pshufb	XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */; \
254	or	y0, c		/* y0 = a|c */; \
255	add	d, h		/* d = d + h + S1 + CH + k + w */; \
256	and	y2, c		/* y2 = a&c */; \
257		paddd	XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */; \
258	and	y0, b		/* y0 = (a|c)&b */; \
259	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
260		/* compute high s1 */; \
261		pshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
262	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
263	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
264
265#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
266		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {DDCC} */; \
267	mov	y0, e		/* y0 = e */; \
268	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
269	mov	y1, a		/* y1 = a */; \
270		movdqa	X0,    XTMP2	/* X0    = W[-2] {DDCC} */; \
271	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
272	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
273	mov	y2, f		/* y2 = f */; \
274	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
275		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */; \
276	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
277	xor	y2, g		/* y2 = f^g */; \
278		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */; \
279	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
280	and	y2, e		/* y2 = (f^g)&e */; \
281	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
282		psrld	X0,    10	/* X0 = W[-2] >> 10 {DDCC} */; \
283	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
284	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
285	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
286		pxor	XTMP2, XTMP3; \
287	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
288	add	y2, y0		/* y2 = S1 + CH */; \
289	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */; \
290		pxor	X0, XTMP2	/* X0 = s1 {xDxC} */; \
291	mov	y0, a		/* y0 = a */; \
292	add	h, y2		/* h = h + S1 + CH + k + w */; \
293	mov	y2, a		/* y2 = a */; \
294		pshufb	X0, SHUF_DC00	/* X0 = s1 {DC00} */; \
295	or	y0, c		/* y0 = a|c */; \
296	add	d, h		/* d = d + h + S1 + CH + k + w */; \
297	and	y2, c		/* y2 = a&c */; \
298		paddd	X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */; \
299	and	y0, b		/* y0 = (a|c)&b */; \
300	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
301	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
302	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
303
304#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
305	FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
306	FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
307	FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
308	FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
309
310/* input is [rsp + _XFER + %1 * 4] */
311#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
312	mov	y0, e		/* y0 = e */; \
313	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
314	mov	y1, a		/* y1 = a */; \
315	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
316	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
317	mov	y2, f		/* y2 = f */; \
318	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
319	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
320	xor	y2, g		/* y2 = f^g */; \
321	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
322	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
323	and	y2, e		/* y2 = (f^g)&e */; \
324	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
325	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
326	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
327	add	y2, y0		/* y2 = S1 + CH */; \
328	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
329	add	y2, [rsp + _XFER + i1 * 4]	/* y2 = k + w + S1 + CH */; \
330	mov	y0, a		/* y0 = a */; \
331	add	h, y2		/* h = h + S1 + CH + k + w */; \
332	mov	y2, a		/* y2 = a */; \
333	or	y0, c		/* y0 = a|c */; \
334	add	d, h		/* d = d + h + S1 + CH + k + w */; \
335	and	y2, c		/* y2 = a&c */; \
336	and	y0, b		/* y0 = (a|c)&b */; \
337	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
338	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
339	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
340
341/*
342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
345;; arg 1 : pointer to input data
346;; arg 2 : pointer to digest
347;; arg 3 : Num blocks
348*/
349.text
350.globl _gcry_sha256_transform_amd64_ssse3
351ELF(.type  _gcry_sha256_transform_amd64_ssse3,@function;)
352.align 16
353_gcry_sha256_transform_amd64_ssse3:
354	CFI_STARTPROC()
355	push	rbx
356	CFI_PUSH(rbx)
357	push	rbp
358	CFI_PUSH(rbp)
359	push	r13
360	CFI_PUSH(r13)
361	push	r14
362	CFI_PUSH(r14)
363	push	r15
364	CFI_PUSH(r15)
365
366	sub	rsp, STACK_SIZE
367	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
368
369	shl	NUM_BLKS, 6	/* convert to bytes */
370	jz	.Ldone_hash
371	add	NUM_BLKS, INP	/* pointer to end of data */
372	mov	[rsp + _INP_END], NUM_BLKS
373
374	/* load initial digest */
375	mov	a,[4*0 + CTX]
376	mov	b,[4*1 + CTX]
377	mov	c,[4*2 + CTX]
378	mov	d,[4*3 + CTX]
379	mov	e,[4*4 + CTX]
380	mov	f,[4*5 + CTX]
381	mov	g,[4*6 + CTX]
382	mov	h,[4*7 + CTX]
383
384	movdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
385	movdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
386	movdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
387
388.Loop0:
389	lea	TBL, [.LK256 ADD_RIP]
390
391	/* byte swap first 16 dwords */
392	COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
393	COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
394	COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
395	COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
396
397	mov	[rsp + _INP], INP
398
399	/* schedule 48 input dwords, by doing 3 rounds of 16 each */
400	mov	SRND, 3
401.align 16
402.Loop1:
403	movdqa	XFER, [TBL + 0*16]
404	paddd	XFER, X0
405	movdqa	[rsp + _XFER], XFER
406	FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
407
408	movdqa	XFER, [TBL + 1*16]
409	paddd	XFER, X1
410	movdqa	[rsp + _XFER], XFER
411	FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
412
413	movdqa	XFER, [TBL + 2*16]
414	paddd	XFER, X2
415	movdqa	[rsp + _XFER], XFER
416	FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
417
418	movdqa	XFER, [TBL + 3*16]
419	paddd	XFER, X3
420	movdqa	[rsp + _XFER], XFER
421	add	TBL, 4*16
422	FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
423
424	sub	SRND, 1
425	jne	.Loop1
426
427	mov	SRND, 2
428.Loop2:
429	paddd	X0, [TBL + 0*16]
430	movdqa	[rsp + _XFER], X0
431	DO_ROUND(0, a, b, c, d, e, f, g, h)
432	DO_ROUND(1, h, a, b, c, d, e, f, g)
433	DO_ROUND(2, g, h, a, b, c, d, e, f)
434	DO_ROUND(3, f, g, h, a, b, c, d, e)
435	paddd	X1, [TBL + 1*16]
436	movdqa	[rsp + _XFER], X1
437	add	TBL, 2*16
438	DO_ROUND(0, e, f, g, h, a, b, c, d)
439	DO_ROUND(1, d, e, f, g, h, a, b, c)
440	DO_ROUND(2, c, d, e, f, g, h, a, b)
441	DO_ROUND(3, b, c, d, e, f, g, h, a)
442
443	movdqa	X0, X2
444	movdqa	X1, X3
445
446	sub	SRND, 1
447	jne	.Loop2
448
449	addm([4*0 + CTX],a)
450	addm([4*1 + CTX],b)
451	addm([4*2 + CTX],c)
452	addm([4*3 + CTX],d)
453	addm([4*4 + CTX],e)
454	addm([4*5 + CTX],f)
455	addm([4*6 + CTX],g)
456	addm([4*7 + CTX],h)
457
458	mov	INP, [rsp + _INP]
459	add	INP, 64
460	cmp	INP, [rsp + _INP_END]
461	jne	.Loop0
462
463	pxor	xmm0, xmm0
464	pxor	xmm1, xmm1
465	pxor	xmm2, xmm2
466	pxor	xmm3, xmm3
467	pxor	xmm4, xmm4
468	pxor	xmm5, xmm5
469	pxor	xmm6, xmm6
470	pxor	xmm7, xmm7
471	pxor	xmm8, xmm8
472	pxor	xmm9, xmm9
473	pxor	xmm10, xmm10
474	pxor	xmm11, xmm11
475	pxor	xmm12, xmm12
476
477.Ldone_hash:
478	pxor	XFER, XFER
479	movdqa	[rsp + _XFER], XFER
480	xor     eax, eax
481
482	add	rsp, STACK_SIZE
483	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
484
485	pop	r15
486	CFI_POP(r15)
487	pop	r14
488	CFI_POP(r14)
489	pop	r13
490	CFI_POP(r13)
491	pop	rbp
492	CFI_POP(rbp)
493	pop	rbx
494	CFI_POP(rbx)
495
496	ret
497	CFI_ENDPROC()
498
499
500.align 16
501.LK256:
502	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
503	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
504	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
505	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
506	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
507	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
508	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
509	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
510	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
511	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
512	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
513	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
514	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
515	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
516	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
517	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
518
519.LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203
520
521/* shuffle xBxA -> 00BA */
522.L_SHUF_00BA:              .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
523
524/* shuffle xDxC -> DC00 */
525.L_SHUF_DC00:              .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
526
527#endif
528#endif
529