1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "textflag.h"
6
7// SHA256 block routine. See sha256block.go for Go equivalent.
8//
9// The algorithm is detailed in FIPS 180-4:
10//
11//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
12
13// The avx2-version is described in an Intel White-Paper:
14// "Fast SHA-256 Implementations on Intel Architecture Processors"
15// To find it, surf to http://www.intel.com/p/en_US/embedded
16// and search for that title.
17// AVX2 version by Intel, same algorithm as code in Linux kernel:
18// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
19// by
20//     James Guilford <james.guilford@intel.com>
21//     Kirk Yap <kirk.s.yap@intel.com>
22//     Tim Chen <tim.c.chen@linux.intel.com>
23
24// Wt = Mt; for 0 <= t <= 15
25// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
26//
27// a = H0
28// b = H1
29// c = H2
30// d = H3
31// e = H4
32// f = H5
33// g = H6
34// h = H7
35//
36// for t = 0 to 63 {
37//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
38//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
39//    h = g
40//    g = f
41//    f = e
42//    e = d + T1
43//    d = c
44//    c = b
45//    b = a
46//    a = T1 + T2
47// }
48//
49// H0 = a + H0
50// H1 = b + H1
51// H2 = c + H2
52// H3 = d + H3
53// H4 = e + H4
54// H5 = f + H5
55// H6 = g + H6
56// H7 = h + H7
57
58// Wt = Mt; for 0 <= t <= 15
59#define MSGSCHEDULE0(index) \
60	MOVL	(index*4)(SI), AX; \
61	BSWAPL	AX; \
62	MOVL	AX, (index*4)(BP)
63
64// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
65//   SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
66//   SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
67#define MSGSCHEDULE1(index) \
68	MOVL	((index-2)*4)(BP), AX; \
69	MOVL	AX, CX; \
70	RORL	$17, AX; \
71	MOVL	CX, DX; \
72	RORL	$19, CX; \
73	SHRL	$10, DX; \
74	MOVL	((index-15)*4)(BP), BX; \
75	XORL	CX, AX; \
76	MOVL	BX, CX; \
77	XORL	DX, AX; \
78	RORL	$7, BX; \
79	MOVL	CX, DX; \
80	SHRL	$3, DX; \
81	RORL	$18, CX; \
82	ADDL	((index-7)*4)(BP), AX; \
83	XORL	CX, BX; \
84	XORL	DX, BX; \
85	ADDL	((index-16)*4)(BP), BX; \
86	ADDL	BX, AX; \
87	MOVL	AX, ((index)*4)(BP)
88
89// Calculate T1 in AX - uses AX, CX and DX registers.
90// h is also used as an accumulator. Wt is passed in AX.
91//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
92//     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
93//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
94#define SHA256T1(const, e, f, g, h) \
95	ADDL	AX, h; \
96	MOVL	e, AX; \
97	ADDL	$const, h; \
98	MOVL	e, CX; \
99	RORL	$6, AX; \
100	MOVL	e, DX; \
101	RORL	$11, CX; \
102	XORL	CX, AX; \
103	MOVL	e, CX; \
104	RORL	$25, DX; \
105	ANDL	f, CX; \
106	XORL	AX, DX; \
107	MOVL	e, AX; \
108	NOTL	AX; \
109	ADDL	DX, h; \
110	ANDL	g, AX; \
111	XORL	CX, AX; \
112	ADDL	h, AX
113
114// Calculate T2 in BX - uses BX, CX, DX and DI registers.
115//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
116//     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
117//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
118#define SHA256T2(a, b, c) \
119	MOVL	a, DI; \
120	MOVL	c, BX; \
121	RORL	$2, DI; \
122	MOVL	a, DX; \
123	ANDL	b, BX; \
124	RORL	$13, DX; \
125	MOVL	a, CX; \
126	ANDL	c, CX; \
127	XORL	DX, DI; \
128	XORL	CX, BX; \
129	MOVL	a, DX; \
130	MOVL	b, CX; \
131	RORL	$22, DX; \
132	ANDL	a, CX; \
133	XORL	CX, BX; \
134	XORL	DX, DI; \
135	ADDL	DI, BX
136
137// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
138// The values for e and a are stored in d and h, ready for rotation.
139#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
140	SHA256T1(const, e, f, g, h); \
141	SHA256T2(a, b, c); \
142	MOVL	BX, h; \
143	ADDL	AX, d; \
144	ADDL	AX, h
145
146#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
147	MSGSCHEDULE0(index); \
148	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
149
150#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
151	MSGSCHEDULE1(index); \
152	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
153
154
155// Definitions for AVX2 version
156
157// addm (mem), reg
158// Add reg to mem using reg-mem add and store
159#define addm(P1, P2) \
160	ADDL P2, P1; \
161	MOVL P1, P2
162
163#define XDWORD0 Y4
164#define XDWORD1 Y5
165#define XDWORD2 Y6
166#define XDWORD3 Y7
167
168#define XWORD0 X4
169#define XWORD1 X5
170#define XWORD2 X6
171#define XWORD3 X7
172
173#define XTMP0 Y0
174#define XTMP1 Y1
175#define XTMP2 Y2
176#define XTMP3 Y3
177#define XTMP4 Y8
178#define XTMP5 Y11
179
180#define XFER  Y9
181
182#define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
183#define X_BYTE_FLIP_MASK X13
184
185#define NUM_BYTES DX
186#define INP	DI
187
188#define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
189
190#define a AX
191#define b BX
192#define c CX
193#define d R8
194#define e DX
195#define f R9
196#define g R10
197#define h R11
198
199#define old_h R11
200
201#define TBL BP
202
203#define SRND SI // SRND is same register as CTX
204
205#define T1 R12
206
207#define y0 R13
208#define y1 R14
209#define y2 R15
210#define y3 DI
211
212// Offsets
213#define XFER_SIZE 2*64*4
214#define INP_END_SIZE 8
215#define INP_SIZE 8
216
217#define _XFER 0
218#define _INP_END _XFER + XFER_SIZE
219#define _INP _INP_END + INP_END_SIZE
220#define STACK_SIZE _INP + INP_SIZE
221
222#define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
223	;                                     \ // #############################  RND N + 0 ############################//
224	MOVL     a, y3;                       \ // y3 = a					// MAJA
225	RORXL    $25, e, y0;                  \ // y0 = e >> 25				// S1A
226	RORXL    $11, e, y1;                  \ // y1 = e >> 11				// S1B
227	;                                     \
228	ADDL     (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h        // disp = k + w
229	ORL      c, y3;                       \ // y3 = a|c				// MAJA
230	VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
231	MOVL     f, y2;                       \ // y2 = f				// CH
232	RORXL    $13, a, T1;                  \ // T1 = a >> 13			// S0B
233	;                                     \
234	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)					// S1
235	XORL     g, y2;                       \ // y2 = f^g                              	// CH
236	VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
237	RORXL    $6, e, y1;                   \ // y1 = (e >> 6)						// S1
238	;                                     \
239	ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
240	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
241	RORXL    $22, a, y1;                  \ // y1 = a >> 22							// S0A
242	ADDL     h, d;                        \ // d = k + w + h + d                     	// --
243	;                                     \
244	ANDL     b, y3;                       \ // y3 = (a|c)&b							// MAJA
245	VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
246	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
247	RORXL    $2, a, T1;                   \ // T1 = (a >> 2)						// S0
248	;                                     \
249	XORL     g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
250	VPSRLD   $7, XTMP1, XTMP2;            \
251	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
252	MOVL     a, T1;                       \ // T1 = a								// MAJB
253	ANDL     c, T1;                       \ // T1 = a&c								// MAJB
254	;                                     \
255	ADDL     y0, y2;                      \ // y2 = S1 + CH							// --
256	VPSLLD   $(32-7), XTMP1, XTMP3;       \
257	ORL      T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
258	ADDL     y1, h;                       \ // h = k + w + h + S0					// --
259	;                                     \
260	ADDL     y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
261	VPOR     XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7
262	;                                     \
263	VPSRLD   $18, XTMP1, XTMP2;           \
264	ADDL     y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
265	ADDL     y3, h                        // h = t1 + S0 + MAJ                     // --
266
267#define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
268	;                                    \ // ################################### RND N + 1 ############################
269	;                                    \
270	MOVL    a, y3;                       \ // y3 = a                       // MAJA
271	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
272	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
273	ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h         		// --
274	ORL     c, y3;                       \ // y3 = a|c						// MAJA
275	;                                    \
276	VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
277	MOVL    f, y2;                       \ // y2 = f						// CH
278	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
279	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
280	XORL    g, y2;                       \ // y2 = f^g						// CH
281	;                                    \
282	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
283	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
284	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
285	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
286	ADDL    h, d;                        \ // d = k + w + h + d				// --
287	;                                    \
288	VPSLLD  $(32-18), XTMP1, XTMP1;      \
289	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
290	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
291	;                                    \
292	VPXOR   XTMP1, XTMP3, XTMP3;         \
293	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
294	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g		// CH
295	;                                    \
296	VPXOR   XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
297	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
298	MOVL    a, T1;                       \ // T1 = a						// MAJB
299	ANDL    c, T1;                       \ // T1 = a&c						// MAJB
300	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
301	;                                    \
302	VPXOR   XTMP4, XTMP3, XTMP1;         \ // XTMP1 = s0
303	VPSHUFD $0xFA, XDWORD3, XTMP2;       \ // XTMP2 = W[-2] {BBAA}
304	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
305	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
306	;                                    \
307	VPADDD  XTMP1, XTMP0, XTMP0;         \ // XTMP0 = W[-16] + W[-7] + s0
308	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
309	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
310	ADDL    y3, h;                       \ // h = t1 + S0 + MAJ                     // --
311	;                                    \
312	VPSRLD  $10, XTMP2, XTMP4            // XTMP4 = W[-2] >> 10 {BBAA}
313
314#define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
315	;                                    \ // ################################### RND N + 2 ############################
316	;                                    \
317	MOVL    a, y3;                       \ // y3 = a							// MAJA
318	RORXL   $25, e, y0;                  \ // y0 = e >> 25						// S1A
319	ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h        			// --
320	;                                    \
321	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
322	RORXL   $11, e, y1;                  \ // y1 = e >> 11						// S1B
323	ORL     c, y3;                       \ // y3 = a|c                         // MAJA
324	MOVL    f, y2;                       \ // y2 = f                           // CH
325	XORL    g, y2;                       \ // y2 = f^g                         // CH
326	;                                    \
327	RORXL   $13, a, T1;                  \ // T1 = a >> 13						// S0B
328	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)			// S1
329	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xBxA}
330	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
331	;                                    \
332	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)					// S1
333	VPXOR   XTMP3, XTMP2, XTMP2;         \
334	ADDL    h, d;                        \ // d = k + w + h + d				// --
335	ANDL    b, y3;                       \ // y3 = (a|c)&b						// MAJA
336	;                                    \
337	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
338	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
339	VPXOR   XTMP2, XTMP4, XTMP4;         \ // XTMP4 = s1 {xBxA}
340	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
341	;                                    \
342	VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
343	;                                    \
344	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
345	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
346	VPADDD  XTMP4, XTMP0, XTMP0;         \ // XTMP0 = {..., ..., W[1], W[0]}
347	;                                    \
348	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
349	MOVL    a, T1;                       \ // T1 = a                                // MAJB
350	ANDL    c, T1;                       \ // T1 = a&c                              // MAJB
351	ADDL    y0, y2;                      \ // y2 = S1 + CH                          // --
352	VPSHUFD $80, XTMP0, XTMP2;           \ // XTMP2 = W[-2] {DDCC}
353	;                                    \
354	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
355	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
356	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
357	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
358	;                                    \
359	ADDL    y3, h                        // h = t1 + S0 + MAJ                     // --
360
361#define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
362	;                                    \ // ################################### RND N + 3 ############################
363	;                                    \
364	MOVL    a, y3;                       \ // y3 = a						// MAJA
365	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
366	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
367	ADDL    (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h				// --
368	ORL     c, y3;                       \ // y3 = a|c                     // MAJA
369	;                                    \
370	VPSRLD  $10, XTMP2, XTMP5;           \ // XTMP5 = W[-2] >> 10 {DDCC}
371	MOVL    f, y2;                       \ // y2 = f						// CH
372	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
373	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
374	XORL    g, y2;                       \ // y2 = f^g						// CH
375	;                                    \
376	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xDxC}
377	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
378	ANDL    e, y2;                       \ // y2 = (f^g)&e					// CH
379	ADDL    h, d;                        \ // d = k + w + h + d			// --
380	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
381	;                                    \
382	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xDxC}
383	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
384	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
385	;                                    \
386	VPXOR   XTMP3, XTMP2, XTMP2;         \
387	RORXL   $22, a, y1;                  \ // y1 = a >> 22					// S0A
388	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
389	;                                    \
390	VPXOR   XTMP2, XTMP5, XTMP5;         \ // XTMP5 = s1 {xDxC}
391	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
392	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
393	;                                    \
394	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
395	;                                    \
396	VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
397	;                                    \
398	VPADDD  XTMP0, XTMP5, XDWORD0;       \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
399	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
400	MOVL    a, T1;                       \ // T1 = a							// MAJB
401	ANDL    c, T1;                       \ // T1 = a&c							// MAJB
402	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)		// MAJ
403	;                                    \
404	ADDL    y1, h;                       \ // h = k + w + h + S0				// --
405	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
406	ADDL    y3, h                        // h = t1 + S0 + MAJ				// --
407
408#define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
409	;                                  \ // ################################### RND N + 0 ###########################
410	MOVL  f, y2;                       \ // y2 = f					// CH
411	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
412	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
413	XORL  g, y2;                       \ // y2 = f^g					// CH
414	;                                  \
415	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)	// S1
416	RORXL $6, e, y1;                   \ // y1 = (e >> 6)			// S1
417	ANDL  e, y2;                       \ // y2 = (f^g)&e				// CH
418	;                                  \
419	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
420	RORXL $13, a, T1;                  \ // T1 = a >> 13						// S0B
421	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
422	RORXL $22, a, y1;                  \ // y1 = a >> 22						// S0A
423	MOVL  a, y3;                       \ // y3 = a							// MAJA
424	;                                  \
425	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)			// S0
426	RORXL $2, a, T1;                   \ // T1 = (a >> 2)					// S0
427	ADDL  (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
428	ORL   c, y3;                       \ // y3 = a|c							// MAJA
429	;                                  \
430	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
431	MOVL  a, T1;                       \ // T1 = a							// MAJB
432	ANDL  b, y3;                       \ // y3 = (a|c)&b						// MAJA
433	ANDL  c, T1;                       \ // T1 = a&c							// MAJB
434	ADDL  y0, y2;                      \ // y2 = S1 + CH						// --
435	;                                  \
436	ADDL  h, d;                        \ // d = k + w + h + d					// --
437	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
438	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
439	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1	// --
440
441#define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
442	;                                  \ // ################################### RND N + 1 ###########################
443	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
444	MOVL  f, y2;                       \ // y2 = f                                // CH
445	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
446	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
447	XORL  g, y2;                       \ // y2 = f^g                             // CH
448	;                                  \
449	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
450	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
451	ANDL  e, y2;                       \ // y2 = (f^g)&e                         // CH
452	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ                    // --
453	;                                  \
454	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
455	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
456	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
457	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
458	MOVL  a, y3;                       \ // y3 = a                               // MAJA
459	;                                  \
460	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
461	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
462	ADDL  (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
463	ORL   c, y3;                       \ // y3 = a|c                             // MAJA
464	;                                  \
465	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
466	MOVL  a, T1;                       \ // T1 = a                               // MAJB
467	ANDL  b, y3;                       \ // y3 = (a|c)&b                         // MAJA
468	ANDL  c, T1;                       \ // T1 = a&c                             // MAJB
469	ADDL  y0, y2;                      \ // y2 = S1 + CH                         // --
470	;                                  \
471	ADDL  h, d;                        \ // d = k + w + h + d                    // --
472	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)            // MAJ
473	ADDL  y1, h;                       \ // h = k + w + h + S0                   // --
474	;                                  \
475	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
476
477#define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
478	;                                  \ // ################################### RND N + 2 ##############################
479	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
480	MOVL  f, y2;                       \ // y2 = f								// CH
481	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
482	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
483	XORL  g, y2;                       \ // y2 = f^g								// CH
484	;                                  \
485	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
486	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
487	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
488	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
489	;                                  \
490	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
491	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
492	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
493	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
494	MOVL  a, y3;                       \ // y3 = a								// MAJA
495	;                                  \
496	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
497	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
498	ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
499	ORL   c, y3;                       \ // y3 = a|c								// MAJA
500	;                                  \
501	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
502	MOVL  a, T1;                       \ // T1 = a								// MAJB
503	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
504	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
505	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
506	;                                  \
507	ADDL  h, d;                        \ // d = k + w + h + d					// --
508	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
509	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
510	;                                  \
511	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
512
513#define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
514	;                                  \ // ################################### RND N + 3 ###########################
515	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
516	MOVL  f, y2;                       \ // y2 = f								// CH
517	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
518	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
519	XORL  g, y2;                       \ // y2 = f^g								// CH
520	;                                  \
521	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
522	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
523	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
524	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
525	;                                  \
526	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
527	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
528	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
529	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
530	MOVL  a, y3;                       \ // y3 = a								// MAJA
531	;                                  \
532	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
533	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
534	ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
535	ORL   c, y3;                       \ // y3 = a|c								// MAJA
536	;                                  \
537	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
538	MOVL  a, T1;                       \ // T1 = a								// MAJB
539	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
540	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
541	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
542	;                                  \
543	ADDL  h, d;                        \ // d = k + w + h + d					// --
544	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
545	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
546	;                                  \
547	ADDL  y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1	// --
548	;                                  \
549	ADDL  y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
550	;                                  \
551	ADDL  y3, h                        // h = t1 + S0 + MAJ					// --
552
553TEXT ·block(SB), 0, $536-32
554	CMPB ·useAVX2(SB), $1
555	JE   avx2
556
557	MOVQ p_base+8(FP), SI
558	MOVQ p_len+16(FP), DX
559	SHRQ $6, DX
560	SHLQ $6, DX
561
562	LEAQ (SI)(DX*1), DI
563	MOVQ DI, 256(SP)
564	CMPQ SI, DI
565	JEQ  end
566
567	MOVQ dig+0(FP), BP
568	MOVL (0*4)(BP), R8  // a = H0
569	MOVL (1*4)(BP), R9  // b = H1
570	MOVL (2*4)(BP), R10 // c = H2
571	MOVL (3*4)(BP), R11 // d = H3
572	MOVL (4*4)(BP), R12 // e = H4
573	MOVL (5*4)(BP), R13 // f = H5
574	MOVL (6*4)(BP), R14 // g = H6
575	MOVL (7*4)(BP), R15 // h = H7
576
577loop:
578	MOVQ SP, BP
579
580	SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
581	SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
582	SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
583	SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
584	SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
585	SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
586	SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
587	SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
588	SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
589	SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
590	SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
591	SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
592	SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
593	SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
594	SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
595	SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
596
597	SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
598	SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
599	SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
600	SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
601	SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
602	SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
603	SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
604	SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
605	SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
606	SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
607	SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
608	SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
609	SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
610	SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
611	SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
612	SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
613	SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
614	SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
615	SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
616	SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
617	SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
618	SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
619	SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
620	SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
621	SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
622	SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
623	SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
624	SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
625	SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
626	SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
627	SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
628	SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
629	SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
630	SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
631	SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
632	SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
633	SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
634	SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
635	SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
636	SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
637	SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
638	SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
639	SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
640	SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
641	SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
642	SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
643	SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
644	SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
645
646	MOVQ dig+0(FP), BP
647	ADDL (0*4)(BP), R8  // H0 = a + H0
648	MOVL R8, (0*4)(BP)
649	ADDL (1*4)(BP), R9  // H1 = b + H1
650	MOVL R9, (1*4)(BP)
651	ADDL (2*4)(BP), R10 // H2 = c + H2
652	MOVL R10, (2*4)(BP)
653	ADDL (3*4)(BP), R11 // H3 = d + H3
654	MOVL R11, (3*4)(BP)
655	ADDL (4*4)(BP), R12 // H4 = e + H4
656	MOVL R12, (4*4)(BP)
657	ADDL (5*4)(BP), R13 // H5 = f + H5
658	MOVL R13, (5*4)(BP)
659	ADDL (6*4)(BP), R14 // H6 = g + H6
660	MOVL R14, (6*4)(BP)
661	ADDL (7*4)(BP), R15 // H7 = h + H7
662	MOVL R15, (7*4)(BP)
663
664	ADDQ $64, SI
665	CMPQ SI, 256(SP)
666	JB   loop
667
668end:
669	RET
670
671avx2:
672	MOVQ dig+0(FP), CTX          // d.h[8]
673	MOVQ p_base+8(FP), INP
674	MOVQ p_len+16(FP), NUM_BYTES
675
676	LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
677	MOVQ NUM_BYTES, _INP_END(SP)
678
679	CMPQ NUM_BYTES, INP
680	JE   avx2_only_one_block
681
682	// Load initial digest
683	MOVL 0(CTX), a  // a = H0
684	MOVL 4(CTX), b  // b = H1
685	MOVL 8(CTX), c  // c = H2
686	MOVL 12(CTX), d // d = H3
687	MOVL 16(CTX), e // e = H4
688	MOVL 20(CTX), f // f = H5
689	MOVL 24(CTX), g // g = H6
690	MOVL 28(CTX), h // h = H7
691
692avx2_loop0: // at each iteration works with one block (512 bit)
693
694	VMOVDQU (0*32)(INP), XTMP0
695	VMOVDQU (1*32)(INP), XTMP1
696	VMOVDQU (2*32)(INP), XTMP2
697	VMOVDQU (3*32)(INP), XTMP3
698
699	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
700
701	// Apply Byte Flip Mask: LE -> BE
702	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
703	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
704	VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
705	VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
706
707	// Transpose data into high/low parts
708	VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
709	VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
710	VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
711	VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
712
713	MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
714
715avx2_last_block_enter:
716	ADDQ $64, INP
717	MOVQ INP, _INP(SP)
718	XORQ SRND, SRND
719
720avx2_loop1: // for w0 - w47
721	// Do 4 rounds and scheduling
722	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
723	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
724	ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
725	ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
726	ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
727	ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
728
729	// Do 4 rounds and scheduling
730	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
731	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
732	ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
733	ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
734	ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
735	ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
736
737	// Do 4 rounds and scheduling
738	VPADDD  2*32(TBL)(SRND*1), XDWORD2, XFER
739	VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
740	ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
741	ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
742	ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
743	ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
744
745	// Do 4 rounds and scheduling
746	VPADDD  3*32(TBL)(SRND*1), XDWORD3, XFER
747	VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
748	ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
749	ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
750	ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
751	ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
752
753	ADDQ $4*32, SRND
754	CMPQ SRND, $3*4*32
755	JB   avx2_loop1
756
757avx2_loop2:
758	// w48 - w63 processed with no scheduling (last 16 rounds)
759	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
760	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
761	DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
762	DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
763	DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
764	DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
765
766	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
767	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
768	DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
769	DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
770	DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
771	DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
772
773	ADDQ $2*32, SRND
774
775	VMOVDQU XDWORD2, XDWORD0
776	VMOVDQU XDWORD3, XDWORD1
777
778	CMPQ SRND, $4*4*32
779	JB   avx2_loop2
780
781	MOVQ dig+0(FP), CTX // d.h[8]
782	MOVQ _INP(SP), INP
783
784	addm(  0(CTX), a)
785	addm(  4(CTX), b)
786	addm(  8(CTX), c)
787	addm( 12(CTX), d)
788	addm( 16(CTX), e)
789	addm( 20(CTX), f)
790	addm( 24(CTX), g)
791	addm( 28(CTX), h)
792
793	CMPQ _INP_END(SP), INP
794	JB   done_hash
795
796	XORQ SRND, SRND
797
798avx2_loop3: // Do second block using previously scheduled results
799	DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
800	DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
801	DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
802	DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
803
804	DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
805	DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
806	DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
807	DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
808
809	ADDQ $2*32, SRND
810	CMPQ SRND, $4*4*32
811	JB   avx2_loop3
812
813	MOVQ dig+0(FP), CTX // d.h[8]
814	MOVQ _INP(SP), INP
815	ADDQ $64, INP
816
817	addm(  0(CTX), a)
818	addm(  4(CTX), b)
819	addm(  8(CTX), c)
820	addm( 12(CTX), d)
821	addm( 16(CTX), e)
822	addm( 20(CTX), f)
823	addm( 24(CTX), g)
824	addm( 28(CTX), h)
825
826	CMPQ _INP_END(SP), INP
827	JA   avx2_loop0
828	JB   done_hash
829
830avx2_do_last_block:
831
832	VMOVDQU 0(INP), XWORD0
833	VMOVDQU 16(INP), XWORD1
834	VMOVDQU 32(INP), XWORD2
835	VMOVDQU 48(INP), XWORD3
836
837	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
838
839	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
840	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
841	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
842	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
843
844	MOVQ $K256<>(SB), TBL
845
846	JMP avx2_last_block_enter
847
848avx2_only_one_block:
849	// Load initial digest
850	MOVL 0(CTX), a  // a = H0
851	MOVL 4(CTX), b  // b = H1
852	MOVL 8(CTX), c  // c = H2
853	MOVL 12(CTX), d // d = H3
854	MOVL 16(CTX), e // e = H4
855	MOVL 20(CTX), f // f = H5
856	MOVL 24(CTX), g // g = H6
857	MOVL 28(CTX), h // h = H7
858
859	JMP avx2_do_last_block
860
861done_hash:
862	VZEROUPPER
863	RET
864
865// shuffle byte order from LE to BE
866DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
867DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
868DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
869DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
870GLOBL flip_mask<>(SB), 8, $32
871
872// shuffle xBxA -> 00BA
873DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
874DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
875DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
876DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
877GLOBL shuff_00BA<>(SB), 8, $32
878
879// shuffle xDxC -> DC00
880DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
881DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
882DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
883DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
884GLOBL shuff_DC00<>(SB), 8, $32
885
886// Round specific constants
887DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
888DATA K256<>+0x04(SB)/4, $0x71374491 // k2
889DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
890DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
891DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
892DATA K256<>+0x14(SB)/4, $0x71374491 // k2
893DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
894DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
895
896DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
897DATA K256<>+0x24(SB)/4, $0x59f111f1
898DATA K256<>+0x28(SB)/4, $0x923f82a4
899DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
900DATA K256<>+0x30(SB)/4, $0x3956c25b
901DATA K256<>+0x34(SB)/4, $0x59f111f1
902DATA K256<>+0x38(SB)/4, $0x923f82a4
903DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
904
905DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
906DATA K256<>+0x44(SB)/4, $0x12835b01
907DATA K256<>+0x48(SB)/4, $0x243185be
908DATA K256<>+0x4c(SB)/4, $0x550c7dc3
909DATA K256<>+0x50(SB)/4, $0xd807aa98
910DATA K256<>+0x54(SB)/4, $0x12835b01
911DATA K256<>+0x58(SB)/4, $0x243185be
912DATA K256<>+0x5c(SB)/4, $0x550c7dc3
913
914DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
915DATA K256<>+0x64(SB)/4, $0x80deb1fe
916DATA K256<>+0x68(SB)/4, $0x9bdc06a7
917DATA K256<>+0x6c(SB)/4, $0xc19bf174
918DATA K256<>+0x70(SB)/4, $0x72be5d74
919DATA K256<>+0x74(SB)/4, $0x80deb1fe
920DATA K256<>+0x78(SB)/4, $0x9bdc06a7
921DATA K256<>+0x7c(SB)/4, $0xc19bf174
922
923DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
924DATA K256<>+0x84(SB)/4, $0xefbe4786
925DATA K256<>+0x88(SB)/4, $0x0fc19dc6
926DATA K256<>+0x8c(SB)/4, $0x240ca1cc
927DATA K256<>+0x90(SB)/4, $0xe49b69c1
928DATA K256<>+0x94(SB)/4, $0xefbe4786
929DATA K256<>+0x98(SB)/4, $0x0fc19dc6
930DATA K256<>+0x9c(SB)/4, $0x240ca1cc
931
932DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
933DATA K256<>+0xa4(SB)/4, $0x4a7484aa
934DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
935DATA K256<>+0xac(SB)/4, $0x76f988da
936DATA K256<>+0xb0(SB)/4, $0x2de92c6f
937DATA K256<>+0xb4(SB)/4, $0x4a7484aa
938DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
939DATA K256<>+0xbc(SB)/4, $0x76f988da
940
941DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
942DATA K256<>+0xc4(SB)/4, $0xa831c66d
943DATA K256<>+0xc8(SB)/4, $0xb00327c8
944DATA K256<>+0xcc(SB)/4, $0xbf597fc7
945DATA K256<>+0xd0(SB)/4, $0x983e5152
946DATA K256<>+0xd4(SB)/4, $0xa831c66d
947DATA K256<>+0xd8(SB)/4, $0xb00327c8
948DATA K256<>+0xdc(SB)/4, $0xbf597fc7
949
950DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
951DATA K256<>+0xe4(SB)/4, $0xd5a79147
952DATA K256<>+0xe8(SB)/4, $0x06ca6351
953DATA K256<>+0xec(SB)/4, $0x14292967
954DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
955DATA K256<>+0xf4(SB)/4, $0xd5a79147
956DATA K256<>+0xf8(SB)/4, $0x06ca6351
957DATA K256<>+0xfc(SB)/4, $0x14292967
958
959DATA K256<>+0x100(SB)/4, $0x27b70a85
960DATA K256<>+0x104(SB)/4, $0x2e1b2138
961DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
962DATA K256<>+0x10c(SB)/4, $0x53380d13
963DATA K256<>+0x110(SB)/4, $0x27b70a85
964DATA K256<>+0x114(SB)/4, $0x2e1b2138
965DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
966DATA K256<>+0x11c(SB)/4, $0x53380d13
967
968DATA K256<>+0x120(SB)/4, $0x650a7354
969DATA K256<>+0x124(SB)/4, $0x766a0abb
970DATA K256<>+0x128(SB)/4, $0x81c2c92e
971DATA K256<>+0x12c(SB)/4, $0x92722c85
972DATA K256<>+0x130(SB)/4, $0x650a7354
973DATA K256<>+0x134(SB)/4, $0x766a0abb
974DATA K256<>+0x138(SB)/4, $0x81c2c92e
975DATA K256<>+0x13c(SB)/4, $0x92722c85
976
977DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
978DATA K256<>+0x144(SB)/4, $0xa81a664b
979DATA K256<>+0x148(SB)/4, $0xc24b8b70
980DATA K256<>+0x14c(SB)/4, $0xc76c51a3
981DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
982DATA K256<>+0x154(SB)/4, $0xa81a664b
983DATA K256<>+0x158(SB)/4, $0xc24b8b70
984DATA K256<>+0x15c(SB)/4, $0xc76c51a3
985
986DATA K256<>+0x160(SB)/4, $0xd192e819
987DATA K256<>+0x164(SB)/4, $0xd6990624
988DATA K256<>+0x168(SB)/4, $0xf40e3585
989DATA K256<>+0x16c(SB)/4, $0x106aa070
990DATA K256<>+0x170(SB)/4, $0xd192e819
991DATA K256<>+0x174(SB)/4, $0xd6990624
992DATA K256<>+0x178(SB)/4, $0xf40e3585
993DATA K256<>+0x17c(SB)/4, $0x106aa070
994
995DATA K256<>+0x180(SB)/4, $0x19a4c116
996DATA K256<>+0x184(SB)/4, $0x1e376c08
997DATA K256<>+0x188(SB)/4, $0x2748774c
998DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
999DATA K256<>+0x190(SB)/4, $0x19a4c116
1000DATA K256<>+0x194(SB)/4, $0x1e376c08
1001DATA K256<>+0x198(SB)/4, $0x2748774c
1002DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
1003
1004DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
1005DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
1006DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
1007DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
1008DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
1009DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
1010DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
1011DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
1012
1013DATA K256<>+0x1c0(SB)/4, $0x748f82ee
1014DATA K256<>+0x1c4(SB)/4, $0x78a5636f
1015DATA K256<>+0x1c8(SB)/4, $0x84c87814
1016DATA K256<>+0x1cc(SB)/4, $0x8cc70208
1017DATA K256<>+0x1d0(SB)/4, $0x748f82ee
1018DATA K256<>+0x1d4(SB)/4, $0x78a5636f
1019DATA K256<>+0x1d8(SB)/4, $0x84c87814
1020DATA K256<>+0x1dc(SB)/4, $0x8cc70208
1021
1022DATA K256<>+0x1e0(SB)/4, $0x90befffa
1023DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
1024DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
1025DATA K256<>+0x1ec(SB)/4, $0xc67178f2
1026DATA K256<>+0x1f0(SB)/4, $0x90befffa
1027DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
1028DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
1029DATA K256<>+0x1fc(SB)/4, $0xc67178f2
1030
1031GLOBL K256<>(SB), (NOPTR + RODATA), $512
1032