xref: /linux/arch/x86/crypto/sha256-avx2-asm.S (revision 2da68a77)
1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7#     James Guilford <james.guilford@intel.com>
8#     Kirk Yap <kirk.s.yap@intel.com>
9#     Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses.  You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17#     Redistribution and use in source and binary forms, with or
18#     without modification, are permitted provided that the following
19#     conditions are met:
20#
21#      - Redistributions of source code must retain the above
22#        copyright notice, this list of conditions and the following
23#        disclaimer.
24#
25#      - Redistributions in binary form must reproduce the above
26#        copyright notice, this list of conditions and the following
27#        disclaimer in the documentation and/or other materials
28#        provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#include <linux/linkage.h>
52
53## assume buffers not aligned
54#define	VMOVDQ vmovdqu
55
56################################ Define Macros
57
58# addm [mem], reg
59# Add reg to mem using reg-mem add and store
60.macro addm p1 p2
61	add	\p1, \p2
62	mov	\p2, \p1
63.endm
64
65################################
66
67X0 = %ymm4
68X1 = %ymm5
69X2 = %ymm6
70X3 = %ymm7
71
72# XMM versions of above
73XWORD0 = %xmm4
74XWORD1 = %xmm5
75XWORD2 = %xmm6
76XWORD3 = %xmm7
77
78XTMP0 = %ymm0
79XTMP1 = %ymm1
80XTMP2 = %ymm2
81XTMP3 = %ymm3
82XTMP4 = %ymm8
83XFER  = %ymm9
84XTMP5 = %ymm11
85
86SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
87SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
88BYTE_FLIP_MASK = %ymm13
89
90X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
91
92NUM_BLKS = %rdx	# 3rd arg
93INP	= %rsi  # 2nd arg
94CTX	= %rdi	# 1st arg
95c	= %ecx
96d	= %r8d
97e       = %edx	# clobbers NUM_BLKS
98y3	= %esi	# clobbers INP
99
100SRND	= CTX	# SRND is same register as CTX
101
102a = %eax
103b = %ebx
104f = %r9d
105g = %r10d
106h = %r11d
107old_h = %r11d
108
109T1 = %r12d
110y0 = %r13d
111y1 = %r14d
112y2 = %r15d
113
114
115_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
116_XMM_SAVE_SIZE	= 0
117_INP_END_SIZE	= 8
118_INP_SIZE	= 8
119_CTX_SIZE	= 8
120
121_XFER		= 0
122_XMM_SAVE	= _XFER     + _XFER_SIZE
123_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
124_INP		= _INP_END  + _INP_END_SIZE
125_CTX		= _INP      + _INP_SIZE
126STACK_SIZE	= _CTX      + _CTX_SIZE
127
128# rotate_Xs
129# Rotate values of symbols X0...X3
130.macro rotate_Xs
131	X_ = X0
132	X0 = X1
133	X1 = X2
134	X2 = X3
135	X3 = X_
136.endm
137
138# ROTATE_ARGS
139# Rotate values of symbols a...h
140.macro ROTATE_ARGS
141	old_h = h
142	TMP_ = h
143	h = g
144	g = f
145	f = e
146	e = d
147	d = c
148	c = b
149	b = a
150	a = TMP_
151.endm
152
153.macro FOUR_ROUNDS_AND_SCHED disp
154################################### RND N + 0 ############################
155
156	mov	a, y3		# y3 = a                                # MAJA
157	rorx	$25, e, y0	# y0 = e >> 25				# S1A
158	rorx	$11, e, y1	# y1 = e >> 11				# S1B
159
160	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
161	or	c, y3		# y3 = a|c                              # MAJA
162	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
163	mov	f, y2		# y2 = f                                # CH
164	rorx	$13, a, T1	# T1 = a >> 13				# S0B
165
166	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
167	xor	g, y2		# y2 = f^g                              # CH
168	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
169	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
170
171	and	e, y2		# y2 = (f^g)&e                          # CH
172	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
173	rorx	$22, a, y1	# y1 = a >> 22				# S0A
174	add	h, d		# d = k + w + h + d                     # --
175
176	and	b, y3		# y3 = (a|c)&b                          # MAJA
177	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
178	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
179	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
180
181	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
182	vpsrld	$7, XTMP1, XTMP2
183	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
184	mov	a, T1		# T1 = a                                # MAJB
185	and	c, T1		# T1 = a&c                              # MAJB
186
187	add	y0, y2		# y2 = S1 + CH                          # --
188	vpslld	$(32-7), XTMP1, XTMP3
189	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
190	add	y1, h		# h = k + w + h + S0                    # --
191
192	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
193	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
194
195	vpsrld	$18, XTMP1, XTMP2
196	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
197	add	y3, h		# h = t1 + S0 + MAJ                     # --
198
199
200	ROTATE_ARGS
201
202################################### RND N + 1 ############################
203
204	mov	a, y3		# y3 = a                                # MAJA
205	rorx	$25, e, y0	# y0 = e >> 25				# S1A
206	rorx	$11, e, y1	# y1 = e >> 11				# S1B
207	offset = \disp + 1*4
208	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
209	or	c, y3		# y3 = a|c                              # MAJA
210
211
212	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
213	mov	f, y2		# y2 = f                                # CH
214	rorx	$13, a, T1	# T1 = a >> 13				# S0B
215	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
216	xor	g, y2		# y2 = f^g                              # CH
217
218
219	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
220	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
221	rorx	$22, a, y1	# y1 = a >> 22				# S0A
222	and	e, y2		# y2 = (f^g)&e                          # CH
223	add	h, d		# d = k + w + h + d                     # --
224
225	vpslld	$(32-18), XTMP1, XTMP1
226	and	b, y3		# y3 = (a|c)&b                          # MAJA
227	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
228
229	vpxor	XTMP1, XTMP3, XTMP3
230	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
231	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
232
233	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
234	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
235	mov	a, T1		# T1 = a                                # MAJB
236	and	c, T1		# T1 = a&c                              # MAJB
237	add	y0, y2		# y2 = S1 + CH                          # --
238
239	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
240	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
241	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
242	add	y1, h		# h = k + w + h + S0                    # --
243
244	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
245	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
246	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
247	add	y3, h		# h = t1 + S0 + MAJ                     # --
248
249	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
250
251
252	ROTATE_ARGS
253
254################################### RND N + 2 ############################
255
256	mov	a, y3		# y3 = a                                # MAJA
257	rorx	$25, e, y0	# y0 = e >> 25				# S1A
258	offset = \disp + 2*4
259	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
260
261	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
262	rorx	$11, e, y1	# y1 = e >> 11				# S1B
263	or	c, y3		# y3 = a|c                              # MAJA
264	mov	f, y2		# y2 = f                                # CH
265	xor	g, y2		# y2 = f^g                              # CH
266
267	rorx	$13, a, T1	# T1 = a >> 13				# S0B
268	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
269	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
270	and	e, y2		# y2 = (f^g)&e                          # CH
271
272	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
273	vpxor	XTMP3, XTMP2, XTMP2
274	add	h, d		# d = k + w + h + d                     # --
275	and	b, y3		# y3 = (a|c)&b                          # MAJA
276
277	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
278	rorx	$22, a, y1	# y1 = a >> 22				# S0A
279	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
280	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
281
282	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
283	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
284	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
285	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
286
287	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
288	mov	a, T1		# T1 = a                                # MAJB
289	and	c, T1		# T1 = a&c                              # MAJB
290	add	y0, y2		# y2 = S1 + CH                          # --
291	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
292
293	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
294	add	y1,h		# h = k + w + h + S0                    # --
295	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
296	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
297
298	add	y3,h		# h = t1 + S0 + MAJ                     # --
299
300
301	ROTATE_ARGS
302
303################################### RND N + 3 ############################
304
305	mov	a, y3		# y3 = a                                # MAJA
306	rorx	$25, e, y0	# y0 = e >> 25				# S1A
307	rorx	$11, e, y1	# y1 = e >> 11				# S1B
308	offset = \disp + 3*4
309	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
310	or	c, y3		# y3 = a|c                              # MAJA
311
312
313	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
314	mov	f, y2		# y2 = f                                # CH
315	rorx	$13, a, T1	# T1 = a >> 13				# S0B
316	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
317	xor	g, y2		# y2 = f^g                              # CH
318
319
320	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
321	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
322	and	e, y2		# y2 = (f^g)&e                          # CH
323	add	h, d		# d = k + w + h + d                     # --
324	and	b, y3		# y3 = (a|c)&b                          # MAJA
325
326	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
327	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
328	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
329
330	vpxor	XTMP3, XTMP2, XTMP2
331	rorx	$22, a, y1	# y1 = a >> 22				# S0A
332	add	y0, y2		# y2 = S1 + CH                          # --
333
334	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
335	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
336	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
337
338	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
339	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
340
341	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
342	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
343	mov	a, T1		# T1 = a                                # MAJB
344	and	c, T1		# T1 = a&c                              # MAJB
345	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
346
347	add	y1, h		# h = k + w + h + S0                    # --
348	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
349	add	y3, h		# h = t1 + S0 + MAJ                     # --
350
351	ROTATE_ARGS
352	rotate_Xs
353.endm
354
355.macro DO_4ROUNDS disp
356################################### RND N + 0 ###########################
357
358	mov	f, y2		# y2 = f                                # CH
359	rorx	$25, e, y0	# y0 = e >> 25				# S1A
360	rorx	$11, e, y1	# y1 = e >> 11				# S1B
361	xor	g, y2		# y2 = f^g                              # CH
362
363	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
364	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
365	and	e, y2		# y2 = (f^g)&e                          # CH
366
367	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
368	rorx	$13, a, T1	# T1 = a >> 13				# S0B
369	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
370	rorx	$22, a, y1	# y1 = a >> 22				# S0A
371	mov	a, y3		# y3 = a                                # MAJA
372
373	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
374	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
375	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
376	or	c, y3		# y3 = a|c                              # MAJA
377
378	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
379	mov	a, T1		# T1 = a                                # MAJB
380	and	b, y3		# y3 = (a|c)&b                          # MAJA
381	and	c, T1		# T1 = a&c                              # MAJB
382	add	y0, y2		# y2 = S1 + CH                          # --
383
384
385	add	h, d		# d = k + w + h + d                     # --
386	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
387	add	y1, h		# h = k + w + h + S0                    # --
388	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
389
390	ROTATE_ARGS
391
392################################### RND N + 1 ###########################
393
394	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
395	mov	f, y2		# y2 = f                                # CH
396	rorx	$25, e, y0	# y0 = e >> 25				# S1A
397	rorx	$11, e, y1	# y1 = e >> 11				# S1B
398	xor	g, y2		# y2 = f^g                              # CH
399
400	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
401	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
402	and	e, y2		# y2 = (f^g)&e                          # CH
403	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
404
405	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
406	rorx	$13, a, T1	# T1 = a >> 13				# S0B
407	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
408	rorx	$22, a, y1	# y1 = a >> 22				# S0A
409	mov	a, y3		# y3 = a                                # MAJA
410
411	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
412	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
413	offset = 4*1 + \disp
414	addl	offset(%rsp, SRND), h		# h = k + w + h # --
415	or	c, y3		# y3 = a|c                              # MAJA
416
417	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
418	mov	a, T1		# T1 = a                                # MAJB
419	and	b, y3		# y3 = (a|c)&b                          # MAJA
420	and	c, T1		# T1 = a&c                              # MAJB
421	add	y0, y2		# y2 = S1 + CH                          # --
422
423
424	add	h, d		# d = k + w + h + d                     # --
425	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
426	add	y1, h		# h = k + w + h + S0                    # --
427
428	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
429
430	ROTATE_ARGS
431
432################################### RND N + 2 ##############################
433
434	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
435	mov	f, y2		# y2 = f                                # CH
436	rorx	$25, e, y0	# y0 = e >> 25				# S1A
437	rorx	$11, e, y1	# y1 = e >> 11				# S1B
438	xor	g, y2		# y2 = f^g                              # CH
439
440	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
441	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
442	and	e, y2		# y2 = (f^g)&e                          # CH
443	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
444
445	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
446	rorx	$13, a, T1	# T1 = a >> 13				# S0B
447	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
448	rorx	$22, a, y1	# y1 = a >> 22				# S0A
449	mov	a, y3		# y3 = a                                # MAJA
450
451	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
452	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
453	offset = 4*2 + \disp
454	addl	offset(%rsp, SRND), h		# h = k + w + h # --
455	or	c, y3		# y3 = a|c                              # MAJA
456
457	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
458	mov	a, T1		# T1 = a                                # MAJB
459	and	b, y3		# y3 = (a|c)&b                          # MAJA
460	and	c, T1		# T1 = a&c                              # MAJB
461	add	y0, y2		# y2 = S1 + CH                          # --
462
463
464	add	h, d		# d = k + w + h + d                     # --
465	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
466	add	y1, h		# h = k + w + h + S0                    # --
467
468	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
469
470	ROTATE_ARGS
471
472################################### RND N + 3 ###########################
473
474	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
475	mov	f, y2		# y2 = f                                # CH
476	rorx	$25, e, y0	# y0 = e >> 25				# S1A
477	rorx	$11, e, y1	# y1 = e >> 11				# S1B
478	xor	g, y2		# y2 = f^g                              # CH
479
480	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
481	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
482	and	e, y2		# y2 = (f^g)&e                          # CH
483	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
484
485	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
486	rorx	$13, a, T1	# T1 = a >> 13				# S0B
487	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
488	rorx	$22, a, y1	# y1 = a >> 22				# S0A
489	mov	a, y3		# y3 = a                                # MAJA
490
491	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
492	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
493	offset = 4*3 + \disp
494	addl	offset(%rsp, SRND), h		# h = k + w + h # --
495	or	c, y3		# y3 = a|c                              # MAJA
496
497	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
498	mov	a, T1		# T1 = a                                # MAJB
499	and	b, y3		# y3 = (a|c)&b                          # MAJA
500	and	c, T1		# T1 = a&c                              # MAJB
501	add	y0, y2		# y2 = S1 + CH                          # --
502
503
504	add	h, d		# d = k + w + h + d                     # --
505	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
506	add	y1, h		# h = k + w + h + S0                    # --
507
508	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
509
510
511	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
512
513	add	y3, h		# h = t1 + S0 + MAJ                     # --
514
515	ROTATE_ARGS
516
517.endm
518
519########################################################################
520## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
521## arg 1 : pointer to state
522## arg 2 : pointer to input data
523## arg 3 : Num blocks
524########################################################################
525.text
526SYM_FUNC_START(sha256_transform_rorx)
527.align 32
528	pushq	%rbx
529	pushq	%r12
530	pushq	%r13
531	pushq	%r14
532	pushq	%r15
533
534	push	%rbp
535	mov	%rsp, %rbp
536
537	subq	$STACK_SIZE, %rsp
538	and	$-32, %rsp	# align rsp to 32 byte boundary
539
540	shl	$6, NUM_BLKS	# convert to bytes
541	jz	done_hash
542	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
543	mov	NUM_BLKS, _INP_END(%rsp)
544
545	cmp	NUM_BLKS, INP
546	je	only_one_block
547
548	## load initial digest
549	mov	(CTX), a
550	mov	4*1(CTX), b
551	mov	4*2(CTX), c
552	mov	4*3(CTX), d
553	mov	4*4(CTX), e
554	mov	4*5(CTX), f
555	mov	4*6(CTX), g
556	mov	4*7(CTX), h
557
558	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
559	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
560	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
561
562	mov	CTX, _CTX(%rsp)
563
564loop0:
565	## Load first 16 dwords from two blocks
566	VMOVDQ	0*32(INP),XTMP0
567	VMOVDQ	1*32(INP),XTMP1
568	VMOVDQ	2*32(INP),XTMP2
569	VMOVDQ	3*32(INP),XTMP3
570
571	## byte swap data
572	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
573	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
574	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
575	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
576
577	## transpose data into high/low halves
578	vperm2i128	$0x20, XTMP2, XTMP0, X0
579	vperm2i128	$0x31, XTMP2, XTMP0, X1
580	vperm2i128	$0x20, XTMP3, XTMP1, X2
581	vperm2i128	$0x31, XTMP3, XTMP1, X3
582
583last_block_enter:
584	add	$64, INP
585	mov	INP, _INP(%rsp)
586
587	## schedule 48 input dwords, by doing 3 rounds of 12 each
588	xor	SRND, SRND
589
590.align 16
591loop1:
592	vpaddd	K256+0*32(SRND), X0, XFER
593	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
594	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
595
596	vpaddd	K256+1*32(SRND), X0, XFER
597	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
598	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
599
600	vpaddd	K256+2*32(SRND), X0, XFER
601	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
602	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
603
604	vpaddd	K256+3*32(SRND), X0, XFER
605	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
606	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
607
608	add	$4*32, SRND
609	cmp	$3*4*32, SRND
610	jb	loop1
611
612loop2:
613	## Do last 16 rounds with no scheduling
614	vpaddd	K256+0*32(SRND), X0, XFER
615	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
616	DO_4ROUNDS	_XFER + 0*32
617
618	vpaddd	K256+1*32(SRND), X1, XFER
619	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
620	DO_4ROUNDS	_XFER + 1*32
621	add	$2*32, SRND
622
623	vmovdqa	X2, X0
624	vmovdqa	X3, X1
625
626	cmp	$4*4*32, SRND
627	jb	loop2
628
629	mov	_CTX(%rsp), CTX
630	mov	_INP(%rsp), INP
631
632	addm    (4*0)(CTX),a
633	addm    (4*1)(CTX),b
634	addm    (4*2)(CTX),c
635	addm    (4*3)(CTX),d
636	addm    (4*4)(CTX),e
637	addm    (4*5)(CTX),f
638	addm    (4*6)(CTX),g
639	addm    (4*7)(CTX),h
640
641	cmp	_INP_END(%rsp), INP
642	ja	done_hash
643
644	#### Do second block using previously scheduled results
645	xor	SRND, SRND
646.align 16
647loop3:
648	DO_4ROUNDS	 _XFER + 0*32 + 16
649	DO_4ROUNDS	 _XFER + 1*32 + 16
650	add	$2*32, SRND
651	cmp	$4*4*32, SRND
652	jb	loop3
653
654	mov	_CTX(%rsp), CTX
655	mov	_INP(%rsp), INP
656	add	$64, INP
657
658	addm    (4*0)(CTX),a
659	addm    (4*1)(CTX),b
660	addm    (4*2)(CTX),c
661	addm    (4*3)(CTX),d
662	addm    (4*4)(CTX),e
663	addm    (4*5)(CTX),f
664	addm    (4*6)(CTX),g
665	addm    (4*7)(CTX),h
666
667	cmp	_INP_END(%rsp), INP
668	jb	loop0
669	ja	done_hash
670
671do_last_block:
672	VMOVDQ	0*16(INP),XWORD0
673	VMOVDQ	1*16(INP),XWORD1
674	VMOVDQ	2*16(INP),XWORD2
675	VMOVDQ	3*16(INP),XWORD3
676
677	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
678	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
679	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
680	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
681
682	jmp	last_block_enter
683
684only_one_block:
685
686	## load initial digest
687	mov	(4*0)(CTX),a
688	mov	(4*1)(CTX),b
689	mov	(4*2)(CTX),c
690	mov	(4*3)(CTX),d
691	mov	(4*4)(CTX),e
692	mov	(4*5)(CTX),f
693	mov	(4*6)(CTX),g
694	mov	(4*7)(CTX),h
695
696	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
697	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
698	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
699
700	mov	CTX, _CTX(%rsp)
701	jmp	do_last_block
702
703done_hash:
704
705	mov	%rbp, %rsp
706	pop	%rbp
707
708	popq	%r15
709	popq	%r14
710	popq	%r13
711	popq	%r12
712	popq	%rbx
713	RET
714SYM_FUNC_END(sha256_transform_rorx)
715
716.section	.rodata.cst512.K256, "aM", @progbits, 512
717.align 64
718K256:
719	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
720	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
721	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
722	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
723	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
724	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
725	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
726	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
727	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
728	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
729	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
730	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
731	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
732	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
733	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
734	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
735	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
736	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
737	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
738	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
739	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
740	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
741	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
742	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
743	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
744	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
745	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
746	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
747	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
748	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
749	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
750	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
751
752.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
753.align 32
754PSHUFFLE_BYTE_FLIP_MASK:
755	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
756
757# shuffle xBxA -> 00BA
758.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
759.align 32
760_SHUF_00BA:
761	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
762
763# shuffle xDxC -> DC00
764.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
765.align 32
766_SHUF_DC00:
767	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
768