xref: /linux/arch/x86/crypto/sha256-avx-asm.S (revision 94330fbe)
1ec2b4c85STim Chen########################################################################
2ec2b4c85STim Chen# Implement fast SHA-256 with AVX1 instructions. (x86_64)
3ec2b4c85STim Chen#
4ec2b4c85STim Chen# Copyright (C) 2013 Intel Corporation.
5ec2b4c85STim Chen#
6ec2b4c85STim Chen# Authors:
7ec2b4c85STim Chen#     James Guilford <james.guilford@intel.com>
8ec2b4c85STim Chen#     Kirk Yap <kirk.s.yap@intel.com>
9ec2b4c85STim Chen#     Tim Chen <tim.c.chen@linux.intel.com>
10ec2b4c85STim Chen#
11ec2b4c85STim Chen# This software is available to you under a choice of one of two
12ec2b4c85STim Chen# licenses.  You may choose to be licensed under the terms of the GNU
13ec2b4c85STim Chen# General Public License (GPL) Version 2, available from the file
14ec2b4c85STim Chen# COPYING in the main directory of this source tree, or the
15ec2b4c85STim Chen# OpenIB.org BSD license below:
16ec2b4c85STim Chen#
17ec2b4c85STim Chen#     Redistribution and use in source and binary forms, with or
18ec2b4c85STim Chen#     without modification, are permitted provided that the following
19ec2b4c85STim Chen#     conditions are met:
20ec2b4c85STim Chen#
21ec2b4c85STim Chen#      - Redistributions of source code must retain the above
22ec2b4c85STim Chen#        copyright notice, this list of conditions and the following
23ec2b4c85STim Chen#        disclaimer.
24ec2b4c85STim Chen#
25ec2b4c85STim Chen#      - Redistributions in binary form must reproduce the above
26ec2b4c85STim Chen#        copyright notice, this list of conditions and the following
27ec2b4c85STim Chen#        disclaimer in the documentation and/or other materials
28ec2b4c85STim Chen#        provided with the distribution.
29ec2b4c85STim Chen#
30ec2b4c85STim Chen# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31ec2b4c85STim Chen# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32ec2b4c85STim Chen# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33ec2b4c85STim Chen# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34ec2b4c85STim Chen# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35ec2b4c85STim Chen# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36ec2b4c85STim Chen# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37ec2b4c85STim Chen# SOFTWARE.
38ec2b4c85STim Chen########################################################################
39ec2b4c85STim Chen#
40ec2b4c85STim Chen# This code is described in an Intel White-Paper:
41ec2b4c85STim Chen# "Fast SHA-256 Implementations on Intel Architecture Processors"
42ec2b4c85STim Chen#
43ec2b4c85STim Chen# To find it, surf to http://www.intel.com/p/en_US/embedded
44ec2b4c85STim Chen# and search for that title.
45ec2b4c85STim Chen#
46ec2b4c85STim Chen########################################################################
47ec2b4c85STim Chen# This code schedules 1 block at a time, with 4 lanes per block
48ec2b4c85STim Chen########################################################################
49ec2b4c85STim Chen
50ec2b4c85STim Chen#include <linux/linkage.h>
5119940ebbSEric Biggers#include <linux/cfi_types.h>
52ec2b4c85STim Chen
53ec2b4c85STim Chen## assume buffers not aligned
54ec2b4c85STim Chen#define    VMOVDQ vmovdqu
55ec2b4c85STim Chen
56ec2b4c85STim Chen################################ Define Macros
57ec2b4c85STim Chen
58ec2b4c85STim Chen# addm [mem], reg
59ec2b4c85STim Chen# Add reg to mem using reg-mem add and store
60ec2b4c85STim Chen.macro addm p1 p2
61ec2b4c85STim Chen	add     \p1, \p2
62ec2b4c85STim Chen	mov     \p2, \p1
63ec2b4c85STim Chen.endm
64ec2b4c85STim Chen
65ec2b4c85STim Chen
66ec2b4c85STim Chen.macro MY_ROR p1 p2
67ec2b4c85STim Chen	shld    $(32-(\p1)), \p2, \p2
68ec2b4c85STim Chen.endm
69ec2b4c85STim Chen
70ec2b4c85STim Chen################################
71ec2b4c85STim Chen
72ec2b4c85STim Chen# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
73ec2b4c85STim Chen# Load xmm with mem and byte swap each dword
74ec2b4c85STim Chen.macro COPY_XMM_AND_BSWAP p1 p2 p3
75ec2b4c85STim Chen	VMOVDQ \p2, \p1
76ec2b4c85STim Chen	vpshufb \p3, \p1, \p1
77ec2b4c85STim Chen.endm
78ec2b4c85STim Chen
79ec2b4c85STim Chen################################
80ec2b4c85STim Chen
81ec2b4c85STim ChenX0 = %xmm4
82ec2b4c85STim ChenX1 = %xmm5
83ec2b4c85STim ChenX2 = %xmm6
84ec2b4c85STim ChenX3 = %xmm7
85ec2b4c85STim Chen
86ec2b4c85STim ChenXTMP0 = %xmm0
87ec2b4c85STim ChenXTMP1 = %xmm1
88ec2b4c85STim ChenXTMP2 = %xmm2
89ec2b4c85STim ChenXTMP3 = %xmm3
90ec2b4c85STim ChenXTMP4 = %xmm8
91ec2b4c85STim ChenXFER = %xmm9
92ec2b4c85STim ChenXTMP5 = %xmm11
93ec2b4c85STim Chen
94ec2b4c85STim ChenSHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
95ec2b4c85STim ChenSHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
96ec2b4c85STim ChenBYTE_FLIP_MASK = %xmm13
97ec2b4c85STim Chen
98ec2b4c85STim ChenNUM_BLKS = %rdx   # 3rd arg
991631030aSArd BiesheuvelINP = %rsi        # 2nd arg
1001631030aSArd BiesheuvelCTX = %rdi        # 1st arg
101ec2b4c85STim Chen
1021631030aSArd BiesheuvelSRND = %rsi       # clobbers INP
103ec2b4c85STim Chenc = %ecx
104ec2b4c85STim Chend = %r8d
105ec2b4c85STim Chene = %edx
106673ac6fbSJosh PoimboeufTBL = %r12
107ec2b4c85STim Chena = %eax
108ec2b4c85STim Chenb = %ebx
109ec2b4c85STim Chen
110ec2b4c85STim Chenf = %r9d
111ec2b4c85STim Cheng = %r10d
112ec2b4c85STim Chenh = %r11d
113ec2b4c85STim Chen
114ec2b4c85STim Cheny0 = %r13d
115ec2b4c85STim Cheny1 = %r14d
116ec2b4c85STim Cheny2 = %r15d
117ec2b4c85STim Chen
118ec2b4c85STim Chen
119ec2b4c85STim Chen_INP_END_SIZE = 8
120ec2b4c85STim Chen_INP_SIZE = 8
121de614e56SJussi Kivilinna_XFER_SIZE = 16
122ec2b4c85STim Chen_XMM_SAVE_SIZE = 0
123ec2b4c85STim Chen
124ec2b4c85STim Chen_INP_END = 0
125ec2b4c85STim Chen_INP            = _INP_END  + _INP_END_SIZE
126ec2b4c85STim Chen_XFER           = _INP      + _INP_SIZE
127ec2b4c85STim Chen_XMM_SAVE       = _XFER     + _XFER_SIZE
128ec2b4c85STim ChenSTACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
129ec2b4c85STim Chen
130ec2b4c85STim Chen# rotate_Xs
131ec2b4c85STim Chen# Rotate values of symbols X0...X3
132ec2b4c85STim Chen.macro rotate_Xs
133ec2b4c85STim ChenX_ = X0
134ec2b4c85STim ChenX0 = X1
135ec2b4c85STim ChenX1 = X2
136ec2b4c85STim ChenX2 = X3
137ec2b4c85STim ChenX3 = X_
138ec2b4c85STim Chen.endm
139ec2b4c85STim Chen
140ec2b4c85STim Chen# ROTATE_ARGS
141ec2b4c85STim Chen# Rotate values of symbols a...h
142ec2b4c85STim Chen.macro ROTATE_ARGS
143ec2b4c85STim ChenTMP_ = h
144ec2b4c85STim Chenh = g
145ec2b4c85STim Cheng = f
146ec2b4c85STim Chenf = e
147ec2b4c85STim Chene = d
148ec2b4c85STim Chend = c
149ec2b4c85STim Chenc = b
150ec2b4c85STim Chenb = a
151ec2b4c85STim Chena = TMP_
152ec2b4c85STim Chen.endm
153ec2b4c85STim Chen
154ec2b4c85STim Chen.macro FOUR_ROUNDS_AND_SCHED
155ec2b4c85STim Chen	## compute s0 four at a time and s1 two at a time
156ec2b4c85STim Chen	## compute W[-16] + W[-7] 4 at a time
157ec2b4c85STim Chen
158ec2b4c85STim Chen	mov     e, y0			# y0 = e
159ec2b4c85STim Chen	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
160ec2b4c85STim Chen	mov     a, y1                   # y1 = a
161ec2b4c85STim Chen	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
162ec2b4c85STim Chen	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
163ec2b4c85STim Chen	xor     e, y0                   # y0 = e ^ (e >> (25-11))
164ec2b4c85STim Chen	mov     f, y2                   # y2 = f
165ec2b4c85STim Chen	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
166ec2b4c85STim Chen	xor     a, y1                   # y1 = a ^ (a >> (22-13)
167ec2b4c85STim Chen	xor     g, y2                   # y2 = f^g
168ec2b4c85STim Chen	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
169ec2b4c85STim Chen	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
170ec2b4c85STim Chen	and     e, y2                   # y2 = (f^g)&e
171ec2b4c85STim Chen	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
172ec2b4c85STim Chen	## compute s0
173ec2b4c85STim Chen	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
174ec2b4c85STim Chen	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
175ec2b4c85STim Chen	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
176ec2b4c85STim Chen	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
177ec2b4c85STim Chen	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
178ec2b4c85STim Chen	add     y0, y2                  # y2 = S1 + CH
179ec2b4c85STim Chen	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
180ec2b4c85STim Chen	mov     a, y0                   # y0 = a
181ec2b4c85STim Chen	add     y2, h                   # h = h + S1 + CH + k + w
182ec2b4c85STim Chen	mov     a, y2                   # y2 = a
183ec2b4c85STim Chen	vpsrld  $7, XTMP1, XTMP2
184ec2b4c85STim Chen	or      c, y0                   # y0 = a|c
185ec2b4c85STim Chen	add     h, d                    # d = d + h + S1 + CH + k + w
186ec2b4c85STim Chen	and     c, y2                   # y2 = a&c
187ec2b4c85STim Chen	vpslld  $(32-7), XTMP1, XTMP3
188ec2b4c85STim Chen	and     b, y0                   # y0 = (a|c)&b
189ec2b4c85STim Chen	add     y1, h                   # h = h + S1 + CH + k + w + S0
190ec2b4c85STim Chen	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
191ec2b4c85STim Chen	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
192ec2b4c85STim Chen	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
193ec2b4c85STim Chen	ROTATE_ARGS
194ec2b4c85STim Chen	mov     e, y0                   # y0 = e
195ec2b4c85STim Chen	mov     a, y1                   # y1 = a
196ec2b4c85STim Chen	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
197ec2b4c85STim Chen	xor     e, y0                   # y0 = e ^ (e >> (25-11))
198ec2b4c85STim Chen	mov     f, y2                   # y2 = f
199ec2b4c85STim Chen	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
200ec2b4c85STim Chen	vpsrld  $18, XTMP1, XTMP2       #
201ec2b4c85STim Chen	xor     a, y1                   # y1 = a ^ (a >> (22-13)
202ec2b4c85STim Chen	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
203ec2b4c85STim Chen	xor     g, y2                   # y2 = f^g
204ec2b4c85STim Chen	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
205ec2b4c85STim Chen	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
206ec2b4c85STim Chen	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
207ec2b4c85STim Chen	and     e, y2                   # y2 = (f^g)&e
208ec2b4c85STim Chen	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
209ec2b4c85STim Chen	vpslld  $(32-18), XTMP1, XTMP1
210ec2b4c85STim Chen	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
211ec2b4c85STim Chen	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
212ec2b4c85STim Chen	vpxor   XTMP1, XTMP3, XTMP3     #
213ec2b4c85STim Chen	add     y0, y2                  # y2 = S1 + CH
214ec2b4c85STim Chen	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
215ec2b4c85STim Chen	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
216ec2b4c85STim Chen	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
217ec2b4c85STim Chen	mov     a, y0                   # y0 = a
218ec2b4c85STim Chen	add     y2, h                   # h = h + S1 + CH + k + w
219ec2b4c85STim Chen	mov     a, y2                   # y2 = a
220ec2b4c85STim Chen	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
221ec2b4c85STim Chen	or      c, y0                   # y0 = a|c
222ec2b4c85STim Chen	add     h, d                    # d = d + h + S1 + CH + k + w
223ec2b4c85STim Chen	and     c, y2                   # y2 = a&c
224ec2b4c85STim Chen	## compute low s1
225ec2b4c85STim Chen	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
226ec2b4c85STim Chen	and     b, y0                   # y0 = (a|c)&b
227ec2b4c85STim Chen	add     y1, h                   # h = h + S1 + CH + k + w + S0
228ec2b4c85STim Chen	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
229ec2b4c85STim Chen	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
230ec2b4c85STim Chen	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
231ec2b4c85STim Chen	ROTATE_ARGS
232ec2b4c85STim Chen	mov     e, y0                   # y0 = e
233ec2b4c85STim Chen	mov     a, y1                   # y1 = a
234ec2b4c85STim Chen	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
235ec2b4c85STim Chen	xor     e, y0                   # y0 = e ^ (e >> (25-11))
236ec2b4c85STim Chen	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
237ec2b4c85STim Chen	mov     f, y2                   # y2 = f
238ec2b4c85STim Chen	xor     a, y1                   # y1 = a ^ (a >> (22-13)
239ec2b4c85STim Chen	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
240ec2b4c85STim Chen	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
241ec2b4c85STim Chen	xor     g, y2                   # y2 = f^g
242ec2b4c85STim Chen	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
243ec2b4c85STim Chen	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
244ec2b4c85STim Chen	and     e, y2                   # y2 = (f^g)&e
245ec2b4c85STim Chen	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
246ec2b4c85STim Chen	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
247ec2b4c85STim Chen	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
248ec2b4c85STim Chen	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
249ec2b4c85STim Chen	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
250ec2b4c85STim Chen	vpxor   XTMP3, XTMP2, XTMP2     #
251ec2b4c85STim Chen	add     y0, y2                  # y2 = S1 + CH
252ec2b4c85STim Chen	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
253ec2b4c85STim Chen	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
254ec2b4c85STim Chen	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
255ec2b4c85STim Chen	mov     a, y0                   # y0 = a
256ec2b4c85STim Chen	add     y2, h                   # h = h + S1 + CH + k + w
257ec2b4c85STim Chen	mov     a, y2                   # y2 = a
258ec2b4c85STim Chen	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
259ec2b4c85STim Chen	or      c, y0                   # y0 = a|c
260ec2b4c85STim Chen	add     h, d                    # d = d + h + S1 + CH + k + w
261ec2b4c85STim Chen	and     c, y2                   # y2 = a&c
262ec2b4c85STim Chen	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
263ec2b4c85STim Chen	and     b, y0                   # y0 = (a|c)&b
264ec2b4c85STim Chen	add     y1, h                   # h = h + S1 + CH + k + w + S0
265ec2b4c85STim Chen	## compute high s1
266ec2b4c85STim Chen	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
267ec2b4c85STim Chen	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
268ec2b4c85STim Chen	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
269ec2b4c85STim Chen	ROTATE_ARGS
270ec2b4c85STim Chen	mov     e, y0                   # y0 = e
271ec2b4c85STim Chen	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
272ec2b4c85STim Chen	mov     a, y1                   # y1 = a
273ec2b4c85STim Chen	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
274ec2b4c85STim Chen	xor     e, y0                   # y0 = e ^ (e >> (25-11))
275ec2b4c85STim Chen	mov     f, y2                   # y2 = f
276ec2b4c85STim Chen	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
277ec2b4c85STim Chen	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
278ec2b4c85STim Chen	xor     a, y1                   # y1 = a ^ (a >> (22-13)
279ec2b4c85STim Chen	xor     g, y2                   # y2 = f^g
280ec2b4c85STim Chen	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
281ec2b4c85STim Chen	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
282ec2b4c85STim Chen	and     e, y2                   # y2 = (f^g)&e
283ec2b4c85STim Chen	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
284ec2b4c85STim Chen	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
285ec2b4c85STim Chen	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
286ec2b4c85STim Chen	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
287ec2b4c85STim Chen	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
288ec2b4c85STim Chen	vpxor   XTMP3, XTMP2, XTMP2
289ec2b4c85STim Chen	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
290ec2b4c85STim Chen	add     y0, y2                  # y2 = S1 + CH
291ec2b4c85STim Chen	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
292ec2b4c85STim Chen	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
293ec2b4c85STim Chen	mov     a, y0                   # y0 = a
294ec2b4c85STim Chen	add     y2, h                   # h = h + S1 + CH + k + w
295ec2b4c85STim Chen	mov     a, y2                   # y2 = a
296ec2b4c85STim Chen	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
297ec2b4c85STim Chen	or      c, y0                   # y0 = a|c
298ec2b4c85STim Chen	add     h, d                    # d = d + h + S1 + CH + k + w
299ec2b4c85STim Chen	and     c, y2                   # y2 = a&c
300ec2b4c85STim Chen	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
301ec2b4c85STim Chen	and     b, y0                   # y0 = (a|c)&b
302ec2b4c85STim Chen	add     y1, h                   # h = h + S1 + CH + k + w + S0
303ec2b4c85STim Chen	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
304ec2b4c85STim Chen	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
305ec2b4c85STim Chen	ROTATE_ARGS
306ec2b4c85STim Chen	rotate_Xs
307ec2b4c85STim Chen.endm
308ec2b4c85STim Chen
309ec2b4c85STim Chen## input is [rsp + _XFER + %1 * 4]
310ec2b4c85STim Chen.macro DO_ROUND round
311ec2b4c85STim Chen	mov	e, y0			# y0 = e
312ec2b4c85STim Chen        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
313ec2b4c85STim Chen        mov     a, y1                   # y1 = a
314ec2b4c85STim Chen        xor     e, y0                   # y0 = e ^ (e >> (25-11))
315ec2b4c85STim Chen        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
316ec2b4c85STim Chen        mov     f, y2                   # y2 = f
317ec2b4c85STim Chen        xor     a, y1                   # y1 = a ^ (a >> (22-13)
318ec2b4c85STim Chen        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
319ec2b4c85STim Chen        xor     g, y2                   # y2 = f^g
320ec2b4c85STim Chen        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
321ec2b4c85STim Chen        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
322ec2b4c85STim Chen        and     e, y2                   # y2 = (f^g)&e
323ec2b4c85STim Chen        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
324ec2b4c85STim Chen        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
325ec2b4c85STim Chen        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
326ec2b4c85STim Chen        add     y0, y2                  # y2 = S1 + CH
327ec2b4c85STim Chen        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
328ec2b4c85STim Chen        offset = \round * 4 + _XFER     #
329ec2b4c85STim Chen        add     offset(%rsp), y2	# y2 = k + w + S1 + CH
330ec2b4c85STim Chen        mov     a, y0			# y0 = a
331ec2b4c85STim Chen        add     y2, h                   # h = h + S1 + CH + k + w
332ec2b4c85STim Chen        mov     a, y2                   # y2 = a
333ec2b4c85STim Chen        or      c, y0                   # y0 = a|c
334ec2b4c85STim Chen        add     h, d                    # d = d + h + S1 + CH + k + w
335ec2b4c85STim Chen        and     c, y2                   # y2 = a&c
336ec2b4c85STim Chen        and     b, y0                   # y0 = (a|c)&b
337ec2b4c85STim Chen        add     y1, h                   # h = h + S1 + CH + k + w + S0
338ec2b4c85STim Chen        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
339ec2b4c85STim Chen        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
340ec2b4c85STim Chen        ROTATE_ARGS
341ec2b4c85STim Chen.endm
342ec2b4c85STim Chen
343ec2b4c85STim Chen########################################################################
34441419a28SKees Cook## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
34541419a28SKees Cook## arg 1 : pointer to state
3461631030aSArd Biesheuvel## arg 2 : pointer to input data
347ec2b4c85STim Chen## arg 3 : Num blocks
348ec2b4c85STim Chen########################################################################
349ec2b4c85STim Chen.text
35019940ebbSEric BiggersSYM_TYPED_FUNC_START(sha256_transform_avx)
351ec2b4c85STim Chen	pushq   %rbx
352673ac6fbSJosh Poimboeuf	pushq   %r12
353ec2b4c85STim Chen	pushq   %r13
354ec2b4c85STim Chen	pushq   %r14
355ec2b4c85STim Chen	pushq   %r15
356673ac6fbSJosh Poimboeuf	pushq	%rbp
357673ac6fbSJosh Poimboeuf	movq	%rsp, %rbp
358ec2b4c85STim Chen
359ec2b4c85STim Chen	subq    $STACK_SIZE, %rsp	# allocate stack space
360ec2b4c85STim Chen	and	$~15, %rsp		# align stack pointer
361ec2b4c85STim Chen
362ec2b4c85STim Chen	shl     $6, NUM_BLKS		# convert to bytes
363*94330fbeSArd Biesheuvel	jz      .Ldone_hash
364ec2b4c85STim Chen	add     INP, NUM_BLKS		# pointer to end of data
365ec2b4c85STim Chen	mov     NUM_BLKS, _INP_END(%rsp)
366ec2b4c85STim Chen
367ec2b4c85STim Chen	## load initial digest
368ec2b4c85STim Chen	mov     4*0(CTX), a
369ec2b4c85STim Chen	mov     4*1(CTX), b
370ec2b4c85STim Chen	mov     4*2(CTX), c
371ec2b4c85STim Chen	mov     4*3(CTX), d
372ec2b4c85STim Chen	mov     4*4(CTX), e
373ec2b4c85STim Chen	mov     4*5(CTX), f
374ec2b4c85STim Chen	mov     4*6(CTX), g
375ec2b4c85STim Chen	mov     4*7(CTX), h
376ec2b4c85STim Chen
377ec2b4c85STim Chen	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
378ec2b4c85STim Chen	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
379ec2b4c85STim Chen	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
380*94330fbeSArd Biesheuvel.Lloop0:
381ec2b4c85STim Chen	lea     K256(%rip), TBL
382ec2b4c85STim Chen
383ec2b4c85STim Chen	## byte swap first 16 dwords
384ec2b4c85STim Chen	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
385ec2b4c85STim Chen	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
386ec2b4c85STim Chen	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
387ec2b4c85STim Chen	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
388ec2b4c85STim Chen
389ec2b4c85STim Chen	mov     INP, _INP(%rsp)
390ec2b4c85STim Chen
391ec2b4c85STim Chen	## schedule 48 input dwords, by doing 3 rounds of 16 each
392ec2b4c85STim Chen	mov     $3, SRND
393ec2b4c85STim Chen.align 16
394*94330fbeSArd Biesheuvel.Lloop1:
395ec2b4c85STim Chen	vpaddd  (TBL), X0, XFER
396ec2b4c85STim Chen	vmovdqa XFER, _XFER(%rsp)
397ec2b4c85STim Chen	FOUR_ROUNDS_AND_SCHED
398ec2b4c85STim Chen
399ec2b4c85STim Chen	vpaddd  1*16(TBL), X0, XFER
400ec2b4c85STim Chen	vmovdqa XFER, _XFER(%rsp)
401ec2b4c85STim Chen	FOUR_ROUNDS_AND_SCHED
402ec2b4c85STim Chen
403ec2b4c85STim Chen	vpaddd  2*16(TBL), X0, XFER
404ec2b4c85STim Chen	vmovdqa XFER, _XFER(%rsp)
405ec2b4c85STim Chen	FOUR_ROUNDS_AND_SCHED
406ec2b4c85STim Chen
407ec2b4c85STim Chen	vpaddd  3*16(TBL), X0, XFER
408ec2b4c85STim Chen	vmovdqa XFER, _XFER(%rsp)
409ec2b4c85STim Chen	add	$4*16, TBL
410ec2b4c85STim Chen	FOUR_ROUNDS_AND_SCHED
411ec2b4c85STim Chen
412ec2b4c85STim Chen	sub     $1, SRND
413*94330fbeSArd Biesheuvel	jne     .Lloop1
414ec2b4c85STim Chen
415ec2b4c85STim Chen	mov     $2, SRND
416*94330fbeSArd Biesheuvel.Lloop2:
417ec2b4c85STim Chen	vpaddd  (TBL), X0, XFER
418ec2b4c85STim Chen	vmovdqa XFER, _XFER(%rsp)
419ec2b4c85STim Chen	DO_ROUND        0
420ec2b4c85STim Chen	DO_ROUND        1
421ec2b4c85STim Chen	DO_ROUND        2
422ec2b4c85STim Chen	DO_ROUND        3
423ec2b4c85STim Chen
424ec2b4c85STim Chen	vpaddd  1*16(TBL), X1, XFER
425ec2b4c85STim Chen	vmovdqa XFER, _XFER(%rsp)
426ec2b4c85STim Chen	add     $2*16, TBL
427ec2b4c85STim Chen	DO_ROUND        0
428ec2b4c85STim Chen	DO_ROUND        1
429ec2b4c85STim Chen	DO_ROUND        2
430ec2b4c85STim Chen	DO_ROUND        3
431ec2b4c85STim Chen
432ec2b4c85STim Chen	vmovdqa X2, X0
433ec2b4c85STim Chen	vmovdqa X3, X1
434ec2b4c85STim Chen
435ec2b4c85STim Chen	sub     $1, SRND
436*94330fbeSArd Biesheuvel	jne     .Lloop2
437ec2b4c85STim Chen
438ec2b4c85STim Chen	addm    (4*0)(CTX),a
439ec2b4c85STim Chen	addm    (4*1)(CTX),b
440ec2b4c85STim Chen	addm    (4*2)(CTX),c
441ec2b4c85STim Chen	addm    (4*3)(CTX),d
442ec2b4c85STim Chen	addm    (4*4)(CTX),e
443ec2b4c85STim Chen	addm    (4*5)(CTX),f
444ec2b4c85STim Chen	addm    (4*6)(CTX),g
445ec2b4c85STim Chen	addm    (4*7)(CTX),h
446ec2b4c85STim Chen
447ec2b4c85STim Chen	mov     _INP(%rsp), INP
448ec2b4c85STim Chen	add     $64, INP
449ec2b4c85STim Chen	cmp     _INP_END(%rsp), INP
450*94330fbeSArd Biesheuvel	jne     .Lloop0
451ec2b4c85STim Chen
452*94330fbeSArd Biesheuvel.Ldone_hash:
453ec2b4c85STim Chen
454673ac6fbSJosh Poimboeuf	mov	%rbp, %rsp
455673ac6fbSJosh Poimboeuf	popq	%rbp
456ec2b4c85STim Chen	popq    %r15
457ec2b4c85STim Chen	popq    %r14
458ec2b4c85STim Chen	popq    %r13
459673ac6fbSJosh Poimboeuf	popq	%r12
460ec2b4c85STim Chen	popq    %rbx
461f94909ceSPeter Zijlstra	RET
4626dcc5627SJiri SlabySYM_FUNC_END(sha256_transform_avx)
463ec2b4c85STim Chen
464e183914aSDenys Vlasenko.section	.rodata.cst256.K256, "aM", @progbits, 256
465ec2b4c85STim Chen.align 64
466ec2b4c85STim ChenK256:
467ec2b4c85STim Chen	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
468ec2b4c85STim Chen	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
469ec2b4c85STim Chen	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
470ec2b4c85STim Chen	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
471ec2b4c85STim Chen	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
472ec2b4c85STim Chen	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
473ec2b4c85STim Chen	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
474ec2b4c85STim Chen	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
475ec2b4c85STim Chen	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
476ec2b4c85STim Chen	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
477ec2b4c85STim Chen	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
478ec2b4c85STim Chen	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
479ec2b4c85STim Chen	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
480ec2b4c85STim Chen	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
481ec2b4c85STim Chen	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
482ec2b4c85STim Chen	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
483ec2b4c85STim Chen
484e183914aSDenys Vlasenko.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
485e183914aSDenys Vlasenko.align 16
486ec2b4c85STim ChenPSHUFFLE_BYTE_FLIP_MASK:
487ec2b4c85STim Chen	.octa 0x0c0d0e0f08090a0b0405060700010203
488ec2b4c85STim Chen
489e183914aSDenys Vlasenko.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
490e183914aSDenys Vlasenko.align 16
491ec2b4c85STim Chen# shuffle xBxA -> 00BA
492ec2b4c85STim Chen_SHUF_00BA:
493ec2b4c85STim Chen	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
494ec2b4c85STim Chen
495e183914aSDenys Vlasenko.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
496e183914aSDenys Vlasenko.align 16
497ec2b4c85STim Chen# shuffle xDxC -> DC00
498ec2b4c85STim Chen_SHUF_DC00:
499ec2b4c85STim Chen	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
500