1ec2b4c85STim Chen######################################################################## 2ec2b4c85STim Chen# Implement fast SHA-256 with AVX1 instructions. (x86_64) 3ec2b4c85STim Chen# 4ec2b4c85STim Chen# Copyright (C) 2013 Intel Corporation. 5ec2b4c85STim Chen# 6ec2b4c85STim Chen# Authors: 7ec2b4c85STim Chen# James Guilford <james.guilford@intel.com> 8ec2b4c85STim Chen# Kirk Yap <kirk.s.yap@intel.com> 9ec2b4c85STim Chen# Tim Chen <tim.c.chen@linux.intel.com> 10ec2b4c85STim Chen# 11ec2b4c85STim Chen# This software is available to you under a choice of one of two 12ec2b4c85STim Chen# licenses. You may choose to be licensed under the terms of the GNU 13ec2b4c85STim Chen# General Public License (GPL) Version 2, available from the file 14ec2b4c85STim Chen# COPYING in the main directory of this source tree, or the 15ec2b4c85STim Chen# OpenIB.org BSD license below: 16ec2b4c85STim Chen# 17ec2b4c85STim Chen# Redistribution and use in source and binary forms, with or 18ec2b4c85STim Chen# without modification, are permitted provided that the following 19ec2b4c85STim Chen# conditions are met: 20ec2b4c85STim Chen# 21ec2b4c85STim Chen# - Redistributions of source code must retain the above 22ec2b4c85STim Chen# copyright notice, this list of conditions and the following 23ec2b4c85STim Chen# disclaimer. 24ec2b4c85STim Chen# 25ec2b4c85STim Chen# - Redistributions in binary form must reproduce the above 26ec2b4c85STim Chen# copyright notice, this list of conditions and the following 27ec2b4c85STim Chen# disclaimer in the documentation and/or other materials 28ec2b4c85STim Chen# provided with the distribution. 29ec2b4c85STim Chen# 30ec2b4c85STim Chen# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31ec2b4c85STim Chen# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32ec2b4c85STim Chen# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33ec2b4c85STim Chen# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34ec2b4c85STim Chen# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35ec2b4c85STim Chen# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36ec2b4c85STim Chen# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37ec2b4c85STim Chen# SOFTWARE. 38ec2b4c85STim Chen######################################################################## 39ec2b4c85STim Chen# 40ec2b4c85STim Chen# This code is described in an Intel White-Paper: 41ec2b4c85STim Chen# "Fast SHA-256 Implementations on Intel Architecture Processors" 42ec2b4c85STim Chen# 43ec2b4c85STim Chen# To find it, surf to http://www.intel.com/p/en_US/embedded 44ec2b4c85STim Chen# and search for that title. 45ec2b4c85STim Chen# 46ec2b4c85STim Chen######################################################################## 47ec2b4c85STim Chen# This code schedules 1 block at a time, with 4 lanes per block 48ec2b4c85STim Chen######################################################################## 49ec2b4c85STim Chen 50ec2b4c85STim Chen#include <linux/linkage.h> 5119940ebbSEric Biggers#include <linux/cfi_types.h> 52ec2b4c85STim Chen 53ec2b4c85STim Chen## assume buffers not aligned 54ec2b4c85STim Chen#define VMOVDQ vmovdqu 55ec2b4c85STim Chen 56ec2b4c85STim Chen################################ Define Macros 57ec2b4c85STim Chen 58ec2b4c85STim Chen# addm [mem], reg 59ec2b4c85STim Chen# Add reg to mem using reg-mem add and store 60ec2b4c85STim Chen.macro addm p1 p2 61ec2b4c85STim Chen add \p1, \p2 62ec2b4c85STim Chen mov \p2, \p1 63ec2b4c85STim Chen.endm 64ec2b4c85STim Chen 65ec2b4c85STim Chen 66ec2b4c85STim Chen.macro MY_ROR p1 p2 67ec2b4c85STim Chen shld $(32-(\p1)), \p2, \p2 68ec2b4c85STim Chen.endm 69ec2b4c85STim Chen 70ec2b4c85STim Chen################################ 71ec2b4c85STim Chen 72ec2b4c85STim Chen# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 73ec2b4c85STim Chen# Load xmm with mem and byte swap each dword 74ec2b4c85STim Chen.macro COPY_XMM_AND_BSWAP p1 p2 p3 75ec2b4c85STim Chen VMOVDQ \p2, \p1 76ec2b4c85STim Chen vpshufb \p3, \p1, \p1 77ec2b4c85STim Chen.endm 78ec2b4c85STim Chen 79ec2b4c85STim Chen################################ 80ec2b4c85STim Chen 81ec2b4c85STim ChenX0 = %xmm4 82ec2b4c85STim ChenX1 = %xmm5 83ec2b4c85STim ChenX2 = %xmm6 84ec2b4c85STim ChenX3 = %xmm7 85ec2b4c85STim Chen 86ec2b4c85STim ChenXTMP0 = %xmm0 87ec2b4c85STim ChenXTMP1 = %xmm1 88ec2b4c85STim ChenXTMP2 = %xmm2 89ec2b4c85STim ChenXTMP3 = %xmm3 90ec2b4c85STim ChenXTMP4 = %xmm8 91ec2b4c85STim ChenXFER = %xmm9 92ec2b4c85STim ChenXTMP5 = %xmm11 93ec2b4c85STim Chen 94ec2b4c85STim ChenSHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 95ec2b4c85STim ChenSHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 96ec2b4c85STim ChenBYTE_FLIP_MASK = %xmm13 97ec2b4c85STim Chen 98ec2b4c85STim ChenNUM_BLKS = %rdx # 3rd arg 991631030aSArd BiesheuvelINP = %rsi # 2nd arg 1001631030aSArd BiesheuvelCTX = %rdi # 1st arg 101ec2b4c85STim Chen 1021631030aSArd BiesheuvelSRND = %rsi # clobbers INP 103ec2b4c85STim Chenc = %ecx 104ec2b4c85STim Chend = %r8d 105ec2b4c85STim Chene = %edx 106673ac6fbSJosh PoimboeufTBL = %r12 107ec2b4c85STim Chena = %eax 108ec2b4c85STim Chenb = %ebx 109ec2b4c85STim Chen 110ec2b4c85STim Chenf = %r9d 111ec2b4c85STim Cheng = %r10d 112ec2b4c85STim Chenh = %r11d 113ec2b4c85STim Chen 114ec2b4c85STim Cheny0 = %r13d 115ec2b4c85STim Cheny1 = %r14d 116ec2b4c85STim Cheny2 = %r15d 117ec2b4c85STim Chen 118ec2b4c85STim Chen 119ec2b4c85STim Chen_INP_END_SIZE = 8 120ec2b4c85STim Chen_INP_SIZE = 8 121de614e56SJussi Kivilinna_XFER_SIZE = 16 122ec2b4c85STim Chen_XMM_SAVE_SIZE = 0 123ec2b4c85STim Chen 124ec2b4c85STim Chen_INP_END = 0 125ec2b4c85STim Chen_INP = _INP_END + _INP_END_SIZE 126ec2b4c85STim Chen_XFER = _INP + _INP_SIZE 127ec2b4c85STim Chen_XMM_SAVE = _XFER + _XFER_SIZE 128ec2b4c85STim ChenSTACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 129ec2b4c85STim Chen 130ec2b4c85STim Chen# rotate_Xs 131ec2b4c85STim Chen# Rotate values of symbols X0...X3 132ec2b4c85STim Chen.macro rotate_Xs 133ec2b4c85STim ChenX_ = X0 134ec2b4c85STim ChenX0 = X1 135ec2b4c85STim ChenX1 = X2 136ec2b4c85STim ChenX2 = X3 137ec2b4c85STim ChenX3 = X_ 138ec2b4c85STim Chen.endm 139ec2b4c85STim Chen 140ec2b4c85STim Chen# ROTATE_ARGS 141ec2b4c85STim Chen# Rotate values of symbols a...h 142ec2b4c85STim Chen.macro ROTATE_ARGS 143ec2b4c85STim ChenTMP_ = h 144ec2b4c85STim Chenh = g 145ec2b4c85STim Cheng = f 146ec2b4c85STim Chenf = e 147ec2b4c85STim Chene = d 148ec2b4c85STim Chend = c 149ec2b4c85STim Chenc = b 150ec2b4c85STim Chenb = a 151ec2b4c85STim Chena = TMP_ 152ec2b4c85STim Chen.endm 153ec2b4c85STim Chen 154ec2b4c85STim Chen.macro FOUR_ROUNDS_AND_SCHED 155ec2b4c85STim Chen ## compute s0 four at a time and s1 two at a time 156ec2b4c85STim Chen ## compute W[-16] + W[-7] 4 at a time 157ec2b4c85STim Chen 158ec2b4c85STim Chen mov e, y0 # y0 = e 159ec2b4c85STim Chen MY_ROR (25-11), y0 # y0 = e >> (25-11) 160ec2b4c85STim Chen mov a, y1 # y1 = a 161ec2b4c85STim Chen vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 162ec2b4c85STim Chen MY_ROR (22-13), y1 # y1 = a >> (22-13) 163ec2b4c85STim Chen xor e, y0 # y0 = e ^ (e >> (25-11)) 164ec2b4c85STim Chen mov f, y2 # y2 = f 165ec2b4c85STim Chen MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 166ec2b4c85STim Chen xor a, y1 # y1 = a ^ (a >> (22-13) 167ec2b4c85STim Chen xor g, y2 # y2 = f^g 168ec2b4c85STim Chen vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] 169ec2b4c85STim Chen xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 170ec2b4c85STim Chen and e, y2 # y2 = (f^g)&e 171ec2b4c85STim Chen MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 172ec2b4c85STim Chen ## compute s0 173ec2b4c85STim Chen vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 174ec2b4c85STim Chen xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 175ec2b4c85STim Chen MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 176ec2b4c85STim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g 177ec2b4c85STim Chen MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 178ec2b4c85STim Chen add y0, y2 # y2 = S1 + CH 179ec2b4c85STim Chen add _XFER(%rsp), y2 # y2 = k + w + S1 + CH 180ec2b4c85STim Chen mov a, y0 # y0 = a 181ec2b4c85STim Chen add y2, h # h = h + S1 + CH + k + w 182ec2b4c85STim Chen mov a, y2 # y2 = a 183ec2b4c85STim Chen vpsrld $7, XTMP1, XTMP2 184ec2b4c85STim Chen or c, y0 # y0 = a|c 185ec2b4c85STim Chen add h, d # d = d + h + S1 + CH + k + w 186ec2b4c85STim Chen and c, y2 # y2 = a&c 187ec2b4c85STim Chen vpslld $(32-7), XTMP1, XTMP3 188ec2b4c85STim Chen and b, y0 # y0 = (a|c)&b 189ec2b4c85STim Chen add y1, h # h = h + S1 + CH + k + w + S0 190ec2b4c85STim Chen vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 191ec2b4c85STim Chen or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 192ec2b4c85STim Chen add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 193ec2b4c85STim Chen ROTATE_ARGS 194ec2b4c85STim Chen mov e, y0 # y0 = e 195ec2b4c85STim Chen mov a, y1 # y1 = a 196ec2b4c85STim Chen MY_ROR (25-11), y0 # y0 = e >> (25-11) 197ec2b4c85STim Chen xor e, y0 # y0 = e ^ (e >> (25-11)) 198ec2b4c85STim Chen mov f, y2 # y2 = f 199ec2b4c85STim Chen MY_ROR (22-13), y1 # y1 = a >> (22-13) 200ec2b4c85STim Chen vpsrld $18, XTMP1, XTMP2 # 201ec2b4c85STim Chen xor a, y1 # y1 = a ^ (a >> (22-13) 202ec2b4c85STim Chen MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 203ec2b4c85STim Chen xor g, y2 # y2 = f^g 204ec2b4c85STim Chen vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 205ec2b4c85STim Chen MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 206ec2b4c85STim Chen xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 207ec2b4c85STim Chen and e, y2 # y2 = (f^g)&e 208ec2b4c85STim Chen MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 209ec2b4c85STim Chen vpslld $(32-18), XTMP1, XTMP1 210ec2b4c85STim Chen xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 211ec2b4c85STim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g 212ec2b4c85STim Chen vpxor XTMP1, XTMP3, XTMP3 # 213ec2b4c85STim Chen add y0, y2 # y2 = S1 + CH 214ec2b4c85STim Chen add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 215ec2b4c85STim Chen MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 216ec2b4c85STim Chen vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 217ec2b4c85STim Chen mov a, y0 # y0 = a 218ec2b4c85STim Chen add y2, h # h = h + S1 + CH + k + w 219ec2b4c85STim Chen mov a, y2 # y2 = a 220ec2b4c85STim Chen vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 221ec2b4c85STim Chen or c, y0 # y0 = a|c 222ec2b4c85STim Chen add h, d # d = d + h + S1 + CH + k + w 223ec2b4c85STim Chen and c, y2 # y2 = a&c 224ec2b4c85STim Chen ## compute low s1 225ec2b4c85STim Chen vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 226ec2b4c85STim Chen and b, y0 # y0 = (a|c)&b 227ec2b4c85STim Chen add y1, h # h = h + S1 + CH + k + w + S0 228ec2b4c85STim Chen vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 229ec2b4c85STim Chen or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 230ec2b4c85STim Chen add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 231ec2b4c85STim Chen ROTATE_ARGS 232ec2b4c85STim Chen mov e, y0 # y0 = e 233ec2b4c85STim Chen mov a, y1 # y1 = a 234ec2b4c85STim Chen MY_ROR (25-11), y0 # y0 = e >> (25-11) 235ec2b4c85STim Chen xor e, y0 # y0 = e ^ (e >> (25-11)) 236ec2b4c85STim Chen MY_ROR (22-13), y1 # y1 = a >> (22-13) 237ec2b4c85STim Chen mov f, y2 # y2 = f 238ec2b4c85STim Chen xor a, y1 # y1 = a ^ (a >> (22-13) 239ec2b4c85STim Chen MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 240ec2b4c85STim Chen vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 241ec2b4c85STim Chen xor g, y2 # y2 = f^g 242ec2b4c85STim Chen vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} 243ec2b4c85STim Chen xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 244ec2b4c85STim Chen and e, y2 # y2 = (f^g)&e 245ec2b4c85STim Chen vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} 246ec2b4c85STim Chen MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 247ec2b4c85STim Chen xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 248ec2b4c85STim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g 249ec2b4c85STim Chen MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 250ec2b4c85STim Chen vpxor XTMP3, XTMP2, XTMP2 # 251ec2b4c85STim Chen add y0, y2 # y2 = S1 + CH 252ec2b4c85STim Chen MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 253ec2b4c85STim Chen add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 254ec2b4c85STim Chen vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 255ec2b4c85STim Chen mov a, y0 # y0 = a 256ec2b4c85STim Chen add y2, h # h = h + S1 + CH + k + w 257ec2b4c85STim Chen mov a, y2 # y2 = a 258ec2b4c85STim Chen vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 259ec2b4c85STim Chen or c, y0 # y0 = a|c 260ec2b4c85STim Chen add h, d # d = d + h + S1 + CH + k + w 261ec2b4c85STim Chen and c, y2 # y2 = a&c 262ec2b4c85STim Chen vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 263ec2b4c85STim Chen and b, y0 # y0 = (a|c)&b 264ec2b4c85STim Chen add y1, h # h = h + S1 + CH + k + w + S0 265ec2b4c85STim Chen ## compute high s1 266ec2b4c85STim Chen vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 267ec2b4c85STim Chen or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 268ec2b4c85STim Chen add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 269ec2b4c85STim Chen ROTATE_ARGS 270ec2b4c85STim Chen mov e, y0 # y0 = e 271ec2b4c85STim Chen MY_ROR (25-11), y0 # y0 = e >> (25-11) 272ec2b4c85STim Chen mov a, y1 # y1 = a 273ec2b4c85STim Chen MY_ROR (22-13), y1 # y1 = a >> (22-13) 274ec2b4c85STim Chen xor e, y0 # y0 = e ^ (e >> (25-11)) 275ec2b4c85STim Chen mov f, y2 # y2 = f 276ec2b4c85STim Chen MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 277ec2b4c85STim Chen vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 278ec2b4c85STim Chen xor a, y1 # y1 = a ^ (a >> (22-13) 279ec2b4c85STim Chen xor g, y2 # y2 = f^g 280ec2b4c85STim Chen vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} 281ec2b4c85STim Chen xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 282ec2b4c85STim Chen and e, y2 # y2 = (f^g)&e 283ec2b4c85STim Chen MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 284ec2b4c85STim Chen vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} 285ec2b4c85STim Chen xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 286ec2b4c85STim Chen MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 287ec2b4c85STim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g 288ec2b4c85STim Chen vpxor XTMP3, XTMP2, XTMP2 289ec2b4c85STim Chen MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 290ec2b4c85STim Chen add y0, y2 # y2 = S1 + CH 291ec2b4c85STim Chen add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 292ec2b4c85STim Chen vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 293ec2b4c85STim Chen mov a, y0 # y0 = a 294ec2b4c85STim Chen add y2, h # h = h + S1 + CH + k + w 295ec2b4c85STim Chen mov a, y2 # y2 = a 296ec2b4c85STim Chen vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 297ec2b4c85STim Chen or c, y0 # y0 = a|c 298ec2b4c85STim Chen add h, d # d = d + h + S1 + CH + k + w 299ec2b4c85STim Chen and c, y2 # y2 = a&c 300ec2b4c85STim Chen vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 301ec2b4c85STim Chen and b, y0 # y0 = (a|c)&b 302ec2b4c85STim Chen add y1, h # h = h + S1 + CH + k + w + S0 303ec2b4c85STim Chen or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 304ec2b4c85STim Chen add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 305ec2b4c85STim Chen ROTATE_ARGS 306ec2b4c85STim Chen rotate_Xs 307ec2b4c85STim Chen.endm 308ec2b4c85STim Chen 309ec2b4c85STim Chen## input is [rsp + _XFER + %1 * 4] 310ec2b4c85STim Chen.macro DO_ROUND round 311ec2b4c85STim Chen mov e, y0 # y0 = e 312ec2b4c85STim Chen MY_ROR (25-11), y0 # y0 = e >> (25-11) 313ec2b4c85STim Chen mov a, y1 # y1 = a 314ec2b4c85STim Chen xor e, y0 # y0 = e ^ (e >> (25-11)) 315ec2b4c85STim Chen MY_ROR (22-13), y1 # y1 = a >> (22-13) 316ec2b4c85STim Chen mov f, y2 # y2 = f 317ec2b4c85STim Chen xor a, y1 # y1 = a ^ (a >> (22-13) 318ec2b4c85STim Chen MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 319ec2b4c85STim Chen xor g, y2 # y2 = f^g 320ec2b4c85STim Chen xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 321ec2b4c85STim Chen MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 322ec2b4c85STim Chen and e, y2 # y2 = (f^g)&e 323ec2b4c85STim Chen xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 324ec2b4c85STim Chen MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 325ec2b4c85STim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g 326ec2b4c85STim Chen add y0, y2 # y2 = S1 + CH 327ec2b4c85STim Chen MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 328ec2b4c85STim Chen offset = \round * 4 + _XFER # 329ec2b4c85STim Chen add offset(%rsp), y2 # y2 = k + w + S1 + CH 330ec2b4c85STim Chen mov a, y0 # y0 = a 331ec2b4c85STim Chen add y2, h # h = h + S1 + CH + k + w 332ec2b4c85STim Chen mov a, y2 # y2 = a 333ec2b4c85STim Chen or c, y0 # y0 = a|c 334ec2b4c85STim Chen add h, d # d = d + h + S1 + CH + k + w 335ec2b4c85STim Chen and c, y2 # y2 = a&c 336ec2b4c85STim Chen and b, y0 # y0 = (a|c)&b 337ec2b4c85STim Chen add y1, h # h = h + S1 + CH + k + w + S0 338ec2b4c85STim Chen or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 339ec2b4c85STim Chen add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 340ec2b4c85STim Chen ROTATE_ARGS 341ec2b4c85STim Chen.endm 342ec2b4c85STim Chen 343ec2b4c85STim Chen######################################################################## 34441419a28SKees Cook## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks) 34541419a28SKees Cook## arg 1 : pointer to state 3461631030aSArd Biesheuvel## arg 2 : pointer to input data 347ec2b4c85STim Chen## arg 3 : Num blocks 348ec2b4c85STim Chen######################################################################## 349ec2b4c85STim Chen.text 35019940ebbSEric BiggersSYM_TYPED_FUNC_START(sha256_transform_avx) 351ec2b4c85STim Chen pushq %rbx 352673ac6fbSJosh Poimboeuf pushq %r12 353ec2b4c85STim Chen pushq %r13 354ec2b4c85STim Chen pushq %r14 355ec2b4c85STim Chen pushq %r15 356673ac6fbSJosh Poimboeuf pushq %rbp 357673ac6fbSJosh Poimboeuf movq %rsp, %rbp 358ec2b4c85STim Chen 359ec2b4c85STim Chen subq $STACK_SIZE, %rsp # allocate stack space 360ec2b4c85STim Chen and $~15, %rsp # align stack pointer 361ec2b4c85STim Chen 362ec2b4c85STim Chen shl $6, NUM_BLKS # convert to bytes 363*94330fbeSArd Biesheuvel jz .Ldone_hash 364ec2b4c85STim Chen add INP, NUM_BLKS # pointer to end of data 365ec2b4c85STim Chen mov NUM_BLKS, _INP_END(%rsp) 366ec2b4c85STim Chen 367ec2b4c85STim Chen ## load initial digest 368ec2b4c85STim Chen mov 4*0(CTX), a 369ec2b4c85STim Chen mov 4*1(CTX), b 370ec2b4c85STim Chen mov 4*2(CTX), c 371ec2b4c85STim Chen mov 4*3(CTX), d 372ec2b4c85STim Chen mov 4*4(CTX), e 373ec2b4c85STim Chen mov 4*5(CTX), f 374ec2b4c85STim Chen mov 4*6(CTX), g 375ec2b4c85STim Chen mov 4*7(CTX), h 376ec2b4c85STim Chen 377ec2b4c85STim Chen vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 378ec2b4c85STim Chen vmovdqa _SHUF_00BA(%rip), SHUF_00BA 379ec2b4c85STim Chen vmovdqa _SHUF_DC00(%rip), SHUF_DC00 380*94330fbeSArd Biesheuvel.Lloop0: 381ec2b4c85STim Chen lea K256(%rip), TBL 382ec2b4c85STim Chen 383ec2b4c85STim Chen ## byte swap first 16 dwords 384ec2b4c85STim Chen COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK 385ec2b4c85STim Chen COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK 386ec2b4c85STim Chen COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK 387ec2b4c85STim Chen COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK 388ec2b4c85STim Chen 389ec2b4c85STim Chen mov INP, _INP(%rsp) 390ec2b4c85STim Chen 391ec2b4c85STim Chen ## schedule 48 input dwords, by doing 3 rounds of 16 each 392ec2b4c85STim Chen mov $3, SRND 393ec2b4c85STim Chen.align 16 394*94330fbeSArd Biesheuvel.Lloop1: 395ec2b4c85STim Chen vpaddd (TBL), X0, XFER 396ec2b4c85STim Chen vmovdqa XFER, _XFER(%rsp) 397ec2b4c85STim Chen FOUR_ROUNDS_AND_SCHED 398ec2b4c85STim Chen 399ec2b4c85STim Chen vpaddd 1*16(TBL), X0, XFER 400ec2b4c85STim Chen vmovdqa XFER, _XFER(%rsp) 401ec2b4c85STim Chen FOUR_ROUNDS_AND_SCHED 402ec2b4c85STim Chen 403ec2b4c85STim Chen vpaddd 2*16(TBL), X0, XFER 404ec2b4c85STim Chen vmovdqa XFER, _XFER(%rsp) 405ec2b4c85STim Chen FOUR_ROUNDS_AND_SCHED 406ec2b4c85STim Chen 407ec2b4c85STim Chen vpaddd 3*16(TBL), X0, XFER 408ec2b4c85STim Chen vmovdqa XFER, _XFER(%rsp) 409ec2b4c85STim Chen add $4*16, TBL 410ec2b4c85STim Chen FOUR_ROUNDS_AND_SCHED 411ec2b4c85STim Chen 412ec2b4c85STim Chen sub $1, SRND 413*94330fbeSArd Biesheuvel jne .Lloop1 414ec2b4c85STim Chen 415ec2b4c85STim Chen mov $2, SRND 416*94330fbeSArd Biesheuvel.Lloop2: 417ec2b4c85STim Chen vpaddd (TBL), X0, XFER 418ec2b4c85STim Chen vmovdqa XFER, _XFER(%rsp) 419ec2b4c85STim Chen DO_ROUND 0 420ec2b4c85STim Chen DO_ROUND 1 421ec2b4c85STim Chen DO_ROUND 2 422ec2b4c85STim Chen DO_ROUND 3 423ec2b4c85STim Chen 424ec2b4c85STim Chen vpaddd 1*16(TBL), X1, XFER 425ec2b4c85STim Chen vmovdqa XFER, _XFER(%rsp) 426ec2b4c85STim Chen add $2*16, TBL 427ec2b4c85STim Chen DO_ROUND 0 428ec2b4c85STim Chen DO_ROUND 1 429ec2b4c85STim Chen DO_ROUND 2 430ec2b4c85STim Chen DO_ROUND 3 431ec2b4c85STim Chen 432ec2b4c85STim Chen vmovdqa X2, X0 433ec2b4c85STim Chen vmovdqa X3, X1 434ec2b4c85STim Chen 435ec2b4c85STim Chen sub $1, SRND 436*94330fbeSArd Biesheuvel jne .Lloop2 437ec2b4c85STim Chen 438ec2b4c85STim Chen addm (4*0)(CTX),a 439ec2b4c85STim Chen addm (4*1)(CTX),b 440ec2b4c85STim Chen addm (4*2)(CTX),c 441ec2b4c85STim Chen addm (4*3)(CTX),d 442ec2b4c85STim Chen addm (4*4)(CTX),e 443ec2b4c85STim Chen addm (4*5)(CTX),f 444ec2b4c85STim Chen addm (4*6)(CTX),g 445ec2b4c85STim Chen addm (4*7)(CTX),h 446ec2b4c85STim Chen 447ec2b4c85STim Chen mov _INP(%rsp), INP 448ec2b4c85STim Chen add $64, INP 449ec2b4c85STim Chen cmp _INP_END(%rsp), INP 450*94330fbeSArd Biesheuvel jne .Lloop0 451ec2b4c85STim Chen 452*94330fbeSArd Biesheuvel.Ldone_hash: 453ec2b4c85STim Chen 454673ac6fbSJosh Poimboeuf mov %rbp, %rsp 455673ac6fbSJosh Poimboeuf popq %rbp 456ec2b4c85STim Chen popq %r15 457ec2b4c85STim Chen popq %r14 458ec2b4c85STim Chen popq %r13 459673ac6fbSJosh Poimboeuf popq %r12 460ec2b4c85STim Chen popq %rbx 461f94909ceSPeter Zijlstra RET 4626dcc5627SJiri SlabySYM_FUNC_END(sha256_transform_avx) 463ec2b4c85STim Chen 464e183914aSDenys Vlasenko.section .rodata.cst256.K256, "aM", @progbits, 256 465ec2b4c85STim Chen.align 64 466ec2b4c85STim ChenK256: 467ec2b4c85STim Chen .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 468ec2b4c85STim Chen .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 469ec2b4c85STim Chen .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 470ec2b4c85STim Chen .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 471ec2b4c85STim Chen .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 472ec2b4c85STim Chen .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 473ec2b4c85STim Chen .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 474ec2b4c85STim Chen .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 475ec2b4c85STim Chen .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 476ec2b4c85STim Chen .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 477ec2b4c85STim Chen .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 478ec2b4c85STim Chen .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 479ec2b4c85STim Chen .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 480ec2b4c85STim Chen .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 481ec2b4c85STim Chen .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 482ec2b4c85STim Chen .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 483ec2b4c85STim Chen 484e183914aSDenys Vlasenko.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 485e183914aSDenys Vlasenko.align 16 486ec2b4c85STim ChenPSHUFFLE_BYTE_FLIP_MASK: 487ec2b4c85STim Chen .octa 0x0c0d0e0f08090a0b0405060700010203 488ec2b4c85STim Chen 489e183914aSDenys Vlasenko.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 490e183914aSDenys Vlasenko.align 16 491ec2b4c85STim Chen# shuffle xBxA -> 00BA 492ec2b4c85STim Chen_SHUF_00BA: 493ec2b4c85STim Chen .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 494ec2b4c85STim Chen 495e183914aSDenys Vlasenko.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 496e183914aSDenys Vlasenko.align 16 497ec2b4c85STim Chen# shuffle xDxC -> DC00 498ec2b4c85STim Chen_SHUF_DC00: 499ec2b4c85STim Chen .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF 500