1;; 2;; Copyright (c) 2012-2020, Intel Corporation 3;; 4;; Redistribution and use in source and binary forms, with or without 5;; modification, are permitted provided that the following conditions are met: 6;; 7;; * Redistributions of source code must retain the above copyright notice, 8;; this list of conditions and the following disclaimer. 9;; * Redistributions in binary form must reproduce the above copyright 10;; notice, this list of conditions and the following disclaimer in the 11;; documentation and/or other materials provided with the distribution. 12;; * Neither the name of Intel Corporation nor the names of its contributors 13;; may be used to endorse or promote products derived from this software 14;; without specific prior written permission. 15;; 16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26;; 27 28;; code to compute quad SHA256 using AVX 29;; outer calling routine takes care of save and restore of XMM registers 30;; Logic designed/laid out by JDG 31 32;; Stack must be aligned to 16 bytes before call 33;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 34;; Windows preserves: rcx rsi rdi rbp r12 r14 r15 35;; 36;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12 37;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 38;; 39;; clobbers xmm0-15 40 41%include "include/os.asm" 42%include "mb_mgr_datastruct.asm" 43%include "include/clear_regs.asm" 44 45extern K256_4 46 47%ifdef LINUX 48 %define arg1 rdi 49 %define arg2 rsi 50%else 51 ; Windows definitions 52 %define arg1 rcx 53 %define arg2 rdx 54%endif 55 56; Common definitions 57%define STATE arg1 58%define INP_SIZE arg2 59 60%define IDX rax 61%define ROUND rbx 62%define TBL r12 63 64%define inp0 r8 65%define inp1 r9 66%define inp2 r10 67%define inp3 r11 68 69%define a xmm0 70%define b xmm1 71%define c xmm2 72%define d xmm3 73%define e xmm4 74%define f xmm5 75%define g xmm6 76%define h xmm7 77 78%define a0 xmm8 79%define a1 xmm9 80%define a2 xmm10 81 82%define TT0 xmm14 83%define TT1 xmm13 84%define TT2 xmm12 85%define TT3 xmm11 86%define TT4 xmm10 87%define TT5 xmm9 88 89%define T1 xmm14 90%define TMP xmm15 91 92%define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register 93%define ROUNDS 64*SZ4 94 95; Define stack usage 96struc STACK 97_DATA: resb SZ4 * 16 98_DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS 99 resb 8 ; for alignment, must be odd multiple of 8 100endstruc 101 102%define VMOVPS vmovups 103 104; transpose r0, r1, r2, r3, t0, t1 105; "transpose" data in {r0..r3} using temps {t0..t3} 106; Input looks like: {r0 r1 r2 r3} 107; r0 = {a3 a2 a1 a0} 108; r1 = {b3 b2 b1 b0} 109; r2 = {c3 c2 c1 c0} 110; r3 = {d3 d2 d1 d0} 111; 112; output looks like: {t0 r1 r0 r3} 113; t0 = {d0 c0 b0 a0} 114; r1 = {d1 c1 b1 a1} 115; r0 = {d2 c2 b2 a2} 116; r3 = {d3 c3 b3 a3} 117; 118%macro TRANSPOSE 6 119%define %%r0 %1 120%define %%r1 %2 121%define %%r2 %3 122%define %%r3 %4 123%define %%t0 %5 124%define %%t1 %6 125 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} 126 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} 127 128 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} 129 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} 130 131 vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} 132 133 vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} 134 135 vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} 136 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} 137%endmacro 138 139 140 141%macro ROTATE_ARGS 0 142%xdefine TMP_ h 143%xdefine h g 144%xdefine g f 145%xdefine f e 146%xdefine e d 147%xdefine d c 148%xdefine c b 149%xdefine b a 150%xdefine a TMP_ 151%endm 152 153; PRORD reg, imm, tmp 154%macro PRORD 3 155%define %%reg %1 156%define %%imm %2 157%define %%tmp %3 158 vpslld %%tmp, %%reg, (32-(%%imm)) 159 vpsrld %%reg, %%reg, %%imm 160 vpor %%reg, %%reg, %%tmp 161%endmacro 162 163; non-destructive 164; PRORD_nd reg, imm, tmp, src 165%macro PRORD_nd 4 166%define %%reg %1 167%define %%imm %2 168%define %%tmp %3 169%define %%src %4 170 ;vmovdqa %%tmp, %%reg 171 vpslld %%tmp, %%src, (32-(%%imm)) 172 vpsrld %%reg, %%src, %%imm 173 vpor %%reg, %%reg, %%tmp 174%endmacro 175 176; PRORD dst/src, amt 177%macro PRORD 2 178 PRORD %1, %2, TMP 179%endmacro 180 181; PRORD_nd dst, src, amt 182%macro PRORD_nd 3 183 PRORD_nd %1, %3, TMP, %2 184%endmacro 185 186;; arguments passed implicitly in preprocessor symbols i, a...h 187%macro ROUND_00_15 2 188%define %%T1 %1 189%define %%i %2 190 PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) 191 192 vpxor a2, f, g ; ch: a2 = f^g 193 vpand a2, a2, e ; ch: a2 = (f^g)&e 194 vpxor a2, a2, g ; a2 = ch 195 196 PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) 197 vmovdqa [SZ4*(%%i&0xf) + rsp + _DATA], %%T1 198 vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K 199 vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) 200 PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) 201 vpaddd h, h, a2 ; h = h + ch 202 PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) 203 vpaddd h, h, %%T1 ; h = h + ch + W + K 204 vpxor a0, a0, a1 ; a0 = sigma1 205 PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) 206 vpxor %%T1, a, c ; maj: T1 = a^c 207 add ROUND, SZ4 ; ROUND++ 208 vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b 209 vpaddd h, h, a0 210 211 vpaddd d, d, h 212 213 vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) 214 PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) 215 vpxor a2, a2, a1 ; a2 = sig0 216 vpand a1, a, c ; maj: a1 = a&c 217 vpor a1, a1, %%T1 ; a1 = maj 218 vpaddd h, h, a1 ; h = h + ch + W + K + maj 219 vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 220 221 ROTATE_ARGS 222%endm 223 224 225;; arguments passed implicitly in preprocessor symbols i, a...h 226%macro ROUND_16_XX 2 227%define %%T1 %1 228%define %%i %2 229 vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA] 230 vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA] 231 vmovdqa a0, %%T1 232 PRORD %%T1, 18-7 233 vmovdqa a2, a1 234 PRORD a1, 19-17 235 vpxor %%T1, %%T1, a0 236 PRORD %%T1, 7 237 vpxor a1, a1, a2 238 PRORD a1, 17 239 vpsrld a0, a0, 3 240 vpxor %%T1, %%T1, a0 241 vpsrld a2, a2, 10 242 vpxor a1, a1, a2 243 vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA] 244 vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA] 245 vpaddd %%T1, %%T1, a1 246 247 ROUND_00_15 %%T1, %%i 248%endm 249 250section .data 251default rel 252align 16 253PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 254 dq 0x0405060700010203, 0x0c0d0e0f08090a0b 255 256section .text 257 258;; SHA256_ARGS: 259;; UINT128 digest[8]; // transposed digests 260;; UINT8 *data_ptr[4]; 261;; 262 263;; void sha_256_mult_avx(SHA256_ARGS *args, UINT64 num_blocks); 264;; arg 1 : STATE : pointer args 265;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) 266;; 267MKGLOBAL(sha_256_mult_avx,function,internal) 268align 16 269sha_256_mult_avx: 270 ; general registers preserved in outer calling routine 271 ; outer calling routine saves all the XMM registers 272 sub rsp, STACK_size 273 274 ;; Load the pre-transposed incoming digest. 275 vmovdqa a,[STATE+0*SHA256_DIGEST_ROW_SIZE] 276 vmovdqa b,[STATE+1*SHA256_DIGEST_ROW_SIZE] 277 vmovdqa c,[STATE+2*SHA256_DIGEST_ROW_SIZE] 278 vmovdqa d,[STATE+3*SHA256_DIGEST_ROW_SIZE] 279 vmovdqa e,[STATE+4*SHA256_DIGEST_ROW_SIZE] 280 vmovdqa f,[STATE+5*SHA256_DIGEST_ROW_SIZE] 281 vmovdqa g,[STATE+6*SHA256_DIGEST_ROW_SIZE] 282 vmovdqa h,[STATE+7*SHA256_DIGEST_ROW_SIZE] 283 284 lea TBL,[rel K256_4] 285 286 ;; load the address of each of the 4 message lanes 287 ;; getting ready to transpose input onto stack 288 mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ] 289 mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ] 290 mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ] 291 mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ] 292 293 xor IDX, IDX 294lloop: 295 xor ROUND, ROUND 296 297 ;; save old digest 298 vmovdqa [rsp + _DIGEST + 0*SZ4], a 299 vmovdqa [rsp + _DIGEST + 1*SZ4], b 300 vmovdqa [rsp + _DIGEST + 2*SZ4], c 301 vmovdqa [rsp + _DIGEST + 3*SZ4], d 302 vmovdqa [rsp + _DIGEST + 4*SZ4], e 303 vmovdqa [rsp + _DIGEST + 5*SZ4], f 304 vmovdqa [rsp + _DIGEST + 6*SZ4], g 305 vmovdqa [rsp + _DIGEST + 7*SZ4], h 306 307%assign i 0 308%rep 4 309 vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] 310 VMOVPS TT2,[inp0+IDX+i*16] 311 VMOVPS TT1,[inp1+IDX+i*16] 312 VMOVPS TT4,[inp2+IDX+i*16] 313 VMOVPS TT3,[inp3+IDX+i*16] 314 TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 315 vpshufb TT0, TT0, TMP 316 vpshufb TT1, TT1, TMP 317 vpshufb TT2, TT2, TMP 318 vpshufb TT3, TT3, TMP 319 ROUND_00_15 TT0,(i*4+0) 320 ROUND_00_15 TT1,(i*4+1) 321 ROUND_00_15 TT2,(i*4+2) 322 ROUND_00_15 TT3,(i*4+3) 323%assign i (i+1) 324%endrep 325 add IDX, 4*4*4 326 327%assign i (i*4) 328 329 jmp Lrounds_16_xx 330align 16 331Lrounds_16_xx: 332%rep 16 333 ROUND_16_XX T1, i 334%assign i (i+1) 335%endrep 336 337 cmp ROUND,ROUNDS 338 jb Lrounds_16_xx 339 340 ;; add old digest 341 vpaddd a, a, [rsp + _DIGEST + 0*SZ4] 342 vpaddd b, b, [rsp + _DIGEST + 1*SZ4] 343 vpaddd c, c, [rsp + _DIGEST + 2*SZ4] 344 vpaddd d, d, [rsp + _DIGEST + 3*SZ4] 345 vpaddd e, e, [rsp + _DIGEST + 4*SZ4] 346 vpaddd f, f, [rsp + _DIGEST + 5*SZ4] 347 vpaddd g, g, [rsp + _DIGEST + 6*SZ4] 348 vpaddd h, h, [rsp + _DIGEST + 7*SZ4] 349 350 sub INP_SIZE, 1 ;; unit is blocks 351 jne lloop 352 353 ; write back to memory (state object) the transposed digest 354 vmovdqa [STATE+0*SHA256_DIGEST_ROW_SIZE],a 355 vmovdqa [STATE+1*SHA256_DIGEST_ROW_SIZE],b 356 vmovdqa [STATE+2*SHA256_DIGEST_ROW_SIZE],c 357 vmovdqa [STATE+3*SHA256_DIGEST_ROW_SIZE],d 358 vmovdqa [STATE+4*SHA256_DIGEST_ROW_SIZE],e 359 vmovdqa [STATE+5*SHA256_DIGEST_ROW_SIZE],f 360 vmovdqa [STATE+6*SHA256_DIGEST_ROW_SIZE],g 361 vmovdqa [STATE+7*SHA256_DIGEST_ROW_SIZE],h 362 363 ; update input pointers 364 add inp0, IDX 365 mov [STATE + _data_ptr_sha256 + 0*8], inp0 366 add inp1, IDX 367 mov [STATE + _data_ptr_sha256 + 1*8], inp1 368 add inp2, IDX 369 mov [STATE + _data_ptr_sha256 + 2*8], inp2 370 add inp3, IDX 371 mov [STATE + _data_ptr_sha256 + 3*8], inp3 372 373 ;;;;;;;;;;;;;;;; 374 ;; Postamble 375 376%ifdef SAFE_DATA 377 ;; Clear stack frame ((16 + 8)*16 bytes) 378 clear_all_xmms_avx_asm 379%assign i 0 380%rep (16+NUM_SHA256_DIGEST_WORDS) 381 vmovdqa [rsp + i*SZ4], xmm0 382%assign i (i+1) 383%endrep 384%endif 385 386 add rsp, STACK_size 387 ; outer calling routine restores XMM and other GP registers 388 ret 389 390%ifdef LINUX 391section .note.GNU-stack noalloc noexec nowrite progbits 392%endif 393