1;; 2;; Copyright (c) 2012-2020, Intel Corporation 3;; 4;; Redistribution and use in source and binary forms, with or without 5;; modification, are permitted provided that the following conditions are met: 6;; 7;; * Redistributions of source code must retain the above copyright notice, 8;; this list of conditions and the following disclaimer. 9;; * Redistributions in binary form must reproduce the above copyright 10;; notice, this list of conditions and the following disclaimer in the 11;; documentation and/or other materials provided with the distribution. 12;; * Neither the name of Intel Corporation nor the names of its contributors 13;; may be used to endorse or promote products derived from this software 14;; without specific prior written permission. 15;; 16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26;; 27 28;; Stack must be aligned to 32 bytes before call 29;; 30;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 31;; ----------------------------------------------------------- 32;; Windows clobbers: RAX RDX R8 R9 R10 R11 R12 R13 R14 R15 33;; Windows preserves: RBX RCX RBP RSI RDI 34;; ----------------------------------------------------------- 35;; Linux clobbers: RAX RDX RSI R9 R10 R11 R12 R13 R14 R15 36;; Linux preserves: RBX RCX RBP RDI R8 37;; ----------------------------------------------------------- 38;; Clobbers ZMM0-31 39 40%include "include/os.asm" 41;%define DO_DBGPRINT 42%include "include/dbgprint.asm" 43%include "mb_mgr_datastruct.asm" 44%include "include/transpose_avx512.asm" 45%include "include/reg_sizes.asm" 46%include "include/clear_regs.asm" 47 48section .data 49default rel 50align 64 51K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 52 ;ddq 0x5A8279995A8279995A8279995A827999 53 ;ddq 0x5A8279995A8279995A8279995A827999 54 ;ddq 0x5A8279995A8279995A8279995A827999 55 dq 0x5A8279995A827999, 0x5A8279995A827999 56 dq 0x5A8279995A827999, 0x5A8279995A827999 57 dq 0x5A8279995A827999, 0x5A8279995A827999 58 dq 0x5A8279995A827999, 0x5A8279995A827999 59K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 60 ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 61 ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 62 ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 63 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 64 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 65 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 66 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 67K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC 68 ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC 69 ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC 70 ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC 71 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC 72 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC 73 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC 74 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC 75K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 76 ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 77 ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 78 ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 79 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 80 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 81 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 82 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 83 84PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 85 ;ddq 0x0c0d0e0f08090a0b0405060700010203 86 ;ddq 0x0c0d0e0f08090a0b0405060700010203 87 ;ddq 0x0c0d0e0f08090a0b0405060700010203 88 dq 0x0405060700010203, 0x0c0d0e0f08090a0b 89 dq 0x0405060700010203, 0x0c0d0e0f08090a0b 90 dq 0x0405060700010203, 0x0c0d0e0f08090a0b 91 dq 0x0405060700010203, 0x0c0d0e0f08090a0b 92 93section .text 94 95%define APPEND(a,b) a %+ b 96 97%ifdef LINUX 98%define arg1 rdi 99%define arg2 rsi 100%define arg3 rdx 101%define arg4 rcx 102%else 103%define arg1 rcx 104%define arg2 rdx 105%define arg3 r8 106%define arg4 r9 107%endif 108 109%define state arg1 110%define SIZE arg2 111%define IDX arg3 112 113%define A zmm0 114%define B zmm1 115%define C zmm2 116%define D zmm3 117%define E zmm4 118%define KT zmm5 119%define AA zmm6 120%define BB zmm7 121%define CC zmm8 122%define DD zmm9 123%define EE zmm10 124%define TMP0 zmm11 125%define TMP1 zmm12 126%define TMP2 zmm13 127%define TMP3 zmm14 128%define TMP4 zmm15 129 130%define W0 zmm16 131%define W1 zmm17 132%define W2 zmm18 133%define W3 zmm19 134%define W4 zmm20 135%define W5 zmm21 136%define W6 zmm22 137%define W7 zmm23 138%define W8 zmm24 139%define W9 zmm25 140%define W10 zmm26 141%define W11 zmm27 142%define W12 zmm28 143%define W13 zmm29 144%define W14 zmm30 145%define W15 zmm31 146 147%define inp0 r9 148%define inp1 r10 149%define inp2 r11 150%define inp3 r12 151%define inp4 r13 152%define inp5 r14 153%define inp6 r15 154%define inp7 rax 155 156%macro ROTATE_ARGS 0 157%xdefine TMP_ E 158%xdefine E D 159%xdefine D C 160%xdefine C B 161%xdefine B A 162%xdefine A TMP_ 163%endm 164 165%macro PROCESS_LOOP 2 166%define %%WT %1 167%define %%F_IMMED %2 168 169 ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt 170 ; E=D, D=C, C=ROTL_30(B), B=A, A=T 171 172 ; Ft 173 ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D) 174 ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D 175 ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D) 176 177 vmovdqa32 TMP1, B ; Copy B 178 vpaddd E, E, %%WT ; E = E + Wt 179 vpternlogd TMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D) 180 vpaddd E, E, KT ; E = E + Wt + Kt 181 vprold TMP0, A, 5 ; TMP0 = ROTL_5(A) 182 vpaddd E, E, TMP1 ; E = Ft(B,C,D) + E + Kt + Wt 183 vprold B, B, 30 ; B = ROTL_30(B) 184 vpaddd E, E, TMP0 ; E = T 185 186 ROTATE_ARGS 187%endmacro 188 189%macro MSG_SCHED_ROUND_16_79 4 190%define %%WT %1 191%define %%WTp2 %2 192%define %%WTp8 %3 193%define %%WTp13 %4 194 ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16) 195 ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt) 196 vpternlogd %%WT, %%WTp2, %%WTp8, 0x96 197 vpxord %%WT, %%WT, %%WTp13 198 vprold %%WT, %%WT, 1 199%endmacro 200 201 202; Note this is reading in two blocks of data from each lane, 203; in preparation for the upcoming needed transpose to build msg schedule. 204; Each register will contain 32 bytes from one lane plus 32 bytes 205; from another lane. 206; The first 8 registers will contain the first 32 bytes of all lanes, 207; where register X (0 <= X <= 7) will contain bytes 0-31 from lane X in the first half 208; and 0-31 bytes from lane X+8 in the second half. 209; The last 8 registers will contain the last 32 bytes of all lanes, 210; where register Y (8 <= Y <= 15) will contain bytes 32-63 from lane Y-8 in the first half 211; and 32-63 bytes from lane Y in the second half. 212; This method helps reducing the number of shuffles required to transpose the data. 213%macro MSG_SCHED_ROUND_00_15 6 214%define %%Wt %1 ; [out] zmm register to load the next block 215%define %%LANE_IDX %2 ; [in] lane index (0-15) 216%define %%BASE_PTR %3 ; [in] base address of the input data 217%define %%OFFSET_PTR %4 ; [in] offset to get next block of data from the lane 218%define %%TMP1 %5 ; [clobbered] temporary gp register 219%define %%TMP2 %6 ; [clobbered] temporary gp register 220%if (%%LANE_IDX < 8) 221 mov %%TMP1, [%%BASE_PTR + %%LANE_IDX*PTR_SZ] 222 mov %%TMP2, [%%BASE_PTR + (%%LANE_IDX+8)*PTR_SZ] 223 vmovups YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR] 224 vinserti64x4 %%Wt, %%Wt, [%%TMP2+%%OFFSET_PTR], 0x01 225%else 226 mov %%TMP1, [%%BASE_PTR + (%%LANE_IDX-8)*PTR_SZ] 227 mov %%TMP2, [%%BASE_PTR + %%LANE_IDX*PTR_SZ] 228 vmovups YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR+32] 229 vinserti64x4 %%Wt, %%Wt, [%%TMP2+%%OFFSET_PTR+32], 0x01 230%endif 231%endmacro 232 233align 64 234; void sha1_mult_x16_avx3(void **input_data, UINT128 *digest, UINT32 size) 235; arg 1 : pointer to SHA1 args structure 236; arg 2 : size (in blocks) ;; assumed to be >= 1 237MKGLOBAL(sha1_x16_avx512,function,internal) 238sha1_x16_avx512: 239 ;; Initialize digests 240 vmovdqu32 A, [state + 0*SHA1_DIGEST_ROW_SIZE] 241 vmovdqu32 B, [state + 1*SHA1_DIGEST_ROW_SIZE] 242 vmovdqu32 C, [state + 2*SHA1_DIGEST_ROW_SIZE] 243 vmovdqu32 D, [state + 3*SHA1_DIGEST_ROW_SIZE] 244 vmovdqu32 E, [state + 4*SHA1_DIGEST_ROW_SIZE] 245 DBGPRINTL_ZMM "Sha1-AVX512 incoming transposed digest", A, B, C, D, E 246 DBGPRINTL64 "SIZE", SIZE 247 248 xor IDX, IDX 249 250 ;; Load first blocks of data into ZMM registers before 251 ;; performing a 16x16 32-bit transpose. 252 ;; To speed up the transpose, data is loaded in chunks of 32 bytes, 253 ;; interleaving data between lane X and lane X+8. 254 ;; This way, final shuffles between top half and bottom half 255 ;; of the matrix are avoided. 256 mov inp0, [state + _data_ptr_sha1 + 0*PTR_SZ] 257 mov inp1, [state + _data_ptr_sha1 + 1*PTR_SZ] 258 mov inp2, [state + _data_ptr_sha1 + 2*PTR_SZ] 259 mov inp3, [state + _data_ptr_sha1 + 3*PTR_SZ] 260 mov inp4, [state + _data_ptr_sha1 + 4*PTR_SZ] 261 mov inp5, [state + _data_ptr_sha1 + 5*PTR_SZ] 262 mov inp6, [state + _data_ptr_sha1 + 6*PTR_SZ] 263 mov inp7, [state + _data_ptr_sha1 + 7*PTR_SZ] 264 265 TRANSPOSE16_U32_LOAD_FIRST8 W0, W1, W2, W3, W4, W5, W6, W7, \ 266 W8, W9, W10, W11, W12, W13, W14, W15, \ 267 inp0, inp1, inp2, inp3, inp4, inp5, \ 268 inp6, inp7, IDX 269 270 mov inp0, [state + _data_ptr_sha1 + 8*PTR_SZ] 271 mov inp1, [state + _data_ptr_sha1 + 9*PTR_SZ] 272 mov inp2, [state + _data_ptr_sha1 +10*PTR_SZ] 273 mov inp3, [state + _data_ptr_sha1 +11*PTR_SZ] 274 mov inp4, [state + _data_ptr_sha1 +12*PTR_SZ] 275 mov inp5, [state + _data_ptr_sha1 +13*PTR_SZ] 276 mov inp6, [state + _data_ptr_sha1 +14*PTR_SZ] 277 mov inp7, [state + _data_ptr_sha1 +15*PTR_SZ] 278 279 TRANSPOSE16_U32_LOAD_LAST8 W0, W1, W2, W3, W4, W5, W6, W7, \ 280 W8, W9, W10, W11, W12, W13, W14, W15, \ 281 inp0, inp1, inp2, inp3, inp4, inp5, \ 282 inp6, inp7, IDX 283lloop: 284 vmovdqa32 TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK] 285 286 add IDX, 64 287 288 TRANSPOSE16_U32_PRELOADED W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, \ 289 W11, W12, W13, W14, W15, TMP0, TMP1, TMP3, TMP4 290 DBGPRINTL_ZMM "Sha1-AVX512 incoming transposed input", W0, W1, W2, W3, W4, \ 291 W6, W7, W8, W9, W10, W11, W12, W13, W14, W15 292 293%assign I 0 294%rep 16 295 vpshufb APPEND(W,I), APPEND(W,I), TMP2 296%assign I (I+1) 297%endrep 298 299 ; Save digests for later addition 300 vmovdqa32 AA, A 301 vmovdqa32 BB, B 302 vmovdqa32 CC, C 303 vmovdqa32 DD, D 304 vmovdqa32 EE, E 305 306 vmovdqa32 KT, [rel K00_19] 307%assign I 0xCA 308%assign J 0 309%assign K 2 310%assign L 8 311%assign M 13 312%assign N 0 313%rep 64 314 PROCESS_LOOP APPEND(W,J), I 315 MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) 316 %if N = 19 317 vmovdqa32 KT, [rel K20_39] 318 %assign I 0x96 319 %elif N = 39 320 vmovdqa32 KT, [rel K40_59] 321 %assign I 0xE8 322 %elif N = 59 323 vmovdqa32 KT, [rel K60_79] 324 %assign I 0x96 325 %endif 326%assign J ((J+1)% 16) 327%assign K ((K+1)% 16) 328%assign L ((L+1)% 16) 329%assign M ((M+1)% 16) 330%assign N (N+1) 331%endrep 332 333 ; Check if this is the last block 334 sub SIZE, 1 335 je lastLoop 336 337%assign I 0x96 338%assign J 0 339%rep 16 340 PROCESS_LOOP APPEND(W,J), I 341 MSG_SCHED_ROUND_00_15 APPEND(W,J), J, state + _data_ptr_sha1, IDX, inp0, inp1 342%assign J (J+1) 343%endrep 344 345 ; Add old digest 346 vpaddd A,A,AA 347 vpaddd B,B,BB 348 vpaddd C,C,CC 349 vpaddd D,D,DD 350 vpaddd E,E,EE 351 352 jmp lloop 353 354lastLoop: 355; Need to reset argument rotation values to Round 64 values 356%xdefine TMP_ A 357%xdefine A B 358%xdefine B C 359%xdefine C D 360%xdefine D E 361%xdefine E TMP_ 362 363 ; Process last 16 rounds 364%assign I 0x96 365%assign J 0 366%rep 16 367 PROCESS_LOOP APPEND(W,J), I 368%assign J (J+1) 369%endrep 370 371 ; Add old digest 372 vpaddd A,A,AA 373 vpaddd B,B,BB 374 vpaddd C,C,CC 375 vpaddd D,D,DD 376 vpaddd E,E,EE 377 378 ; Write out digest 379 ; Do we need to untranspose digests??? 380 vmovdqu32 [state + 0*SHA1_DIGEST_ROW_SIZE], A 381 vmovdqu32 [state + 1*SHA1_DIGEST_ROW_SIZE], B 382 vmovdqu32 [state + 2*SHA1_DIGEST_ROW_SIZE], C 383 vmovdqu32 [state + 3*SHA1_DIGEST_ROW_SIZE], D 384 vmovdqu32 [state + 4*SHA1_DIGEST_ROW_SIZE], E 385 DBGPRINTL_ZMM "Sha1-AVX512 outgoing transposed digest", A, B, C, D, E 386 387 ;; update input pointers 388 mov inp0, [state + _data_ptr_sha1 + 0*PTR_SZ] 389 mov inp1, [state + _data_ptr_sha1 + 1*PTR_SZ] 390 mov inp2, [state + _data_ptr_sha1 + 2*PTR_SZ] 391 mov inp3, [state + _data_ptr_sha1 + 3*PTR_SZ] 392 mov inp4, [state + _data_ptr_sha1 + 4*PTR_SZ] 393 mov inp5, [state + _data_ptr_sha1 + 5*PTR_SZ] 394 mov inp6, [state + _data_ptr_sha1 + 6*PTR_SZ] 395 mov inp7, [state + _data_ptr_sha1 + 7*PTR_SZ] 396 add inp0, IDX 397 add inp1, IDX 398 add inp2, IDX 399 add inp3, IDX 400 add inp4, IDX 401 add inp5, IDX 402 add inp6, IDX 403 add inp7, IDX 404 mov [state + _data_ptr_sha1 + 0*PTR_SZ], inp0 405 mov [state + _data_ptr_sha1 + 1*PTR_SZ], inp1 406 mov [state + _data_ptr_sha1 + 2*PTR_SZ], inp2 407 mov [state + _data_ptr_sha1 + 3*PTR_SZ], inp3 408 mov [state + _data_ptr_sha1 + 4*PTR_SZ], inp4 409 mov [state + _data_ptr_sha1 + 5*PTR_SZ], inp5 410 mov [state + _data_ptr_sha1 + 6*PTR_SZ], inp6 411 mov [state + _data_ptr_sha1 + 7*PTR_SZ], inp7 412 413 mov inp0, [state + _data_ptr_sha1 + 8*PTR_SZ] 414 mov inp1, [state + _data_ptr_sha1 + 9*PTR_SZ] 415 mov inp2, [state + _data_ptr_sha1 + 10*PTR_SZ] 416 mov inp3, [state + _data_ptr_sha1 + 11*PTR_SZ] 417 mov inp4, [state + _data_ptr_sha1 + 12*PTR_SZ] 418 mov inp5, [state + _data_ptr_sha1 + 13*PTR_SZ] 419 mov inp6, [state + _data_ptr_sha1 + 14*PTR_SZ] 420 mov inp7, [state + _data_ptr_sha1 + 15*PTR_SZ] 421 add inp0, IDX 422 add inp1, IDX 423 add inp2, IDX 424 add inp3, IDX 425 add inp4, IDX 426 add inp5, IDX 427 add inp6, IDX 428 add inp7, IDX 429 mov [state + _data_ptr_sha1 + 8*PTR_SZ], inp0 430 mov [state + _data_ptr_sha1 + 9*PTR_SZ], inp1 431 mov [state + _data_ptr_sha1 + 10*PTR_SZ], inp2 432 mov [state + _data_ptr_sha1 + 11*PTR_SZ], inp3 433 mov [state + _data_ptr_sha1 + 12*PTR_SZ], inp4 434 mov [state + _data_ptr_sha1 + 13*PTR_SZ], inp5 435 mov [state + _data_ptr_sha1 + 14*PTR_SZ], inp6 436 mov [state + _data_ptr_sha1 + 15*PTR_SZ], inp7 437 438%ifdef SAFE_DATA 439 clear_all_zmms_asm 440%endif ;; SAFE_DATA 441 442 ret 443 444%ifdef LINUX 445section .note.GNU-stack noalloc noexec nowrite progbits 446%endif 447