1/* 2;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3; Copyright (c) 2012, Intel Corporation 4; 5; All rights reserved. 6; 7; Redistribution and use in source and binary forms, with or without 8; modification, are permitted provided that the following conditions are 9; met: 10; 11; * Redistributions of source code must retain the above copyright 12; notice, this list of conditions and the following disclaimer. 13; 14; * Redistributions in binary form must reproduce the above copyright 15; notice, this list of conditions and the following disclaimer in the 16; documentation and/or other materials provided with the 17; distribution. 18; 19; * Neither the name of the Intel Corporation nor the names of its 20; contributors may be used to endorse or promote products derived from 21; this software without specific prior written permission. 22; 23; 24; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY 25; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 28; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 29; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 30; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 31; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 32; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 33; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 34; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 36; 37; This code is described in an Intel White-Paper: 38; "Fast SHA-256 Implementations on Intel Architecture Processors" 39; 40; To find it, surf to http://www.intel.com/p/en_US/embedded 41; and search for that title. 42; The paper is expected to be released roughly at the end of April, 2012 43; 44;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 45; This code schedules 1 blocks at a time, with 4 lanes per block 46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 47*/ 48/* 49 * Conversion to GAS assembly and integration to libgcrypt 50 * by Jussi Kivilinna <jussi.kivilinna@iki.fi> 51 * 52 * Note: original implementation was named as SHA256-SSE4. However, only SSSE3 53 * is required. 54 */ 55 56#ifdef __x86_64 57#include <config.h> 58#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ 59 defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ 60 defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ 61 defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256) 62 63#include "asm-common-amd64.h" 64 65.intel_syntax noprefix 66 67#define MOVDQ movdqu /* assume buffers not aligned */ 68 69/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/ 70 71/* addm [mem], reg 72 * Add reg to mem using reg-mem add and store */ 73#define addm(p1, p2) \ 74 add p2, p1; \ 75 mov p1, p2; 76 77/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ 78 79/* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 80 * Load xmm with mem and byte swap each dword */ 81#define COPY_XMM_AND_BSWAP(p1, p2, p3) \ 82 MOVDQ p1, p2; \ 83 pshufb p1, p3; 84 85/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ 86 87#define X0 xmm4 88#define X1 xmm5 89#define X2 xmm6 90#define X3 xmm7 91 92#define XTMP0 xmm0 93#define XTMP1 xmm1 94#define XTMP2 xmm2 95#define XTMP3 xmm3 96#define XTMP4 xmm8 97#define XFER xmm9 98 99#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */ 100#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */ 101#define BYTE_FLIP_MASK xmm12 102 103#define NUM_BLKS rdx /* 3rd arg */ 104#define CTX rsi /* 2nd arg */ 105#define INP rdi /* 1st arg */ 106 107#define SRND rdi /* clobbers INP */ 108#define c ecx 109#define d r8d 110#define e edx 111 112#define TBL rbp 113#define a eax 114#define b ebx 115 116#define f r9d 117#define g r10d 118#define h r11d 119 120#define y0 r13d 121#define y1 r14d 122#define y2 r15d 123 124 125 126#define _INP_END_SIZE 8 127#define _INP_SIZE 8 128#define _XFER_SIZE 8 129#define _XMM_SAVE_SIZE 0 130/* STACK_SIZE plus pushes must be an odd multiple of 8 */ 131#define _ALIGN_SIZE 8 132 133#define _INP_END 0 134#define _INP (_INP_END + _INP_END_SIZE) 135#define _XFER (_INP + _INP_SIZE) 136#define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) 137#define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) 138 139 140#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ 141 /* compute s0 four at a time and s1 two at a time */; \ 142 /* compute W[-16] + W[-7] 4 at a time */; \ 143 movdqa XTMP0, X3; \ 144 mov y0, e /* y0 = e */; \ 145 ror y0, (25-11) /* y0 = e >> (25-11) */; \ 146 mov y1, a /* y1 = a */; \ 147 palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */; \ 148 ror y1, (22-13) /* y1 = a >> (22-13) */; \ 149 xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ 150 mov y2, f /* y2 = f */; \ 151 ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ 152 movdqa XTMP1, X1; \ 153 xor y1, a /* y1 = a ^ (a >> (22-13) */; \ 154 xor y2, g /* y2 = f^g */; \ 155 paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \ 156 xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ 157 and y2, e /* y2 = (f^g)&e */; \ 158 ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ 159 /* compute s0 */; \ 160 palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */; \ 161 xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ 162 ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ 163 xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ 164 movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */; \ 165 ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ 166 add y2, y0 /* y2 = S1 + CH */; \ 167 add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \ 168 movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */; \ 169 mov y0, a /* y0 = a */; \ 170 add h, y2 /* h = h + S1 + CH + k + w */; \ 171 mov y2, a /* y2 = a */; \ 172 pslld XTMP1, (32-7); \ 173 or y0, c /* y0 = a|c */; \ 174 add d, h /* d = d + h + S1 + CH + k + w */; \ 175 and y2, c /* y2 = a&c */; \ 176 psrld XTMP2, 7; \ 177 and y0, b /* y0 = (a|c)&b */; \ 178 add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ 179 por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */; \ 180 or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ 181 lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ 182 183#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ 184 movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */; \ 185 mov y0, e /* y0 = e */; \ 186 mov y1, a /* y1 = a */; \ 187 movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */; \ 188 ror y0, (25-11) /* y0 = e >> (25-11) */; \ 189 xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ 190 mov y2, f /* y2 = f */; \ 191 ror y1, (22-13) /* y1 = a >> (22-13) */; \ 192 pslld XTMP3, (32-18); \ 193 xor y1, a /* y1 = a ^ (a >> (22-13) */; \ 194 ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ 195 xor y2, g /* y2 = f^g */; \ 196 psrld XTMP2, 18; \ 197 ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ 198 xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ 199 and y2, e /* y2 = (f^g)&e */; \ 200 ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ 201 pxor XTMP1, XTMP3; \ 202 xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ 203 xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ 204 psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */; \ 205 add y2, y0 /* y2 = S1 + CH */; \ 206 add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \ 207 ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ 208 pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \ 209 mov y0, a /* y0 = a */; \ 210 add h, y2 /* h = h + S1 + CH + k + w */; \ 211 mov y2, a /* y2 = a */; \ 212 pxor XTMP1, XTMP4 /* XTMP1 = s0 */; \ 213 or y0, c /* y0 = a|c */; \ 214 add d, h /* d = d + h + S1 + CH + k + w */; \ 215 and y2, c /* y2 = a&c */; \ 216 /* compute low s1 */; \ 217 pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \ 218 and y0, b /* y0 = (a|c)&b */; \ 219 add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ 220 paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \ 221 or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ 222 lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ 223 224#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ 225 movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */; \ 226 mov y0, e /* y0 = e */; \ 227 mov y1, a /* y1 = a */; \ 228 ror y0, (25-11) /* y0 = e >> (25-11) */; \ 229 movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */; \ 230 xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ 231 ror y1, (22-13) /* y1 = a >> (22-13) */; \ 232 mov y2, f /* y2 = f */; \ 233 xor y1, a /* y1 = a ^ (a >> (22-13) */; \ 234 ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ 235 psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \ 236 xor y2, g /* y2 = f^g */; \ 237 psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \ 238 xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ 239 and y2, e /* y2 = (f^g)&e */; \ 240 psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \ 241 ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ 242 xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ 243 xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ 244 ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ 245 pxor XTMP2, XTMP3; \ 246 add y2, y0 /* y2 = S1 + CH */; \ 247 ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ 248 add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \ 249 pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \ 250 mov y0, a /* y0 = a */; \ 251 add h, y2 /* h = h + S1 + CH + k + w */; \ 252 mov y2, a /* y2 = a */; \ 253 pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \ 254 or y0, c /* y0 = a|c */; \ 255 add d, h /* d = d + h + S1 + CH + k + w */; \ 256 and y2, c /* y2 = a&c */; \ 257 paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \ 258 and y0, b /* y0 = (a|c)&b */; \ 259 add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ 260 /* compute high s1 */; \ 261 pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \ 262 or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ 263 lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ 264 265#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ 266 movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */; \ 267 mov y0, e /* y0 = e */; \ 268 ror y0, (25-11) /* y0 = e >> (25-11) */; \ 269 mov y1, a /* y1 = a */; \ 270 movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */; \ 271 ror y1, (22-13) /* y1 = a >> (22-13) */; \ 272 xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ 273 mov y2, f /* y2 = f */; \ 274 ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ 275 psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \ 276 xor y1, a /* y1 = a ^ (a >> (22-13) */; \ 277 xor y2, g /* y2 = f^g */; \ 278 psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \ 279 xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ 280 and y2, e /* y2 = (f^g)&e */; \ 281 ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ 282 psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */; \ 283 xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ 284 ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ 285 xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ 286 pxor XTMP2, XTMP3; \ 287 ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ 288 add y2, y0 /* y2 = S1 + CH */; \ 289 add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \ 290 pxor X0, XTMP2 /* X0 = s1 {xDxC} */; \ 291 mov y0, a /* y0 = a */; \ 292 add h, y2 /* h = h + S1 + CH + k + w */; \ 293 mov y2, a /* y2 = a */; \ 294 pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */; \ 295 or y0, c /* y0 = a|c */; \ 296 add d, h /* d = d + h + S1 + CH + k + w */; \ 297 and y2, c /* y2 = a&c */; \ 298 paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \ 299 and y0, b /* y0 = (a|c)&b */; \ 300 add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ 301 or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ 302 lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ 303 304#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \ 305 FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \ 306 FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \ 307 FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \ 308 FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e); 309 310/* input is [rsp + _XFER + %1 * 4] */ 311#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \ 312 mov y0, e /* y0 = e */; \ 313 ror y0, (25-11) /* y0 = e >> (25-11) */; \ 314 mov y1, a /* y1 = a */; \ 315 xor y0, e /* y0 = e ^ (e >> (25-11)) */; \ 316 ror y1, (22-13) /* y1 = a >> (22-13) */; \ 317 mov y2, f /* y2 = f */; \ 318 xor y1, a /* y1 = a ^ (a >> (22-13) */; \ 319 ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \ 320 xor y2, g /* y2 = f^g */; \ 321 xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \ 322 ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \ 323 and y2, e /* y2 = (f^g)&e */; \ 324 xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \ 325 ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \ 326 xor y2, g /* y2 = CH = ((f^g)&e)^g */; \ 327 add y2, y0 /* y2 = S1 + CH */; \ 328 ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \ 329 add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \ 330 mov y0, a /* y0 = a */; \ 331 add h, y2 /* h = h + S1 + CH + k + w */; \ 332 mov y2, a /* y2 = a */; \ 333 or y0, c /* y0 = a|c */; \ 334 add d, h /* d = d + h + S1 + CH + k + w */; \ 335 and y2, c /* y2 = a&c */; \ 336 and y0, b /* y0 = (a|c)&b */; \ 337 add h, y1 /* h = h + S1 + CH + k + w + S0 */; \ 338 or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \ 339 lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ 340 341/* 342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 343;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 344;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks) 345;; arg 1 : pointer to input data 346;; arg 2 : pointer to digest 347;; arg 3 : Num blocks 348*/ 349.text 350.globl _gcry_sha256_transform_amd64_ssse3 351ELF(.type _gcry_sha256_transform_amd64_ssse3,@function;) 352.align 16 353_gcry_sha256_transform_amd64_ssse3: 354 CFI_STARTPROC() 355 push rbx 356 CFI_PUSH(rbx) 357 push rbp 358 CFI_PUSH(rbp) 359 push r13 360 CFI_PUSH(r13) 361 push r14 362 CFI_PUSH(r14) 363 push r15 364 CFI_PUSH(r15) 365 366 sub rsp, STACK_SIZE 367 CFI_ADJUST_CFA_OFFSET(STACK_SIZE); 368 369 shl NUM_BLKS, 6 /* convert to bytes */ 370 jz .Ldone_hash 371 add NUM_BLKS, INP /* pointer to end of data */ 372 mov [rsp + _INP_END], NUM_BLKS 373 374 /* load initial digest */ 375 mov a,[4*0 + CTX] 376 mov b,[4*1 + CTX] 377 mov c,[4*2 + CTX] 378 mov d,[4*3 + CTX] 379 mov e,[4*4 + CTX] 380 mov f,[4*5 + CTX] 381 mov g,[4*6 + CTX] 382 mov h,[4*7 + CTX] 383 384 movdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] 385 movdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] 386 movdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] 387 388.Loop0: 389 lea TBL, [.LK256 ADD_RIP] 390 391 /* byte swap first 16 dwords */ 392 COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK) 393 COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK) 394 COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK) 395 COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK) 396 397 mov [rsp + _INP], INP 398 399 /* schedule 48 input dwords, by doing 3 rounds of 16 each */ 400 mov SRND, 3 401.align 16 402.Loop1: 403 movdqa XFER, [TBL + 0*16] 404 paddd XFER, X0 405 movdqa [rsp + _XFER], XFER 406 FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) 407 408 movdqa XFER, [TBL + 1*16] 409 paddd XFER, X1 410 movdqa [rsp + _XFER], XFER 411 FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d) 412 413 movdqa XFER, [TBL + 2*16] 414 paddd XFER, X2 415 movdqa [rsp + _XFER], XFER 416 FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h) 417 418 movdqa XFER, [TBL + 3*16] 419 paddd XFER, X3 420 movdqa [rsp + _XFER], XFER 421 add TBL, 4*16 422 FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d) 423 424 sub SRND, 1 425 jne .Loop1 426 427 mov SRND, 2 428.Loop2: 429 paddd X0, [TBL + 0*16] 430 movdqa [rsp + _XFER], X0 431 DO_ROUND(0, a, b, c, d, e, f, g, h) 432 DO_ROUND(1, h, a, b, c, d, e, f, g) 433 DO_ROUND(2, g, h, a, b, c, d, e, f) 434 DO_ROUND(3, f, g, h, a, b, c, d, e) 435 paddd X1, [TBL + 1*16] 436 movdqa [rsp + _XFER], X1 437 add TBL, 2*16 438 DO_ROUND(0, e, f, g, h, a, b, c, d) 439 DO_ROUND(1, d, e, f, g, h, a, b, c) 440 DO_ROUND(2, c, d, e, f, g, h, a, b) 441 DO_ROUND(3, b, c, d, e, f, g, h, a) 442 443 movdqa X0, X2 444 movdqa X1, X3 445 446 sub SRND, 1 447 jne .Loop2 448 449 addm([4*0 + CTX],a) 450 addm([4*1 + CTX],b) 451 addm([4*2 + CTX],c) 452 addm([4*3 + CTX],d) 453 addm([4*4 + CTX],e) 454 addm([4*5 + CTX],f) 455 addm([4*6 + CTX],g) 456 addm([4*7 + CTX],h) 457 458 mov INP, [rsp + _INP] 459 add INP, 64 460 cmp INP, [rsp + _INP_END] 461 jne .Loop0 462 463 pxor xmm0, xmm0 464 pxor xmm1, xmm1 465 pxor xmm2, xmm2 466 pxor xmm3, xmm3 467 pxor xmm4, xmm4 468 pxor xmm5, xmm5 469 pxor xmm6, xmm6 470 pxor xmm7, xmm7 471 pxor xmm8, xmm8 472 pxor xmm9, xmm9 473 pxor xmm10, xmm10 474 pxor xmm11, xmm11 475 pxor xmm12, xmm12 476 477.Ldone_hash: 478 pxor XFER, XFER 479 movdqa [rsp + _XFER], XFER 480 xor eax, eax 481 482 add rsp, STACK_SIZE 483 CFI_ADJUST_CFA_OFFSET(-STACK_SIZE); 484 485 pop r15 486 CFI_POP(r15) 487 pop r14 488 CFI_POP(r14) 489 pop r13 490 CFI_POP(r13) 491 pop rbp 492 CFI_POP(rbp) 493 pop rbx 494 CFI_POP(rbx) 495 496 ret 497 CFI_ENDPROC() 498 499 500.align 16 501.LK256: 502 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 503 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 504 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 505 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 506 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 507 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 508 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 509 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 510 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 511 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 512 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 513 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 514 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 515 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 516 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 517 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 518 519.LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 520 521/* shuffle xBxA -> 00BA */ 522.L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 523 524/* shuffle xDxC -> DC00 */ 525.L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF 526 527#endif 528#endif 529