1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build 386,!gccgo,!appengine 6 7#include "textflag.h" 8 9DATA iv0<>+0x00(SB)/4, $0x6a09e667 10DATA iv0<>+0x04(SB)/4, $0xbb67ae85 11DATA iv0<>+0x08(SB)/4, $0x3c6ef372 12DATA iv0<>+0x0c(SB)/4, $0xa54ff53a 13GLOBL iv0<>(SB), (NOPTR+RODATA), $16 14 15DATA iv1<>+0x00(SB)/4, $0x510e527f 16DATA iv1<>+0x04(SB)/4, $0x9b05688c 17DATA iv1<>+0x08(SB)/4, $0x1f83d9ab 18DATA iv1<>+0x0c(SB)/4, $0x5be0cd19 19GLOBL iv1<>(SB), (NOPTR+RODATA), $16 20 21DATA rol16<>+0x00(SB)/8, $0x0504070601000302 22DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A 23GLOBL rol16<>(SB), (NOPTR+RODATA), $16 24 25DATA rol8<>+0x00(SB)/8, $0x0407060500030201 26DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 27GLOBL rol8<>(SB), (NOPTR+RODATA), $16 28 29DATA counter<>+0x00(SB)/8, $0x40 30DATA counter<>+0x08(SB)/8, $0x0 31GLOBL counter<>(SB), (NOPTR+RODATA), $16 32 33#define ROTL_SSE2(n, t, v) \ 34 MOVO v, t; \ 35 PSLLL $n, t; \ 36 PSRLL $(32-n), v; \ 37 PXOR t, v 38 39#define ROTL_SSSE3(c, v) \ 40 PSHUFB c, v 41 42#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \ 43 PADDL m0, v0; \ 44 PADDL v1, v0; \ 45 PXOR v0, v3; \ 46 ROTL_SSE2(16, t, v3); \ 47 PADDL v3, v2; \ 48 PXOR v2, v1; \ 49 ROTL_SSE2(20, t, v1); \ 50 PADDL m1, v0; \ 51 PADDL v1, v0; \ 52 PXOR v0, v3; \ 53 ROTL_SSE2(24, t, v3); \ 54 PADDL v3, v2; \ 55 PXOR v2, v1; \ 56 ROTL_SSE2(25, t, v1); \ 57 PSHUFL $0x39, v1, v1; \ 58 PSHUFL $0x4E, v2, v2; \ 59 PSHUFL $0x93, v3, v3; \ 60 PADDL m2, v0; \ 61 PADDL v1, v0; \ 62 PXOR v0, v3; \ 63 ROTL_SSE2(16, t, v3); \ 64 PADDL v3, v2; \ 65 PXOR v2, v1; \ 66 ROTL_SSE2(20, t, v1); \ 67 PADDL m3, v0; \ 68 PADDL v1, v0; \ 69 PXOR v0, v3; \ 70 ROTL_SSE2(24, t, v3); \ 71 PADDL v3, v2; \ 72 PXOR v2, v1; \ 73 ROTL_SSE2(25, t, v1); \ 74 PSHUFL $0x39, v3, v3; \ 75 PSHUFL $0x4E, v2, v2; \ 76 PSHUFL $0x93, v1, v1 77 78#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \ 79 PADDL m0, v0; \ 80 PADDL v1, v0; \ 81 PXOR v0, v3; \ 82 ROTL_SSSE3(c16, v3); \ 83 PADDL v3, v2; \ 84 PXOR v2, v1; \ 85 ROTL_SSE2(20, t, v1); \ 86 PADDL m1, v0; \ 87 PADDL v1, v0; \ 88 PXOR v0, v3; \ 89 ROTL_SSSE3(c8, v3); \ 90 PADDL v3, v2; \ 91 PXOR v2, v1; \ 92 ROTL_SSE2(25, t, v1); \ 93 PSHUFL $0x39, v1, v1; \ 94 PSHUFL $0x4E, v2, v2; \ 95 PSHUFL $0x93, v3, v3; \ 96 PADDL m2, v0; \ 97 PADDL v1, v0; \ 98 PXOR v0, v3; \ 99 ROTL_SSSE3(c16, v3); \ 100 PADDL v3, v2; \ 101 PXOR v2, v1; \ 102 ROTL_SSE2(20, t, v1); \ 103 PADDL m3, v0; \ 104 PADDL v1, v0; \ 105 PXOR v0, v3; \ 106 ROTL_SSSE3(c8, v3); \ 107 PADDL v3, v2; \ 108 PXOR v2, v1; \ 109 ROTL_SSE2(25, t, v1); \ 110 PSHUFL $0x39, v3, v3; \ 111 PSHUFL $0x4E, v2, v2; \ 112 PSHUFL $0x93, v1, v1 113 114#define PRECOMPUTE(dst, off, src, t) \ 115 MOVL 0*4(src), t; \ 116 MOVL t, 0*4+off+0(dst); \ 117 MOVL t, 9*4+off+64(dst); \ 118 MOVL t, 5*4+off+128(dst); \ 119 MOVL t, 14*4+off+192(dst); \ 120 MOVL t, 4*4+off+256(dst); \ 121 MOVL t, 2*4+off+320(dst); \ 122 MOVL t, 8*4+off+384(dst); \ 123 MOVL t, 12*4+off+448(dst); \ 124 MOVL t, 3*4+off+512(dst); \ 125 MOVL t, 15*4+off+576(dst); \ 126 MOVL 1*4(src), t; \ 127 MOVL t, 4*4+off+0(dst); \ 128 MOVL t, 8*4+off+64(dst); \ 129 MOVL t, 14*4+off+128(dst); \ 130 MOVL t, 5*4+off+192(dst); \ 131 MOVL t, 12*4+off+256(dst); \ 132 MOVL t, 11*4+off+320(dst); \ 133 MOVL t, 1*4+off+384(dst); \ 134 MOVL t, 6*4+off+448(dst); \ 135 MOVL t, 10*4+off+512(dst); \ 136 MOVL t, 3*4+off+576(dst); \ 137 MOVL 2*4(src), t; \ 138 MOVL t, 1*4+off+0(dst); \ 139 MOVL t, 13*4+off+64(dst); \ 140 MOVL t, 6*4+off+128(dst); \ 141 MOVL t, 8*4+off+192(dst); \ 142 MOVL t, 2*4+off+256(dst); \ 143 MOVL t, 0*4+off+320(dst); \ 144 MOVL t, 14*4+off+384(dst); \ 145 MOVL t, 11*4+off+448(dst); \ 146 MOVL t, 12*4+off+512(dst); \ 147 MOVL t, 4*4+off+576(dst); \ 148 MOVL 3*4(src), t; \ 149 MOVL t, 5*4+off+0(dst); \ 150 MOVL t, 15*4+off+64(dst); \ 151 MOVL t, 9*4+off+128(dst); \ 152 MOVL t, 1*4+off+192(dst); \ 153 MOVL t, 11*4+off+256(dst); \ 154 MOVL t, 7*4+off+320(dst); \ 155 MOVL t, 13*4+off+384(dst); \ 156 MOVL t, 3*4+off+448(dst); \ 157 MOVL t, 6*4+off+512(dst); \ 158 MOVL t, 10*4+off+576(dst); \ 159 MOVL 4*4(src), t; \ 160 MOVL t, 2*4+off+0(dst); \ 161 MOVL t, 1*4+off+64(dst); \ 162 MOVL t, 15*4+off+128(dst); \ 163 MOVL t, 10*4+off+192(dst); \ 164 MOVL t, 6*4+off+256(dst); \ 165 MOVL t, 8*4+off+320(dst); \ 166 MOVL t, 3*4+off+384(dst); \ 167 MOVL t, 13*4+off+448(dst); \ 168 MOVL t, 14*4+off+512(dst); \ 169 MOVL t, 5*4+off+576(dst); \ 170 MOVL 5*4(src), t; \ 171 MOVL t, 6*4+off+0(dst); \ 172 MOVL t, 11*4+off+64(dst); \ 173 MOVL t, 2*4+off+128(dst); \ 174 MOVL t, 9*4+off+192(dst); \ 175 MOVL t, 1*4+off+256(dst); \ 176 MOVL t, 13*4+off+320(dst); \ 177 MOVL t, 4*4+off+384(dst); \ 178 MOVL t, 8*4+off+448(dst); \ 179 MOVL t, 15*4+off+512(dst); \ 180 MOVL t, 7*4+off+576(dst); \ 181 MOVL 6*4(src), t; \ 182 MOVL t, 3*4+off+0(dst); \ 183 MOVL t, 7*4+off+64(dst); \ 184 MOVL t, 13*4+off+128(dst); \ 185 MOVL t, 12*4+off+192(dst); \ 186 MOVL t, 10*4+off+256(dst); \ 187 MOVL t, 1*4+off+320(dst); \ 188 MOVL t, 9*4+off+384(dst); \ 189 MOVL t, 14*4+off+448(dst); \ 190 MOVL t, 0*4+off+512(dst); \ 191 MOVL t, 6*4+off+576(dst); \ 192 MOVL 7*4(src), t; \ 193 MOVL t, 7*4+off+0(dst); \ 194 MOVL t, 14*4+off+64(dst); \ 195 MOVL t, 10*4+off+128(dst); \ 196 MOVL t, 0*4+off+192(dst); \ 197 MOVL t, 5*4+off+256(dst); \ 198 MOVL t, 9*4+off+320(dst); \ 199 MOVL t, 12*4+off+384(dst); \ 200 MOVL t, 1*4+off+448(dst); \ 201 MOVL t, 13*4+off+512(dst); \ 202 MOVL t, 2*4+off+576(dst); \ 203 MOVL 8*4(src), t; \ 204 MOVL t, 8*4+off+0(dst); \ 205 MOVL t, 5*4+off+64(dst); \ 206 MOVL t, 4*4+off+128(dst); \ 207 MOVL t, 15*4+off+192(dst); \ 208 MOVL t, 14*4+off+256(dst); \ 209 MOVL t, 3*4+off+320(dst); \ 210 MOVL t, 11*4+off+384(dst); \ 211 MOVL t, 10*4+off+448(dst); \ 212 MOVL t, 7*4+off+512(dst); \ 213 MOVL t, 1*4+off+576(dst); \ 214 MOVL 9*4(src), t; \ 215 MOVL t, 12*4+off+0(dst); \ 216 MOVL t, 2*4+off+64(dst); \ 217 MOVL t, 11*4+off+128(dst); \ 218 MOVL t, 4*4+off+192(dst); \ 219 MOVL t, 0*4+off+256(dst); \ 220 MOVL t, 15*4+off+320(dst); \ 221 MOVL t, 10*4+off+384(dst); \ 222 MOVL t, 7*4+off+448(dst); \ 223 MOVL t, 5*4+off+512(dst); \ 224 MOVL t, 9*4+off+576(dst); \ 225 MOVL 10*4(src), t; \ 226 MOVL t, 9*4+off+0(dst); \ 227 MOVL t, 4*4+off+64(dst); \ 228 MOVL t, 8*4+off+128(dst); \ 229 MOVL t, 13*4+off+192(dst); \ 230 MOVL t, 3*4+off+256(dst); \ 231 MOVL t, 5*4+off+320(dst); \ 232 MOVL t, 7*4+off+384(dst); \ 233 MOVL t, 15*4+off+448(dst); \ 234 MOVL t, 11*4+off+512(dst); \ 235 MOVL t, 0*4+off+576(dst); \ 236 MOVL 11*4(src), t; \ 237 MOVL t, 13*4+off+0(dst); \ 238 MOVL t, 10*4+off+64(dst); \ 239 MOVL t, 0*4+off+128(dst); \ 240 MOVL t, 3*4+off+192(dst); \ 241 MOVL t, 9*4+off+256(dst); \ 242 MOVL t, 6*4+off+320(dst); \ 243 MOVL t, 15*4+off+384(dst); \ 244 MOVL t, 4*4+off+448(dst); \ 245 MOVL t, 2*4+off+512(dst); \ 246 MOVL t, 12*4+off+576(dst); \ 247 MOVL 12*4(src), t; \ 248 MOVL t, 10*4+off+0(dst); \ 249 MOVL t, 12*4+off+64(dst); \ 250 MOVL t, 1*4+off+128(dst); \ 251 MOVL t, 6*4+off+192(dst); \ 252 MOVL t, 13*4+off+256(dst); \ 253 MOVL t, 4*4+off+320(dst); \ 254 MOVL t, 0*4+off+384(dst); \ 255 MOVL t, 2*4+off+448(dst); \ 256 MOVL t, 8*4+off+512(dst); \ 257 MOVL t, 14*4+off+576(dst); \ 258 MOVL 13*4(src), t; \ 259 MOVL t, 14*4+off+0(dst); \ 260 MOVL t, 3*4+off+64(dst); \ 261 MOVL t, 7*4+off+128(dst); \ 262 MOVL t, 2*4+off+192(dst); \ 263 MOVL t, 15*4+off+256(dst); \ 264 MOVL t, 12*4+off+320(dst); \ 265 MOVL t, 6*4+off+384(dst); \ 266 MOVL t, 0*4+off+448(dst); \ 267 MOVL t, 9*4+off+512(dst); \ 268 MOVL t, 11*4+off+576(dst); \ 269 MOVL 14*4(src), t; \ 270 MOVL t, 11*4+off+0(dst); \ 271 MOVL t, 0*4+off+64(dst); \ 272 MOVL t, 12*4+off+128(dst); \ 273 MOVL t, 7*4+off+192(dst); \ 274 MOVL t, 8*4+off+256(dst); \ 275 MOVL t, 14*4+off+320(dst); \ 276 MOVL t, 2*4+off+384(dst); \ 277 MOVL t, 5*4+off+448(dst); \ 278 MOVL t, 1*4+off+512(dst); \ 279 MOVL t, 13*4+off+576(dst); \ 280 MOVL 15*4(src), t; \ 281 MOVL t, 15*4+off+0(dst); \ 282 MOVL t, 6*4+off+64(dst); \ 283 MOVL t, 3*4+off+128(dst); \ 284 MOVL t, 11*4+off+192(dst); \ 285 MOVL t, 7*4+off+256(dst); \ 286 MOVL t, 10*4+off+320(dst); \ 287 MOVL t, 5*4+off+384(dst); \ 288 MOVL t, 9*4+off+448(dst); \ 289 MOVL t, 4*4+off+512(dst); \ 290 MOVL t, 8*4+off+576(dst) 291 292// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) 293TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment 294 MOVL h+0(FP), AX 295 MOVL c+4(FP), BX 296 MOVL flag+8(FP), CX 297 MOVL blocks_base+12(FP), SI 298 MOVL blocks_len+16(FP), DX 299 300 MOVL SP, BP 301 MOVL SP, DI 302 ADDL $15, DI 303 ANDL $~15, DI 304 MOVL DI, SP 305 306 MOVL CX, 8(SP) 307 MOVL 0(BX), CX 308 MOVL CX, 0(SP) 309 MOVL 4(BX), CX 310 MOVL CX, 4(SP) 311 XORL CX, CX 312 MOVL CX, 12(SP) 313 314 MOVOU 0(AX), X0 315 MOVOU 16(AX), X1 316 MOVOU counter<>(SB), X2 317 318loop: 319 MOVO X0, X4 320 MOVO X1, X5 321 MOVOU iv0<>(SB), X6 322 MOVOU iv1<>(SB), X7 323 324 MOVO 0(SP), X3 325 PADDQ X2, X3 326 PXOR X3, X7 327 MOVO X3, 0(SP) 328 329 PRECOMPUTE(SP, 16, SI, CX) 330 ROUND_SSE2(X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X3) 331 ROUND_SSE2(X4, X5, X6, X7, 16+64(SP), 32+64(SP), 48+64(SP), 64+64(SP), X3) 332 ROUND_SSE2(X4, X5, X6, X7, 16+128(SP), 32+128(SP), 48+128(SP), 64+128(SP), X3) 333 ROUND_SSE2(X4, X5, X6, X7, 16+192(SP), 32+192(SP), 48+192(SP), 64+192(SP), X3) 334 ROUND_SSE2(X4, X5, X6, X7, 16+256(SP), 32+256(SP), 48+256(SP), 64+256(SP), X3) 335 ROUND_SSE2(X4, X5, X6, X7, 16+320(SP), 32+320(SP), 48+320(SP), 64+320(SP), X3) 336 ROUND_SSE2(X4, X5, X6, X7, 16+384(SP), 32+384(SP), 48+384(SP), 64+384(SP), X3) 337 ROUND_SSE2(X4, X5, X6, X7, 16+448(SP), 32+448(SP), 48+448(SP), 64+448(SP), X3) 338 ROUND_SSE2(X4, X5, X6, X7, 16+512(SP), 32+512(SP), 48+512(SP), 64+512(SP), X3) 339 ROUND_SSE2(X4, X5, X6, X7, 16+576(SP), 32+576(SP), 48+576(SP), 64+576(SP), X3) 340 341 PXOR X4, X0 342 PXOR X5, X1 343 PXOR X6, X0 344 PXOR X7, X1 345 346 LEAL 64(SI), SI 347 SUBL $64, DX 348 JNE loop 349 350 MOVL 0(SP), CX 351 MOVL CX, 0(BX) 352 MOVL 4(SP), CX 353 MOVL CX, 4(BX) 354 355 MOVOU X0, 0(AX) 356 MOVOU X1, 16(AX) 357 358 MOVL BP, SP 359 RET 360 361// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) 362TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment 363 MOVL h+0(FP), AX 364 MOVL c+4(FP), BX 365 MOVL flag+8(FP), CX 366 MOVL blocks_base+12(FP), SI 367 MOVL blocks_len+16(FP), DX 368 369 MOVL SP, BP 370 MOVL SP, DI 371 ADDL $15, DI 372 ANDL $~15, DI 373 MOVL DI, SP 374 375 MOVL CX, 8(SP) 376 MOVL 0(BX), CX 377 MOVL CX, 0(SP) 378 MOVL 4(BX), CX 379 MOVL CX, 4(SP) 380 XORL CX, CX 381 MOVL CX, 12(SP) 382 383 MOVOU 0(AX), X0 384 MOVOU 16(AX), X1 385 MOVOU counter<>(SB), X2 386 387loop: 388 MOVO X0, 656(SP) 389 MOVO X1, 672(SP) 390 MOVO X0, X4 391 MOVO X1, X5 392 MOVOU iv0<>(SB), X6 393 MOVOU iv1<>(SB), X7 394 395 MOVO 0(SP), X3 396 PADDQ X2, X3 397 PXOR X3, X7 398 MOVO X3, 0(SP) 399 400 MOVOU rol16<>(SB), X0 401 MOVOU rol8<>(SB), X1 402 403 PRECOMPUTE(SP, 16, SI, CX) 404 ROUND_SSSE3(X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X3, X0, X1) 405 ROUND_SSSE3(X4, X5, X6, X7, 16+64(SP), 32+64(SP), 48+64(SP), 64+64(SP), X3, X0, X1) 406 ROUND_SSSE3(X4, X5, X6, X7, 16+128(SP), 32+128(SP), 48+128(SP), 64+128(SP), X3, X0, X1) 407 ROUND_SSSE3(X4, X5, X6, X7, 16+192(SP), 32+192(SP), 48+192(SP), 64+192(SP), X3, X0, X1) 408 ROUND_SSSE3(X4, X5, X6, X7, 16+256(SP), 32+256(SP), 48+256(SP), 64+256(SP), X3, X0, X1) 409 ROUND_SSSE3(X4, X5, X6, X7, 16+320(SP), 32+320(SP), 48+320(SP), 64+320(SP), X3, X0, X1) 410 ROUND_SSSE3(X4, X5, X6, X7, 16+384(SP), 32+384(SP), 48+384(SP), 64+384(SP), X3, X0, X1) 411 ROUND_SSSE3(X4, X5, X6, X7, 16+448(SP), 32+448(SP), 48+448(SP), 64+448(SP), X3, X0, X1) 412 ROUND_SSSE3(X4, X5, X6, X7, 16+512(SP), 32+512(SP), 48+512(SP), 64+512(SP), X3, X0, X1) 413 ROUND_SSSE3(X4, X5, X6, X7, 16+576(SP), 32+576(SP), 48+576(SP), 64+576(SP), X3, X0, X1) 414 415 MOVO 656(SP), X0 416 MOVO 672(SP), X1 417 PXOR X4, X0 418 PXOR X5, X1 419 PXOR X6, X0 420 PXOR X7, X1 421 422 LEAL 64(SI), SI 423 SUBL $64, DX 424 JNE loop 425 426 MOVL 0(SP), CX 427 MOVL CX, 0(BX) 428 MOVL 4(SP), CX 429 MOVL CX, 4(BX) 430 431 MOVOU X0, 0(AX) 432 MOVOU X1, 16(AX) 433 434 MOVL BP, SP 435 RET 436