1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build go1.7,amd64,!gccgo,!appengine 6 7#include "textflag.h" 8 9DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 10DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b 11DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b 12DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1 13GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32 14 15DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1 16DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f 17DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b 18DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179 19GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32 20 21DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403 22DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b 23DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403 24DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b 25GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32 26 27DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302 28DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a 29DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302 30DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a 31GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32 32 33DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 34DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b 35GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16 36 37DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b 38DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 39GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16 40 41DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1 42DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f 43GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16 44 45DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b 46DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 47GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16 48 49DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403 50DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b 51GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16 52 53DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302 54DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a 55GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16 56 57#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39 58#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93 59#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e 60#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93 61#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39 62 63#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \ 64 VPADDQ m0, Y0, Y0; \ 65 VPADDQ Y1, Y0, Y0; \ 66 VPXOR Y0, Y3, Y3; \ 67 VPSHUFD $-79, Y3, Y3; \ 68 VPADDQ Y3, Y2, Y2; \ 69 VPXOR Y2, Y1, Y1; \ 70 VPSHUFB c40, Y1, Y1; \ 71 VPADDQ m1, Y0, Y0; \ 72 VPADDQ Y1, Y0, Y0; \ 73 VPXOR Y0, Y3, Y3; \ 74 VPSHUFB c48, Y3, Y3; \ 75 VPADDQ Y3, Y2, Y2; \ 76 VPXOR Y2, Y1, Y1; \ 77 VPADDQ Y1, Y1, t; \ 78 VPSRLQ $63, Y1, Y1; \ 79 VPXOR t, Y1, Y1; \ 80 VPERMQ_0x39_Y1_Y1; \ 81 VPERMQ_0x4E_Y2_Y2; \ 82 VPERMQ_0x93_Y3_Y3; \ 83 VPADDQ m2, Y0, Y0; \ 84 VPADDQ Y1, Y0, Y0; \ 85 VPXOR Y0, Y3, Y3; \ 86 VPSHUFD $-79, Y3, Y3; \ 87 VPADDQ Y3, Y2, Y2; \ 88 VPXOR Y2, Y1, Y1; \ 89 VPSHUFB c40, Y1, Y1; \ 90 VPADDQ m3, Y0, Y0; \ 91 VPADDQ Y1, Y0, Y0; \ 92 VPXOR Y0, Y3, Y3; \ 93 VPSHUFB c48, Y3, Y3; \ 94 VPADDQ Y3, Y2, Y2; \ 95 VPXOR Y2, Y1, Y1; \ 96 VPADDQ Y1, Y1, t; \ 97 VPSRLQ $63, Y1, Y1; \ 98 VPXOR t, Y1, Y1; \ 99 VPERMQ_0x39_Y3_Y3; \ 100 VPERMQ_0x4E_Y2_Y2; \ 101 VPERMQ_0x93_Y1_Y1 102 103#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E 104#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26 105#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E 106#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36 107#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E 108 109#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n 110#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n 111#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n 112#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n 113#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n 114 115#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01 116#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01 117#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01 118#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01 119#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01 120 121#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01 122#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01 123#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01 124#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01 125#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01 126 127#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8 128#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01 129 130// load msg: Y12 = (i0, i1, i2, i3) 131// i0, i1, i2, i3 must not be 0 132#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \ 133 VMOVQ_SI_X12(i0*8); \ 134 VMOVQ_SI_X11(i2*8); \ 135 VPINSRQ_1_SI_X12(i1*8); \ 136 VPINSRQ_1_SI_X11(i3*8); \ 137 VINSERTI128 $1, X11, Y12, Y12 138 139// load msg: Y13 = (i0, i1, i2, i3) 140// i0, i1, i2, i3 must not be 0 141#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \ 142 VMOVQ_SI_X13(i0*8); \ 143 VMOVQ_SI_X11(i2*8); \ 144 VPINSRQ_1_SI_X13(i1*8); \ 145 VPINSRQ_1_SI_X11(i3*8); \ 146 VINSERTI128 $1, X11, Y13, Y13 147 148// load msg: Y14 = (i0, i1, i2, i3) 149// i0, i1, i2, i3 must not be 0 150#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \ 151 VMOVQ_SI_X14(i0*8); \ 152 VMOVQ_SI_X11(i2*8); \ 153 VPINSRQ_1_SI_X14(i1*8); \ 154 VPINSRQ_1_SI_X11(i3*8); \ 155 VINSERTI128 $1, X11, Y14, Y14 156 157// load msg: Y15 = (i0, i1, i2, i3) 158// i0, i1, i2, i3 must not be 0 159#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \ 160 VMOVQ_SI_X15(i0*8); \ 161 VMOVQ_SI_X11(i2*8); \ 162 VPINSRQ_1_SI_X15(i1*8); \ 163 VPINSRQ_1_SI_X11(i3*8); \ 164 VINSERTI128 $1, X11, Y15, Y15 165 166#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \ 167 VMOVQ_SI_X12_0; \ 168 VMOVQ_SI_X11(4*8); \ 169 VPINSRQ_1_SI_X12(2*8); \ 170 VPINSRQ_1_SI_X11(6*8); \ 171 VINSERTI128 $1, X11, Y12, Y12; \ 172 LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \ 173 LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \ 174 LOAD_MSG_AVX2_Y15(9, 11, 13, 15) 175 176#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \ 177 LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \ 178 LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \ 179 VMOVQ_SI_X11(11*8); \ 180 VPSHUFD $0x4E, 0*8(SI), X14; \ 181 VPINSRQ_1_SI_X11(5*8); \ 182 VINSERTI128 $1, X11, Y14, Y14; \ 183 LOAD_MSG_AVX2_Y15(12, 2, 7, 3) 184 185#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \ 186 VMOVQ_SI_X11(5*8); \ 187 VMOVDQU 11*8(SI), X12; \ 188 VPINSRQ_1_SI_X11(15*8); \ 189 VINSERTI128 $1, X11, Y12, Y12; \ 190 VMOVQ_SI_X13(8*8); \ 191 VMOVQ_SI_X11(2*8); \ 192 VPINSRQ_1_SI_X13_0; \ 193 VPINSRQ_1_SI_X11(13*8); \ 194 VINSERTI128 $1, X11, Y13, Y13; \ 195 LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \ 196 LOAD_MSG_AVX2_Y15(14, 6, 1, 4) 197 198#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \ 199 LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \ 200 LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \ 201 LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \ 202 VMOVQ_SI_X15(6*8); \ 203 VMOVQ_SI_X11_0; \ 204 VPINSRQ_1_SI_X15(10*8); \ 205 VPINSRQ_1_SI_X11(8*8); \ 206 VINSERTI128 $1, X11, Y15, Y15 207 208#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \ 209 LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \ 210 VMOVQ_SI_X13_0; \ 211 VMOVQ_SI_X11(4*8); \ 212 VPINSRQ_1_SI_X13(7*8); \ 213 VPINSRQ_1_SI_X11(15*8); \ 214 VINSERTI128 $1, X11, Y13, Y13; \ 215 LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \ 216 LOAD_MSG_AVX2_Y15(1, 12, 8, 13) 217 218#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \ 219 VMOVQ_SI_X12(2*8); \ 220 VMOVQ_SI_X11_0; \ 221 VPINSRQ_1_SI_X12(6*8); \ 222 VPINSRQ_1_SI_X11(8*8); \ 223 VINSERTI128 $1, X11, Y12, Y12; \ 224 LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \ 225 LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \ 226 LOAD_MSG_AVX2_Y15(13, 5, 14, 9) 227 228#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \ 229 LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \ 230 LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \ 231 VMOVQ_SI_X14_0; \ 232 VPSHUFD $0x4E, 8*8(SI), X11; \ 233 VPINSRQ_1_SI_X14(6*8); \ 234 VINSERTI128 $1, X11, Y14, Y14; \ 235 LOAD_MSG_AVX2_Y15(7, 3, 2, 11) 236 237#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \ 238 LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \ 239 LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \ 240 LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \ 241 VMOVQ_SI_X15_0; \ 242 VMOVQ_SI_X11(6*8); \ 243 VPINSRQ_1_SI_X15(4*8); \ 244 VPINSRQ_1_SI_X11(10*8); \ 245 VINSERTI128 $1, X11, Y15, Y15 246 247#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \ 248 VMOVQ_SI_X12(6*8); \ 249 VMOVQ_SI_X11(11*8); \ 250 VPINSRQ_1_SI_X12(14*8); \ 251 VPINSRQ_1_SI_X11_0; \ 252 VINSERTI128 $1, X11, Y12, Y12; \ 253 LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \ 254 VMOVQ_SI_X11(1*8); \ 255 VMOVDQU 12*8(SI), X14; \ 256 VPINSRQ_1_SI_X11(10*8); \ 257 VINSERTI128 $1, X11, Y14, Y14; \ 258 VMOVQ_SI_X15(2*8); \ 259 VMOVDQU 4*8(SI), X11; \ 260 VPINSRQ_1_SI_X15(7*8); \ 261 VINSERTI128 $1, X11, Y15, Y15 262 263#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \ 264 LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \ 265 VMOVQ_SI_X13(2*8); \ 266 VPSHUFD $0x4E, 5*8(SI), X11; \ 267 VPINSRQ_1_SI_X13(4*8); \ 268 VINSERTI128 $1, X11, Y13, Y13; \ 269 LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \ 270 VMOVQ_SI_X15(11*8); \ 271 VMOVQ_SI_X11(12*8); \ 272 VPINSRQ_1_SI_X15(14*8); \ 273 VPINSRQ_1_SI_X11_0; \ 274 VINSERTI128 $1, X11, Y15, Y15 275 276// func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) 277TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment 278 MOVQ h+0(FP), AX 279 MOVQ c+8(FP), BX 280 MOVQ flag+16(FP), CX 281 MOVQ blocks_base+24(FP), SI 282 MOVQ blocks_len+32(FP), DI 283 284 MOVQ SP, DX 285 MOVQ SP, R9 286 ADDQ $31, R9 287 ANDQ $~31, R9 288 MOVQ R9, SP 289 290 MOVQ CX, 16(SP) 291 XORQ CX, CX 292 MOVQ CX, 24(SP) 293 294 VMOVDQU ·AVX2_c40<>(SB), Y4 295 VMOVDQU ·AVX2_c48<>(SB), Y5 296 297 VMOVDQU 0(AX), Y8 298 VMOVDQU 32(AX), Y9 299 VMOVDQU ·AVX2_iv0<>(SB), Y6 300 VMOVDQU ·AVX2_iv1<>(SB), Y7 301 302 MOVQ 0(BX), R8 303 MOVQ 8(BX), R9 304 MOVQ R9, 8(SP) 305 306loop: 307 ADDQ $128, R8 308 MOVQ R8, 0(SP) 309 CMPQ R8, $128 310 JGE noinc 311 INCQ R9 312 MOVQ R9, 8(SP) 313 314noinc: 315 VMOVDQA Y8, Y0 316 VMOVDQA Y9, Y1 317 VMOVDQA Y6, Y2 318 VPXOR 0(SP), Y7, Y3 319 320 LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() 321 VMOVDQA Y12, 32(SP) 322 VMOVDQA Y13, 64(SP) 323 VMOVDQA Y14, 96(SP) 324 VMOVDQA Y15, 128(SP) 325 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 326 LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() 327 VMOVDQA Y12, 160(SP) 328 VMOVDQA Y13, 192(SP) 329 VMOVDQA Y14, 224(SP) 330 VMOVDQA Y15, 256(SP) 331 332 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 333 LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() 334 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 335 LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() 336 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 337 LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() 338 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 339 LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() 340 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 341 LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() 342 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 343 LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() 344 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 345 LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() 346 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 347 LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() 348 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 349 350 ROUND_AVX2(32(SP), 64(SP), 96(SP), 128(SP), Y10, Y4, Y5) 351 ROUND_AVX2(160(SP), 192(SP), 224(SP), 256(SP), Y10, Y4, Y5) 352 353 VPXOR Y0, Y8, Y8 354 VPXOR Y1, Y9, Y9 355 VPXOR Y2, Y8, Y8 356 VPXOR Y3, Y9, Y9 357 358 LEAQ 128(SI), SI 359 SUBQ $128, DI 360 JNE loop 361 362 MOVQ R8, 0(BX) 363 MOVQ R9, 8(BX) 364 365 VMOVDQU Y8, 0(AX) 366 VMOVDQU Y9, 32(AX) 367 VZEROUPPER 368 369 MOVQ DX, SP 370 RET 371 372#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA 373#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB 374#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF 375#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD 376#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE 377 378#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7 379#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF 380#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7 381#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF 382#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7 383#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7 384#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF 385#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF 386 387#define SHUFFLE_AVX() \ 388 VMOVDQA X6, X13; \ 389 VMOVDQA X2, X14; \ 390 VMOVDQA X4, X6; \ 391 VPUNPCKLQDQ_X13_X13_X15; \ 392 VMOVDQA X5, X4; \ 393 VMOVDQA X6, X5; \ 394 VPUNPCKHQDQ_X15_X7_X6; \ 395 VPUNPCKLQDQ_X7_X7_X15; \ 396 VPUNPCKHQDQ_X15_X13_X7; \ 397 VPUNPCKLQDQ_X3_X3_X15; \ 398 VPUNPCKHQDQ_X15_X2_X2; \ 399 VPUNPCKLQDQ_X14_X14_X15; \ 400 VPUNPCKHQDQ_X15_X3_X3; \ 401 402#define SHUFFLE_AVX_INV() \ 403 VMOVDQA X2, X13; \ 404 VMOVDQA X4, X14; \ 405 VPUNPCKLQDQ_X2_X2_X15; \ 406 VMOVDQA X5, X4; \ 407 VPUNPCKHQDQ_X15_X3_X2; \ 408 VMOVDQA X14, X5; \ 409 VPUNPCKLQDQ_X3_X3_X15; \ 410 VMOVDQA X6, X14; \ 411 VPUNPCKHQDQ_X15_X13_X3; \ 412 VPUNPCKLQDQ_X7_X7_X15; \ 413 VPUNPCKHQDQ_X15_X6_X6; \ 414 VPUNPCKLQDQ_X14_X14_X15; \ 415 VPUNPCKHQDQ_X15_X7_X7; \ 416 417#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ 418 VPADDQ m0, v0, v0; \ 419 VPADDQ v2, v0, v0; \ 420 VPADDQ m1, v1, v1; \ 421 VPADDQ v3, v1, v1; \ 422 VPXOR v0, v6, v6; \ 423 VPXOR v1, v7, v7; \ 424 VPSHUFD $-79, v6, v6; \ 425 VPSHUFD $-79, v7, v7; \ 426 VPADDQ v6, v4, v4; \ 427 VPADDQ v7, v5, v5; \ 428 VPXOR v4, v2, v2; \ 429 VPXOR v5, v3, v3; \ 430 VPSHUFB c40, v2, v2; \ 431 VPSHUFB c40, v3, v3; \ 432 VPADDQ m2, v0, v0; \ 433 VPADDQ v2, v0, v0; \ 434 VPADDQ m3, v1, v1; \ 435 VPADDQ v3, v1, v1; \ 436 VPXOR v0, v6, v6; \ 437 VPXOR v1, v7, v7; \ 438 VPSHUFB c48, v6, v6; \ 439 VPSHUFB c48, v7, v7; \ 440 VPADDQ v6, v4, v4; \ 441 VPADDQ v7, v5, v5; \ 442 VPXOR v4, v2, v2; \ 443 VPXOR v5, v3, v3; \ 444 VPADDQ v2, v2, t0; \ 445 VPSRLQ $63, v2, v2; \ 446 VPXOR t0, v2, v2; \ 447 VPADDQ v3, v3, t0; \ 448 VPSRLQ $63, v3, v3; \ 449 VPXOR t0, v3, v3 450 451// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) 452// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 453#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \ 454 VMOVQ_SI_X12(i0*8); \ 455 VMOVQ_SI_X13(i2*8); \ 456 VMOVQ_SI_X14(i4*8); \ 457 VMOVQ_SI_X15(i6*8); \ 458 VPINSRQ_1_SI_X12(i1*8); \ 459 VPINSRQ_1_SI_X13(i3*8); \ 460 VPINSRQ_1_SI_X14(i5*8); \ 461 VPINSRQ_1_SI_X15(i7*8) 462 463// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) 464#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \ 465 VMOVQ_SI_X12_0; \ 466 VMOVQ_SI_X13(4*8); \ 467 VMOVQ_SI_X14(1*8); \ 468 VMOVQ_SI_X15(5*8); \ 469 VPINSRQ_1_SI_X12(2*8); \ 470 VPINSRQ_1_SI_X13(6*8); \ 471 VPINSRQ_1_SI_X14(3*8); \ 472 VPINSRQ_1_SI_X15(7*8) 473 474// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) 475#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \ 476 VPSHUFD $0x4E, 0*8(SI), X12; \ 477 VMOVQ_SI_X13(11*8); \ 478 VMOVQ_SI_X14(12*8); \ 479 VMOVQ_SI_X15(7*8); \ 480 VPINSRQ_1_SI_X13(5*8); \ 481 VPINSRQ_1_SI_X14(2*8); \ 482 VPINSRQ_1_SI_X15(3*8) 483 484// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) 485#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \ 486 VMOVDQU 11*8(SI), X12; \ 487 VMOVQ_SI_X13(5*8); \ 488 VMOVQ_SI_X14(8*8); \ 489 VMOVQ_SI_X15(2*8); \ 490 VPINSRQ_1_SI_X13(15*8); \ 491 VPINSRQ_1_SI_X14_0; \ 492 VPINSRQ_1_SI_X15(13*8) 493 494// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) 495#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \ 496 VMOVQ_SI_X12(2*8); \ 497 VMOVQ_SI_X13(4*8); \ 498 VMOVQ_SI_X14(6*8); \ 499 VMOVQ_SI_X15_0; \ 500 VPINSRQ_1_SI_X12(5*8); \ 501 VPINSRQ_1_SI_X13(15*8); \ 502 VPINSRQ_1_SI_X14(10*8); \ 503 VPINSRQ_1_SI_X15(8*8) 504 505// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) 506#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \ 507 VMOVQ_SI_X12(9*8); \ 508 VMOVQ_SI_X13(2*8); \ 509 VMOVQ_SI_X14_0; \ 510 VMOVQ_SI_X15(4*8); \ 511 VPINSRQ_1_SI_X12(5*8); \ 512 VPINSRQ_1_SI_X13(10*8); \ 513 VPINSRQ_1_SI_X14(7*8); \ 514 VPINSRQ_1_SI_X15(15*8) 515 516// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) 517#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \ 518 VMOVQ_SI_X12(2*8); \ 519 VMOVQ_SI_X13_0; \ 520 VMOVQ_SI_X14(12*8); \ 521 VMOVQ_SI_X15(11*8); \ 522 VPINSRQ_1_SI_X12(6*8); \ 523 VPINSRQ_1_SI_X13(8*8); \ 524 VPINSRQ_1_SI_X14(10*8); \ 525 VPINSRQ_1_SI_X15(3*8) 526 527// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) 528#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \ 529 MOVQ 0*8(SI), X12; \ 530 VPSHUFD $0x4E, 8*8(SI), X13; \ 531 MOVQ 7*8(SI), X14; \ 532 MOVQ 2*8(SI), X15; \ 533 VPINSRQ_1_SI_X12(6*8); \ 534 VPINSRQ_1_SI_X14(3*8); \ 535 VPINSRQ_1_SI_X15(11*8) 536 537// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) 538#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \ 539 MOVQ 6*8(SI), X12; \ 540 MOVQ 11*8(SI), X13; \ 541 MOVQ 15*8(SI), X14; \ 542 MOVQ 3*8(SI), X15; \ 543 VPINSRQ_1_SI_X12(14*8); \ 544 VPINSRQ_1_SI_X13_0; \ 545 VPINSRQ_1_SI_X14(9*8); \ 546 VPINSRQ_1_SI_X15(8*8) 547 548// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) 549#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \ 550 MOVQ 5*8(SI), X12; \ 551 MOVQ 8*8(SI), X13; \ 552 MOVQ 0*8(SI), X14; \ 553 MOVQ 6*8(SI), X15; \ 554 VPINSRQ_1_SI_X12(15*8); \ 555 VPINSRQ_1_SI_X13(2*8); \ 556 VPINSRQ_1_SI_X14(4*8); \ 557 VPINSRQ_1_SI_X15(10*8) 558 559// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) 560#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \ 561 VMOVDQU 12*8(SI), X12; \ 562 MOVQ 1*8(SI), X13; \ 563 MOVQ 2*8(SI), X14; \ 564 VPINSRQ_1_SI_X13(10*8); \ 565 VPINSRQ_1_SI_X14(7*8); \ 566 VMOVDQU 4*8(SI), X15 567 568// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) 569#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \ 570 MOVQ 15*8(SI), X12; \ 571 MOVQ 3*8(SI), X13; \ 572 MOVQ 11*8(SI), X14; \ 573 MOVQ 12*8(SI), X15; \ 574 VPINSRQ_1_SI_X12(9*8); \ 575 VPINSRQ_1_SI_X13(13*8); \ 576 VPINSRQ_1_SI_X14(14*8); \ 577 VPINSRQ_1_SI_X15_0 578 579// func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) 580TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment 581 MOVQ h+0(FP), AX 582 MOVQ c+8(FP), BX 583 MOVQ flag+16(FP), CX 584 MOVQ blocks_base+24(FP), SI 585 MOVQ blocks_len+32(FP), DI 586 587 MOVQ SP, BP 588 MOVQ SP, R9 589 ADDQ $15, R9 590 ANDQ $~15, R9 591 MOVQ R9, SP 592 593 VMOVDQU ·AVX_c40<>(SB), X0 594 VMOVDQU ·AVX_c48<>(SB), X1 595 VMOVDQA X0, X8 596 VMOVDQA X1, X9 597 598 VMOVDQU ·AVX_iv3<>(SB), X0 599 VMOVDQA X0, 0(SP) 600 XORQ CX, 0(SP) // 0(SP) = ·AVX_iv3 ^ (CX || 0) 601 602 VMOVDQU 0(AX), X10 603 VMOVDQU 16(AX), X11 604 VMOVDQU 32(AX), X2 605 VMOVDQU 48(AX), X3 606 607 MOVQ 0(BX), R8 608 MOVQ 8(BX), R9 609 610loop: 611 ADDQ $128, R8 612 CMPQ R8, $128 613 JGE noinc 614 INCQ R9 615 616noinc: 617 VMOVQ_R8_X15 618 VPINSRQ_1_R9_X15 619 620 VMOVDQA X10, X0 621 VMOVDQA X11, X1 622 VMOVDQU ·AVX_iv0<>(SB), X4 623 VMOVDQU ·AVX_iv1<>(SB), X5 624 VMOVDQU ·AVX_iv2<>(SB), X6 625 626 VPXOR X15, X6, X6 627 VMOVDQA 0(SP), X7 628 629 LOAD_MSG_AVX_0_2_4_6_1_3_5_7() 630 VMOVDQA X12, 16(SP) 631 VMOVDQA X13, 32(SP) 632 VMOVDQA X14, 48(SP) 633 VMOVDQA X15, 64(SP) 634 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 635 SHUFFLE_AVX() 636 LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) 637 VMOVDQA X12, 80(SP) 638 VMOVDQA X13, 96(SP) 639 VMOVDQA X14, 112(SP) 640 VMOVDQA X15, 128(SP) 641 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 642 SHUFFLE_AVX_INV() 643 644 LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) 645 VMOVDQA X12, 144(SP) 646 VMOVDQA X13, 160(SP) 647 VMOVDQA X14, 176(SP) 648 VMOVDQA X15, 192(SP) 649 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 650 SHUFFLE_AVX() 651 LOAD_MSG_AVX_1_0_11_5_12_2_7_3() 652 VMOVDQA X12, 208(SP) 653 VMOVDQA X13, 224(SP) 654 VMOVDQA X14, 240(SP) 655 VMOVDQA X15, 256(SP) 656 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 657 SHUFFLE_AVX_INV() 658 659 LOAD_MSG_AVX_11_12_5_15_8_0_2_13() 660 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 661 SHUFFLE_AVX() 662 LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) 663 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 664 SHUFFLE_AVX_INV() 665 666 LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) 667 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 668 SHUFFLE_AVX() 669 LOAD_MSG_AVX_2_5_4_15_6_10_0_8() 670 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 671 SHUFFLE_AVX_INV() 672 673 LOAD_MSG_AVX_9_5_2_10_0_7_4_15() 674 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 675 SHUFFLE_AVX() 676 LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) 677 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 678 SHUFFLE_AVX_INV() 679 680 LOAD_MSG_AVX_2_6_0_8_12_10_11_3() 681 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 682 SHUFFLE_AVX() 683 LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) 684 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 685 SHUFFLE_AVX_INV() 686 687 LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) 688 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 689 SHUFFLE_AVX() 690 LOAD_MSG_AVX_0_6_9_8_7_3_2_11() 691 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 692 SHUFFLE_AVX_INV() 693 694 LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) 695 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 696 SHUFFLE_AVX() 697 LOAD_MSG_AVX_5_15_8_2_0_4_6_10() 698 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 699 SHUFFLE_AVX_INV() 700 701 LOAD_MSG_AVX_6_14_11_0_15_9_3_8() 702 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 703 SHUFFLE_AVX() 704 LOAD_MSG_AVX_12_13_1_10_2_7_4_5() 705 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 706 SHUFFLE_AVX_INV() 707 708 LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) 709 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 710 SHUFFLE_AVX() 711 LOAD_MSG_AVX_15_9_3_13_11_14_12_0() 712 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 713 SHUFFLE_AVX_INV() 714 715 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X15, X8, X9) 716 SHUFFLE_AVX() 717 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X15, X8, X9) 718 SHUFFLE_AVX_INV() 719 720 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X15, X8, X9) 721 SHUFFLE_AVX() 722 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X15, X8, X9) 723 SHUFFLE_AVX_INV() 724 725 VMOVDQU 32(AX), X14 726 VMOVDQU 48(AX), X15 727 VPXOR X0, X10, X10 728 VPXOR X1, X11, X11 729 VPXOR X2, X14, X14 730 VPXOR X3, X15, X15 731 VPXOR X4, X10, X10 732 VPXOR X5, X11, X11 733 VPXOR X6, X14, X2 734 VPXOR X7, X15, X3 735 VMOVDQU X2, 32(AX) 736 VMOVDQU X3, 48(AX) 737 738 LEAQ 128(SI), SI 739 SUBQ $128, DI 740 JNE loop 741 742 VMOVDQU X10, 0(AX) 743 VMOVDQU X11, 16(AX) 744 745 MOVQ R8, 0(BX) 746 MOVQ R9, 8(BX) 747 VZEROUPPER 748 749 MOVQ BP, SP 750 RET 751