1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2019 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;; 31;;; gf_6vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests); 32;;; 33 34%include "reg_sizes.asm" 35 36%ifdef HAVE_AS_KNOWS_AVX512 37 38%ifidn __OUTPUT_FORMAT__, elf64 39 %define arg0 rdi 40 %define arg1 rsi 41 %define arg2 rdx 42 %define arg3 rcx 43 %define arg4 r8 44 %define arg5 r9 45 46 %define tmp r11 47 %define tmp.w r11d 48 %define tmp.b r11b 49 %define tmp2 r10 50 %define tmp3 r13 ; must be saved and restored 51 %define tmp4 r12 ; must be saved and restored 52 %define tmp5 r14 ; must be saved and restored 53 %define tmp6 r15 ; must be saved and restored 54 %define tmp7 rbp ; must be saved and restored 55 %define tmp8 rbx ; must be saved and restored 56 %define return rax 57 %define PS 8 58 %define LOG_PS 3 59 60 %define func(x) x: endbranch 61 %macro FUNC_SAVE 0 62 push r12 63 push r13 64 push r14 65 push r15 66 push rbp 67 push rbx 68 %endmacro 69 %macro FUNC_RESTORE 0 70 pop rbx 71 pop rbp 72 pop r15 73 pop r14 74 pop r13 75 pop r12 76 %endmacro 77%endif 78 79%ifidn __OUTPUT_FORMAT__, win64 80 %define arg0 rcx 81 %define arg1 rdx 82 %define arg2 r8 83 %define arg3 r9 84 85 %define arg4 r12 ; must be saved, loaded and restored 86 %define arg5 r15 ; must be saved and restored 87 %define tmp r11 88 %define tmp.w r11d 89 %define tmp.b r11b 90 %define tmp2 r10 91 %define tmp3 r13 ; must be saved and restored 92 %define tmp4 r14 ; must be saved and restored 93 %define tmp5 rdi ; must be saved and restored 94 %define tmp6 rsi ; must be saved and restored 95 %define tmp7 rbp ; must be saved and restored 96 %define tmp8 rbx ; must be saved and restored 97 %define return rax 98 %define PS 8 99 %define LOG_PS 3 100 %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8 101 %define arg(x) [rsp + stack_size + PS + PS*x] 102 103 %define func(x) proc_frame x 104 %macro FUNC_SAVE 0 105 alloc_stack stack_size 106 vmovdqa [rsp + 0*16], xmm6 107 vmovdqa [rsp + 1*16], xmm7 108 vmovdqa [rsp + 2*16], xmm8 109 vmovdqa [rsp + 3*16], xmm9 110 vmovdqa [rsp + 4*16], xmm10 111 vmovdqa [rsp + 5*16], xmm11 112 vmovdqa [rsp + 6*16], xmm12 113 vmovdqa [rsp + 7*16], xmm13 114 vmovdqa [rsp + 8*16], xmm14 115 vmovdqa [rsp + 9*16], xmm15 116 save_reg r12, 10*16 + 0*8 117 save_reg r13, 10*16 + 1*8 118 save_reg r14, 10*16 + 2*8 119 save_reg r15, 10*16 + 3*8 120 save_reg rdi, 10*16 + 4*8 121 save_reg rsi, 10*16 + 5*8 122 save_reg rbp, 10*16 + 6*8 123 save_reg rbx, 10*16 + 7*8 124 end_prolog 125 mov arg4, arg(4) 126 %endmacro 127 128 %macro FUNC_RESTORE 0 129 vmovdqa xmm6, [rsp + 0*16] 130 vmovdqa xmm7, [rsp + 1*16] 131 vmovdqa xmm8, [rsp + 2*16] 132 vmovdqa xmm9, [rsp + 3*16] 133 vmovdqa xmm10, [rsp + 4*16] 134 vmovdqa xmm11, [rsp + 5*16] 135 vmovdqa xmm12, [rsp + 6*16] 136 vmovdqa xmm13, [rsp + 7*16] 137 vmovdqa xmm14, [rsp + 8*16] 138 vmovdqa xmm15, [rsp + 9*16] 139 mov r12, [rsp + 10*16 + 0*8] 140 mov r13, [rsp + 10*16 + 1*8] 141 mov r14, [rsp + 10*16 + 2*8] 142 mov r15, [rsp + 10*16 + 3*8] 143 mov rdi, [rsp + 10*16 + 4*8] 144 mov rsi, [rsp + 10*16 + 5*8] 145 mov rbp, [rsp + 10*16 + 6*8] 146 mov rbx, [rsp + 10*16 + 7*8] 147 add rsp, stack_size 148 %endmacro 149%endif 150 151 152%define len arg0 153%define vec arg1 154%define mul_array arg2 155%define src arg3 156%define dest1 arg4 157%define ptr arg5 158%define vec_i tmp2 159%define dest2 tmp3 160%define dest3 tmp4 161%define dest4 tmp5 162%define vskip3 tmp6 163%define dest5 tmp7 164%define vskip1 tmp8 165%define pos return 166 167 168%ifndef EC_ALIGNED_ADDR 169;;; Use Un-aligned load/store 170 %define XLDR vmovdqu8 171 %define XSTR vmovdqu8 172%else 173;;; Use Non-temporal load/stor 174 %ifdef NO_NT_LDST 175 %define XLDR vmovdqa 176 %define XSTR vmovdqa 177 %else 178 %define XLDR vmovntdqa 179 %define XSTR vmovntdq 180 %endif 181%endif 182 183%define xmask0f zmm20 184%define xgft1_lo zmm19 185%define xgft1_loy ymm19 186%define xgft1_hi zmm18 187%define xgft2_lo zmm17 188%define xgft2_loy ymm17 189%define xgft2_hi zmm16 190%define xgft3_lo zmm15 191%define xgft3_loy ymm15 192%define xgft3_hi zmm14 193%define xgft4_lo zmm13 194%define xgft4_loy ymm13 195%define xgft4_hi zmm12 196%define xgft5_lo zmm11 197%define xgft5_loy ymm11 198%define xgft5_hi zmm10 199%define xgft6_lo zmm9 200%define xgft6_loy ymm9 201%define xgft6_hi zmm8 202 203%define x0 zmm0 204%define xtmpa zmm1 205%define xp1 zmm2 206%define xp2 zmm3 207%define xp3 zmm4 208%define xp4 zmm5 209%define xp5 zmm6 210%define xp6 zmm7 211 212default rel 213[bits 64] 214 215section .text 216 217align 16 218mk_global gf_6vect_dot_prod_avx512, function 219func(gf_6vect_dot_prod_avx512) 220 FUNC_SAVE 221 sub len, 64 222 jl .return_fail 223 224 xor pos, pos 225 mov tmp, 0x0f 226 vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f... 227 mov vskip1, vec 228 imul vskip1, 32 229 mov vskip3, vec 230 imul vskip3, 96 231 sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS 232 mov dest2, [dest1+PS] 233 mov dest3, [dest1+2*PS] 234 mov dest4, [dest1+3*PS] 235 mov dest5, [dest1+4*PS] 236 237.loop64: 238 vpxorq xp1, xp1, xp1 239 vpxorq xp2, xp2, xp2 240 vpxorq xp3, xp3, xp3 241 vpxorq xp4, xp4, xp4 242 vpxorq xp5, xp5, xp5 243 vpxorq xp6, xp6, xp6 244 mov tmp, mul_array 245 xor vec_i, vec_i 246 247.next_vect: 248 mov ptr, [src+vec_i] 249 XLDR x0, [ptr+pos] ;Get next source vector 250 add vec_i, PS 251 252 vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 253 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 254 vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0 255 256 vmovdqu8 xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0} 257 vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)] ;Load array Bx{00}..{0f}, Bx{00}..{f0} 258 vmovdqu8 xgft3_loy, [tmp+vec*(64/PS)] ;Load array Cx{00}..{0f}, Cx{00}..{f0} 259 vmovdqu8 xgft4_loy, [tmp+vskip3] ;Load array Dx{00}..{0f}, Dx{00}..{f0} 260 vmovdqu8 xgft5_loy, [tmp+vskip1*4] ;Load array Ex{00}..{0f}, Ex{00}..{f0} 261 lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5 262 vmovdqu8 xgft6_loy, [tmp+ptr] ;Load array Fx{00}..{0f}, Fx{00}..{f0} 263 add tmp, 32 264 265 vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55 266 vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 267 vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55 268 vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 269 270 vpshufb xgft1_hi, xgft1_hi, x0 ;Lookup mul table of high nibble 271 vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble 272 vpxorq xgft1_hi, xgft1_hi, xgft1_lo ;GF add high and low partials 273 vpxorq xp1, xp1, xgft1_hi ;xp1 += partial 274 275 vpshufb xgft2_hi, xgft2_hi, x0 ;Lookup mul table of high nibble 276 vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble 277 vpxorq xgft2_hi, xgft2_hi, xgft2_lo ;GF add high and low partials 278 vpxorq xp2, xp2, xgft2_hi ;xp2 += partial 279 280 vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55 281 vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 282 vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55 283 vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00 284 285 vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble 286 vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble 287 vpxorq xgft3_hi, xgft3_hi, xgft3_lo ;GF add high and low partials 288 vpxorq xp3, xp3, xgft3_hi ;xp3 += partial 289 290 vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble 291 vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble 292 vpxorq xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials 293 vpxorq xp4, xp4, xgft4_hi ;xp4 += partial 294 295 vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55 296 vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00 297 298 vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble 299 vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble 300 vpxorq xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials 301 vpxorq xp5, xp5, xgft5_hi ;xp5 += partial 302 303 vshufi64x2 xgft6_hi, xgft6_lo, xgft6_lo, 0x55 304 vshufi64x2 xgft6_lo, xgft6_lo, xgft6_lo, 0x00 305 306 vpshufb xgft6_hi, xgft6_hi, x0 ;Lookup mul table of high nibble 307 vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble 308 vpxorq xgft6_hi, xgft6_hi, xgft6_lo ;GF add high and low partials 309 vpxorq xp6, xp6, xgft6_hi ;x6 += partial 310 311 cmp vec_i, vec 312 jl .next_vect 313 314 mov ptr, [dest1] ;reuse ptr 315 mov tmp, [dest1+5*PS] ;reuse tmp 316 317 XSTR [dest2+pos], xp2 318 XSTR [dest3+pos], xp3 319 XSTR [dest4+pos], xp4 320 XSTR [dest5+pos], xp5 321 322 XSTR [ptr+pos], xp1 323 XSTR [tmp+pos], xp6 324 325 add pos, 64 ;Loop on 64 bytes at a time 326 cmp pos, len 327 jle .loop64 328 329 lea tmp, [len + 64] 330 cmp pos, tmp 331 je .return_pass 332 333 ;; Tail len 334 mov pos, len ;Overlapped offset length-64 335 jmp .loop64 ;Do one more overlap pass 336 337.return_pass: 338 mov return, 0 339 FUNC_RESTORE 340 ret 341 342.return_fail: 343 mov return, 1 344 FUNC_RESTORE 345 ret 346 347endproc_frame 348 349%else 350%ifidn __OUTPUT_FORMAT__, win64 351global no_gf_6vect_dot_prod_avx512 352no_gf_6vect_dot_prod_avx512: 353%endif 354%endif ; ifdef HAVE_AS_KNOWS_AVX512 355