1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2019 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30;;; 31;;; gf_5vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests); 32;;; 33 34%include "reg_sizes.asm" 35 36%ifdef HAVE_AS_KNOWS_AVX512 37 38%ifidn __OUTPUT_FORMAT__, elf64 39 %define arg0 rdi 40 %define arg1 rsi 41 %define arg2 rdx 42 %define arg3 rcx 43 %define arg4 r8 44 %define arg5 r9 45 46 %define tmp r11 47 %define tmp.w r11d 48 %define tmp.b r11b 49 %define tmp2 r10 50 %define tmp3 r13 ; must be saved and restored 51 %define tmp4 r12 ; must be saved and restored 52 %define tmp5 r14 ; must be saved and restored 53 %define tmp6 r15 ; must be saved and restored 54 %define tmp7 rbp ; must be saved and restored 55 %define tmp8 rbx ; must be saved and restored 56 %define return rax 57 %define PS 8 58 %define LOG_PS 3 59 60 %define func(x) x: endbranch 61 %macro FUNC_SAVE 0 62 push r12 63 push r13 64 push r14 65 push r15 66 push rbp 67 push rbx 68 %endmacro 69 %macro FUNC_RESTORE 0 70 pop rbx 71 pop rbp 72 pop r15 73 pop r14 74 pop r13 75 pop r12 76 %endmacro 77%endif 78 79%ifidn __OUTPUT_FORMAT__, win64 80 %define arg0 rcx 81 %define arg1 rdx 82 %define arg2 r8 83 %define arg3 r9 84 85 %define arg4 r12 ; must be saved, loaded and restored 86 %define arg5 r15 ; must be saved and restored 87 %define tmp r11 88 %define tmp.w r11d 89 %define tmp.b r11b 90 %define tmp2 r10 91 %define tmp3 r13 ; must be saved and restored 92 %define tmp4 r14 ; must be saved and restored 93 %define tmp5 rdi ; must be saved and restored 94 %define tmp6 rsi ; must be saved and restored 95 %define tmp7 rbp ; must be saved and restored 96 %define tmp8 rbx ; must be saved and restored 97 %define return rax 98 %define PS 8 99 %define LOG_PS 3 100 %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8 101 %define arg(x) [rsp + stack_size + PS + PS*x] 102 103 %define func(x) proc_frame x 104 %macro FUNC_SAVE 0 105 alloc_stack stack_size 106 vmovdqa [rsp + 0*16], xmm6 107 vmovdqa [rsp + 1*16], xmm7 108 vmovdqa [rsp + 2*16], xmm8 109 vmovdqa [rsp + 3*16], xmm9 110 vmovdqa [rsp + 4*16], xmm10 111 vmovdqa [rsp + 5*16], xmm11 112 vmovdqa [rsp + 6*16], xmm12 113 vmovdqa [rsp + 7*16], xmm13 114 vmovdqa [rsp + 8*16], xmm14 115 vmovdqa [rsp + 9*16], xmm15 116 save_reg r12, 10*16 + 0*8 117 save_reg r13, 10*16 + 1*8 118 save_reg r14, 10*16 + 2*8 119 save_reg r15, 10*16 + 3*8 120 save_reg rdi, 10*16 + 4*8 121 save_reg rsi, 10*16 + 5*8 122 save_reg rbp, 10*16 + 6*8 123 save_reg rbx, 10*16 + 7*8 124 end_prolog 125 mov arg4, arg(4) 126 %endmacro 127 128 %macro FUNC_RESTORE 0 129 vmovdqa xmm6, [rsp + 0*16] 130 vmovdqa xmm7, [rsp + 1*16] 131 vmovdqa xmm8, [rsp + 2*16] 132 vmovdqa xmm9, [rsp + 3*16] 133 vmovdqa xmm10, [rsp + 4*16] 134 vmovdqa xmm11, [rsp + 5*16] 135 vmovdqa xmm12, [rsp + 6*16] 136 vmovdqa xmm13, [rsp + 7*16] 137 vmovdqa xmm14, [rsp + 8*16] 138 vmovdqa xmm15, [rsp + 9*16] 139 mov r12, [rsp + 10*16 + 0*8] 140 mov r13, [rsp + 10*16 + 1*8] 141 mov r14, [rsp + 10*16 + 2*8] 142 mov r15, [rsp + 10*16 + 3*8] 143 mov rdi, [rsp + 10*16 + 4*8] 144 mov rsi, [rsp + 10*16 + 5*8] 145 mov rbp, [rsp + 10*16 + 6*8] 146 mov rbx, [rsp + 10*16 + 7*8] 147 add rsp, stack_size 148 %endmacro 149%endif 150 151 152%define len arg0 153%define vec arg1 154%define mul_array arg2 155%define src arg3 156%define dest1 arg4 157%define ptr arg5 158%define vec_i tmp2 159%define dest2 tmp3 160%define dest3 tmp4 161%define dest4 tmp5 162%define vskip3 tmp6 163%define dest5 tmp7 164%define vskip1 tmp8 165%define pos return 166 167 168%ifndef EC_ALIGNED_ADDR 169;;; Use Un-aligned load/store 170 %define XLDR vmovdqu8 171 %define XSTR vmovdqu8 172%else 173;;; Use Non-temporal load/stor 174 %ifdef NO_NT_LDST 175 %define XLDR vmovdqa 176 %define XSTR vmovdqa 177 %else 178 %define XLDR vmovntdqa 179 %define XSTR vmovntdq 180 %endif 181%endif 182 183%define xmask0f zmm17 184%define xgft1_lo zmm16 185%define xgft1_loy ymm16 186%define xgft1_hi zmm15 187%define xgft2_lo zmm14 188%define xgft2_loy ymm14 189%define xgft2_hi zmm13 190%define xgft3_lo zmm12 191%define xgft3_loy ymm12 192%define xgft3_hi zmm11 193%define xgft4_lo zmm10 194%define xgft4_loy ymm10 195%define xgft4_hi zmm9 196%define xgft5_lo zmm8 197%define xgft5_loy ymm8 198%define xgft5_hi zmm7 199 200%define x0 zmm0 201%define xtmpa zmm1 202%define xp1 zmm2 203%define xp2 zmm3 204%define xp3 zmm4 205%define xp4 zmm5 206%define xp5 zmm6 207 208default rel 209[bits 64] 210 211section .text 212 213align 16 214mk_global gf_5vect_dot_prod_avx512, function 215func(gf_5vect_dot_prod_avx512) 216 FUNC_SAVE 217 sub len, 64 218 jl .return_fail 219 220 xor pos, pos 221 mov tmp, 0x0f 222 vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f... 223 mov vskip1, vec 224 imul vskip1, 32 225 mov vskip3, vec 226 imul vskip3, 96 227 sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS 228 mov dest2, [dest1+PS] 229 mov dest3, [dest1+2*PS] 230 mov dest4, [dest1+3*PS] 231 mov dest5, [dest1+4*PS] 232 mov dest1, [dest1] 233 234.loop64: 235 vpxorq xp1, xp1, xp1 236 vpxorq xp2, xp2, xp2 237 vpxorq xp3, xp3, xp3 238 vpxorq xp4, xp4, xp4 239 vpxorq xp5, xp5, xp5 240 mov tmp, mul_array 241 xor vec_i, vec_i 242 243.next_vect: 244 mov ptr, [src+vec_i] 245 XLDR x0, [ptr+pos] ;Get next source vector 246 add vec_i, PS 247 248 vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 249 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 250 vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0 251 252 vmovdqu8 xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0} 253 vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)] ;Load array Bx{00}..{0f}, Bx{00}..{f0} 254 vmovdqu8 xgft3_loy, [tmp+vec*(64/PS)] ;Load array Cx{00}..{0f}, Cx{00}..{f0} 255 vmovdqu8 xgft4_loy, [tmp+vskip3] ;Load array Dx{00}..{0f}, Dx{00}..{f0} 256 vmovdqu8 xgft5_loy, [tmp+vskip1*4] ;Load array Ex{00}..{0f}, Ex{00}..{f0} 257 add tmp, 32 258 259 vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55 260 vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 261 vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55 262 vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 263 264 vpshufb xgft1_hi, xgft1_hi, x0 ;Lookup mul table of high nibble 265 vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble 266 vpxorq xgft1_hi, xgft1_hi, xgft1_lo ;GF add high and low partials 267 vpxorq xp1, xp1, xgft1_hi ;xp1 += partial 268 269 vpshufb xgft2_hi, xgft2_hi, x0 ;Lookup mul table of high nibble 270 vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble 271 vpxorq xgft2_hi, xgft2_hi, xgft2_lo ;GF add high and low partials 272 vpxorq xp2, xp2, xgft2_hi ;xp2 += partial 273 274 vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55 275 vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 276 vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55 277 vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00 278 279 vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble 280 vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble 281 vpxorq xgft3_hi, xgft3_hi, xgft3_lo ;GF add high and low partials 282 vpxorq xp3, xp3, xgft3_hi ;xp3 += partial 283 284 vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble 285 vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble 286 vpxorq xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials 287 vpxorq xp4, xp4, xgft4_hi ;xp4 += partial 288 289 vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55 290 vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00 291 292 vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble 293 vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble 294 vpxorq xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials 295 vpxorq xp5, xp5, xgft5_hi ;xp5 += partial 296 297 cmp vec_i, vec 298 jl .next_vect 299 300 XSTR [dest1+pos], xp1 301 XSTR [dest2+pos], xp2 302 XSTR [dest3+pos], xp3 303 XSTR [dest4+pos], xp4 304 XSTR [dest5+pos], xp5 305 306 add pos, 64 ;Loop on 64 bytes at a time 307 cmp pos, len 308 jle .loop64 309 310 lea tmp, [len + 64] 311 cmp pos, tmp 312 je .return_pass 313 314 ;; Tail len 315 mov pos, len ;Overlapped offset length-64 316 jmp .loop64 ;Do one more overlap pass 317 318.return_pass: 319 mov return, 0 320 FUNC_RESTORE 321 ret 322 323.return_fail: 324 mov return, 1 325 FUNC_RESTORE 326 ret 327 328endproc_frame 329 330%else 331%ifidn __OUTPUT_FORMAT__, win64 332global no_gf_5vect_dot_prod_avx512 333no_gf_5vect_dot_prod_avx512: 334%endif 335%endif ; ifdef HAVE_AS_KNOWS_AVX512 336