1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;
31;;; gf_6vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests);
32;;;
33
34%include "reg_sizes.asm"
35
36%ifdef HAVE_AS_KNOWS_AVX512
37
38%ifidn __OUTPUT_FORMAT__, elf64
39 %define arg0  rdi
40 %define arg1  rsi
41 %define arg2  rdx
42 %define arg3  rcx
43 %define arg4  r8
44 %define arg5  r9
45
46 %define tmp   r11
47 %define tmp.w r11d
48 %define tmp.b r11b
49 %define tmp2  r10
50 %define tmp3  r13		; must be saved and restored
51 %define tmp4  r12		; must be saved and restored
52 %define tmp5  r14		; must be saved and restored
53 %define tmp6  r15		; must be saved and restored
54 %define tmp7  rbp		; must be saved and restored
55 %define tmp8  rbx		; must be saved and restored
56 %define return rax
57 %define PS     8
58 %define LOG_PS 3
59
60 %define func(x) x: endbranch
61 %macro FUNC_SAVE 0
62	push	r12
63	push	r13
64	push	r14
65	push	r15
66	push	rbp
67	push	rbx
68 %endmacro
69 %macro FUNC_RESTORE 0
70	pop	rbx
71	pop	rbp
72	pop	r15
73	pop	r14
74	pop	r13
75	pop	r12
76 %endmacro
77%endif
78
79%ifidn __OUTPUT_FORMAT__, win64
80 %define arg0   rcx
81 %define arg1   rdx
82 %define arg2   r8
83 %define arg3   r9
84
85 %define arg4   r12 		; must be saved, loaded and restored
86 %define arg5   r15 		; must be saved and restored
87 %define tmp    r11
88 %define tmp.w  r11d
89 %define tmp.b  r11b
90 %define tmp2   r10
91 %define tmp3   r13		; must be saved and restored
92 %define tmp4   r14		; must be saved and restored
93 %define tmp5   rdi		; must be saved and restored
94 %define tmp6   rsi		; must be saved and restored
95 %define tmp7   rbp		; must be saved and restored
96 %define tmp8   rbx		; must be saved and restored
97 %define return rax
98 %define PS     8
99 %define LOG_PS 3
100 %define stack_size  10*16 + 9*8		; must be an odd multiple of 8
101 %define arg(x)      [rsp + stack_size + PS + PS*x]
102
103 %define func(x) proc_frame x
104 %macro FUNC_SAVE 0
105	alloc_stack	stack_size
106	vmovdqa	[rsp + 0*16], xmm6
107	vmovdqa	[rsp + 1*16], xmm7
108	vmovdqa	[rsp + 2*16], xmm8
109	vmovdqa	[rsp + 3*16], xmm9
110	vmovdqa	[rsp + 4*16], xmm10
111	vmovdqa	[rsp + 5*16], xmm11
112	vmovdqa	[rsp + 6*16], xmm12
113	vmovdqa	[rsp + 7*16], xmm13
114	vmovdqa	[rsp + 8*16], xmm14
115	vmovdqa	[rsp + 9*16], xmm15
116	save_reg	r12,  10*16 + 0*8
117	save_reg	r13,  10*16 + 1*8
118	save_reg	r14,  10*16 + 2*8
119	save_reg	r15,  10*16 + 3*8
120	save_reg	rdi,  10*16 + 4*8
121	save_reg	rsi,  10*16 + 5*8
122	save_reg	rbp,  10*16 + 6*8
123	save_reg	rbx,  10*16 + 7*8
124	end_prolog
125	mov	arg4, arg(4)
126 %endmacro
127
128 %macro FUNC_RESTORE 0
129	vmovdqa	xmm6, [rsp + 0*16]
130	vmovdqa	xmm7, [rsp + 1*16]
131	vmovdqa	xmm8, [rsp + 2*16]
132	vmovdqa	xmm9, [rsp + 3*16]
133	vmovdqa	xmm10, [rsp + 4*16]
134	vmovdqa	xmm11, [rsp + 5*16]
135	vmovdqa	xmm12, [rsp + 6*16]
136	vmovdqa	xmm13, [rsp + 7*16]
137	vmovdqa	xmm14, [rsp + 8*16]
138	vmovdqa	xmm15, [rsp + 9*16]
139	mov	r12,  [rsp + 10*16 + 0*8]
140	mov	r13,  [rsp + 10*16 + 1*8]
141	mov	r14,  [rsp + 10*16 + 2*8]
142	mov	r15,  [rsp + 10*16 + 3*8]
143	mov	rdi,  [rsp + 10*16 + 4*8]
144	mov	rsi,  [rsp + 10*16 + 5*8]
145	mov	rbp,  [rsp + 10*16 + 6*8]
146	mov	rbx,  [rsp + 10*16 + 7*8]
147	add	rsp, stack_size
148 %endmacro
149%endif
150
151
152%define len    arg0
153%define vec    arg1
154%define mul_array arg2
155%define src    arg3
156%define dest1  arg4
157%define ptr    arg5
158%define vec_i  tmp2
159%define dest2  tmp3
160%define dest3  tmp4
161%define dest4  tmp5
162%define vskip3 tmp6
163%define dest5  tmp7
164%define vskip1 tmp8
165%define pos    return
166
167
168%ifndef EC_ALIGNED_ADDR
169;;; Use Un-aligned load/store
170 %define XLDR vmovdqu8
171 %define XSTR vmovdqu8
172%else
173;;; Use Non-temporal load/stor
174 %ifdef NO_NT_LDST
175  %define XLDR vmovdqa
176  %define XSTR vmovdqa
177 %else
178  %define XLDR vmovntdqa
179  %define XSTR vmovntdq
180 %endif
181%endif
182
183%define xmask0f   zmm20
184%define xgft1_lo  zmm19
185%define xgft1_loy ymm19
186%define xgft1_hi  zmm18
187%define xgft2_lo  zmm17
188%define xgft2_loy ymm17
189%define xgft2_hi  zmm16
190%define xgft3_lo  zmm15
191%define xgft3_loy ymm15
192%define xgft3_hi  zmm14
193%define xgft4_lo  zmm13
194%define xgft4_loy ymm13
195%define xgft4_hi  zmm12
196%define xgft5_lo  zmm11
197%define xgft5_loy ymm11
198%define xgft5_hi  zmm10
199%define xgft6_lo  zmm9
200%define xgft6_loy ymm9
201%define xgft6_hi  zmm8
202
203%define x0        zmm0
204%define xtmpa     zmm1
205%define xp1       zmm2
206%define xp2       zmm3
207%define xp3       zmm4
208%define xp4       zmm5
209%define xp5       zmm6
210%define xp6       zmm7
211
212default rel
213[bits 64]
214
215section .text
216
217align 16
218mk_global gf_6vect_dot_prod_avx512, function
219func(gf_6vect_dot_prod_avx512)
220	FUNC_SAVE
221	sub	len, 64
222	jl	.return_fail
223
224	xor	pos, pos
225	mov	tmp, 0x0f
226	vpbroadcastb xmask0f, tmp	;Construct mask 0x0f0f0f...
227	mov	vskip1, vec
228	imul	vskip1, 32
229	mov	vskip3, vec
230	imul	vskip3, 96
231	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
232	mov	dest2, [dest1+PS]
233	mov	dest3, [dest1+2*PS]
234	mov	dest4, [dest1+3*PS]
235	mov	dest5, [dest1+4*PS]
236
237.loop64:
238	vpxorq	xp1, xp1, xp1
239	vpxorq	xp2, xp2, xp2
240	vpxorq	xp3, xp3, xp3
241	vpxorq	xp4, xp4, xp4
242	vpxorq	xp5, xp5, xp5
243	vpxorq	xp6, xp6, xp6
244	mov	tmp, mul_array
245	xor	vec_i, vec_i
246
247.next_vect:
248	mov	ptr, [src+vec_i]
249	XLDR	x0, [ptr+pos]		;Get next source vector
250	add	vec_i, PS
251
252	vpandq	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
253	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
254	vpandq	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
255
256	vmovdqu8 xgft1_loy, [tmp]		;Load array Ax{00}..{0f}, Ax{00}..{f0}
257	vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)]	;Load array Bx{00}..{0f}, Bx{00}..{f0}
258	vmovdqu8 xgft3_loy, [tmp+vec*(64/PS)]	;Load array Cx{00}..{0f}, Cx{00}..{f0}
259	vmovdqu8 xgft4_loy, [tmp+vskip3]	;Load array Dx{00}..{0f}, Dx{00}..{f0}
260	vmovdqu8 xgft5_loy, [tmp+vskip1*4]	;Load array Ex{00}..{0f}, Ex{00}..{f0}
261	lea	ptr, [vskip1 + vskip1*4]	;ptr = vskip5
262	vmovdqu8 xgft6_loy, [tmp+ptr]		;Load array Fx{00}..{0f}, Fx{00}..{f0}
263	add	tmp, 32
264
265	vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
266	vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
267	vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
268	vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
269
270	vpshufb	xgft1_hi, xgft1_hi, x0		;Lookup mul table of high nibble
271	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
272	vpxorq	xgft1_hi, xgft1_hi, xgft1_lo	;GF add high and low partials
273	vpxorq	xp1, xp1, xgft1_hi		;xp1 += partial
274
275	vpshufb	xgft2_hi, xgft2_hi, x0		;Lookup mul table of high nibble
276	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
277	vpxorq	xgft2_hi, xgft2_hi, xgft2_lo	;GF add high and low partials
278	vpxorq	xp2, xp2, xgft2_hi		;xp2 += partial
279
280	vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55
281	vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00
282	vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55
283	vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00
284
285	vpshufb	xgft3_hi, xgft3_hi, x0		;Lookup mul table of high nibble
286	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
287	vpxorq	xgft3_hi, xgft3_hi, xgft3_lo	;GF add high and low partials
288	vpxorq	xp3, xp3, xgft3_hi		;xp3 += partial
289
290	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
291	vpshufb	xgft4_lo, xgft4_lo, xtmpa 	;Lookup mul table of low nibble
292	vpxorq	xgft4_hi, xgft4_hi, xgft4_lo	;GF add high and low partials
293	vpxorq	xp4, xp4, xgft4_hi		;xp4 += partial
294
295	vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55
296	vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00
297
298	vpshufb	xgft5_hi, xgft5_hi, x0		;Lookup mul table of high nibble
299	vpshufb	xgft5_lo, xgft5_lo, xtmpa	;Lookup mul table of low nibble
300	vpxorq	xgft5_hi, xgft5_hi, xgft5_lo	;GF add high and low partials
301	vpxorq	xp5, xp5, xgft5_hi		;xp5 += partial
302
303	vshufi64x2 xgft6_hi, xgft6_lo, xgft6_lo, 0x55
304	vshufi64x2 xgft6_lo, xgft6_lo, xgft6_lo, 0x00
305
306	vpshufb	xgft6_hi, xgft6_hi, x0		;Lookup mul table of high nibble
307	vpshufb	xgft6_lo, xgft6_lo, xtmpa	;Lookup mul table of low nibble
308	vpxorq	xgft6_hi, xgft6_hi, xgft6_lo	;GF add high and low partials
309	vpxorq	xp6, xp6, xgft6_hi		;x6 += partial
310
311	cmp	vec_i, vec
312	jl	.next_vect
313
314        mov     ptr, [dest1]			;reuse ptr
315        mov     tmp, [dest1+5*PS]		;reuse tmp
316
317	XSTR	[dest2+pos], xp2
318	XSTR	[dest3+pos], xp3
319	XSTR	[dest4+pos], xp4
320	XSTR	[dest5+pos], xp5
321
322	XSTR	[ptr+pos], xp1
323	XSTR	[tmp+pos], xp6
324
325	add	pos, 64			;Loop on 64 bytes at a time
326	cmp	pos, len
327	jle	.loop64
328
329	lea	tmp, [len + 64]
330	cmp	pos, tmp
331	je	.return_pass
332
333	;; Tail len
334	mov	pos, len	;Overlapped offset length-64
335	jmp	.loop64		;Do one more overlap pass
336
337.return_pass:
338	mov	return, 0
339	FUNC_RESTORE
340	ret
341
342.return_fail:
343	mov	return, 1
344	FUNC_RESTORE
345	ret
346
347endproc_frame
348
349%else
350%ifidn __OUTPUT_FORMAT__, win64
351global no_gf_6vect_dot_prod_avx512
352no_gf_6vect_dot_prod_avx512:
353%endif
354%endif  ; ifdef HAVE_AS_KNOWS_AVX512
355