1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;
31;;; gf_3vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
32;;;
33
34%include "reg_sizes.asm"
35
36%ifidn __OUTPUT_FORMAT__, elf64
37 %define arg0  rdi
38 %define arg1  rsi
39 %define arg2  rdx
40 %define arg3  rcx
41 %define arg4  r8
42 %define arg5  r9
43
44 %define tmp   r11
45 %define tmp.w r11d
46 %define tmp.b r11b
47 %define tmp2  r10
48 %define tmp3  r13		; must be saved and restored
49 %define tmp4  r12		; must be saved and restored
50 %define return rax
51 %macro  SLDR   2
52 %endmacro
53 %define SSTR   SLDR
54 %define PS     8
55 %define LOG_PS 3
56
57 %define func(x) x: endbranch
58 %macro FUNC_SAVE 0
59	push	r12
60	push	r13
61 %endmacro
62 %macro FUNC_RESTORE 0
63	pop	r13
64	pop	r12
65 %endmacro
66%endif
67
68%ifidn __OUTPUT_FORMAT__, win64
69 %define arg0   rcx
70 %define arg1   rdx
71 %define arg2   r8
72 %define arg3   r9
73
74 %define arg4   r12 		; must be saved, loaded and restored
75 %define arg5   r15 		; must be saved and restored
76 %define tmp    r11
77 %define tmp.w  r11d
78 %define tmp.b  r11b
79 %define tmp2   r10
80 %define tmp3   r13		; must be saved and restored
81 %define tmp4   r14		; must be saved and restored
82 %define return rax
83 %macro  SLDR   2
84 %endmacro
85 %define SSTR   SLDR
86 %define PS     8
87 %define LOG_PS 3
88 %define stack_size  6*16 + 5*8 	; must be an odd multiple of 8
89 %define arg(x)      [rsp + stack_size + PS + PS*x]
90
91 %define func(x) proc_frame x
92 %macro FUNC_SAVE 0
93	alloc_stack	stack_size
94	vmovdqa	[rsp + 0*16], xmm6
95	vmovdqa	[rsp + 1*16], xmm7
96	vmovdqa	[rsp + 2*16], xmm8
97	vmovdqa	[rsp + 3*16], xmm9
98	vmovdqa	[rsp + 4*16], xmm10
99	vmovdqa	[rsp + 5*16], xmm11
100	save_reg	r12,  6*16 + 0*8
101	save_reg	r13,  6*16 + 1*8
102	save_reg	r14,  6*16 + 2*8
103	save_reg	r15,  6*16 + 3*8
104	end_prolog
105	mov	arg4, arg(4)
106 %endmacro
107
108 %macro FUNC_RESTORE 0
109	vmovdqa	xmm6, [rsp + 0*16]
110	vmovdqa	xmm7, [rsp + 1*16]
111	vmovdqa	xmm8, [rsp + 2*16]
112	vmovdqa	xmm9, [rsp + 3*16]
113	vmovdqa	xmm10, [rsp + 4*16]
114	vmovdqa	xmm11, [rsp + 5*16]
115	mov	r12,  [rsp + 6*16 + 0*8]
116	mov	r13,  [rsp + 6*16 + 1*8]
117	mov	r14,  [rsp + 6*16 + 2*8]
118	mov	r15,  [rsp + 6*16 + 3*8]
119	add	rsp, stack_size
120 %endmacro
121%endif
122
123%ifidn __OUTPUT_FORMAT__, elf32
124
125;;;================== High Address;
126;;;	arg4
127;;;	arg3
128;;;	arg2
129;;;	arg1
130;;;	arg0
131;;;	return
132;;;<================= esp of caller
133;;;	ebp
134;;;<================= ebp = esp
135;;;	var0
136;;;	var1
137;;;	esi
138;;;	edi
139;;;	ebx
140;;;<================= esp of callee
141;;;
142;;;================== Low Address;
143
144 %define PS 4
145 %define LOG_PS 2
146 %define func(x) x: endbranch
147 %define arg(x) [ebp + PS*2 + PS*x]
148 %define var(x) [ebp - PS - PS*x]
149
150 %define trans   ecx
151 %define trans2  esi
152 %define arg0    trans			;trans and trans2 are for the variables in stack
153 %define arg0_m  arg(0)
154 %define arg1    ebx
155 %define arg2    arg2_m
156 %define arg2_m  arg(2)
157 %define arg3    trans
158 %define arg3_m  arg(3)
159 %define arg4    trans
160 %define arg4_m  arg(4)
161 %define arg5	 trans2
162 %define tmp	 edx
163 %define tmp.w   edx
164 %define tmp.b   dl
165 %define tmp2    edi
166 %define tmp3    trans2
167 %define tmp3_m  var(0)
168 %define tmp4    trans2
169 %define tmp4_m  var(1)
170 %define return  eax
171 %macro SLDR     2			;stack load/restore
172	mov %1, %2
173 %endmacro
174 %define SSTR SLDR
175
176 %macro FUNC_SAVE 0
177	push	ebp
178	mov	ebp, esp
179	sub	esp, PS*2		;2 local variables
180	push	esi
181	push	edi
182	push	ebx
183	mov	arg1, arg(1)
184 %endmacro
185
186 %macro FUNC_RESTORE 0
187	pop	ebx
188	pop	edi
189	pop	esi
190	add	esp, PS*2		;2 local variables
191	pop	ebp
192 %endmacro
193
194%endif	; output formats
195
196%define len   arg0
197%define vec   arg1
198%define mul_array arg2
199%define	src   arg3
200%define dest1 arg4
201%define ptr   arg5
202
203%define vec_i tmp2
204%define dest2 tmp3
205%define dest3 tmp4
206%define pos   return
207
208%ifidn PS,4				;32-bit code
209 %define  len_m   arg0_m
210 %define  src_m   arg3_m
211 %define  dest1_m arg4_m
212 %define  dest2_m tmp3_m
213 %define  dest3_m tmp4_m
214%endif
215
216%ifndef EC_ALIGNED_ADDR
217;;; Use Un-aligned load/store
218 %define XLDR vmovdqu
219 %define XSTR vmovdqu
220%else
221;;; Use Non-temporal load/stor
222 %ifdef NO_NT_LDST
223  %define XLDR vmovdqa
224  %define XSTR vmovdqa
225 %else
226  %define XLDR vmovntdqa
227  %define XSTR vmovntdq
228 %endif
229%endif
230
231%ifidn PS,8				;64-bit code
232 default rel
233 [bits 64]
234%endif
235
236section .text
237
238%ifidn PS,8				;64-bit code
239 %define xmask0f   ymm11
240 %define xmask0fx  xmm11
241 %define xgft1_lo  ymm10
242 %define xgft1_hi  ymm9
243 %define xgft2_lo  ymm8
244 %define xgft2_hi  ymm7
245 %define xgft3_lo  ymm6
246 %define xgft3_hi  ymm5
247
248 %define x0     ymm0
249 %define xtmpa  ymm1
250 %define xp1    ymm2
251 %define xp2    ymm3
252 %define xp3    ymm4
253%else
254 %define xmask0f   ymm7
255 %define xmask0fx  xmm7
256 %define xgft1_lo  ymm6
257 %define xgft1_hi  ymm5
258 %define xgft2_lo  xgft1_lo
259 %define xgft2_hi  xgft1_hi
260 %define xgft3_lo  xgft1_lo
261 %define xgft3_hi  xgft1_hi
262
263 %define x0     ymm0
264 %define xtmpa  ymm1
265 %define xp1    ymm2
266 %define xp2    ymm3
267 %define xp3    ymm4
268
269%endif
270
271align 16
272mk_global gf_3vect_dot_prod_avx2, function
273func(gf_3vect_dot_prod_avx2)
274	FUNC_SAVE
275	SLDR	len, len_m
276	sub	len, 32
277	SSTR	len_m, len
278	jl	.return_fail
279	xor	pos, pos
280	mov	tmp.b, 0x0f
281	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
282	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
283
284	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
285	SLDR	dest1, dest1_m
286	mov	dest2, [dest1+PS]
287	SSTR	dest2_m, dest2
288	mov	dest3, [dest1+2*PS]
289	SSTR	dest3_m, dest3
290	mov	dest1, [dest1]
291	SSTR	dest1_m, dest1
292
293.loop32:
294	vpxor	xp1, xp1
295	vpxor	xp2, xp2
296	vpxor	xp3, xp3
297	mov	tmp, mul_array
298	xor	vec_i, vec_i
299
300.next_vect:
301	SLDR	src, src_m
302	mov	ptr, [src+vec_i]
303
304	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
305					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
306	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
307	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
308 %ifidn PS,8				; 64-bit code
309	vmovdqu	   xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
310						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
311	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
312	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
313
314	vmovdqu	   xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
315						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
316	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
317	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
318
319	add	tmp, 32
320	add	vec_i, PS
321 %endif
322	XLDR	x0, [ptr+pos]		;Get next source vector
323
324	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
325	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
326	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
327
328	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
329	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
330	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
331	vpxor	xp1, xgft1_hi		;xp1 += partial
332
333 %ifidn PS,4				; 32-bit code
334	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
335						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
336	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
337	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
338 %endif
339	vpshufb	   xgft2_hi, x0		;Lookup mul table of high nibble
340	vpshufb	   xgft2_lo, xtmpa		;Lookup mul table of low nibble
341	vpxor	   xgft2_hi, xgft2_lo	;GF add high and low partials
342	vpxor	   xp2, xgft2_hi		;xp2 += partial
343
344 %ifidn PS,4				; 32-bit code
345	sal     vec, 1
346	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
347						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
348	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
349	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
350	sar	vec, 1
351	add	tmp, 32
352	add	vec_i, PS
353 %endif
354	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
355	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
356	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
357	vpxor	xp3, xgft3_hi		;xp3 += partial
358
359	cmp	vec_i, vec
360	jl	.next_vect
361
362	SLDR	dest1, dest1_m
363	SLDR	dest2, dest2_m
364	XSTR	[dest1+pos], xp1
365	XSTR	[dest2+pos], xp2
366	SLDR	dest3, dest3_m
367	XSTR	[dest3+pos], xp3
368
369	SLDR	len, len_m
370	add	pos, 32			;Loop on 32 bytes at a time
371	cmp	pos, len
372	jle	.loop32
373
374	lea	tmp, [len + 32]
375	cmp	pos, tmp
376	je	.return_pass
377
378	;; Tail len
379	mov	pos, len	;Overlapped offset length-16
380	jmp	.loop32		;Do one more overlap pass
381
382.return_pass:
383	mov	return, 0
384	FUNC_RESTORE
385	ret
386
387.return_fail:
388	mov	return, 1
389	FUNC_RESTORE
390	ret
391
392endproc_frame
393
394section .data
395
396;;;       func                   core, ver, snum
397slversion gf_3vect_dot_prod_avx2, 04,  05,  0197
398