1#include "consts.h"
2.include "shuffle.inc"
3.include "fq.inc"
4
5.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
6vpsubw		%ymm\rl0,%ymm\rh0,%ymm12
7vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rl0
8vpsubw		%ymm\rl1,%ymm\rh1,%ymm13
9
10vpmullw		%ymm\zl0,%ymm12,%ymm\rh0
11vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl1
12vpsubw		%ymm\rl2,%ymm\rh2,%ymm14
13
14vpmullw		%ymm\zl0,%ymm13,%ymm\rh1
15vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl2
16vpsubw		%ymm\rl3,%ymm\rh3,%ymm15
17
18vpmullw		%ymm\zl1,%ymm14,%ymm\rh2
19vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl3
20vpmullw		%ymm\zl1,%ymm15,%ymm\rh3
21
22vpmulhw		%ymm\zh0,%ymm12,%ymm12
23vpmulhw		%ymm\zh0,%ymm13,%ymm13
24
25vpmulhw		%ymm\zh1,%ymm14,%ymm14
26vpmulhw		%ymm\zh1,%ymm15,%ymm15
27
28vpmulhw		%ymm0,%ymm\rh0,%ymm\rh0
29
30vpmulhw		%ymm0,%ymm\rh1,%ymm\rh1
31
32vpmulhw		%ymm0,%ymm\rh2,%ymm\rh2
33vpmulhw		%ymm0,%ymm\rh3,%ymm\rh3
34
35#
36
37#
38
39vpsubw		%ymm\rh0,%ymm12,%ymm\rh0
40
41vpsubw		%ymm\rh1,%ymm13,%ymm\rh1
42
43vpsubw		%ymm\rh2,%ymm14,%ymm\rh2
44vpsubw		%ymm\rh3,%ymm15,%ymm\rh3
45.endm
46
47.macro intt_levels0t5 off
48/* level 0 */
49vmovdqa		_16XFLO*2(%rsi),%ymm2
50vmovdqa		_16XFHI*2(%rsi),%ymm3
51
52vmovdqa         (128*\off+  0)*2(%rdi),%ymm4
53vmovdqa         (128*\off+ 32)*2(%rdi),%ymm6
54vmovdqa         (128*\off+ 16)*2(%rdi),%ymm5
55vmovdqa         (128*\off+ 48)*2(%rdi),%ymm7
56
57fqmulprecomp	2,3,4
58fqmulprecomp	2,3,6
59fqmulprecomp	2,3,5
60fqmulprecomp	2,3,7
61
62vmovdqa         (128*\off+ 64)*2(%rdi),%ymm8
63vmovdqa         (128*\off+ 96)*2(%rdi),%ymm10
64vmovdqa         (128*\off+ 80)*2(%rdi),%ymm9
65vmovdqa         (128*\off+112)*2(%rdi),%ymm11
66
67fqmulprecomp	2,3,8
68fqmulprecomp	2,3,10
69fqmulprecomp	2,3,9
70fqmulprecomp	2,3,11
71
72vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
73vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
74vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
75vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
76vmovdqa		_REVIDXB*2(%rsi),%ymm12
77vpshufb		%ymm12,%ymm15,%ymm15
78vpshufb		%ymm12,%ymm1,%ymm1
79vpshufb		%ymm12,%ymm2,%ymm2
80vpshufb		%ymm12,%ymm3,%ymm3
81
82butterfly	4,5,8,9,6,7,10,11,15,1,2,3
83
84/* level 1 */
85vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
86vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
87vmovdqa		_REVIDXB*2(%rsi),%ymm1
88vpshufb		%ymm1,%ymm2,%ymm2
89vpshufb		%ymm1,%ymm3,%ymm3
90
91butterfly	4,5,6,7,8,9,10,11,2,2,3,3
92
93shuffle1	4,5,3,5
94shuffle1	6,7,4,7
95shuffle1	8,9,6,9
96shuffle1	10,11,8,11
97
98/* level 2 */
99vmovdqa		_REVIDXD*2(%rsi),%ymm12
100vpermd		(_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
101vpermd		(_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10
102
103butterfly	3,4,6,8,5,7,9,11,2,2,10,10
104
105vmovdqa		_16XV*2(%rsi),%ymm1
106red16		3
107
108shuffle2	3,4,10,4
109shuffle2	6,8,3,8
110shuffle2	5,7,6,7
111shuffle2	9,11,5,11
112
113/* level 3 */
114vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
115vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9
116
117butterfly	10,3,6,5,4,8,7,11,2,2,9,9
118
119shuffle4	10,3,9,3
120shuffle4	6,5,10,5
121shuffle4	4,8,6,8
122shuffle4	7,11,4,11
123
124/* level 4 */
125vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
126vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7
127
128butterfly	9,10,6,4,3,5,8,11,2,2,7,7
129
130red16		9
131
132shuffle8	9,10,7,10
133shuffle8	6,4,9,4
134shuffle8	3,5,6,5
135shuffle8	8,11,3,11
136
137/* level 5 */
138vmovdqa		(_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
139vmovdqa		(_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8
140
141butterfly	7,9,6,3,10,4,5,11,2,2,8,8
142
143vmovdqa         %ymm7,(128*\off+  0)*2(%rdi)
144vmovdqa         %ymm9,(128*\off+ 16)*2(%rdi)
145vmovdqa         %ymm6,(128*\off+ 32)*2(%rdi)
146vmovdqa         %ymm3,(128*\off+ 48)*2(%rdi)
147vmovdqa         %ymm10,(128*\off+ 64)*2(%rdi)
148vmovdqa         %ymm4,(128*\off+ 80)*2(%rdi)
149vmovdqa         %ymm5,(128*\off+ 96)*2(%rdi)
150vmovdqa         %ymm11,(128*\off+112)*2(%rdi)
151.endm
152
153.macro intt_level6 off
154/* level 6 */
155vmovdqa         (64*\off+  0)*2(%rdi),%ymm4
156vmovdqa         (64*\off+128)*2(%rdi),%ymm8
157vmovdqa         (64*\off+ 16)*2(%rdi),%ymm5
158vmovdqa         (64*\off+144)*2(%rdi),%ymm9
159vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm2
160
161vmovdqa         (64*\off+ 32)*2(%rdi),%ymm6
162vmovdqa         (64*\off+160)*2(%rdi),%ymm10
163vmovdqa         (64*\off+ 48)*2(%rdi),%ymm7
164vmovdqa         (64*\off+176)*2(%rdi),%ymm11
165vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm3
166
167butterfly	4,5,6,7,8,9,10,11
168
169.if \off == 0
170red16		4
171.endif
172
173vmovdqa		%ymm4,(64*\off+  0)*2(%rdi)
174vmovdqa		%ymm5,(64*\off+ 16)*2(%rdi)
175vmovdqa		%ymm6,(64*\off+ 32)*2(%rdi)
176vmovdqa		%ymm7,(64*\off+ 48)*2(%rdi)
177vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
178vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
179vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
180vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
181.endm
182
183.text
184.global cdecl(invntt_avx)
185cdecl(invntt_avx):
186vmovdqa         _16XQ*2(%rsi),%ymm0
187
188intt_levels0t5	0
189intt_levels0t5	1
190
191intt_level6	0
192intt_level6	1
193ret
194