1#include "consts.h"
2.include "shuffle.inc"
3
4.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
5vpmullw		%ymm\zl0,%ymm\rh0,%ymm12
6vpmullw		%ymm\zl0,%ymm\rh1,%ymm13
7
8vpmullw		%ymm\zl1,%ymm\rh2,%ymm14
9vpmullw		%ymm\zl1,%ymm\rh3,%ymm15
10
11vpmulhw		%ymm\zh0,%ymm\rh0,%ymm\rh0
12vpmulhw		%ymm\zh0,%ymm\rh1,%ymm\rh1
13
14vpmulhw		%ymm\zh1,%ymm\rh2,%ymm\rh2
15vpmulhw		%ymm\zh1,%ymm\rh3,%ymm\rh3
16.endm
17
18.macro reduce
19vpmulhw		%ymm0,%ymm12,%ymm12
20vpmulhw		%ymm0,%ymm13,%ymm13
21
22vpmulhw		%ymm0,%ymm14,%ymm14
23vpmulhw		%ymm0,%ymm15,%ymm15
24.endm
25
26.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
27vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rln
28vpsubw		%ymm\rh0,%ymm\rl0,%ymm\rh0
29vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl0
30
31vpsubw		%ymm\rh1,%ymm\rl1,%ymm\rh1
32vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl1
33vpsubw		%ymm\rh2,%ymm\rl2,%ymm\rh2
34
35vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl2
36vpsubw		%ymm\rh3,%ymm\rl3,%ymm\rh3
37
38vpsubw		%ymm12,%ymm\rln,%ymm\rln
39vpaddw		%ymm12,%ymm\rh0,%ymm\rh0
40vpsubw		%ymm13,%ymm\rl0,%ymm\rl0
41
42vpaddw		%ymm13,%ymm\rh1,%ymm\rh1
43vpsubw		%ymm14,%ymm\rl1,%ymm\rl1
44vpaddw		%ymm14,%ymm\rh2,%ymm\rh2
45
46vpsubw		%ymm15,%ymm\rl2,%ymm\rl2
47vpaddw		%ymm15,%ymm\rh3,%ymm\rh3
48.endm
49
50.macro level0 off
51vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm15
52vmovdqa		(64*\off+128)*2(%rdi),%ymm8
53vmovdqa		(64*\off+144)*2(%rdi),%ymm9
54vmovdqa		(64*\off+160)*2(%rdi),%ymm10
55vmovdqa		(64*\off+176)*2(%rdi),%ymm11
56vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm2
57
58mul		8,9,10,11
59
60vmovdqa		(64*\off+  0)*2(%rdi),%ymm4
61vmovdqa		(64*\off+ 16)*2(%rdi),%ymm5
62vmovdqa		(64*\off+ 32)*2(%rdi),%ymm6
63vmovdqa		(64*\off+ 48)*2(%rdi),%ymm7
64
65reduce
66update		3,4,5,6,7,8,9,10,11
67
68vmovdqa		%ymm3,(64*\off+  0)*2(%rdi)
69vmovdqa		%ymm4,(64*\off+ 16)*2(%rdi)
70vmovdqa		%ymm5,(64*\off+ 32)*2(%rdi)
71vmovdqa		%ymm6,(64*\off+ 48)*2(%rdi)
72vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
73vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
74vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
75vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
76.endm
77
78.macro levels1t6 off
79/* level 1 */
80vmovdqa		(_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
81vmovdqa		(128*\off+ 64)*2(%rdi),%ymm8
82vmovdqa		(128*\off+ 80)*2(%rdi),%ymm9
83vmovdqa		(128*\off+ 96)*2(%rdi),%ymm10
84vmovdqa		(128*\off+112)*2(%rdi),%ymm11
85vmovdqa		(_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2
86
87mul		8,9,10,11
88
89vmovdqa		(128*\off+  0)*2(%rdi),%ymm4
90vmovdqa	 	(128*\off+ 16)*2(%rdi),%ymm5
91vmovdqa		(128*\off+ 32)*2(%rdi),%ymm6
92vmovdqa		(128*\off+ 48)*2(%rdi),%ymm7
93
94reduce
95update		3,4,5,6,7,8,9,10,11
96
97/* level 2 */
98shuffle8	5,10,7,10
99shuffle8	6,11,5,11
100
101vmovdqa		(_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
102vmovdqa		(_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2
103
104mul		7,10,5,11
105
106shuffle8	3,8,6,8
107shuffle8	4,9,3,9
108
109reduce
110update		4,6,8,3,9,7,10,5,11
111
112/* level 3 */
113shuffle4	8,5,9,5
114shuffle4	3,11,8,11
115
116vmovdqa		(_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
117vmovdqa		(_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2
118
119mul		9,5,8,11
120
121shuffle4	4,7,3,7
122shuffle4	6,10,4,10
123
124reduce
125update		6,3,7,4,10,9,5,8,11
126
127/* level 4 */
128shuffle2	7,8,10,8
129shuffle2	4,11,7,11
130
131vmovdqa		(_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
132vmovdqa		(_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2
133
134mul		10,8,7,11
135
136shuffle2	6,9,4,9
137shuffle2	3,5,6,5
138
139reduce
140update		3,4,9,6,5,10,8,7,11
141
142/* level 5 */
143shuffle1	9,7,5,7
144shuffle1	6,11,9,11
145
146vmovdqa		(_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
147vmovdqa		(_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2
148
149mul		5,7,9,11
150
151shuffle1	3,10,6,10
152shuffle1	4,8,3,8
153
154reduce
155update		4,6,10,3,8,5,7,9,11
156
157/* level 6 */
158vmovdqa		(_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
159vmovdqa		(_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
160vmovdqa		(_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
161vmovdqa		(_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2
162
163mul		10,3,9,11,14,15,8,2
164
165reduce
166update		8,4,6,5,7,10,3,9,11
167
168vmovdqa		%ymm8,(128*\off+  0)*2(%rdi)
169vmovdqa		%ymm4,(128*\off+ 16)*2(%rdi)
170vmovdqa		%ymm10,(128*\off+ 32)*2(%rdi)
171vmovdqa		%ymm3,(128*\off+ 48)*2(%rdi)
172vmovdqa		%ymm6,(128*\off+ 64)*2(%rdi)
173vmovdqa		%ymm5,(128*\off+ 80)*2(%rdi)
174vmovdqa		%ymm9,(128*\off+ 96)*2(%rdi)
175vmovdqa		%ymm11,(128*\off+112)*2(%rdi)
176.endm
177
178.text
179.global cdecl(ntt_avx)
180cdecl(ntt_avx):
181vmovdqa		_16XQ*2(%rsi),%ymm0
182
183level0		0
184level0		1
185
186levels1t6	0
187levels1t6	1
188
189ret
190