1#include "consts.h" 2.include "shuffle.inc" 3 4.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 5vpmullw %ymm\zl0,%ymm\rh0,%ymm12 6vpmullw %ymm\zl0,%ymm\rh1,%ymm13 7 8vpmullw %ymm\zl1,%ymm\rh2,%ymm14 9vpmullw %ymm\zl1,%ymm\rh3,%ymm15 10 11vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 12vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 13 14vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 15vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 16.endm 17 18.macro reduce 19vpmulhw %ymm0,%ymm12,%ymm12 20vpmulhw %ymm0,%ymm13,%ymm13 21 22vpmulhw %ymm0,%ymm14,%ymm14 23vpmulhw %ymm0,%ymm15,%ymm15 24.endm 25 26.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 27vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln 28vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 29vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 30 31vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 32vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 33vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 34 35vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 36vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 37 38vpsubw %ymm12,%ymm\rln,%ymm\rln 39vpaddw %ymm12,%ymm\rh0,%ymm\rh0 40vpsubw %ymm13,%ymm\rl0,%ymm\rl0 41 42vpaddw %ymm13,%ymm\rh1,%ymm\rh1 43vpsubw %ymm14,%ymm\rl1,%ymm\rl1 44vpaddw %ymm14,%ymm\rh2,%ymm\rh2 45 46vpsubw %ymm15,%ymm\rl2,%ymm\rl2 47vpaddw %ymm15,%ymm\rh3,%ymm\rh3 48.endm 49 50.macro level0 off 51vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 52vmovdqa (64*\off+128)*2(%rdi),%ymm8 53vmovdqa (64*\off+144)*2(%rdi),%ymm9 54vmovdqa (64*\off+160)*2(%rdi),%ymm10 55vmovdqa (64*\off+176)*2(%rdi),%ymm11 56vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 57 58mul 8,9,10,11 59 60vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 61vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 62vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 63vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 64 65reduce 66update 3,4,5,6,7,8,9,10,11 67 68vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) 69vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) 70vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) 71vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) 72vmovdqa %ymm8,(64*\off+128)*2(%rdi) 73vmovdqa %ymm9,(64*\off+144)*2(%rdi) 74vmovdqa %ymm10,(64*\off+160)*2(%rdi) 75vmovdqa %ymm11,(64*\off+176)*2(%rdi) 76.endm 77 78.macro levels1t6 off 79/* level 1 */ 80vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 81vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 82vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 83vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 84vmovdqa (128*\off+112)*2(%rdi),%ymm11 85vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 86 87mul 8,9,10,11 88 89vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 90vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 91vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 92vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 93 94reduce 95update 3,4,5,6,7,8,9,10,11 96 97/* level 2 */ 98shuffle8 5,10,7,10 99shuffle8 6,11,5,11 100 101vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 102vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 103 104mul 7,10,5,11 105 106shuffle8 3,8,6,8 107shuffle8 4,9,3,9 108 109reduce 110update 4,6,8,3,9,7,10,5,11 111 112/* level 3 */ 113shuffle4 8,5,9,5 114shuffle4 3,11,8,11 115 116vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 117vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 118 119mul 9,5,8,11 120 121shuffle4 4,7,3,7 122shuffle4 6,10,4,10 123 124reduce 125update 6,3,7,4,10,9,5,8,11 126 127/* level 4 */ 128shuffle2 7,8,10,8 129shuffle2 4,11,7,11 130 131vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 132vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 133 134mul 10,8,7,11 135 136shuffle2 6,9,4,9 137shuffle2 3,5,6,5 138 139reduce 140update 3,4,9,6,5,10,8,7,11 141 142/* level 5 */ 143shuffle1 9,7,5,7 144shuffle1 6,11,9,11 145 146vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 147vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 148 149mul 5,7,9,11 150 151shuffle1 3,10,6,10 152shuffle1 4,8,3,8 153 154reduce 155update 4,6,10,3,8,5,7,9,11 156 157/* level 6 */ 158vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 159vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 160vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 161vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 162 163mul 10,3,9,11,14,15,8,2 164 165reduce 166update 8,4,6,5,7,10,3,9,11 167 168vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) 169vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) 170vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) 171vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) 172vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) 173vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) 174vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) 175vmovdqa %ymm11,(128*\off+112)*2(%rdi) 176.endm 177 178.text 179.global cdecl(ntt_avx) 180cdecl(ntt_avx): 181vmovdqa _16XQ*2(%rsi),%ymm0 182 183level0 0 184level0 1 185 186levels1t6 0 187levels1t6 1 188 189ret 190