1#include "consts.h" 2.include "shuffle.inc" 3.include "fq.inc" 4 5.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 6vpsubw %ymm\rl0,%ymm\rh0,%ymm12 7vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 8vpsubw %ymm\rl1,%ymm\rh1,%ymm13 9 10vpmullw %ymm\zl0,%ymm12,%ymm\rh0 11vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 12vpsubw %ymm\rl2,%ymm\rh2,%ymm14 13 14vpmullw %ymm\zl0,%ymm13,%ymm\rh1 15vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 16vpsubw %ymm\rl3,%ymm\rh3,%ymm15 17 18vpmullw %ymm\zl1,%ymm14,%ymm\rh2 19vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 20vpmullw %ymm\zl1,%ymm15,%ymm\rh3 21 22vpmulhw %ymm\zh0,%ymm12,%ymm12 23vpmulhw %ymm\zh0,%ymm13,%ymm13 24 25vpmulhw %ymm\zh1,%ymm14,%ymm14 26vpmulhw %ymm\zh1,%ymm15,%ymm15 27 28vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 29 30vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 31 32vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 33vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 34 35# 36 37# 38 39vpsubw %ymm\rh0,%ymm12,%ymm\rh0 40 41vpsubw %ymm\rh1,%ymm13,%ymm\rh1 42 43vpsubw %ymm\rh2,%ymm14,%ymm\rh2 44vpsubw %ymm\rh3,%ymm15,%ymm\rh3 45.endm 46 47.macro intt_levels0t5 off 48/* level 0 */ 49vmovdqa _16XFLO*2(%rsi),%ymm2 50vmovdqa _16XFHI*2(%rsi),%ymm3 51 52vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 53vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 54vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 55vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 56 57fqmulprecomp 2,3,4 58fqmulprecomp 2,3,6 59fqmulprecomp 2,3,5 60fqmulprecomp 2,3,7 61 62vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 63vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 64vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 65vmovdqa (128*\off+112)*2(%rdi),%ymm11 66 67fqmulprecomp 2,3,8 68fqmulprecomp 2,3,10 69fqmulprecomp 2,3,9 70fqmulprecomp 2,3,11 71 72vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 73vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 74vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 75vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 76vmovdqa _REVIDXB*2(%rsi),%ymm12 77vpshufb %ymm12,%ymm15,%ymm15 78vpshufb %ymm12,%ymm1,%ymm1 79vpshufb %ymm12,%ymm2,%ymm2 80vpshufb %ymm12,%ymm3,%ymm3 81 82butterfly 4,5,8,9,6,7,10,11,15,1,2,3 83 84/* level 1 */ 85vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 86vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 87vmovdqa _REVIDXB*2(%rsi),%ymm1 88vpshufb %ymm1,%ymm2,%ymm2 89vpshufb %ymm1,%ymm3,%ymm3 90 91butterfly 4,5,6,7,8,9,10,11,2,2,3,3 92 93shuffle1 4,5,3,5 94shuffle1 6,7,4,7 95shuffle1 8,9,6,9 96shuffle1 10,11,8,11 97 98/* level 2 */ 99vmovdqa _REVIDXD*2(%rsi),%ymm12 100vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 101vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 102 103butterfly 3,4,6,8,5,7,9,11,2,2,10,10 104 105vmovdqa _16XV*2(%rsi),%ymm1 106red16 3 107 108shuffle2 3,4,10,4 109shuffle2 6,8,3,8 110shuffle2 5,7,6,7 111shuffle2 9,11,5,11 112 113/* level 3 */ 114vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 115vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 116 117butterfly 10,3,6,5,4,8,7,11,2,2,9,9 118 119shuffle4 10,3,9,3 120shuffle4 6,5,10,5 121shuffle4 4,8,6,8 122shuffle4 7,11,4,11 123 124/* level 4 */ 125vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 126vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 127 128butterfly 9,10,6,4,3,5,8,11,2,2,7,7 129 130red16 9 131 132shuffle8 9,10,7,10 133shuffle8 6,4,9,4 134shuffle8 3,5,6,5 135shuffle8 8,11,3,11 136 137/* level 5 */ 138vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 139vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 140 141butterfly 7,9,6,3,10,4,5,11,2,2,8,8 142 143vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) 144vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) 145vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) 146vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) 147vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) 148vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) 149vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) 150vmovdqa %ymm11,(128*\off+112)*2(%rdi) 151.endm 152 153.macro intt_level6 off 154/* level 6 */ 155vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 156vmovdqa (64*\off+128)*2(%rdi),%ymm8 157vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 158vmovdqa (64*\off+144)*2(%rdi),%ymm9 159vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 160 161vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 162vmovdqa (64*\off+160)*2(%rdi),%ymm10 163vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 164vmovdqa (64*\off+176)*2(%rdi),%ymm11 165vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 166 167butterfly 4,5,6,7,8,9,10,11 168 169.if \off == 0 170red16 4 171.endif 172 173vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) 174vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) 175vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) 176vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) 177vmovdqa %ymm8,(64*\off+128)*2(%rdi) 178vmovdqa %ymm9,(64*\off+144)*2(%rdi) 179vmovdqa %ymm10,(64*\off+160)*2(%rdi) 180vmovdqa %ymm11,(64*\off+176)*2(%rdi) 181.endm 182 183.text 184.global cdecl(invntt_avx) 185cdecl(invntt_avx): 186vmovdqa _16XQ*2(%rsi),%ymm0 187 188intt_levels0t5 0 189intt_levels0t5 1 190 191intt_level6 0 192intt_level6 1 193ret 194