1 /* 2 This file is for the Gao-Mateer FFT 3 sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf 4 */ 5 6 #include "fft.h" 7 8 #include "transpose.h" 9 #include "vec256.h" 10 11 #include <stdint.h> 12 13 /* input: in, polynomial in bitsliced form */ 14 /* output: in, result of applying the radix conversions on in */ 15 static void radix_conversions(vec128 *in) { 16 int i, j, k; 17 vec128 t; 18 uint64_t v0, v1; 19 20 const vec128 mask[5][2] = { 21 { 22 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), 23 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) 24 }, 25 { 26 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), 27 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) 28 }, 29 { 30 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), 31 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) 32 }, 33 { 34 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), 35 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) 36 }, 37 { 38 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), 39 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) 40 } 41 }; 42 43 const vec128 s[5][GFBITS] = { 44 #include "scalars_2x.inc" 45 }; 46 47 // 48 49 for (j = 0; j <= 5; j++) { 50 for (i = 0; i < GFBITS; i++) { 51 v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[i], 1); 52 v1 ^= v1 >> 32; 53 v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[i], 0); 54 v0 ^= v1 << 32; 55 in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v0, v1); 56 } 57 58 for (i = 0; i < GFBITS; i++) { 59 for (k = 4; k >= j; k--) { 60 t = PQCLEAN_MCELIECE6688128_AVX_vec128_and(in[i], mask[k][0]); 61 t = PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(t, 1 << k); 62 in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(in[i], t); 63 64 t = PQCLEAN_MCELIECE6688128_AVX_vec128_and(in[i], mask[k][1]); 65 t = PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(t, 1 << k); 66 in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(in[i], t); 67 } 68 } 69 70 if (j < 5) { 71 PQCLEAN_MCELIECE6688128_AVX_vec128_mul(in, in, s[j]); // scaling 72 } 73 } 74 } 75 76 /* input: in, result of applying the radix conversions to the input polynomial */ 77 /* output: out, evaluation results (by applying the FFT butterflies) */ 78 static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { 79 int i, j, k, s, b; 80 81 vec128 tmp[ GFBITS ]; 82 vec256 tmp0[ GFBITS ]; 83 vec256 tmp1[ GFBITS ]; 84 vec128 t[ GFBITS ]; 85 86 union { 87 vec128 v[8][ GFBITS + 1 ]; 88 vec256 V[8][ (GFBITS + 1) / 2 ]; 89 } pre; 90 91 union { 92 vec128 v[64][ 2 ]; 93 vec256 V[64]; 94 } buf; 95 96 uint64_t v0, v1; 97 98 const vec256 consts[ 33 ][ GFBITS ] = { 99 #include "consts.inc" 100 }; 101 102 const vec256 powers[ 32 ][ GFBITS ] = { 103 #include "powers.inc" 104 }; 105 106 uint64_t consts_ptr = 2; 107 108 const unsigned char reversal[64] = { 109 0, 32, 16, 48, 8, 40, 24, 56, 110 4, 36, 20, 52, 12, 44, 28, 60, 111 2, 34, 18, 50, 10, 42, 26, 58, 112 6, 38, 22, 54, 14, 46, 30, 62, 113 1, 33, 17, 49, 9, 41, 25, 57, 114 5, 37, 21, 53, 13, 45, 29, 61, 115 3, 35, 19, 51, 11, 43, 27, 59, 116 7, 39, 23, 55, 15, 47, 31, 63 117 }; 118 119 const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; 120 121 // boradcast 122 123 for (j = 0; j < GFBITS; j++) { 124 t[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(in[j], in[j]); 125 } 126 127 for (i = 0; i < 8; i += 2) { 128 for (j = 0; j < GFBITS; j++) { 129 v0 = (beta[i + 0] >> j) & 1; 130 v0 = -v0; 131 v1 = (beta[i + 1] >> j) & 1; 132 v1 = -v1; 133 134 tmp[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v0, v1); 135 } 136 137 PQCLEAN_MCELIECE6688128_AVX_vec128_mul(tmp, t, tmp); 138 139 for (j = 0; j < GFBITS; j++) { 140 pre.v[i + 0][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(tmp[j], tmp[j]); 141 pre.v[i + 1][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(tmp[j], tmp[j]); 142 } 143 } 144 145 for (i = 0; i < GFBITS; i += 2) { 146 if (i != GFBITS - 1) { 147 buf.v[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); 148 } 149 buf.v[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); 150 151 buf.V[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); 152 buf.V[16] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); 153 buf.V[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); 154 buf.V[48] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); 155 buf.V[49] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); 156 buf.V[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); 157 buf.V[51] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); 158 buf.V[6] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); 159 buf.V[50] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); 160 buf.V[7] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); 161 buf.V[54] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); 162 buf.V[5] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); 163 buf.V[55] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); 164 buf.V[53] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); 165 buf.V[4] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); 166 buf.V[52] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); 167 buf.V[12] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); 168 buf.V[60] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); 169 buf.V[13] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); 170 buf.V[61] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); 171 buf.V[15] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); 172 buf.V[63] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); 173 buf.V[14] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); 174 buf.V[62] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); 175 buf.V[10] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); 176 buf.V[58] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); 177 buf.V[11] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); 178 buf.V[59] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); 179 buf.V[9] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); 180 buf.V[57] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); 181 buf.V[56] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); 182 buf.V[8] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); 183 buf.V[40] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); 184 buf.V[24] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); 185 buf.V[41] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); 186 buf.V[25] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); 187 buf.V[43] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); 188 buf.V[27] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); 189 buf.V[42] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); 190 buf.V[26] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); 191 buf.V[46] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); 192 buf.V[30] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); 193 buf.V[47] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); 194 buf.V[31] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); 195 buf.V[45] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); 196 buf.V[29] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); 197 buf.V[44] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); 198 buf.V[28] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); 199 buf.V[36] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); 200 buf.V[20] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); 201 buf.V[37] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); 202 buf.V[21] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); 203 buf.V[39] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); 204 buf.V[23] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); 205 buf.V[38] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); 206 buf.V[22] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); 207 buf.V[34] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); 208 buf.V[18] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); 209 buf.V[35] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); 210 buf.V[19] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); 211 buf.V[33] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); 212 buf.V[17] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); 213 buf.V[32] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); 214 215 216 // transpose 217 218 PQCLEAN_MCELIECE6688128_AVX_transpose_64x256_sp(buf.V); 219 220 for (j = 0; j < 32; j++) { 221 if (i != GFBITS - 1) { 222 out[j][i + 1] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); 223 } 224 out[j][i + 0] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); 225 } 226 } 227 228 // butterflies 229 230 for (k = 0; k < 32; k += 2) { 231 for (b = 0; b < GFBITS; b++) { 232 tmp0[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); 233 } 234 for (b = 0; b < GFBITS; b++) { 235 tmp1[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); 236 } 237 238 PQCLEAN_MCELIECE6688128_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); 239 240 for (b = 0; b < GFBITS; b++) { 241 out[k][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); 242 } 243 for (b = 0; b < GFBITS; b++) { 244 out[k + 1][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); 245 } 246 } 247 248 for (i = 0; i <= 4; i++) { 249 s = 1 << i; 250 251 for (j = 0; j < 32; j += 2 * s) { 252 for (k = j; k < j + s; k++) { 253 PQCLEAN_MCELIECE6688128_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); 254 } 255 } 256 257 consts_ptr += (1 << i); 258 } 259 260 // adding the part contributed by x^128 261 262 for (i = 0; i < 32; i++) { 263 for (b = 0; b < GFBITS; b++) { 264 out[i][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(out[i][b], powers[i][b]); 265 } 266 } 267 } 268 269 /* input: in, polynomial in bitsliced form */ 270 /* output: out, bitsliced results of evaluating in all the field elements */ 271 void PQCLEAN_MCELIECE6688128_AVX_fft(vec256 out[][GFBITS], vec128 *in) { 272 radix_conversions(in); 273 butterflies(out, in); 274 } 275 276