1 /* 2 This file is for transpose of the Gao-Mateer FFT 3 Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c 4 */ 5 6 #include "fft_tr.h" 7 8 #include "transpose.h" 9 10 #include <stdint.h> 11 12 static void radix_conversions_tr(vec256 *in) { 13 int i, j, k; 14 vec256 t; 15 uint64_t v[4]; 16 17 const vec256 mask[6][2] = { 18 { 19 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), 20 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) 21 }, 22 { 23 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), 24 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) 25 }, 26 { 27 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), 28 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) 29 }, 30 { 31 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), 32 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) 33 }, 34 { 35 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), 36 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) 37 }, 38 { 39 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), 40 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) 41 } 42 }; 43 44 const vec256 s[6][GFBITS] = { 45 #include "scalars_4x.inc" 46 }; 47 48 // 49 50 for (j = 6; j >= 0; j--) { 51 if (j < 6) { 52 PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(in, in, s[j]); // scaling 53 } 54 55 for (k = j; k <= 4; k++) { 56 for (i = 0; i < GFBITS; i++) { 57 t = PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[i], mask[k][0]); 58 t = PQCLEAN_MCELIECE6960119F_AVX_vec256_sll_4x(t, 1 << k); 59 in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(in[i], t); 60 61 t = PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[i], mask[k][1]); 62 t = PQCLEAN_MCELIECE6960119F_AVX_vec256_sll_4x(t, 1 << k); 63 in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(in[i], t); 64 } 65 } operator ()boost::numeric::functional::outer_product66 67 if (j <= 5) { 68 for (i = 0; i < GFBITS; i++) { 69 v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 0); 70 v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 1); 71 v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 2); 72 v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 3); 73 74 v[1] ^= v[0] >> 32; 75 v[1] ^= v[1] << 32; 76 v[3] ^= v[2] >> 32; 77 v[3] ^= v[3] << 32; 78 79 in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); 80 } 81 } 82 83 for (i = 0; i < GFBITS; i++) { 84 v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 0); 85 v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 1); 86 v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 2); 87 v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 3); 88 89 v[2] ^= v[1]; 90 v[3] ^= v[2]; 91 92 in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); 93 } 94 95 } 96 } 97 98 static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { 99 int i, j, k, s, b; 100 101 vec256 t0[ GFBITS ]; 102 vec256 t1[ GFBITS ]; 103 vec256 t; 104 105 vec128 out128[ GFBITS ][ 2 ]; 106 vec128 tmp[ GFBITS ]; 107 108 union { 109 vec128 v[6][ GFBITS + 1 ]; 110 vec256 V[6][ (GFBITS + 1) / 2 ]; 111 } pre; 112 113 union { 114 vec128 v[64][ 2 ]; 115 vec256 V[64]; 116 } buf; 117 118 const vec256 consts[ 33 ][ GFBITS ] = { 119 #include "consts.inc" 120 }; 121 122 uint64_t v[4]; covariance_implboost::accumulators::impl::covariance_impl123 uint64_t consts_ptr = 33; 124 125 const unsigned char reversal[] = { 126 0, 32, 16, 48, 8, 40, 24, 56, 127 4, 36, 20, 52, 12, 44, 28, 60, 128 2, 34, 18, 50, 10, 42, 26, 58, 129 6, 38, 22, 54, 14, 46, 30, 62, 130 1, 33, 17, 49, 9, 41, 25, 57, 131 5, 37, 21, 53, 13, 45, 29, 61, 132 3, 35, 19, 51, 11, 43, 27, 59, 133 7, 39, 23, 55, 15, 47, 31, 63 operator ()boost::accumulators::impl::covariance_impl134 }; 135 136 const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; 137 138 // butterflies 139 140 for (i = 4; i >= 0; i--) { 141 s = 1 << i; 142 consts_ptr -= s; 143 144 for (j = 0; j < 32; j += 2 * s) { 145 for (k = j; k < j + s; k++) { 146 PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); 147 } 148 } 149 resultboost::accumulators::impl::covariance_impl150 } 151 152 for (k = 0; k < 32; k += 2) { 153 for (b = 0; b < GFBITS; b++) { 154 t0[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); 155 } 156 for (b = 0; b < GFBITS; b++) { serializeboost::accumulators::impl::covariance_impl157 t1[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); 158 } 159 160 PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(t0, t1, consts[1]); 161 162 for (b = 0; b < GFBITS; b++) { 163 in[k][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low(t0[b], t1[b]); 164 } 165 for (b = 0; b < GFBITS; b++) { 166 in[k + 1][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high(t0[b], t1[b]); 167 } 168 169 for (b = 0; b < GFBITS; b++) { 170 t0[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); 171 } 172 for (b = 0; b < GFBITS; b++) { 173 t1[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); 174 } 175 176 PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(t0, t1, consts[0]); 177 178 for (b = 0; b < GFBITS; b++) { 179 in[k + 0][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low_2x(t0[b], t1[b]); 180 } 181 for (b = 0; b < GFBITS; b++) { 182 in[k + 1][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high_2x(t0[b], t1[b]); 183 } 184 } 185 186 187 // boradcast 188 189 for (i = 0; i < GFBITS; i += 2) { 190 // transpose 191 192 for (k = 0; k < 32; k++) { 193 if (i != GFBITS - 1) { 194 buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 1], 0); 195 buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 1], 1); 196 } 197 198 buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 0], 0); 199 buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 0], 1); 200 } 201 202 PQCLEAN_MCELIECE6960119F_AVX_transpose_64x256_sp(buf.V); 203 204 // 205 206 pre.V[0][i / 2] = buf.V[32]; 207 buf.V[33] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[33], buf.V[32]); 208 pre.V[1][i / 2] = buf.V[33]; 209 buf.V[35] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[35], buf.V[33]); 210 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); 211 buf.V[34] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[34], buf.V[35]); 212 pre.V[2][i / 2] = buf.V[34]; 213 buf.V[38] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[38], buf.V[34]); 214 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); 215 buf.V[39] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[39], buf.V[38]); 216 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); 217 buf.V[37] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[37], buf.V[39]); 218 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); 219 buf.V[36] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[36], buf.V[37]); 220 pre.V[3][i / 2] = buf.V[36]; 221 buf.V[44] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[44], buf.V[36]); 222 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); 223 buf.V[45] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[45], buf.V[44]); 224 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); 225 buf.V[47] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[47], buf.V[45]); 226 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); 227 buf.V[46] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[46], buf.V[47]); 228 pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); 229 buf.V[42] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[42], buf.V[46]); 230 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); 231 buf.V[43] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[43], buf.V[42]); 232 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); 233 buf.V[41] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[41], buf.V[43]); 234 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); 235 buf.V[40] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[40], buf.V[41]); 236 pre.V[4][i / 2] = buf.V[40]; 237 buf.V[56] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[56], buf.V[40]); 238 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); 239 buf.V[57] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[57], buf.V[56]); 240 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); 241 buf.V[59] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[59], buf.V[57]); 242 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); 243 buf.V[58] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[58], buf.V[59]); 244 pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); 245 buf.V[62] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[62], buf.V[58]); 246 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); 247 buf.V[63] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[63], buf.V[62]); 248 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); 249 buf.V[61] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[61], buf.V[63]); 250 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); 251 buf.V[60] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[60], buf.V[61]); 252 pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); 253 buf.V[52] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[52], buf.V[60]); 254 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); 255 buf.V[53] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[53], buf.V[52]); 256 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); 257 buf.V[55] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[55], buf.V[53]); 258 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); 259 buf.V[54] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[54], buf.V[55]); 260 pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); 261 buf.V[50] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[50], buf.V[54]); 262 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); 263 buf.V[51] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[51], buf.V[50]); 264 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); 265 buf.V[49] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[49], buf.V[51]); 266 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); 267 buf.V[48] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[48], buf.V[49]); 268 pre.V[5][i / 2] = buf.V[48]; 269 buf.V[16] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[16], buf.V[48]); 270 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); 271 buf.V[17] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[17], buf.V[16]); 272 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); 273 buf.V[19] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[19], buf.V[17]); 274 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); 275 buf.V[18] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[18], buf.V[19]); 276 pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); 277 buf.V[22] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[22], buf.V[18]); 278 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); 279 buf.V[23] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[23], buf.V[22]); 280 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); 281 buf.V[21] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[21], buf.V[23]); 282 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); 283 buf.V[20] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[20], buf.V[21]); 284 pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); 285 buf.V[28] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[28], buf.V[20]); 286 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); 287 buf.V[29] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[29], buf.V[28]); 288 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); 289 buf.V[31] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[31], buf.V[29]); 290 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); 291 buf.V[30] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[30], buf.V[31]); 292 pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); 293 buf.V[26] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[26], buf.V[30]); 294 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); 295 buf.V[27] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[27], buf.V[26]); 296 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); 297 buf.V[25] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[25], buf.V[27]); 298 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); 299 buf.V[24] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[24], buf.V[25]); 300 pre.V[4][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); 301 buf.V[8] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[8], buf.V[24]); 302 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); 303 buf.V[9] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[9], buf.V[8]); 304 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); 305 buf.V[11] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[11], buf.V[9]); 306 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); 307 buf.V[10] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[10], buf.V[11]); 308 pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); 309 buf.V[14] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[14], buf.V[10]); 310 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); 311 buf.V[15] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[15], buf.V[14]); 312 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); 313 buf.V[13] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[13], buf.V[15]); 314 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); 315 buf.V[12] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[12], buf.V[13]); 316 pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); 317 buf.V[4] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[4], buf.V[12]); 318 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); 319 buf.V[5] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[5], buf.V[4]); 320 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); 321 buf.V[7] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[7], buf.V[5]); 322 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); 323 buf.V[6] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[6], buf.V[7]); 324 pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); 325 buf.V[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[2], buf.V[6]); 326 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); 327 buf.V[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[3], buf.V[2]); 328 pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); 329 buf.V[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[1], buf.V[3]); 330 331 pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); 332 t = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], buf.V[1]); 333 334 if (i != GFBITS - 1) { 335 out128[i + 1][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(t, 1); 336 } 337 out128[i + 0][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(t, 0); 338 339 } 340 341 // 342 343 for (j = 0; j < GFBITS; j++) { 344 tmp[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits((beta[0] >> j) & 1); 345 } 346 347 PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(tmp, pre.v[0], tmp); 348 349 for (b = 0; b < GFBITS; b++) { 350 out128[b][1] = tmp[b]; 351 } 352 353 for (i = 1; i < 6; i++) { 354 for (j = 0; j < GFBITS; j++) { 355 tmp[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits((beta[i] >> j) & 1); 356 } 357 358 PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(tmp, pre.v[i], tmp); 359 360 for (b = 0; b < GFBITS; b++) { 361 out128[b][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(out128[b][1], tmp[b]); 362 } 363 } 364 365 for (b = 0; b < GFBITS; b++) { 366 v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][0], 0); 367 v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][0], 1); 368 v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][1], 0); 369 v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][1], 1); 370 371 out[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); 372 } 373 } 374 375 /* justifying the length of the output */ 376 static void postprocess(vec256 *out) { 377 int i; 378 uint64_t v[4]; 379 380 for (i = 0; i < 13; i++) { 381 v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 0); 382 v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 1); 383 v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 2); 384 v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 3); 385 386 v[3] <<= (128 - SYS_T) * 2; 387 v[3] >>= (128 - SYS_T) * 2; 388 389 out[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); 390 } 391 } 392 393 394 void PQCLEAN_MCELIECE6960119F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { 395 butterflies_tr(out, in); 396 radix_conversions_tr(out); 397 398 postprocess(out); 399 } 400 401