1 /* 2 This file is for Benes network related functions 3 */ 4 5 #include "benes.h" 6 7 #include "params.h" 8 #include "transpose.h" 9 #include "util.h" 10 11 static void layer_x(vec128 *data, vec128 *bits) { 12 int i; 13 vec128 v0, v1; 14 vec128 d; 15 16 for (i = 0; i < 64; i += 2) { 17 v0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); 18 v1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); 19 20 d = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(v0, v1); 21 d = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(d, *bits++); 22 v0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(v0, d); 23 v1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(v1, d); 24 25 data[i + 0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(v0, v1); 26 data[i + 1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(v0, v1); 27 } 28 } 29 30 static void layer_0(vec128 *bs, vec128 *cond) { 31 int x; 32 vec128 diff; 33 34 for (x = 0; x < (1 << 6); x += 2) { 35 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); 36 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, *cond++); 37 bs[ x ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x ], diff); 38 bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], diff); 39 } 40 } 41 42 static void layer_1(vec128 *bs, vec128 *cond) { 43 int x; 44 vec128 diff; 45 46 for (x = 0; x < (1 << 6); x += 4) { 47 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); 48 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); 49 bs[ x + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], diff); 50 bs[ x + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 2 ], diff); 51 52 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); 53 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); 54 bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], diff); 55 bs[ x + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 3 ], diff); 56 57 cond += 2; 58 } 59 } 60 61 static void layer_2(vec128 *bs, vec128 *cond) { 62 int x; 63 vec128 diff; 64 65 for (x = 0; x < (1 << 6); x += 8) { 66 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); 67 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); 68 bs[ x + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], diff); 69 bs[ x + 4 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 4 ], diff); 70 71 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); 72 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); 73 bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], diff); 74 bs[ x + 5 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 5 ], diff); 75 76 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); 77 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); 78 bs[ x + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 2 ], diff); 79 bs[ x + 6 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 6 ], diff); 80 81 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); 82 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); 83 bs[ x + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 3 ], diff); 84 bs[ x + 7 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 7 ], diff); 85 86 cond += 4; 87 } 88 } 89 90 static void layer_3(vec128 *bs, vec128 *cond) { 91 int x, s; 92 vec128 diff; 93 94 for (x = 0; x < (1 << 6); x += 16) { 95 for (s = x; s < x + 8; s += 4) { 96 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); 97 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); 98 bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], diff); 99 bs[ s + 8 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 8 ], diff); 100 101 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); 102 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); 103 bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], diff); 104 bs[ s + 9 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 9 ], diff); 105 106 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); 107 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); 108 bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], diff); 109 bs[ s + 10 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 10 ], diff); 110 111 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); 112 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); 113 bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], diff); 114 bs[ s + 11 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 11 ], diff); 115 116 cond += 4; 117 } 118 } 119 } 120 121 static void layer_4(vec128 *bs, vec128 *cond) { 122 int x, s; 123 vec128 diff; 124 125 for (x = 0; x < (1 << 6); x += 32) { 126 for (s = x; s < x + 16; s += 4) { 127 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); 128 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); 129 bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], diff); 130 bs[ s + 16 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 16 ], diff); 131 132 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); 133 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); 134 bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], diff); 135 bs[ s + 17 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 17 ], diff); 136 137 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); 138 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); 139 bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], diff); 140 bs[ s + 18 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 18 ], diff); 141 142 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); 143 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); 144 bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], diff); 145 bs[ s + 19 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 19 ], diff); 146 147 cond += 4; 148 } 149 } 150 } 151 152 static void layer_5(vec128 *bs, vec128 *cond) { 153 int x, s; 154 vec128 diff; 155 156 for (x = 0; x < (1 << 6); x += 64) { 157 for (s = x; s < x + 32; s += 4) { 158 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); 159 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); 160 bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], diff); 161 bs[ s + 32 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 32 ], diff); 162 163 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); 164 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); 165 bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], diff); 166 bs[ s + 33 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 33 ], diff); 167 168 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); 169 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); 170 bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], diff); 171 bs[ s + 34 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 34 ], diff); 172 173 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); 174 diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); 175 bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], diff); 176 bs[ s + 35 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 35 ], diff); 177 178 cond += 4; 179 } 180 } 181 } 182 183 /* input: bits, control bits as array of bytes */ 184 /* output: bits_int, control bits as array of 128-bit vectors */ 185 void PQCLEAN_MCELIECE6688128F_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { 186 int i, j; 187 const unsigned char *ptr = bits; 188 189 vec128 buf[64]; 190 191 for (i = 0; i <= 5; i += 2) { 192 for (j = 0; j < 64; j++) { 193 buf[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(PQCLEAN_MCELIECE6688128F_AVX_load8(ptr), PQCLEAN_MCELIECE6688128F_AVX_load8(ptr + 512)); 194 ptr += 8; 195 } 196 197 PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( buf ); 198 199 for (j = 0; j < 32; j++) { 200 bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); 201 bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); 202 } 203 204 ptr += 512; 205 } 206 207 for (i = 6; i <= 18; i++) { 208 for (j = 0; j < 32; j++) { 209 bits_int[i][j] = PQCLEAN_MCELIECE6688128F_AVX_load16(ptr); 210 ptr += 16; 211 } 212 } 213 214 for (i = 19; i < 25; i += 2) { 215 for (j = 0; j < 64; j++) { 216 buf[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(PQCLEAN_MCELIECE6688128F_AVX_load8(ptr), PQCLEAN_MCELIECE6688128F_AVX_load8(ptr + 512)); 217 ptr += 8; 218 } 219 220 PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( buf ); 221 222 for (j = 0; j < 32; j++) { 223 bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); 224 bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); 225 } 226 227 ptr += 512; 228 } 229 } 230 231 /* input: r, sequence of bits to be permuted */ 232 /* b, control bits as array of 128-bit vectors */ 233 /* rev, 0 for normal application; !0 for inverse */ 234 /* output: r, permuted bits */ 235 void PQCLEAN_MCELIECE6688128F_AVX_benes(vec128 *r, vec128 b[][32], int rev) { 236 int inc; 237 238 vec128 *b_ptr; 239 240 if (rev == 0) { 241 inc = 32; 242 b_ptr = b[ 0]; 243 } else { 244 inc = -32; 245 b_ptr = b[24]; 246 } 247 248 // 249 250 PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); 251 252 layer_0(r, b_ptr); 253 b_ptr += inc; 254 layer_1(r, b_ptr); 255 b_ptr += inc; 256 layer_2(r, b_ptr); 257 b_ptr += inc; 258 layer_3(r, b_ptr); 259 b_ptr += inc; 260 layer_4(r, b_ptr); 261 b_ptr += inc; 262 layer_5(r, b_ptr); 263 b_ptr += inc; 264 265 PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); 266 267 layer_x(r, b_ptr); 268 b_ptr += inc; 269 layer_0(r, b_ptr); 270 b_ptr += inc; 271 layer_1(r, b_ptr); 272 b_ptr += inc; 273 layer_2(r, b_ptr); 274 b_ptr += inc; 275 layer_3(r, b_ptr); 276 b_ptr += inc; 277 layer_4(r, b_ptr); 278 b_ptr += inc; 279 layer_5(r, b_ptr); 280 b_ptr += inc; 281 layer_4(r, b_ptr); 282 b_ptr += inc; 283 layer_3(r, b_ptr); 284 b_ptr += inc; 285 layer_2(r, b_ptr); 286 b_ptr += inc; 287 layer_1(r, b_ptr); 288 b_ptr += inc; 289 layer_0(r, b_ptr); 290 b_ptr += inc; 291 layer_x(r, b_ptr); 292 b_ptr += inc; 293 294 PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); 295 296 layer_5(r, b_ptr); 297 b_ptr += inc; 298 layer_4(r, b_ptr); 299 b_ptr += inc; 300 layer_3(r, b_ptr); 301 b_ptr += inc; 302 layer_2(r, b_ptr); 303 b_ptr += inc; 304 layer_1(r, b_ptr); 305 b_ptr += inc; 306 layer_0(r, b_ptr); 307 //b_ptr += inc; 308 309 PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); 310 } 311 312