1 /* 2 Plain C implementation of the Haraka256 and Haraka512 permutations. 3 */ 4 #include <immintrin.h> 5 #include <stdint.h> 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <string.h> 9 10 #include "haraka.h" 11 12 #define HARAKAS_RATE 32 13 14 #define u64 uint64_t 15 #define u128 __m128i 16 17 #define LOAD(src) _mm_loadu_si128((u128 *)(src)) 18 #define STORE(dest,src) _mm_storeu_si128((u128 *)(dest),src) 19 20 #define XOR128(a, b) _mm_xor_si128(a, b) 21 22 #define AES2(s0, s1, rci) \ 23 (s0) = _mm_aesenc_si128(s0, *(rci)); \ 24 (s1) = _mm_aesenc_si128(s1, *((rci) + 1)); \ 25 (s0) = _mm_aesenc_si128(s0, *((rci) + 2)); \ 26 (s1) = _mm_aesenc_si128(s1, *((rci) + 3)); 27 28 #define AES2_4x(s0, s1, s2, s3, rci) \ 29 AES2((s0)[0], (s0)[1], rci); \ 30 AES2((s1)[0], (s1)[1], rci); \ 31 AES2((s2)[0], (s2)[1], rci); \ 32 AES2((s3)[0], (s3)[1], rci); 33 34 #define AES4(s0, s1, s2, s3, rci) \ 35 (s0) = _mm_aesenc_si128(s0, *(rci)); \ 36 (s1) = _mm_aesenc_si128(s1, *((rci) + 1)); \ 37 (s2) = _mm_aesenc_si128(s2, *((rci) + 2)); \ 38 (s3) = _mm_aesenc_si128(s3, *((rci) + 3)); \ 39 (s0) = _mm_aesenc_si128(s0, *((rci) + 4)); \ 40 (s1) = _mm_aesenc_si128(s1, *((rci) + 5)); \ 41 (s2) = _mm_aesenc_si128(s2, *((rci) + 6)); \ 42 (s3) = _mm_aesenc_si128(s3, *((rci) + 7)); 43 44 #define AES4_4x(s0, s1, s2, s3, rci) \ 45 AES4((s0)[0], (s0)[1], (s0)[2], (s0)[3], rci); \ 46 AES4((s1)[0], (s1)[1], (s1)[2], (s1)[3], rci); \ 47 AES4((s2)[0], (s2)[1], (s2)[2], (s2)[3], rci); \ 48 AES4((s3)[0], (s3)[1], (s3)[2], (s3)[3], rci); 49 50 #define MIX2(s0, s1) \ 51 tmp = _mm_unpacklo_epi32(s0, s1); \ 52 (s1) = _mm_unpackhi_epi32(s0, s1); \ 53 (s0) = tmp; 54 55 #define MIX4(s0, s1, s2, s3) \ 56 tmp = _mm_unpacklo_epi32(s0, s1); \ 57 (s0) = _mm_unpackhi_epi32(s0, s1); \ 58 (s1) = _mm_unpacklo_epi32(s2, s3); \ 59 (s2) = _mm_unpackhi_epi32(s2, s3); \ 60 (s3) = _mm_unpacklo_epi32(s0, s2); \ 61 (s0) = _mm_unpackhi_epi32(s0, s2); \ 62 (s2) = _mm_unpackhi_epi32(s1, tmp); \ 63 (s1) = _mm_unpacklo_epi32(s1, tmp); 64 65 #define TRUNCSTORE(out, s0, s1, s2, s3) \ 66 _mm_storeu_si128((u128 *)(out), \ 67 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(s0), _mm_castsi128_pd(s1), 3))); \ 68 _mm_storeu_si128((u128 *)((out) + 16), \ 69 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(s2), _mm_castsi128_pd(s3), 0))); 70 71 static void load_haraka_constants(u128 rc[40]) { 72 rc[ 0] = _mm_set_epi32((int)0x0684704c, (int)0xe620c00a, (int)0xb2c5fef0, (int)0x75817b9d); 73 rc[ 1] = _mm_set_epi32((int)0x8b66b4e1, (int)0x88f3a06b, (int)0x640f6ba4, (int)0x2f08f717); 74 rc[ 2] = _mm_set_epi32((int)0x3402de2d, (int)0x53f28498, (int)0xcf029d60, (int)0x9f029114); 75 rc[ 3] = _mm_set_epi32((int)0x0ed6eae6, (int)0x2e7b4f08, (int)0xbbf3bcaf, (int)0xfd5b4f79); 76 rc[ 4] = _mm_set_epi32((int)0xcbcfb0cb, (int)0x4872448b, (int)0x79eecd1c, (int)0xbe397044); 77 rc[ 5] = _mm_set_epi32((int)0x7eeacdee, (int)0x6e9032b7, (int)0x8d5335ed, (int)0x2b8a057b); 78 rc[ 6] = _mm_set_epi32((int)0x67c28f43, (int)0x5e2e7cd0, (int)0xe2412761, (int)0xda4fef1b); 79 rc[ 7] = _mm_set_epi32((int)0x2924d9b0, (int)0xafcacc07, (int)0x675ffde2, (int)0x1fc70b3b); 80 rc[ 8] = _mm_set_epi32((int)0xab4d63f1, (int)0xe6867fe9, (int)0xecdb8fca, (int)0xb9d465ee); 81 rc[ 9] = _mm_set_epi32((int)0x1c30bf84, (int)0xd4b7cd64, (int)0x5b2a404f, (int)0xad037e33); 82 rc[10] = _mm_set_epi32((int)0xb2cc0bb9, (int)0x941723bf, (int)0x69028b2e, (int)0x8df69800); 83 rc[11] = _mm_set_epi32((int)0xfa0478a6, (int)0xde6f5572, (int)0x4aaa9ec8, (int)0x5c9d2d8a); 84 rc[12] = _mm_set_epi32((int)0xdfb49f2b, (int)0x6b772a12, (int)0x0efa4f2e, (int)0x29129fd4); 85 rc[13] = _mm_set_epi32((int)0x1ea10344, (int)0xf449a236, (int)0x32d611ae, (int)0xbb6a12ee); 86 rc[14] = _mm_set_epi32((int)0xaf044988, (int)0x4b050084, (int)0x5f9600c9, (int)0x9ca8eca6); 87 rc[15] = _mm_set_epi32((int)0x21025ed8, (int)0x9d199c4f, (int)0x78a2c7e3, (int)0x27e593ec); 88 rc[16] = _mm_set_epi32((int)0xbf3aaaf8, (int)0xa759c9b7, (int)0xb9282ecd, (int)0x82d40173); 89 rc[17] = _mm_set_epi32((int)0x6260700d, (int)0x6186b017, (int)0x37f2efd9, (int)0x10307d6b); 90 rc[18] = _mm_set_epi32((int)0x5aca45c2, (int)0x21300443, (int)0x81c29153, (int)0xf6fc9ac6); 91 rc[19] = _mm_set_epi32((int)0x9223973c, (int)0x226b68bb, (int)0x2caf92e8, (int)0x36d1943a); 92 rc[20] = _mm_set_epi32((int)0xd3bf9238, (int)0x225886eb, (int)0x6cbab958, (int)0xe51071b4); 93 rc[21] = _mm_set_epi32((int)0xdb863ce5, (int)0xaef0c677, (int)0x933dfddd, (int)0x24e1128d); 94 rc[22] = _mm_set_epi32((int)0xbb606268, (int)0xffeba09c, (int)0x83e48de3, (int)0xcb2212b1); 95 rc[23] = _mm_set_epi32((int)0x734bd3dc, (int)0xe2e4d19c, (int)0x2db91a4e, (int)0xc72bf77d); 96 rc[24] = _mm_set_epi32((int)0x43bb47c3, (int)0x61301b43, (int)0x4b1415c4, (int)0x2cb3924e); 97 rc[25] = _mm_set_epi32((int)0xdba775a8, (int)0xe707eff6, (int)0x03b231dd, (int)0x16eb6899); 98 rc[26] = _mm_set_epi32((int)0x6df3614b, (int)0x3c755977, (int)0x8e5e2302, (int)0x7eca472c); 99 rc[27] = _mm_set_epi32((int)0xcda75a17, (int)0xd6de7d77, (int)0x6d1be5b9, (int)0xb88617f9); 100 rc[28] = _mm_set_epi32((int)0xec6b43f0, (int)0x6ba8e9aa, (int)0x9d6c069d, (int)0xa946ee5d); 101 rc[29] = _mm_set_epi32((int)0xcb1e6950, (int)0xf957332b, (int)0xa2531159, (int)0x3bf327c1); 102 rc[30] = _mm_set_epi32((int)0x2cee0c75, (int)0x00da619c, (int)0xe4ed0353, (int)0x600ed0d9); 103 rc[31] = _mm_set_epi32((int)0xf0b1a5a1, (int)0x96e90cab, (int)0x80bbbabc, (int)0x63a4a350); 104 rc[32] = _mm_set_epi32((int)0xae3db102, (int)0x5e962988, (int)0xab0dde30, (int)0x938dca39); 105 rc[33] = _mm_set_epi32((int)0x17bb8f38, (int)0xd554a40b, (int)0x8814f3a8, (int)0x2e75b442); 106 rc[34] = _mm_set_epi32((int)0x34bb8a5b, (int)0x5f427fd7, (int)0xaeb6b779, (int)0x360a16f6); 107 rc[35] = _mm_set_epi32((int)0x26f65241, (int)0xcbe55438, (int)0x43ce5918, (int)0xffbaafde); 108 rc[36] = _mm_set_epi32((int)0x4ce99a54, (int)0xb9f3026a, (int)0xa2ca9cf7, (int)0x839ec978); 109 rc[37] = _mm_set_epi32((int)0xae51a51a, (int)0x1bdff7be, (int)0x40c06e28, (int)0x22901235); 110 rc[38] = _mm_set_epi32((int)0xa0c1613c, (int)0xba7ed22b, (int)0xc173bc0f, (int)0x48a659cf); 111 rc[39] = _mm_set_epi32((int)0x756acc03, (int)0x02288288, (int)0x4ad6bdfd, (int)0xe9c59da1); 112 } 113 114 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_tweak_constants( 115 harakactx *state, 116 const unsigned char *pk_seed, const unsigned char *sk_seed, 117 unsigned long long seed_length) { 118 int i; 119 unsigned char buf[40 * 16]; 120 121 /* Use the standard constants to generate tweaked ones. */ 122 load_haraka_constants(state->rc); 123 124 /* Constants for sk.seed */ 125 if (sk_seed != NULL) { 126 PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S(buf, 40 * 16, sk_seed, seed_length, state); 127 /* Tweak constants with the pub_seed */ 128 for (i = 0; i < 40; i++) { 129 state->rc_sseed[i] = LOAD(buf + i * 16); 130 } 131 } 132 133 /* Constants for pk.seed */ 134 PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S(buf, 40 * 16, pk_seed, seed_length, state); 135 136 /* Tweak constants with the pub_seed */ 137 for (i = 0; i < 40; i++) { 138 state->rc[i] = LOAD(buf + i * 16); 139 } 140 } 141 142 static void haraka_S_absorb(unsigned char *s, 143 const unsigned char *m, unsigned long long mlen, 144 unsigned char p, 145 const harakactx *state) { 146 unsigned long long i; 147 unsigned char t[HARAKAS_RATE]; 148 149 while (mlen >= HARAKAS_RATE) { 150 // XOR block to state 151 STORE(s, XOR128(LOAD(s), LOAD(m))); 152 STORE(s + 16, XOR128(LOAD(s + 16), LOAD(m + 16))); 153 PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm(s, s, state); 154 mlen -= HARAKAS_RATE; 155 m += HARAKAS_RATE; 156 } 157 158 for (i = 0; i < HARAKAS_RATE; ++i) { 159 t[i] = 0; 160 } 161 for (i = 0; i < mlen; ++i) { 162 t[i] = m[i]; 163 } 164 t[i] = p; 165 t[HARAKAS_RATE - 1] |= 128; 166 STORE(s, XOR128(LOAD(s), LOAD(t))); 167 STORE(s + 16, XOR128(LOAD(s + 16), LOAD(t + 16))); 168 } 169 170 static void haraka_S_absorb4x(unsigned char *s, 171 const unsigned char *m0, 172 const unsigned char *m1, 173 const unsigned char *m2, 174 const unsigned char *m3, 175 unsigned long long int mlen, 176 unsigned char p, 177 const harakactx *state) { 178 unsigned long long i; 179 unsigned char t0[HARAKAS_RATE]; 180 unsigned char t1[HARAKAS_RATE]; 181 unsigned char t2[HARAKAS_RATE]; 182 unsigned char t3[HARAKAS_RATE]; 183 184 while (mlen >= HARAKAS_RATE) { 185 // XOR block to state 186 STORE(s, XOR128(LOAD(s), LOAD(m0))); 187 STORE(s + 16, XOR128(LOAD(s + 16), LOAD(m0 + 16))); 188 STORE(s + 64, XOR128(LOAD(s + 64), LOAD(m1))); 189 STORE(s + 80, XOR128(LOAD(s + 80), LOAD(m1 + 16))); 190 STORE(s + 128, XOR128(LOAD(s + 128), LOAD(m2))); 191 STORE(s + 144, XOR128(LOAD(s + 144), LOAD(m2 + 16))); 192 STORE(s + 192, XOR128(LOAD(s + 192), LOAD(m3))); 193 STORE(s + 208, XOR128(LOAD(s + 208), LOAD(m3 + 16))); 194 195 PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm_x4(s, s, state); 196 mlen -= HARAKAS_RATE; 197 m0 += HARAKAS_RATE; 198 m1 += HARAKAS_RATE; 199 m2 += HARAKAS_RATE; 200 m3 += HARAKAS_RATE; 201 } 202 203 for (i = 0; i < HARAKAS_RATE; ++i) { 204 t0[i] = 0; 205 t1[i] = 0; 206 t2[i] = 0; 207 t3[i] = 0; 208 } 209 for (i = 0; i < mlen; ++i) { 210 t0[i] = m0[i]; 211 t1[i] = m1[i]; 212 t2[i] = m2[i]; 213 t3[i] = m3[i]; 214 } 215 216 t0[i] = p; 217 t1[i] = p; 218 t2[i] = p; 219 t3[i] = p; 220 221 t0[HARAKAS_RATE - 1] |= 128; 222 t1[HARAKAS_RATE - 1] |= 128; 223 t2[HARAKAS_RATE - 1] |= 128; 224 t3[HARAKAS_RATE - 1] |= 128; 225 226 STORE(s, XOR128(LOAD(s), LOAD(t0))); 227 STORE(s + 16, XOR128(LOAD(s + 16), LOAD(t0 + 16))); 228 STORE(s + 64, XOR128(LOAD(s + 64), LOAD(t1))); 229 STORE(s + 80, XOR128(LOAD(s + 80), LOAD(t1 + 16))); 230 STORE(s + 128, XOR128(LOAD(s + 128), LOAD(t2))); 231 STORE(s + 144, XOR128(LOAD(s + 144), LOAD(t2 + 16))); 232 STORE(s + 192, XOR128(LOAD(s + 192), LOAD(t3))); 233 STORE(s + 208, XOR128(LOAD(s + 208), LOAD(t3 + 16))); 234 } 235 236 static void haraka_S_squeezeblocks(unsigned char *h, unsigned long long nblocks, 237 unsigned char *s, unsigned int r, const harakactx *state) { 238 while (nblocks > 0) { 239 PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm(s, s, state); 240 STORE(h, LOAD(s)); 241 STORE(h + 16, LOAD(s + 16)); 242 h += r; 243 nblocks--; 244 } 245 } 246 247 static void haraka_S_squeezeblocks4x(unsigned char *h0, 248 unsigned char *h1, 249 unsigned char *h2, 250 unsigned char *h3, 251 unsigned long long nblocks, 252 unsigned char *s, 253 unsigned int r, 254 const harakactx *state) { 255 while (nblocks > 0) { 256 PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm_x4(s, s, state); 257 STORE(h0, LOAD(s)); 258 STORE(h0 + 16, LOAD(s + 16)); 259 STORE(h1, LOAD(s + 64)); 260 STORE(h1 + 16, LOAD(s + 80)); 261 STORE(h2, LOAD(s + 128)); 262 STORE(h2 + 16, LOAD(s + 144)); 263 STORE(h3, LOAD(s + 192)); 264 STORE(h3 + 16, LOAD(s + 208)); 265 h0 += r; 266 h1 += r; 267 h2 += r; 268 h3 += r; 269 nblocks--; 270 } 271 } 272 273 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S_inc_init(uint8_t *s_inc) { 274 size_t i; 275 276 for (i = 0; i < 64; i++) { 277 s_inc[i] = 0; 278 } 279 s_inc[64] = 0; 280 } 281 282 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S_inc_absorb(uint8_t *s_inc, const uint8_t *m, size_t mlen, const harakactx *state) { 283 size_t i; 284 285 /* Recall that s_inc[64] is the non-absorbed bytes xored into the state */ 286 while (mlen + s_inc[64] >= HARAKAS_RATE) { 287 for (i = 0; i < (size_t)(HARAKAS_RATE - s_inc[64]); i++) { 288 /* Take the i'th byte from message 289 xor with the s_inc[64] + i'th byte of the state */ 290 s_inc[s_inc[64] + i] ^= m[i]; 291 } 292 mlen -= (size_t)(HARAKAS_RATE - s_inc[64]); 293 m += HARAKAS_RATE - s_inc[64]; 294 s_inc[64] = 0; 295 296 PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm(s_inc, s_inc, state); 297 } 298 299 for (i = 0; i < mlen; i++) { 300 s_inc[s_inc[64] + i] ^= m[i]; 301 } 302 s_inc[64] = (uint8_t)(s_inc[64] + mlen); 303 } 304 305 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S_inc_finalize(uint8_t *s_inc) { 306 /* After haraka_S_inc_absorb, we are guaranteed that s_inc[64] < HARAKAS_RATE, 307 so we can always use one more byte for p in the current state. */ 308 s_inc[s_inc[64]] ^= 0x1F; 309 s_inc[HARAKAS_RATE - 1] ^= 128; 310 s_inc[64] = 0; 311 } 312 313 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S_inc_squeeze(uint8_t *out, size_t outlen, uint8_t *s_inc, const harakactx *state) { 314 size_t i; 315 316 /* First consume any bytes we still have sitting around */ 317 for (i = 0; i < outlen && i < s_inc[64]; i++) { 318 /* There are s_inc[64] bytes left, so r - s_inc[64] is the first 319 available byte. We consume from there, i.e., up to r. */ 320 out[i] = (uint8_t)s_inc[(HARAKAS_RATE - s_inc[64] + (uint8_t)i)]; 321 } 322 out += i; 323 outlen -= i; 324 s_inc[64] = (uint8_t)(s_inc[64] - i); 325 326 /* Then squeeze the remaining necessary blocks */ 327 while (outlen > 0) { 328 PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm(s_inc, s_inc, state); 329 330 for (i = 0; i < outlen && i < HARAKAS_RATE; i++) { 331 out[i] = s_inc[i]; 332 } 333 out += i; 334 outlen -= i; 335 s_inc[64] = (uint8_t)(HARAKAS_RATE - i); 336 } 337 } 338 339 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S(unsigned char *out, unsigned long long outlen, 340 const unsigned char *in, unsigned long long inlen, const harakactx *state) { 341 unsigned long long i; 342 unsigned char s[64]; 343 unsigned char d[32]; 344 345 for (i = 0; i < 64; i++) { 346 s[i] = 0; 347 } 348 haraka_S_absorb(s, in, inlen, 0x1F, state); 349 350 haraka_S_squeezeblocks(out, outlen / HARAKAS_RATE, s, HARAKAS_RATE, state); 351 out += (outlen / HARAKAS_RATE) * HARAKAS_RATE; 352 353 if (outlen % HARAKAS_RATE) { 354 haraka_S_squeezeblocks(d, 1, s, HARAKAS_RATE, state); 355 for (i = 0; i < outlen % HARAKAS_RATE; i++) { 356 out[i] = d[i]; 357 } 358 } 359 } 360 361 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_Sx4(unsigned char *out0, 362 unsigned char *out1, 363 unsigned char *out2, 364 unsigned char *out3, 365 unsigned long long outlen, 366 const unsigned char *in0, 367 const unsigned char *in1, 368 const unsigned char *in2, 369 const unsigned char *in3, 370 unsigned long long inlen, 371 const harakactx *state) { 372 unsigned long long i; 373 unsigned char s[64 * 4]; 374 unsigned char d0[32]; 375 unsigned char d1[32]; 376 unsigned char d2[32]; 377 unsigned char d3[32]; 378 379 for (i = 0; i < 64 * 4; i++) { 380 s[i] = 0; 381 } 382 haraka_S_absorb4x(s, in0, in1, in2, in3, inlen, 0x1F, state); 383 384 haraka_S_squeezeblocks4x(out0, out1, out2, out3, outlen / HARAKAS_RATE, s, HARAKAS_RATE, state); 385 out0 += (outlen / HARAKAS_RATE) * HARAKAS_RATE; 386 out1 += (outlen / HARAKAS_RATE) * HARAKAS_RATE; 387 out2 += (outlen / HARAKAS_RATE) * HARAKAS_RATE; 388 out3 += (outlen / HARAKAS_RATE) * HARAKAS_RATE; 389 390 if (outlen % HARAKAS_RATE) { 391 haraka_S_squeezeblocks4x(d0, d1, d2, d3, 1, s, HARAKAS_RATE, state); 392 for (i = 0; i < outlen % HARAKAS_RATE; i++) { 393 out0[i] = d0[i]; 394 out1[i] = d1[i]; 395 out2[i] = d2[i]; 396 out3[i] = d3[i]; 397 } 398 } 399 } 400 401 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm(unsigned char *out, const unsigned char *in, const harakactx *state) { 402 u128 s[4], tmp; 403 404 s[0] = LOAD(in); 405 s[1] = LOAD(in + 16); 406 s[2] = LOAD(in + 32); 407 s[3] = LOAD(in + 48); 408 409 AES4(s[0], s[1], s[2], s[3], state->rc); 410 MIX4(s[0], s[1], s[2], s[3]); 411 412 AES4(s[0], s[1], s[2], s[3], state->rc + 8); 413 MIX4(s[0], s[1], s[2], s[3]); 414 415 AES4(s[0], s[1], s[2], s[3], state->rc + 16); 416 MIX4(s[0], s[1], s[2], s[3]); 417 418 AES4(s[0], s[1], s[2], s[3], state->rc + 24); 419 MIX4(s[0], s[1], s[2], s[3]); 420 421 AES4(s[0], s[1], s[2], s[3], state->rc + 32); 422 MIX4(s[0], s[1], s[2], s[3]); 423 424 STORE(out, s[0]); 425 STORE(out + 16, s[1]); 426 STORE(out + 32, s[2]); 427 STORE(out + 48, s[3]); 428 } 429 430 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm_x4(unsigned char *out, const unsigned char *in, const harakactx *state) { 431 u128 s[4][4], tmp; 432 433 s[0][0] = LOAD(in); 434 s[0][1] = LOAD(in + 16); 435 s[0][2] = LOAD(in + 32); 436 s[0][3] = LOAD(in + 48); 437 s[1][0] = LOAD(in + 64); 438 s[1][1] = LOAD(in + 80); 439 s[1][2] = LOAD(in + 96); 440 s[1][3] = LOAD(in + 112); 441 s[2][0] = LOAD(in + 128); 442 s[2][1] = LOAD(in + 144); 443 s[2][2] = LOAD(in + 160); 444 s[2][3] = LOAD(in + 176); 445 s[3][0] = LOAD(in + 192); 446 s[3][1] = LOAD(in + 208); 447 s[3][2] = LOAD(in + 224); 448 s[3][3] = LOAD(in + 240); 449 450 AES4_4x(s[0], s[1], s[2], s[3], state->rc); 451 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 452 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 453 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 454 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 455 456 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 8); 457 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 458 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 459 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 460 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 461 462 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 16); 463 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 464 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 465 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 466 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 467 468 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 24); 469 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 470 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 471 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 472 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 473 474 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 32); 475 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 476 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 477 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 478 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 479 480 STORE(out, s[0][0]); 481 STORE(out + 16, s[0][1]); 482 STORE(out + 32, s[0][2]); 483 STORE(out + 48, s[0][3]); 484 STORE(out + 64, s[1][0]); 485 STORE(out + 80, s[1][1]); 486 STORE(out + 96, s[1][2]); 487 STORE(out + 112, s[1][3]); 488 STORE(out + 128, s[2][0]); 489 STORE(out + 144, s[2][1]); 490 STORE(out + 160, s[2][2]); 491 STORE(out + 176, s[2][3]); 492 STORE(out + 192, s[3][0]); 493 STORE(out + 208, s[3][1]); 494 STORE(out + 224, s[3][2]); 495 STORE(out + 240, s[3][3]); 496 } 497 498 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512(unsigned char *out, const unsigned char *in, const harakactx *state) { 499 u128 s[4], tmp; 500 501 s[0] = LOAD(in); 502 s[1] = LOAD(in + 16); 503 s[2] = LOAD(in + 32); 504 s[3] = LOAD(in + 48); 505 506 AES4(s[0], s[1], s[2], s[3], state->rc); 507 MIX4(s[0], s[1], s[2], s[3]); 508 509 AES4(s[0], s[1], s[2], s[3], state->rc + 8); 510 MIX4(s[0], s[1], s[2], s[3]); 511 512 AES4(s[0], s[1], s[2], s[3], state->rc + 16); 513 MIX4(s[0], s[1], s[2], s[3]); 514 515 AES4(s[0], s[1], s[2], s[3], state->rc + 24); 516 MIX4(s[0], s[1], s[2], s[3]); 517 518 AES4(s[0], s[1], s[2], s[3], state->rc + 32); 519 MIX4(s[0], s[1], s[2], s[3]); 520 521 s[0] = XOR128(s[0], LOAD(in)); 522 s[1] = XOR128(s[1], LOAD(in + 16)); 523 s[2] = XOR128(s[2], LOAD(in + 32)); 524 s[3] = XOR128(s[3], LOAD(in + 48)); 525 526 // truncate and store result 527 TRUNCSTORE(out, s[0], s[1], s[2], s[3]); 528 } 529 530 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512x4(unsigned char *out, const unsigned char *in, const harakactx *state) { 531 u128 s[4][4], tmp; 532 533 s[0][0] = LOAD(in); 534 s[0][1] = LOAD(in + 16); 535 s[0][2] = LOAD(in + 32); 536 s[0][3] = LOAD(in + 48); 537 s[1][0] = LOAD(in + 64); 538 s[1][1] = LOAD(in + 80); 539 s[1][2] = LOAD(in + 96); 540 s[1][3] = LOAD(in + 112); 541 s[2][0] = LOAD(in + 128); 542 s[2][1] = LOAD(in + 144); 543 s[2][2] = LOAD(in + 160); 544 s[2][3] = LOAD(in + 176); 545 s[3][0] = LOAD(in + 192); 546 s[3][1] = LOAD(in + 208); 547 s[3][2] = LOAD(in + 224); 548 s[3][3] = LOAD(in + 240); 549 550 AES4_4x(s[0], s[1], s[2], s[3], state->rc); 551 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 552 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 553 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 554 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 555 556 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 8); 557 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 558 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 559 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 560 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 561 562 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 16); 563 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 564 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 565 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 566 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 567 568 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 24); 569 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 570 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 571 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 572 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 573 574 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 32); 575 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]); 576 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]); 577 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]); 578 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]); 579 580 s[0][0] = XOR128(s[0][0], LOAD(in)); 581 s[0][1] = XOR128(s[0][1], LOAD(in + 16)); 582 s[0][2] = XOR128(s[0][2], LOAD(in + 32)); 583 s[0][3] = XOR128(s[0][3], LOAD(in + 48)); 584 s[1][0] = XOR128(s[1][0], LOAD(in + 64)); 585 s[1][1] = XOR128(s[1][1], LOAD(in + 80)); 586 s[1][2] = XOR128(s[1][2], LOAD(in + 96)); 587 s[1][3] = XOR128(s[1][3], LOAD(in + 112)); 588 s[2][0] = XOR128(s[2][0], LOAD(in + 128)); 589 s[2][1] = XOR128(s[2][1], LOAD(in + 144)); 590 s[2][2] = XOR128(s[2][2], LOAD(in + 160)); 591 s[2][3] = XOR128(s[2][3], LOAD(in + 176)); 592 s[3][0] = XOR128(s[3][0], LOAD(in + 192)); 593 s[3][1] = XOR128(s[3][1], LOAD(in + 208)); 594 s[3][2] = XOR128(s[3][2], LOAD(in + 224)); 595 s[3][3] = XOR128(s[3][3], LOAD(in + 240)); 596 597 TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]); 598 TRUNCSTORE((out + 32), s[1][0], s[1][1], s[1][2], s[1][3]); 599 TRUNCSTORE((out + 64), s[2][0], s[2][1], s[2][2], s[2][3]); 600 TRUNCSTORE((out + 96), s[3][0], s[3][1], s[3][2], s[3][3]); 601 } 602 603 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka256(unsigned char *out, const unsigned char *in, const harakactx *state) { 604 u128 s[2], tmp; 605 606 s[0] = LOAD(in); 607 s[1] = LOAD(in + 16); 608 609 AES2(s[0], s[1], state->rc); 610 MIX2(s[0], s[1]); 611 612 AES2(s[0], s[1], state->rc + 4); 613 MIX2(s[0], s[1]); 614 615 AES2(s[0], s[1], state->rc + 8); 616 MIX2(s[0], s[1]); 617 618 AES2(s[0], s[1], state->rc + 12); 619 MIX2(s[0], s[1]); 620 621 AES2(s[0], s[1], state->rc + 16); 622 MIX2(s[0], s[1]); 623 624 s[0] = XOR128(s[0], LOAD(in)); 625 s[1] = XOR128(s[1], LOAD(in + 16)); 626 627 STORE(out, s[0]); 628 STORE(out + 16, s[1]); 629 } 630 631 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka256x4(unsigned char *out, const unsigned char *in, const harakactx *state) { 632 u128 s[4][2], tmp; 633 634 s[0][0] = LOAD(in); 635 s[0][1] = LOAD(in + 16); 636 s[1][0] = LOAD(in + 32); 637 s[1][1] = LOAD(in + 48); 638 s[2][0] = LOAD(in + 64); 639 s[2][1] = LOAD(in + 80); 640 s[3][0] = LOAD(in + 96); 641 s[3][1] = LOAD(in + 112); 642 643 // Round 1 644 AES2_4x(s[0], s[1], s[2], s[3], state->rc); 645 646 MIX2(s[0][0], s[0][1]); 647 MIX2(s[1][0], s[1][1]); 648 MIX2(s[2][0], s[2][1]); 649 MIX2(s[3][0], s[3][1]); 650 651 // Round 2 652 AES2_4x(s[0], s[1], s[2], s[3], state->rc + 4); 653 654 MIX2(s[0][0], s[0][1]); 655 MIX2(s[1][0], s[1][1]); 656 MIX2(s[2][0], s[2][1]); 657 MIX2(s[3][0], s[3][1]); 658 659 // Round 3 660 AES2_4x(s[0], s[1], s[2], s[3], state->rc + 8); 661 662 MIX2(s[0][0], s[0][1]); 663 MIX2(s[1][0], s[1][1]); 664 MIX2(s[2][0], s[2][1]); 665 MIX2(s[3][0], s[3][1]); 666 667 // Round 4 668 AES2_4x(s[0], s[1], s[2], s[3], state->rc + 12); 669 670 MIX2(s[0][0], s[0][1]); 671 MIX2(s[1][0], s[1][1]); 672 MIX2(s[2][0], s[2][1]); 673 MIX2(s[3][0], s[3][1]); 674 675 // Round 5 676 AES2_4x(s[0], s[1], s[2], s[3], state->rc + 16); 677 678 MIX2(s[0][0], s[0][1]); 679 MIX2(s[1][0], s[1][1]); 680 MIX2(s[2][0], s[2][1]); 681 MIX2(s[3][0], s[3][1]); 682 683 // Feed Forward 684 s[0][0] = _mm_xor_si128(s[0][0], LOAD(in)); 685 s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16)); 686 s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32)); 687 s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48)); 688 s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64)); 689 s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80)); 690 s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96)); 691 s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112)); 692 693 STORE(out, s[0][0]); 694 STORE(out + 16, s[0][1]); 695 STORE(out + 32, s[1][0]); 696 STORE(out + 48, s[1][1]); 697 STORE(out + 64, s[2][0]); 698 STORE(out + 80, s[2][1]); 699 STORE(out + 96, s[3][0]); 700 STORE(out + 112, s[3][1]); 701 } 702 703 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka256_sk(unsigned char *out, const unsigned char *in, const harakactx *state) { 704 u128 s[2], tmp; 705 706 s[0] = LOAD(in); 707 s[1] = LOAD(in + 16); 708 709 AES2(s[0], s[1], state->rc_sseed); 710 MIX2(s[0], s[1]); 711 712 AES2(s[0], s[1], state->rc_sseed + 4); 713 MIX2(s[0], s[1]); 714 715 AES2(s[0], s[1], state->rc_sseed + 8); 716 MIX2(s[0], s[1]); 717 718 AES2(s[0], s[1], state->rc_sseed + 12); 719 MIX2(s[0], s[1]); 720 721 AES2(s[0], s[1], state->rc_sseed + 16); 722 MIX2(s[0], s[1]); 723 724 s[0] = XOR128(s[0], LOAD(in)); 725 s[1] = XOR128(s[1], LOAD(in + 16)); 726 727 STORE(out, s[0]); 728 STORE(out + 16, s[1]); 729 } 730 731 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka256_skx4(unsigned char *out, const unsigned char *in, const harakactx *state) { 732 u128 s[4][2], tmp; 733 734 s[0][0] = LOAD(in); 735 s[0][1] = LOAD(in + 16); 736 s[1][0] = LOAD(in + 32); 737 s[1][1] = LOAD(in + 48); 738 s[2][0] = LOAD(in + 64); 739 s[2][1] = LOAD(in + 80); 740 s[3][0] = LOAD(in + 96); 741 s[3][1] = LOAD(in + 112); 742 743 // Round 1 744 AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed); 745 746 MIX2(s[0][0], s[0][1]); 747 MIX2(s[1][0], s[1][1]); 748 MIX2(s[2][0], s[2][1]); 749 MIX2(s[3][0], s[3][1]); 750 751 // Round 2 752 AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 4); 753 754 MIX2(s[0][0], s[0][1]); 755 MIX2(s[1][0], s[1][1]); 756 MIX2(s[2][0], s[2][1]); 757 MIX2(s[3][0], s[3][1]); 758 759 // Round 3 760 AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 8); 761 762 MIX2(s[0][0], s[0][1]); 763 MIX2(s[1][0], s[1][1]); 764 MIX2(s[2][0], s[2][1]); 765 MIX2(s[3][0], s[3][1]); 766 767 // Round 4 768 AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 12); 769 770 MIX2(s[0][0], s[0][1]); 771 MIX2(s[1][0], s[1][1]); 772 MIX2(s[2][0], s[2][1]); 773 MIX2(s[3][0], s[3][1]); 774 775 // Round 5 776 AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 16); 777 778 MIX2(s[0][0], s[0][1]); 779 MIX2(s[1][0], s[1][1]); 780 MIX2(s[2][0], s[2][1]); 781 MIX2(s[3][0], s[3][1]); 782 783 // Feed Forward 784 s[0][0] = XOR128(s[0][0], LOAD(in)); 785 s[0][1] = XOR128(s[0][1], LOAD(in + 16)); 786 s[1][0] = XOR128(s[1][0], LOAD(in + 32)); 787 s[1][1] = XOR128(s[1][1], LOAD(in + 48)); 788 s[2][0] = XOR128(s[2][0], LOAD(in + 64)); 789 s[2][1] = XOR128(s[2][1], LOAD(in + 80)); 790 s[3][0] = XOR128(s[3][0], LOAD(in + 96)); 791 s[3][1] = XOR128(s[3][1], LOAD(in + 112)); 792 793 STORE(out, s[0][0]); 794 STORE(out + 16, s[0][1]); 795 STORE(out + 32, s[1][0]); 796 STORE(out + 48, s[1][1]); 797 STORE(out + 64, s[2][0]); 798 STORE(out + 80, s[2][1]); 799 STORE(out + 96, s[3][0]); 800 STORE(out + 112, s[3][1]); 801 } 802