1 /*
2 Plain C implementation of the Haraka256 and Haraka512 permutations.
3 */
4 #include <immintrin.h>
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 
10 #include "haraka.h"
11 
12 #define HARAKAS_RATE 32
13 
14 #define u64 uint64_t
15 #define u128 __m128i
16 
17 #define LOAD(src) _mm_loadu_si128((u128 *)(src))
18 #define STORE(dest,src) _mm_storeu_si128((u128 *)(dest),src)
19 
20 #define XOR128(a, b) _mm_xor_si128(a, b)
21 
22 #define AES2(s0, s1, rci) \
23     (s0) = _mm_aesenc_si128(s0, *(rci)); \
24     (s1) = _mm_aesenc_si128(s1, *((rci) + 1)); \
25     (s0) = _mm_aesenc_si128(s0, *((rci) + 2)); \
26     (s1) = _mm_aesenc_si128(s1, *((rci) + 3));
27 
28 #define AES2_4x(s0, s1, s2, s3, rci) \
29     AES2((s0)[0], (s0)[1], rci); \
30     AES2((s1)[0], (s1)[1], rci); \
31     AES2((s2)[0], (s2)[1], rci); \
32     AES2((s3)[0], (s3)[1], rci);
33 
34 #define AES4(s0, s1, s2, s3, rci) \
35     (s0) = _mm_aesenc_si128(s0, *(rci)); \
36     (s1) = _mm_aesenc_si128(s1, *((rci) + 1)); \
37     (s2) = _mm_aesenc_si128(s2, *((rci) + 2)); \
38     (s3) = _mm_aesenc_si128(s3, *((rci) + 3)); \
39     (s0) = _mm_aesenc_si128(s0, *((rci) + 4)); \
40     (s1) = _mm_aesenc_si128(s1, *((rci) + 5)); \
41     (s2) = _mm_aesenc_si128(s2, *((rci) + 6)); \
42     (s3) = _mm_aesenc_si128(s3, *((rci) + 7));
43 
44 #define AES4_4x(s0, s1, s2, s3, rci) \
45     AES4((s0)[0], (s0)[1], (s0)[2], (s0)[3], rci); \
46     AES4((s1)[0], (s1)[1], (s1)[2], (s1)[3], rci); \
47     AES4((s2)[0], (s2)[1], (s2)[2], (s2)[3], rci); \
48     AES4((s3)[0], (s3)[1], (s3)[2], (s3)[3], rci);
49 
50 #define MIX2(s0, s1) \
51     tmp = _mm_unpacklo_epi32(s0, s1); \
52     (s1) = _mm_unpackhi_epi32(s0, s1); \
53     (s0) = tmp;
54 
55 #define MIX4(s0, s1, s2, s3) \
56     tmp  = _mm_unpacklo_epi32(s0, s1); \
57     (s0) = _mm_unpackhi_epi32(s0, s1); \
58     (s1) = _mm_unpacklo_epi32(s2, s3); \
59     (s2) = _mm_unpackhi_epi32(s2, s3); \
60     (s3) = _mm_unpacklo_epi32(s0, s2); \
61     (s0) = _mm_unpackhi_epi32(s0, s2); \
62     (s2) = _mm_unpackhi_epi32(s1, tmp); \
63     (s1) = _mm_unpacklo_epi32(s1, tmp);
64 
65 #define TRUNCSTORE(out, s0, s1, s2, s3) \
66     _mm_storeu_si128((u128 *)(out), \
67                      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(s0), _mm_castsi128_pd(s1), 3))); \
68     _mm_storeu_si128((u128 *)((out) + 16), \
69                      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(s2), _mm_castsi128_pd(s3), 0)));
70 
71 static void load_haraka_constants(u128 rc[40]) {
72     rc[ 0] = _mm_set_epi32((int)0x0684704c, (int)0xe620c00a, (int)0xb2c5fef0, (int)0x75817b9d);
73     rc[ 1] = _mm_set_epi32((int)0x8b66b4e1, (int)0x88f3a06b, (int)0x640f6ba4, (int)0x2f08f717);
74     rc[ 2] = _mm_set_epi32((int)0x3402de2d, (int)0x53f28498, (int)0xcf029d60, (int)0x9f029114);
75     rc[ 3] = _mm_set_epi32((int)0x0ed6eae6, (int)0x2e7b4f08, (int)0xbbf3bcaf, (int)0xfd5b4f79);
76     rc[ 4] = _mm_set_epi32((int)0xcbcfb0cb, (int)0x4872448b, (int)0x79eecd1c, (int)0xbe397044);
77     rc[ 5] = _mm_set_epi32((int)0x7eeacdee, (int)0x6e9032b7, (int)0x8d5335ed, (int)0x2b8a057b);
78     rc[ 6] = _mm_set_epi32((int)0x67c28f43, (int)0x5e2e7cd0, (int)0xe2412761, (int)0xda4fef1b);
79     rc[ 7] = _mm_set_epi32((int)0x2924d9b0, (int)0xafcacc07, (int)0x675ffde2, (int)0x1fc70b3b);
80     rc[ 8] = _mm_set_epi32((int)0xab4d63f1, (int)0xe6867fe9, (int)0xecdb8fca, (int)0xb9d465ee);
81     rc[ 9] = _mm_set_epi32((int)0x1c30bf84, (int)0xd4b7cd64, (int)0x5b2a404f, (int)0xad037e33);
82     rc[10] = _mm_set_epi32((int)0xb2cc0bb9, (int)0x941723bf, (int)0x69028b2e, (int)0x8df69800);
83     rc[11] = _mm_set_epi32((int)0xfa0478a6, (int)0xde6f5572, (int)0x4aaa9ec8, (int)0x5c9d2d8a);
84     rc[12] = _mm_set_epi32((int)0xdfb49f2b, (int)0x6b772a12, (int)0x0efa4f2e, (int)0x29129fd4);
85     rc[13] = _mm_set_epi32((int)0x1ea10344, (int)0xf449a236, (int)0x32d611ae, (int)0xbb6a12ee);
86     rc[14] = _mm_set_epi32((int)0xaf044988, (int)0x4b050084, (int)0x5f9600c9, (int)0x9ca8eca6);
87     rc[15] = _mm_set_epi32((int)0x21025ed8, (int)0x9d199c4f, (int)0x78a2c7e3, (int)0x27e593ec);
88     rc[16] = _mm_set_epi32((int)0xbf3aaaf8, (int)0xa759c9b7, (int)0xb9282ecd, (int)0x82d40173);
89     rc[17] = _mm_set_epi32((int)0x6260700d, (int)0x6186b017, (int)0x37f2efd9, (int)0x10307d6b);
90     rc[18] = _mm_set_epi32((int)0x5aca45c2, (int)0x21300443, (int)0x81c29153, (int)0xf6fc9ac6);
91     rc[19] = _mm_set_epi32((int)0x9223973c, (int)0x226b68bb, (int)0x2caf92e8, (int)0x36d1943a);
92     rc[20] = _mm_set_epi32((int)0xd3bf9238, (int)0x225886eb, (int)0x6cbab958, (int)0xe51071b4);
93     rc[21] = _mm_set_epi32((int)0xdb863ce5, (int)0xaef0c677, (int)0x933dfddd, (int)0x24e1128d);
94     rc[22] = _mm_set_epi32((int)0xbb606268, (int)0xffeba09c, (int)0x83e48de3, (int)0xcb2212b1);
95     rc[23] = _mm_set_epi32((int)0x734bd3dc, (int)0xe2e4d19c, (int)0x2db91a4e, (int)0xc72bf77d);
96     rc[24] = _mm_set_epi32((int)0x43bb47c3, (int)0x61301b43, (int)0x4b1415c4, (int)0x2cb3924e);
97     rc[25] = _mm_set_epi32((int)0xdba775a8, (int)0xe707eff6, (int)0x03b231dd, (int)0x16eb6899);
98     rc[26] = _mm_set_epi32((int)0x6df3614b, (int)0x3c755977, (int)0x8e5e2302, (int)0x7eca472c);
99     rc[27] = _mm_set_epi32((int)0xcda75a17, (int)0xd6de7d77, (int)0x6d1be5b9, (int)0xb88617f9);
100     rc[28] = _mm_set_epi32((int)0xec6b43f0, (int)0x6ba8e9aa, (int)0x9d6c069d, (int)0xa946ee5d);
101     rc[29] = _mm_set_epi32((int)0xcb1e6950, (int)0xf957332b, (int)0xa2531159, (int)0x3bf327c1);
102     rc[30] = _mm_set_epi32((int)0x2cee0c75, (int)0x00da619c, (int)0xe4ed0353, (int)0x600ed0d9);
103     rc[31] = _mm_set_epi32((int)0xf0b1a5a1, (int)0x96e90cab, (int)0x80bbbabc, (int)0x63a4a350);
104     rc[32] = _mm_set_epi32((int)0xae3db102, (int)0x5e962988, (int)0xab0dde30, (int)0x938dca39);
105     rc[33] = _mm_set_epi32((int)0x17bb8f38, (int)0xd554a40b, (int)0x8814f3a8, (int)0x2e75b442);
106     rc[34] = _mm_set_epi32((int)0x34bb8a5b, (int)0x5f427fd7, (int)0xaeb6b779, (int)0x360a16f6);
107     rc[35] = _mm_set_epi32((int)0x26f65241, (int)0xcbe55438, (int)0x43ce5918, (int)0xffbaafde);
108     rc[36] = _mm_set_epi32((int)0x4ce99a54, (int)0xb9f3026a, (int)0xa2ca9cf7, (int)0x839ec978);
109     rc[37] = _mm_set_epi32((int)0xae51a51a, (int)0x1bdff7be, (int)0x40c06e28, (int)0x22901235);
110     rc[38] = _mm_set_epi32((int)0xa0c1613c, (int)0xba7ed22b, (int)0xc173bc0f, (int)0x48a659cf);
111     rc[39] = _mm_set_epi32((int)0x756acc03, (int)0x02288288, (int)0x4ad6bdfd, (int)0xe9c59da1);
112 }
113 
114 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_tweak_constants(
115     harakactx *state,
116     const unsigned char *pk_seed, const unsigned char *sk_seed,
117     unsigned long long seed_length) {
118     int i;
119     unsigned char buf[40 * 16];
120 
121     /* Use the standard constants to generate tweaked ones. */
122     load_haraka_constants(state->rc);
123 
124     /* Constants for sk.seed */
125     if (sk_seed != NULL) {
126         PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S(buf, 40 * 16, sk_seed, seed_length, state);
127         /* Tweak constants with the pub_seed */
128         for (i = 0; i < 40; i++) {
129             state->rc_sseed[i] = LOAD(buf + i * 16);
130         }
131     }
132 
133     /* Constants for pk.seed */
134     PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S(buf, 40 * 16, pk_seed, seed_length, state);
135 
136     /* Tweak constants with the pub_seed */
137     for (i = 0; i < 40; i++) {
138         state->rc[i] = LOAD(buf + i * 16);
139     }
140 }
141 
142 static void haraka_S_absorb(unsigned char *s,
143                             const unsigned char *m, unsigned long long mlen,
144                             unsigned char p,
145                             const harakactx *state) {
146     unsigned long long i;
147     unsigned char t[HARAKAS_RATE];
148 
149     while (mlen >= HARAKAS_RATE) {
150         // XOR block to state
151         STORE(s, XOR128(LOAD(s), LOAD(m)));
152         STORE(s + 16, XOR128(LOAD(s + 16), LOAD(m + 16)));
153         PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm(s, s, state);
154         mlen -= HARAKAS_RATE;
155         m += HARAKAS_RATE;
156     }
157 
158     for (i = 0; i < HARAKAS_RATE; ++i) {
159         t[i] = 0;
160     }
161     for (i = 0; i < mlen; ++i) {
162         t[i] = m[i];
163     }
164     t[i] = p;
165     t[HARAKAS_RATE - 1] |= 128;
166     STORE(s, XOR128(LOAD(s), LOAD(t)));
167     STORE(s + 16, XOR128(LOAD(s + 16), LOAD(t + 16)));
168 }
169 
170 static void haraka_S_absorb4x(unsigned char *s,
171                               const unsigned char *m0,
172                               const unsigned char *m1,
173                               const unsigned char *m2,
174                               const unsigned char *m3,
175                               unsigned long long int mlen,
176                               unsigned char p,
177                               const harakactx *state) {
178     unsigned long long i;
179     unsigned char t0[HARAKAS_RATE];
180     unsigned char t1[HARAKAS_RATE];
181     unsigned char t2[HARAKAS_RATE];
182     unsigned char t3[HARAKAS_RATE];
183 
184     while (mlen >= HARAKAS_RATE) {
185         // XOR block to state
186         STORE(s, XOR128(LOAD(s), LOAD(m0)));
187         STORE(s + 16, XOR128(LOAD(s + 16), LOAD(m0 + 16)));
188         STORE(s + 64, XOR128(LOAD(s + 64), LOAD(m1)));
189         STORE(s + 80, XOR128(LOAD(s + 80), LOAD(m1 + 16)));
190         STORE(s + 128, XOR128(LOAD(s + 128), LOAD(m2)));
191         STORE(s + 144, XOR128(LOAD(s + 144), LOAD(m2 + 16)));
192         STORE(s + 192, XOR128(LOAD(s + 192), LOAD(m3)));
193         STORE(s + 208, XOR128(LOAD(s + 208), LOAD(m3 + 16)));
194 
195         PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm_x4(s, s, state);
196         mlen -= HARAKAS_RATE;
197         m0 += HARAKAS_RATE;
198         m1 += HARAKAS_RATE;
199         m2 += HARAKAS_RATE;
200         m3 += HARAKAS_RATE;
201     }
202 
203     for (i = 0; i < HARAKAS_RATE; ++i) {
204         t0[i] = 0;
205         t1[i] = 0;
206         t2[i] = 0;
207         t3[i] = 0;
208     }
209     for (i = 0; i < mlen; ++i) {
210         t0[i] = m0[i];
211         t1[i] = m1[i];
212         t2[i] = m2[i];
213         t3[i] = m3[i];
214     }
215 
216     t0[i] = p;
217     t1[i] = p;
218     t2[i] = p;
219     t3[i] = p;
220 
221     t0[HARAKAS_RATE - 1] |= 128;
222     t1[HARAKAS_RATE - 1] |= 128;
223     t2[HARAKAS_RATE - 1] |= 128;
224     t3[HARAKAS_RATE - 1] |= 128;
225 
226     STORE(s, XOR128(LOAD(s), LOAD(t0)));
227     STORE(s + 16, XOR128(LOAD(s + 16), LOAD(t0 + 16)));
228     STORE(s + 64, XOR128(LOAD(s + 64), LOAD(t1)));
229     STORE(s + 80, XOR128(LOAD(s + 80), LOAD(t1 + 16)));
230     STORE(s + 128, XOR128(LOAD(s + 128), LOAD(t2)));
231     STORE(s + 144, XOR128(LOAD(s + 144), LOAD(t2 + 16)));
232     STORE(s + 192, XOR128(LOAD(s + 192), LOAD(t3)));
233     STORE(s + 208, XOR128(LOAD(s + 208), LOAD(t3 + 16)));
234 }
235 
236 static void haraka_S_squeezeblocks(unsigned char *h, unsigned long long nblocks,
237                                    unsigned char *s, unsigned int r, const harakactx *state) {
238     while (nblocks > 0) {
239         PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm(s, s, state);
240         STORE(h, LOAD(s));
241         STORE(h + 16, LOAD(s + 16));
242         h += r;
243         nblocks--;
244     }
245 }
246 
247 static void haraka_S_squeezeblocks4x(unsigned char *h0,
248                                      unsigned char *h1,
249                                      unsigned char *h2,
250                                      unsigned char *h3,
251                                      unsigned long long nblocks,
252                                      unsigned char *s,
253                                      unsigned int r,
254                                      const harakactx *state) {
255     while (nblocks > 0) {
256         PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm_x4(s, s, state);
257         STORE(h0, LOAD(s));
258         STORE(h0 + 16, LOAD(s + 16));
259         STORE(h1, LOAD(s + 64));
260         STORE(h1 + 16, LOAD(s + 80));
261         STORE(h2, LOAD(s + 128));
262         STORE(h2 + 16, LOAD(s + 144));
263         STORE(h3, LOAD(s + 192));
264         STORE(h3 + 16, LOAD(s + 208));
265         h0 += r;
266         h1 += r;
267         h2 += r;
268         h3 += r;
269         nblocks--;
270     }
271 }
272 
273 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S_inc_init(uint8_t *s_inc) {
274     size_t i;
275 
276     for (i = 0; i < 64; i++) {
277         s_inc[i] = 0;
278     }
279     s_inc[64] = 0;
280 }
281 
282 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S_inc_absorb(uint8_t *s_inc, const uint8_t *m, size_t mlen, const harakactx *state) {
283     size_t i;
284 
285     /* Recall that s_inc[64] is the non-absorbed bytes xored into the state */
286     while (mlen + s_inc[64] >= HARAKAS_RATE) {
287         for (i = 0; i < (size_t)(HARAKAS_RATE - s_inc[64]); i++) {
288             /* Take the i'th byte from message
289                xor with the s_inc[64] + i'th byte of the state */
290             s_inc[s_inc[64] + i] ^= m[i];
291         }
292         mlen -= (size_t)(HARAKAS_RATE - s_inc[64]);
293         m += HARAKAS_RATE - s_inc[64];
294         s_inc[64] = 0;
295 
296         PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm(s_inc, s_inc, state);
297     }
298 
299     for (i = 0; i < mlen; i++) {
300         s_inc[s_inc[64] + i] ^= m[i];
301     }
302     s_inc[64] = (uint8_t)(s_inc[64] + mlen);
303 }
304 
305 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S_inc_finalize(uint8_t *s_inc) {
306     /* After haraka_S_inc_absorb, we are guaranteed that s_inc[64] < HARAKAS_RATE,
307        so we can always use one more byte for p in the current state. */
308     s_inc[s_inc[64]] ^= 0x1F;
309     s_inc[HARAKAS_RATE - 1] ^= 128;
310     s_inc[64] = 0;
311 }
312 
313 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S_inc_squeeze(uint8_t *out, size_t outlen, uint8_t *s_inc, const harakactx *state) {
314     size_t i;
315 
316     /* First consume any bytes we still have sitting around */
317     for (i = 0; i < outlen && i < s_inc[64]; i++) {
318         /* There are s_inc[64] bytes left, so r - s_inc[64] is the first
319            available byte. We consume from there, i.e., up to r. */
320         out[i] = (uint8_t)s_inc[(HARAKAS_RATE - s_inc[64] + (uint8_t)i)];
321     }
322     out += i;
323     outlen -= i;
324     s_inc[64] = (uint8_t)(s_inc[64] - i);
325 
326     /* Then squeeze the remaining necessary blocks */
327     while (outlen > 0) {
328         PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm(s_inc, s_inc, state);
329 
330         for (i = 0; i < outlen && i < HARAKAS_RATE; i++) {
331             out[i] = s_inc[i];
332         }
333         out += i;
334         outlen -= i;
335         s_inc[64] = (uint8_t)(HARAKAS_RATE - i);
336     }
337 }
338 
339 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_S(unsigned char *out, unsigned long long outlen,
340         const unsigned char *in, unsigned long long inlen, const harakactx *state) {
341     unsigned long long i;
342     unsigned char s[64];
343     unsigned char d[32];
344 
345     for (i = 0; i < 64; i++) {
346         s[i] = 0;
347     }
348     haraka_S_absorb(s, in, inlen, 0x1F, state);
349 
350     haraka_S_squeezeblocks(out, outlen / HARAKAS_RATE, s, HARAKAS_RATE, state);
351     out += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
352 
353     if (outlen % HARAKAS_RATE) {
354         haraka_S_squeezeblocks(d, 1, s, HARAKAS_RATE, state);
355         for (i = 0; i < outlen % HARAKAS_RATE; i++) {
356             out[i] = d[i];
357         }
358     }
359 }
360 
361 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka_Sx4(unsigned char *out0,
362         unsigned char *out1,
363         unsigned char *out2,
364         unsigned char *out3,
365         unsigned long long outlen,
366         const unsigned char *in0,
367         const unsigned char *in1,
368         const unsigned char *in2,
369         const unsigned char *in3,
370         unsigned long long inlen,
371         const harakactx *state) {
372     unsigned long long i;
373     unsigned char s[64 * 4];
374     unsigned char d0[32];
375     unsigned char d1[32];
376     unsigned char d2[32];
377     unsigned char d3[32];
378 
379     for (i = 0; i < 64 * 4; i++) {
380         s[i] = 0;
381     }
382     haraka_S_absorb4x(s, in0, in1, in2, in3, inlen, 0x1F, state);
383 
384     haraka_S_squeezeblocks4x(out0, out1, out2, out3, outlen / HARAKAS_RATE, s, HARAKAS_RATE, state);
385     out0 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
386     out1 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
387     out2 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
388     out3 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
389 
390     if (outlen % HARAKAS_RATE) {
391         haraka_S_squeezeblocks4x(d0, d1, d2, d3, 1, s, HARAKAS_RATE, state);
392         for (i = 0; i < outlen % HARAKAS_RATE; i++) {
393             out0[i] = d0[i];
394             out1[i] = d1[i];
395             out2[i] = d2[i];
396             out3[i] = d3[i];
397         }
398     }
399 }
400 
401 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm(unsigned char *out, const unsigned char *in, const harakactx *state) {
402     u128 s[4], tmp;
403 
404     s[0] = LOAD(in);
405     s[1] = LOAD(in + 16);
406     s[2] = LOAD(in + 32);
407     s[3] = LOAD(in + 48);
408 
409     AES4(s[0], s[1], s[2], s[3], state->rc);
410     MIX4(s[0], s[1], s[2], s[3]);
411 
412     AES4(s[0], s[1], s[2], s[3], state->rc + 8);
413     MIX4(s[0], s[1], s[2], s[3]);
414 
415     AES4(s[0], s[1], s[2], s[3], state->rc + 16);
416     MIX4(s[0], s[1], s[2], s[3]);
417 
418     AES4(s[0], s[1], s[2], s[3], state->rc + 24);
419     MIX4(s[0], s[1], s[2], s[3]);
420 
421     AES4(s[0], s[1], s[2], s[3], state->rc + 32);
422     MIX4(s[0], s[1], s[2], s[3]);
423 
424     STORE(out, s[0]);
425     STORE(out + 16, s[1]);
426     STORE(out + 32, s[2]);
427     STORE(out + 48, s[3]);
428 }
429 
430 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512_perm_x4(unsigned char *out, const unsigned char *in, const harakactx *state) {
431     u128 s[4][4], tmp;
432 
433     s[0][0] = LOAD(in);
434     s[0][1] = LOAD(in + 16);
435     s[0][2] = LOAD(in + 32);
436     s[0][3] = LOAD(in + 48);
437     s[1][0] = LOAD(in + 64);
438     s[1][1] = LOAD(in + 80);
439     s[1][2] = LOAD(in + 96);
440     s[1][3] = LOAD(in + 112);
441     s[2][0] = LOAD(in + 128);
442     s[2][1] = LOAD(in + 144);
443     s[2][2] = LOAD(in + 160);
444     s[2][3] = LOAD(in + 176);
445     s[3][0] = LOAD(in + 192);
446     s[3][1] = LOAD(in + 208);
447     s[3][2] = LOAD(in + 224);
448     s[3][3] = LOAD(in + 240);
449 
450     AES4_4x(s[0], s[1], s[2], s[3], state->rc);
451     MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
452     MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
453     MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
454     MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
455 
456     AES4_4x(s[0], s[1], s[2], s[3], state->rc + 8);
457     MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
458     MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
459     MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
460     MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
461 
462     AES4_4x(s[0], s[1], s[2], s[3], state->rc + 16);
463     MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
464     MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
465     MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
466     MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
467 
468     AES4_4x(s[0], s[1], s[2], s[3], state->rc + 24);
469     MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
470     MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
471     MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
472     MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
473 
474     AES4_4x(s[0], s[1], s[2], s[3], state->rc + 32);
475     MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
476     MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
477     MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
478     MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
479 
480     STORE(out, s[0][0]);
481     STORE(out + 16, s[0][1]);
482     STORE(out + 32, s[0][2]);
483     STORE(out + 48, s[0][3]);
484     STORE(out + 64, s[1][0]);
485     STORE(out + 80, s[1][1]);
486     STORE(out + 96, s[1][2]);
487     STORE(out + 112, s[1][3]);
488     STORE(out + 128, s[2][0]);
489     STORE(out + 144, s[2][1]);
490     STORE(out + 160, s[2][2]);
491     STORE(out + 176, s[2][3]);
492     STORE(out + 192, s[3][0]);
493     STORE(out + 208, s[3][1]);
494     STORE(out + 224, s[3][2]);
495     STORE(out + 240, s[3][3]);
496 }
497 
498 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512(unsigned char *out, const unsigned char *in, const harakactx *state) {
499     u128 s[4], tmp;
500 
501     s[0] = LOAD(in);
502     s[1] = LOAD(in + 16);
503     s[2] = LOAD(in + 32);
504     s[3] = LOAD(in + 48);
505 
506     AES4(s[0], s[1], s[2], s[3], state->rc);
507     MIX4(s[0], s[1], s[2], s[3]);
508 
509     AES4(s[0], s[1], s[2], s[3], state->rc + 8);
510     MIX4(s[0], s[1], s[2], s[3]);
511 
512     AES4(s[0], s[1], s[2], s[3], state->rc + 16);
513     MIX4(s[0], s[1], s[2], s[3]);
514 
515     AES4(s[0], s[1], s[2], s[3], state->rc + 24);
516     MIX4(s[0], s[1], s[2], s[3]);
517 
518     AES4(s[0], s[1], s[2], s[3], state->rc + 32);
519     MIX4(s[0], s[1], s[2], s[3]);
520 
521     s[0] = XOR128(s[0], LOAD(in));
522     s[1] = XOR128(s[1], LOAD(in + 16));
523     s[2] = XOR128(s[2], LOAD(in + 32));
524     s[3] = XOR128(s[3], LOAD(in + 48));
525 
526     // truncate and store result
527     TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
528 }
529 
530 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka512x4(unsigned char *out, const unsigned char *in, const harakactx *state) {
531     u128 s[4][4], tmp;
532 
533     s[0][0] = LOAD(in);
534     s[0][1] = LOAD(in + 16);
535     s[0][2] = LOAD(in + 32);
536     s[0][3] = LOAD(in + 48);
537     s[1][0] = LOAD(in + 64);
538     s[1][1] = LOAD(in + 80);
539     s[1][2] = LOAD(in + 96);
540     s[1][3] = LOAD(in + 112);
541     s[2][0] = LOAD(in + 128);
542     s[2][1] = LOAD(in + 144);
543     s[2][2] = LOAD(in + 160);
544     s[2][3] = LOAD(in + 176);
545     s[3][0] = LOAD(in + 192);
546     s[3][1] = LOAD(in + 208);
547     s[3][2] = LOAD(in + 224);
548     s[3][3] = LOAD(in + 240);
549 
550     AES4_4x(s[0], s[1], s[2], s[3], state->rc);
551     MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
552     MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
553     MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
554     MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
555 
556     AES4_4x(s[0], s[1], s[2], s[3], state->rc + 8);
557     MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
558     MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
559     MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
560     MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
561 
562     AES4_4x(s[0], s[1], s[2], s[3], state->rc + 16);
563     MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
564     MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
565     MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
566     MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
567 
568     AES4_4x(s[0], s[1], s[2], s[3], state->rc + 24);
569     MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
570     MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
571     MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
572     MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
573 
574     AES4_4x(s[0], s[1], s[2], s[3], state->rc + 32);
575     MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
576     MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
577     MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
578     MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
579 
580     s[0][0] = XOR128(s[0][0], LOAD(in));
581     s[0][1] = XOR128(s[0][1], LOAD(in + 16));
582     s[0][2] = XOR128(s[0][2], LOAD(in + 32));
583     s[0][3] = XOR128(s[0][3], LOAD(in + 48));
584     s[1][0] = XOR128(s[1][0], LOAD(in + 64));
585     s[1][1] = XOR128(s[1][1], LOAD(in + 80));
586     s[1][2] = XOR128(s[1][2], LOAD(in + 96));
587     s[1][3] = XOR128(s[1][3], LOAD(in + 112));
588     s[2][0] = XOR128(s[2][0], LOAD(in + 128));
589     s[2][1] = XOR128(s[2][1], LOAD(in + 144));
590     s[2][2] = XOR128(s[2][2], LOAD(in + 160));
591     s[2][3] = XOR128(s[2][3], LOAD(in + 176));
592     s[3][0] = XOR128(s[3][0], LOAD(in + 192));
593     s[3][1] = XOR128(s[3][1], LOAD(in + 208));
594     s[3][2] = XOR128(s[3][2], LOAD(in + 224));
595     s[3][3] = XOR128(s[3][3], LOAD(in + 240));
596 
597     TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
598     TRUNCSTORE((out + 32), s[1][0], s[1][1], s[1][2], s[1][3]);
599     TRUNCSTORE((out + 64), s[2][0], s[2][1], s[2][2], s[2][3]);
600     TRUNCSTORE((out + 96), s[3][0], s[3][1], s[3][2], s[3][3]);
601 }
602 
603 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka256(unsigned char *out, const unsigned char *in, const harakactx *state) {
604     u128 s[2], tmp;
605 
606     s[0] = LOAD(in);
607     s[1] = LOAD(in + 16);
608 
609     AES2(s[0], s[1], state->rc);
610     MIX2(s[0], s[1]);
611 
612     AES2(s[0], s[1], state->rc + 4);
613     MIX2(s[0], s[1]);
614 
615     AES2(s[0], s[1], state->rc + 8);
616     MIX2(s[0], s[1]);
617 
618     AES2(s[0], s[1], state->rc + 12);
619     MIX2(s[0], s[1]);
620 
621     AES2(s[0], s[1], state->rc + 16);
622     MIX2(s[0], s[1]);
623 
624     s[0] = XOR128(s[0], LOAD(in));
625     s[1] = XOR128(s[1], LOAD(in + 16));
626 
627     STORE(out, s[0]);
628     STORE(out + 16, s[1]);
629 }
630 
631 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka256x4(unsigned char *out, const unsigned char *in, const harakactx *state) {
632     u128 s[4][2], tmp;
633 
634     s[0][0] = LOAD(in);
635     s[0][1] = LOAD(in + 16);
636     s[1][0] = LOAD(in + 32);
637     s[1][1] = LOAD(in + 48);
638     s[2][0] = LOAD(in + 64);
639     s[2][1] = LOAD(in + 80);
640     s[3][0] = LOAD(in + 96);
641     s[3][1] = LOAD(in + 112);
642 
643     // Round 1
644     AES2_4x(s[0], s[1], s[2], s[3], state->rc);
645 
646     MIX2(s[0][0], s[0][1]);
647     MIX2(s[1][0], s[1][1]);
648     MIX2(s[2][0], s[2][1]);
649     MIX2(s[3][0], s[3][1]);
650 
651     // Round 2
652     AES2_4x(s[0], s[1], s[2], s[3], state->rc + 4);
653 
654     MIX2(s[0][0], s[0][1]);
655     MIX2(s[1][0], s[1][1]);
656     MIX2(s[2][0], s[2][1]);
657     MIX2(s[3][0], s[3][1]);
658 
659     // Round 3
660     AES2_4x(s[0], s[1], s[2], s[3], state->rc + 8);
661 
662     MIX2(s[0][0], s[0][1]);
663     MIX2(s[1][0], s[1][1]);
664     MIX2(s[2][0], s[2][1]);
665     MIX2(s[3][0], s[3][1]);
666 
667     // Round 4
668     AES2_4x(s[0], s[1], s[2], s[3], state->rc + 12);
669 
670     MIX2(s[0][0], s[0][1]);
671     MIX2(s[1][0], s[1][1]);
672     MIX2(s[2][0], s[2][1]);
673     MIX2(s[3][0], s[3][1]);
674 
675     // Round 5
676     AES2_4x(s[0], s[1], s[2], s[3], state->rc + 16);
677 
678     MIX2(s[0][0], s[0][1]);
679     MIX2(s[1][0], s[1][1]);
680     MIX2(s[2][0], s[2][1]);
681     MIX2(s[3][0], s[3][1]);
682 
683     // Feed Forward
684     s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
685     s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
686     s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
687     s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
688     s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
689     s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
690     s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
691     s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
692 
693     STORE(out, s[0][0]);
694     STORE(out + 16, s[0][1]);
695     STORE(out + 32, s[1][0]);
696     STORE(out + 48, s[1][1]);
697     STORE(out + 64, s[2][0]);
698     STORE(out + 80, s[2][1]);
699     STORE(out + 96, s[3][0]);
700     STORE(out + 112, s[3][1]);
701 }
702 
703 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka256_sk(unsigned char *out, const unsigned char *in, const harakactx *state) {
704     u128 s[2], tmp;
705 
706     s[0] = LOAD(in);
707     s[1] = LOAD(in + 16);
708 
709     AES2(s[0], s[1], state->rc_sseed);
710     MIX2(s[0], s[1]);
711 
712     AES2(s[0], s[1], state->rc_sseed + 4);
713     MIX2(s[0], s[1]);
714 
715     AES2(s[0], s[1], state->rc_sseed + 8);
716     MIX2(s[0], s[1]);
717 
718     AES2(s[0], s[1], state->rc_sseed + 12);
719     MIX2(s[0], s[1]);
720 
721     AES2(s[0], s[1], state->rc_sseed + 16);
722     MIX2(s[0], s[1]);
723 
724     s[0] = XOR128(s[0], LOAD(in));
725     s[1] = XOR128(s[1], LOAD(in + 16));
726 
727     STORE(out, s[0]);
728     STORE(out + 16, s[1]);
729 }
730 
731 void PQCLEAN_SPHINCSHARAKA128SROBUST_AESNI_haraka256_skx4(unsigned char *out, const unsigned char *in, const harakactx *state) {
732     u128 s[4][2], tmp;
733 
734     s[0][0] = LOAD(in);
735     s[0][1] = LOAD(in + 16);
736     s[1][0] = LOAD(in + 32);
737     s[1][1] = LOAD(in + 48);
738     s[2][0] = LOAD(in + 64);
739     s[2][1] = LOAD(in + 80);
740     s[3][0] = LOAD(in + 96);
741     s[3][1] = LOAD(in + 112);
742 
743     // Round 1
744     AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed);
745 
746     MIX2(s[0][0], s[0][1]);
747     MIX2(s[1][0], s[1][1]);
748     MIX2(s[2][0], s[2][1]);
749     MIX2(s[3][0], s[3][1]);
750 
751     // Round 2
752     AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 4);
753 
754     MIX2(s[0][0], s[0][1]);
755     MIX2(s[1][0], s[1][1]);
756     MIX2(s[2][0], s[2][1]);
757     MIX2(s[3][0], s[3][1]);
758 
759     // Round 3
760     AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 8);
761 
762     MIX2(s[0][0], s[0][1]);
763     MIX2(s[1][0], s[1][1]);
764     MIX2(s[2][0], s[2][1]);
765     MIX2(s[3][0], s[3][1]);
766 
767     // Round 4
768     AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 12);
769 
770     MIX2(s[0][0], s[0][1]);
771     MIX2(s[1][0], s[1][1]);
772     MIX2(s[2][0], s[2][1]);
773     MIX2(s[3][0], s[3][1]);
774 
775     // Round 5
776     AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 16);
777 
778     MIX2(s[0][0], s[0][1]);
779     MIX2(s[1][0], s[1][1]);
780     MIX2(s[2][0], s[2][1]);
781     MIX2(s[3][0], s[3][1]);
782 
783     // Feed Forward
784     s[0][0] = XOR128(s[0][0], LOAD(in));
785     s[0][1] = XOR128(s[0][1], LOAD(in + 16));
786     s[1][0] = XOR128(s[1][0], LOAD(in + 32));
787     s[1][1] = XOR128(s[1][1], LOAD(in + 48));
788     s[2][0] = XOR128(s[2][0], LOAD(in + 64));
789     s[2][1] = XOR128(s[2][1], LOAD(in + 80));
790     s[3][0] = XOR128(s[3][0], LOAD(in + 96));
791     s[3][1] = XOR128(s[3][1], LOAD(in + 112));
792 
793     STORE(out, s[0][0]);
794     STORE(out + 16, s[0][1]);
795     STORE(out + 32, s[1][0]);
796     STORE(out + 48, s[1][1]);
797     STORE(out + 64, s[2][0]);
798     STORE(out + 80, s[2][1]);
799     STORE(out + 96, s[3][0]);
800     STORE(out + 112, s[3][1]);
801 }
802