1 /*
2 Plain C implementation of the Haraka256 and Haraka512 permutations.
3 */
4 #include <immintrin.h>
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9
10 #include "haraka.h"
11
12 #define HARAKAS_RATE 32
13
14 #define u64 uint64_t
15 #define u128 __m128i
16
17 #define LOAD(src) _mm_loadu_si128((u128 *)(src))
18 #define STORE(dest,src) _mm_storeu_si128((u128 *)(dest),src)
19
20 #define XOR128(a, b) _mm_xor_si128(a, b)
21
22 #define AES2(s0, s1, rci) \
23 (s0) = _mm_aesenc_si128(s0, *(rci)); \
24 (s1) = _mm_aesenc_si128(s1, *((rci) + 1)); \
25 (s0) = _mm_aesenc_si128(s0, *((rci) + 2)); \
26 (s1) = _mm_aesenc_si128(s1, *((rci) + 3));
27
28 #define AES2_4x(s0, s1, s2, s3, rci) \
29 AES2((s0)[0], (s0)[1], rci); \
30 AES2((s1)[0], (s1)[1], rci); \
31 AES2((s2)[0], (s2)[1], rci); \
32 AES2((s3)[0], (s3)[1], rci);
33
34 #define AES4(s0, s1, s2, s3, rci) \
35 (s0) = _mm_aesenc_si128(s0, *(rci)); \
36 (s1) = _mm_aesenc_si128(s1, *((rci) + 1)); \
37 (s2) = _mm_aesenc_si128(s2, *((rci) + 2)); \
38 (s3) = _mm_aesenc_si128(s3, *((rci) + 3)); \
39 (s0) = _mm_aesenc_si128(s0, *((rci) + 4)); \
40 (s1) = _mm_aesenc_si128(s1, *((rci) + 5)); \
41 (s2) = _mm_aesenc_si128(s2, *((rci) + 6)); \
42 (s3) = _mm_aesenc_si128(s3, *((rci) + 7));
43
44 #define AES4_4x(s0, s1, s2, s3, rci) \
45 AES4((s0)[0], (s0)[1], (s0)[2], (s0)[3], rci); \
46 AES4((s1)[0], (s1)[1], (s1)[2], (s1)[3], rci); \
47 AES4((s2)[0], (s2)[1], (s2)[2], (s2)[3], rci); \
48 AES4((s3)[0], (s3)[1], (s3)[2], (s3)[3], rci);
49
50 #define MIX2(s0, s1) \
51 tmp = _mm_unpacklo_epi32(s0, s1); \
52 (s1) = _mm_unpackhi_epi32(s0, s1); \
53 (s0) = tmp;
54
55 #define MIX4(s0, s1, s2, s3) \
56 tmp = _mm_unpacklo_epi32(s0, s1); \
57 (s0) = _mm_unpackhi_epi32(s0, s1); \
58 (s1) = _mm_unpacklo_epi32(s2, s3); \
59 (s2) = _mm_unpackhi_epi32(s2, s3); \
60 (s3) = _mm_unpacklo_epi32(s0, s2); \
61 (s0) = _mm_unpackhi_epi32(s0, s2); \
62 (s2) = _mm_unpackhi_epi32(s1, tmp); \
63 (s1) = _mm_unpacklo_epi32(s1, tmp);
64
65 #define TRUNCSTORE(out, s0, s1, s2, s3) \
66 _mm_storeu_si128((u128 *)(out), \
67 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(s0), _mm_castsi128_pd(s1), 3))); \
68 _mm_storeu_si128((u128 *)((out) + 16), \
69 _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(s2), _mm_castsi128_pd(s3), 0)));
70
load_haraka_constants(u128 rc[40])71 static void load_haraka_constants(u128 rc[40]) {
72 rc[ 0] = _mm_set_epi32((int)0x0684704c, (int)0xe620c00a, (int)0xb2c5fef0, (int)0x75817b9d);
73 rc[ 1] = _mm_set_epi32((int)0x8b66b4e1, (int)0x88f3a06b, (int)0x640f6ba4, (int)0x2f08f717);
74 rc[ 2] = _mm_set_epi32((int)0x3402de2d, (int)0x53f28498, (int)0xcf029d60, (int)0x9f029114);
75 rc[ 3] = _mm_set_epi32((int)0x0ed6eae6, (int)0x2e7b4f08, (int)0xbbf3bcaf, (int)0xfd5b4f79);
76 rc[ 4] = _mm_set_epi32((int)0xcbcfb0cb, (int)0x4872448b, (int)0x79eecd1c, (int)0xbe397044);
77 rc[ 5] = _mm_set_epi32((int)0x7eeacdee, (int)0x6e9032b7, (int)0x8d5335ed, (int)0x2b8a057b);
78 rc[ 6] = _mm_set_epi32((int)0x67c28f43, (int)0x5e2e7cd0, (int)0xe2412761, (int)0xda4fef1b);
79 rc[ 7] = _mm_set_epi32((int)0x2924d9b0, (int)0xafcacc07, (int)0x675ffde2, (int)0x1fc70b3b);
80 rc[ 8] = _mm_set_epi32((int)0xab4d63f1, (int)0xe6867fe9, (int)0xecdb8fca, (int)0xb9d465ee);
81 rc[ 9] = _mm_set_epi32((int)0x1c30bf84, (int)0xd4b7cd64, (int)0x5b2a404f, (int)0xad037e33);
82 rc[10] = _mm_set_epi32((int)0xb2cc0bb9, (int)0x941723bf, (int)0x69028b2e, (int)0x8df69800);
83 rc[11] = _mm_set_epi32((int)0xfa0478a6, (int)0xde6f5572, (int)0x4aaa9ec8, (int)0x5c9d2d8a);
84 rc[12] = _mm_set_epi32((int)0xdfb49f2b, (int)0x6b772a12, (int)0x0efa4f2e, (int)0x29129fd4);
85 rc[13] = _mm_set_epi32((int)0x1ea10344, (int)0xf449a236, (int)0x32d611ae, (int)0xbb6a12ee);
86 rc[14] = _mm_set_epi32((int)0xaf044988, (int)0x4b050084, (int)0x5f9600c9, (int)0x9ca8eca6);
87 rc[15] = _mm_set_epi32((int)0x21025ed8, (int)0x9d199c4f, (int)0x78a2c7e3, (int)0x27e593ec);
88 rc[16] = _mm_set_epi32((int)0xbf3aaaf8, (int)0xa759c9b7, (int)0xb9282ecd, (int)0x82d40173);
89 rc[17] = _mm_set_epi32((int)0x6260700d, (int)0x6186b017, (int)0x37f2efd9, (int)0x10307d6b);
90 rc[18] = _mm_set_epi32((int)0x5aca45c2, (int)0x21300443, (int)0x81c29153, (int)0xf6fc9ac6);
91 rc[19] = _mm_set_epi32((int)0x9223973c, (int)0x226b68bb, (int)0x2caf92e8, (int)0x36d1943a);
92 rc[20] = _mm_set_epi32((int)0xd3bf9238, (int)0x225886eb, (int)0x6cbab958, (int)0xe51071b4);
93 rc[21] = _mm_set_epi32((int)0xdb863ce5, (int)0xaef0c677, (int)0x933dfddd, (int)0x24e1128d);
94 rc[22] = _mm_set_epi32((int)0xbb606268, (int)0xffeba09c, (int)0x83e48de3, (int)0xcb2212b1);
95 rc[23] = _mm_set_epi32((int)0x734bd3dc, (int)0xe2e4d19c, (int)0x2db91a4e, (int)0xc72bf77d);
96 rc[24] = _mm_set_epi32((int)0x43bb47c3, (int)0x61301b43, (int)0x4b1415c4, (int)0x2cb3924e);
97 rc[25] = _mm_set_epi32((int)0xdba775a8, (int)0xe707eff6, (int)0x03b231dd, (int)0x16eb6899);
98 rc[26] = _mm_set_epi32((int)0x6df3614b, (int)0x3c755977, (int)0x8e5e2302, (int)0x7eca472c);
99 rc[27] = _mm_set_epi32((int)0xcda75a17, (int)0xd6de7d77, (int)0x6d1be5b9, (int)0xb88617f9);
100 rc[28] = _mm_set_epi32((int)0xec6b43f0, (int)0x6ba8e9aa, (int)0x9d6c069d, (int)0xa946ee5d);
101 rc[29] = _mm_set_epi32((int)0xcb1e6950, (int)0xf957332b, (int)0xa2531159, (int)0x3bf327c1);
102 rc[30] = _mm_set_epi32((int)0x2cee0c75, (int)0x00da619c, (int)0xe4ed0353, (int)0x600ed0d9);
103 rc[31] = _mm_set_epi32((int)0xf0b1a5a1, (int)0x96e90cab, (int)0x80bbbabc, (int)0x63a4a350);
104 rc[32] = _mm_set_epi32((int)0xae3db102, (int)0x5e962988, (int)0xab0dde30, (int)0x938dca39);
105 rc[33] = _mm_set_epi32((int)0x17bb8f38, (int)0xd554a40b, (int)0x8814f3a8, (int)0x2e75b442);
106 rc[34] = _mm_set_epi32((int)0x34bb8a5b, (int)0x5f427fd7, (int)0xaeb6b779, (int)0x360a16f6);
107 rc[35] = _mm_set_epi32((int)0x26f65241, (int)0xcbe55438, (int)0x43ce5918, (int)0xffbaafde);
108 rc[36] = _mm_set_epi32((int)0x4ce99a54, (int)0xb9f3026a, (int)0xa2ca9cf7, (int)0x839ec978);
109 rc[37] = _mm_set_epi32((int)0xae51a51a, (int)0x1bdff7be, (int)0x40c06e28, (int)0x22901235);
110 rc[38] = _mm_set_epi32((int)0xa0c1613c, (int)0xba7ed22b, (int)0xc173bc0f, (int)0x48a659cf);
111 rc[39] = _mm_set_epi32((int)0x756acc03, (int)0x02288288, (int)0x4ad6bdfd, (int)0xe9c59da1);
112 }
113
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_tweak_constants(harakactx * state,const unsigned char * pk_seed,const unsigned char * sk_seed,unsigned long long seed_length)114 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_tweak_constants(
115 harakactx *state,
116 const unsigned char *pk_seed, const unsigned char *sk_seed,
117 unsigned long long seed_length) {
118 int i;
119 unsigned char buf[40 * 16];
120
121 /* Use the standard constants to generate tweaked ones. */
122 load_haraka_constants(state->rc);
123
124 /* Constants for sk.seed */
125 if (sk_seed != NULL) {
126 PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S(buf, 40 * 16, sk_seed, seed_length, state);
127 /* Tweak constants with the pub_seed */
128 for (i = 0; i < 40; i++) {
129 state->rc_sseed[i] = LOAD(buf + i * 16);
130 }
131 }
132
133 /* Constants for pk.seed */
134 PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S(buf, 40 * 16, pk_seed, seed_length, state);
135
136 /* Tweak constants with the pub_seed */
137 for (i = 0; i < 40; i++) {
138 state->rc[i] = LOAD(buf + i * 16);
139 }
140 }
141
haraka_S_absorb(unsigned char * s,const unsigned char * m,unsigned long long mlen,unsigned char p,const harakactx * state)142 static void haraka_S_absorb(unsigned char *s,
143 const unsigned char *m, unsigned long long mlen,
144 unsigned char p,
145 const harakactx *state) {
146 unsigned long long i;
147 unsigned char t[HARAKAS_RATE];
148
149 while (mlen >= HARAKAS_RATE) {
150 // XOR block to state
151 STORE(s, XOR128(LOAD(s), LOAD(m)));
152 STORE(s + 16, XOR128(LOAD(s + 16), LOAD(m + 16)));
153 PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512_perm(s, s, state);
154 mlen -= HARAKAS_RATE;
155 m += HARAKAS_RATE;
156 }
157
158 for (i = 0; i < HARAKAS_RATE; ++i) {
159 t[i] = 0;
160 }
161 for (i = 0; i < mlen; ++i) {
162 t[i] = m[i];
163 }
164 t[i] = p;
165 t[HARAKAS_RATE - 1] |= 128;
166 STORE(s, XOR128(LOAD(s), LOAD(t)));
167 STORE(s + 16, XOR128(LOAD(s + 16), LOAD(t + 16)));
168 }
169
haraka_S_absorb4x(unsigned char * s,const unsigned char * m0,const unsigned char * m1,const unsigned char * m2,const unsigned char * m3,unsigned long long int mlen,unsigned char p,const harakactx * state)170 static void haraka_S_absorb4x(unsigned char *s,
171 const unsigned char *m0,
172 const unsigned char *m1,
173 const unsigned char *m2,
174 const unsigned char *m3,
175 unsigned long long int mlen,
176 unsigned char p,
177 const harakactx *state) {
178 unsigned long long i;
179 unsigned char t0[HARAKAS_RATE];
180 unsigned char t1[HARAKAS_RATE];
181 unsigned char t2[HARAKAS_RATE];
182 unsigned char t3[HARAKAS_RATE];
183
184 while (mlen >= HARAKAS_RATE) {
185 // XOR block to state
186 STORE(s, XOR128(LOAD(s), LOAD(m0)));
187 STORE(s + 16, XOR128(LOAD(s + 16), LOAD(m0 + 16)));
188 STORE(s + 64, XOR128(LOAD(s + 64), LOAD(m1)));
189 STORE(s + 80, XOR128(LOAD(s + 80), LOAD(m1 + 16)));
190 STORE(s + 128, XOR128(LOAD(s + 128), LOAD(m2)));
191 STORE(s + 144, XOR128(LOAD(s + 144), LOAD(m2 + 16)));
192 STORE(s + 192, XOR128(LOAD(s + 192), LOAD(m3)));
193 STORE(s + 208, XOR128(LOAD(s + 208), LOAD(m3 + 16)));
194
195 PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512_perm_x4(s, s, state);
196 mlen -= HARAKAS_RATE;
197 m0 += HARAKAS_RATE;
198 m1 += HARAKAS_RATE;
199 m2 += HARAKAS_RATE;
200 m3 += HARAKAS_RATE;
201 }
202
203 for (i = 0; i < HARAKAS_RATE; ++i) {
204 t0[i] = 0;
205 t1[i] = 0;
206 t2[i] = 0;
207 t3[i] = 0;
208 }
209 for (i = 0; i < mlen; ++i) {
210 t0[i] = m0[i];
211 t1[i] = m1[i];
212 t2[i] = m2[i];
213 t3[i] = m3[i];
214 }
215
216 t0[i] = p;
217 t1[i] = p;
218 t2[i] = p;
219 t3[i] = p;
220
221 t0[HARAKAS_RATE - 1] |= 128;
222 t1[HARAKAS_RATE - 1] |= 128;
223 t2[HARAKAS_RATE - 1] |= 128;
224 t3[HARAKAS_RATE - 1] |= 128;
225
226 STORE(s, XOR128(LOAD(s), LOAD(t0)));
227 STORE(s + 16, XOR128(LOAD(s + 16), LOAD(t0 + 16)));
228 STORE(s + 64, XOR128(LOAD(s + 64), LOAD(t1)));
229 STORE(s + 80, XOR128(LOAD(s + 80), LOAD(t1 + 16)));
230 STORE(s + 128, XOR128(LOAD(s + 128), LOAD(t2)));
231 STORE(s + 144, XOR128(LOAD(s + 144), LOAD(t2 + 16)));
232 STORE(s + 192, XOR128(LOAD(s + 192), LOAD(t3)));
233 STORE(s + 208, XOR128(LOAD(s + 208), LOAD(t3 + 16)));
234 }
235
haraka_S_squeezeblocks(unsigned char * h,unsigned long long nblocks,unsigned char * s,unsigned int r,const harakactx * state)236 static void haraka_S_squeezeblocks(unsigned char *h, unsigned long long nblocks,
237 unsigned char *s, unsigned int r, const harakactx *state) {
238 while (nblocks > 0) {
239 PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512_perm(s, s, state);
240 STORE(h, LOAD(s));
241 STORE(h + 16, LOAD(s + 16));
242 h += r;
243 nblocks--;
244 }
245 }
246
haraka_S_squeezeblocks4x(unsigned char * h0,unsigned char * h1,unsigned char * h2,unsigned char * h3,unsigned long long nblocks,unsigned char * s,unsigned int r,const harakactx * state)247 static void haraka_S_squeezeblocks4x(unsigned char *h0,
248 unsigned char *h1,
249 unsigned char *h2,
250 unsigned char *h3,
251 unsigned long long nblocks,
252 unsigned char *s,
253 unsigned int r,
254 const harakactx *state) {
255 while (nblocks > 0) {
256 PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512_perm_x4(s, s, state);
257 STORE(h0, LOAD(s));
258 STORE(h0 + 16, LOAD(s + 16));
259 STORE(h1, LOAD(s + 64));
260 STORE(h1 + 16, LOAD(s + 80));
261 STORE(h2, LOAD(s + 128));
262 STORE(h2 + 16, LOAD(s + 144));
263 STORE(h3, LOAD(s + 192));
264 STORE(h3 + 16, LOAD(s + 208));
265 h0 += r;
266 h1 += r;
267 h2 += r;
268 h3 += r;
269 nblocks--;
270 }
271 }
272
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S_inc_init(uint8_t * s_inc)273 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S_inc_init(uint8_t *s_inc) {
274 size_t i;
275
276 for (i = 0; i < 64; i++) {
277 s_inc[i] = 0;
278 }
279 s_inc[64] = 0;
280 }
281
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S_inc_absorb(uint8_t * s_inc,const uint8_t * m,size_t mlen,const harakactx * state)282 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S_inc_absorb(uint8_t *s_inc, const uint8_t *m, size_t mlen, const harakactx *state) {
283 size_t i;
284
285 /* Recall that s_inc[64] is the non-absorbed bytes xored into the state */
286 while (mlen + s_inc[64] >= HARAKAS_RATE) {
287 for (i = 0; i < (size_t)(HARAKAS_RATE - s_inc[64]); i++) {
288 /* Take the i'th byte from message
289 xor with the s_inc[64] + i'th byte of the state */
290 s_inc[s_inc[64] + i] ^= m[i];
291 }
292 mlen -= (size_t)(HARAKAS_RATE - s_inc[64]);
293 m += HARAKAS_RATE - s_inc[64];
294 s_inc[64] = 0;
295
296 PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512_perm(s_inc, s_inc, state);
297 }
298
299 for (i = 0; i < mlen; i++) {
300 s_inc[s_inc[64] + i] ^= m[i];
301 }
302 s_inc[64] = (uint8_t)(s_inc[64] + mlen);
303 }
304
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S_inc_finalize(uint8_t * s_inc)305 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S_inc_finalize(uint8_t *s_inc) {
306 /* After haraka_S_inc_absorb, we are guaranteed that s_inc[64] < HARAKAS_RATE,
307 so we can always use one more byte for p in the current state. */
308 s_inc[s_inc[64]] ^= 0x1F;
309 s_inc[HARAKAS_RATE - 1] ^= 128;
310 s_inc[64] = 0;
311 }
312
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S_inc_squeeze(uint8_t * out,size_t outlen,uint8_t * s_inc,const harakactx * state)313 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S_inc_squeeze(uint8_t *out, size_t outlen, uint8_t *s_inc, const harakactx *state) {
314 size_t i;
315
316 /* First consume any bytes we still have sitting around */
317 for (i = 0; i < outlen && i < s_inc[64]; i++) {
318 /* There are s_inc[64] bytes left, so r - s_inc[64] is the first
319 available byte. We consume from there, i.e., up to r. */
320 out[i] = (uint8_t)s_inc[(HARAKAS_RATE - s_inc[64] + (uint8_t)i)];
321 }
322 out += i;
323 outlen -= i;
324 s_inc[64] = (uint8_t)(s_inc[64] - i);
325
326 /* Then squeeze the remaining necessary blocks */
327 while (outlen > 0) {
328 PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512_perm(s_inc, s_inc, state);
329
330 for (i = 0; i < outlen && i < HARAKAS_RATE; i++) {
331 out[i] = s_inc[i];
332 }
333 out += i;
334 outlen -= i;
335 s_inc[64] = (uint8_t)(HARAKAS_RATE - i);
336 }
337 }
338
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S(unsigned char * out,unsigned long long outlen,const unsigned char * in,unsigned long long inlen,const harakactx * state)339 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_S(unsigned char *out, unsigned long long outlen,
340 const unsigned char *in, unsigned long long inlen, const harakactx *state) {
341 unsigned long long i;
342 unsigned char s[64];
343 unsigned char d[32];
344
345 for (i = 0; i < 64; i++) {
346 s[i] = 0;
347 }
348 haraka_S_absorb(s, in, inlen, 0x1F, state);
349
350 haraka_S_squeezeblocks(out, outlen / HARAKAS_RATE, s, HARAKAS_RATE, state);
351 out += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
352
353 if (outlen % HARAKAS_RATE) {
354 haraka_S_squeezeblocks(d, 1, s, HARAKAS_RATE, state);
355 for (i = 0; i < outlen % HARAKAS_RATE; i++) {
356 out[i] = d[i];
357 }
358 }
359 }
360
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_Sx4(unsigned char * out0,unsigned char * out1,unsigned char * out2,unsigned char * out3,unsigned long long outlen,const unsigned char * in0,const unsigned char * in1,const unsigned char * in2,const unsigned char * in3,unsigned long long inlen,const harakactx * state)361 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka_Sx4(unsigned char *out0,
362 unsigned char *out1,
363 unsigned char *out2,
364 unsigned char *out3,
365 unsigned long long outlen,
366 const unsigned char *in0,
367 const unsigned char *in1,
368 const unsigned char *in2,
369 const unsigned char *in3,
370 unsigned long long inlen,
371 const harakactx *state) {
372 unsigned long long i;
373 unsigned char s[64 * 4];
374 unsigned char d0[32];
375 unsigned char d1[32];
376 unsigned char d2[32];
377 unsigned char d3[32];
378
379 for (i = 0; i < 64 * 4; i++) {
380 s[i] = 0;
381 }
382 haraka_S_absorb4x(s, in0, in1, in2, in3, inlen, 0x1F, state);
383
384 haraka_S_squeezeblocks4x(out0, out1, out2, out3, outlen / HARAKAS_RATE, s, HARAKAS_RATE, state);
385 out0 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
386 out1 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
387 out2 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
388 out3 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
389
390 if (outlen % HARAKAS_RATE) {
391 haraka_S_squeezeblocks4x(d0, d1, d2, d3, 1, s, HARAKAS_RATE, state);
392 for (i = 0; i < outlen % HARAKAS_RATE; i++) {
393 out0[i] = d0[i];
394 out1[i] = d1[i];
395 out2[i] = d2[i];
396 out3[i] = d3[i];
397 }
398 }
399 }
400
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512_perm(unsigned char * out,const unsigned char * in,const harakactx * state)401 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512_perm(unsigned char *out, const unsigned char *in, const harakactx *state) {
402 u128 s[4], tmp;
403
404 s[0] = LOAD(in);
405 s[1] = LOAD(in + 16);
406 s[2] = LOAD(in + 32);
407 s[3] = LOAD(in + 48);
408
409 AES4(s[0], s[1], s[2], s[3], state->rc);
410 MIX4(s[0], s[1], s[2], s[3]);
411
412 AES4(s[0], s[1], s[2], s[3], state->rc + 8);
413 MIX4(s[0], s[1], s[2], s[3]);
414
415 AES4(s[0], s[1], s[2], s[3], state->rc + 16);
416 MIX4(s[0], s[1], s[2], s[3]);
417
418 AES4(s[0], s[1], s[2], s[3], state->rc + 24);
419 MIX4(s[0], s[1], s[2], s[3]);
420
421 AES4(s[0], s[1], s[2], s[3], state->rc + 32);
422 MIX4(s[0], s[1], s[2], s[3]);
423
424 STORE(out, s[0]);
425 STORE(out + 16, s[1]);
426 STORE(out + 32, s[2]);
427 STORE(out + 48, s[3]);
428 }
429
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512_perm_x4(unsigned char * out,const unsigned char * in,const harakactx * state)430 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512_perm_x4(unsigned char *out, const unsigned char *in, const harakactx *state) {
431 u128 s[4][4], tmp;
432
433 s[0][0] = LOAD(in);
434 s[0][1] = LOAD(in + 16);
435 s[0][2] = LOAD(in + 32);
436 s[0][3] = LOAD(in + 48);
437 s[1][0] = LOAD(in + 64);
438 s[1][1] = LOAD(in + 80);
439 s[1][2] = LOAD(in + 96);
440 s[1][3] = LOAD(in + 112);
441 s[2][0] = LOAD(in + 128);
442 s[2][1] = LOAD(in + 144);
443 s[2][2] = LOAD(in + 160);
444 s[2][3] = LOAD(in + 176);
445 s[3][0] = LOAD(in + 192);
446 s[3][1] = LOAD(in + 208);
447 s[3][2] = LOAD(in + 224);
448 s[3][3] = LOAD(in + 240);
449
450 AES4_4x(s[0], s[1], s[2], s[3], state->rc);
451 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
452 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
453 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
454 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
455
456 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 8);
457 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
458 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
459 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
460 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
461
462 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 16);
463 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
464 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
465 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
466 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
467
468 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 24);
469 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
470 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
471 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
472 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
473
474 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 32);
475 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
476 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
477 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
478 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
479
480 STORE(out, s[0][0]);
481 STORE(out + 16, s[0][1]);
482 STORE(out + 32, s[0][2]);
483 STORE(out + 48, s[0][3]);
484 STORE(out + 64, s[1][0]);
485 STORE(out + 80, s[1][1]);
486 STORE(out + 96, s[1][2]);
487 STORE(out + 112, s[1][3]);
488 STORE(out + 128, s[2][0]);
489 STORE(out + 144, s[2][1]);
490 STORE(out + 160, s[2][2]);
491 STORE(out + 176, s[2][3]);
492 STORE(out + 192, s[3][0]);
493 STORE(out + 208, s[3][1]);
494 STORE(out + 224, s[3][2]);
495 STORE(out + 240, s[3][3]);
496 }
497
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512(unsigned char * out,const unsigned char * in,const harakactx * state)498 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512(unsigned char *out, const unsigned char *in, const harakactx *state) {
499 u128 s[4], tmp;
500
501 s[0] = LOAD(in);
502 s[1] = LOAD(in + 16);
503 s[2] = LOAD(in + 32);
504 s[3] = LOAD(in + 48);
505
506 AES4(s[0], s[1], s[2], s[3], state->rc);
507 MIX4(s[0], s[1], s[2], s[3]);
508
509 AES4(s[0], s[1], s[2], s[3], state->rc + 8);
510 MIX4(s[0], s[1], s[2], s[3]);
511
512 AES4(s[0], s[1], s[2], s[3], state->rc + 16);
513 MIX4(s[0], s[1], s[2], s[3]);
514
515 AES4(s[0], s[1], s[2], s[3], state->rc + 24);
516 MIX4(s[0], s[1], s[2], s[3]);
517
518 AES4(s[0], s[1], s[2], s[3], state->rc + 32);
519 MIX4(s[0], s[1], s[2], s[3]);
520
521 s[0] = XOR128(s[0], LOAD(in));
522 s[1] = XOR128(s[1], LOAD(in + 16));
523 s[2] = XOR128(s[2], LOAD(in + 32));
524 s[3] = XOR128(s[3], LOAD(in + 48));
525
526 // truncate and store result
527 TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
528 }
529
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512x4(unsigned char * out,const unsigned char * in,const harakactx * state)530 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka512x4(unsigned char *out, const unsigned char *in, const harakactx *state) {
531 u128 s[4][4], tmp;
532
533 s[0][0] = LOAD(in);
534 s[0][1] = LOAD(in + 16);
535 s[0][2] = LOAD(in + 32);
536 s[0][3] = LOAD(in + 48);
537 s[1][0] = LOAD(in + 64);
538 s[1][1] = LOAD(in + 80);
539 s[1][2] = LOAD(in + 96);
540 s[1][3] = LOAD(in + 112);
541 s[2][0] = LOAD(in + 128);
542 s[2][1] = LOAD(in + 144);
543 s[2][2] = LOAD(in + 160);
544 s[2][3] = LOAD(in + 176);
545 s[3][0] = LOAD(in + 192);
546 s[3][1] = LOAD(in + 208);
547 s[3][2] = LOAD(in + 224);
548 s[3][3] = LOAD(in + 240);
549
550 AES4_4x(s[0], s[1], s[2], s[3], state->rc);
551 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
552 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
553 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
554 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
555
556 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 8);
557 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
558 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
559 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
560 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
561
562 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 16);
563 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
564 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
565 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
566 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
567
568 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 24);
569 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
570 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
571 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
572 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
573
574 AES4_4x(s[0], s[1], s[2], s[3], state->rc + 32);
575 MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
576 MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
577 MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
578 MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
579
580 s[0][0] = XOR128(s[0][0], LOAD(in));
581 s[0][1] = XOR128(s[0][1], LOAD(in + 16));
582 s[0][2] = XOR128(s[0][2], LOAD(in + 32));
583 s[0][3] = XOR128(s[0][3], LOAD(in + 48));
584 s[1][0] = XOR128(s[1][0], LOAD(in + 64));
585 s[1][1] = XOR128(s[1][1], LOAD(in + 80));
586 s[1][2] = XOR128(s[1][2], LOAD(in + 96));
587 s[1][3] = XOR128(s[1][3], LOAD(in + 112));
588 s[2][0] = XOR128(s[2][0], LOAD(in + 128));
589 s[2][1] = XOR128(s[2][1], LOAD(in + 144));
590 s[2][2] = XOR128(s[2][2], LOAD(in + 160));
591 s[2][3] = XOR128(s[2][3], LOAD(in + 176));
592 s[3][0] = XOR128(s[3][0], LOAD(in + 192));
593 s[3][1] = XOR128(s[3][1], LOAD(in + 208));
594 s[3][2] = XOR128(s[3][2], LOAD(in + 224));
595 s[3][3] = XOR128(s[3][3], LOAD(in + 240));
596
597 TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
598 TRUNCSTORE((out + 32), s[1][0], s[1][1], s[1][2], s[1][3]);
599 TRUNCSTORE((out + 64), s[2][0], s[2][1], s[2][2], s[2][3]);
600 TRUNCSTORE((out + 96), s[3][0], s[3][1], s[3][2], s[3][3]);
601 }
602
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka256(unsigned char * out,const unsigned char * in,const harakactx * state)603 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka256(unsigned char *out, const unsigned char *in, const harakactx *state) {
604 u128 s[2], tmp;
605
606 s[0] = LOAD(in);
607 s[1] = LOAD(in + 16);
608
609 AES2(s[0], s[1], state->rc);
610 MIX2(s[0], s[1]);
611
612 AES2(s[0], s[1], state->rc + 4);
613 MIX2(s[0], s[1]);
614
615 AES2(s[0], s[1], state->rc + 8);
616 MIX2(s[0], s[1]);
617
618 AES2(s[0], s[1], state->rc + 12);
619 MIX2(s[0], s[1]);
620
621 AES2(s[0], s[1], state->rc + 16);
622 MIX2(s[0], s[1]);
623
624 s[0] = XOR128(s[0], LOAD(in));
625 s[1] = XOR128(s[1], LOAD(in + 16));
626
627 STORE(out, s[0]);
628 STORE(out + 16, s[1]);
629 }
630
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka256x4(unsigned char * out,const unsigned char * in,const harakactx * state)631 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka256x4(unsigned char *out, const unsigned char *in, const harakactx *state) {
632 u128 s[4][2], tmp;
633
634 s[0][0] = LOAD(in);
635 s[0][1] = LOAD(in + 16);
636 s[1][0] = LOAD(in + 32);
637 s[1][1] = LOAD(in + 48);
638 s[2][0] = LOAD(in + 64);
639 s[2][1] = LOAD(in + 80);
640 s[3][0] = LOAD(in + 96);
641 s[3][1] = LOAD(in + 112);
642
643 // Round 1
644 AES2_4x(s[0], s[1], s[2], s[3], state->rc);
645
646 MIX2(s[0][0], s[0][1]);
647 MIX2(s[1][0], s[1][1]);
648 MIX2(s[2][0], s[2][1]);
649 MIX2(s[3][0], s[3][1]);
650
651 // Round 2
652 AES2_4x(s[0], s[1], s[2], s[3], state->rc + 4);
653
654 MIX2(s[0][0], s[0][1]);
655 MIX2(s[1][0], s[1][1]);
656 MIX2(s[2][0], s[2][1]);
657 MIX2(s[3][0], s[3][1]);
658
659 // Round 3
660 AES2_4x(s[0], s[1], s[2], s[3], state->rc + 8);
661
662 MIX2(s[0][0], s[0][1]);
663 MIX2(s[1][0], s[1][1]);
664 MIX2(s[2][0], s[2][1]);
665 MIX2(s[3][0], s[3][1]);
666
667 // Round 4
668 AES2_4x(s[0], s[1], s[2], s[3], state->rc + 12);
669
670 MIX2(s[0][0], s[0][1]);
671 MIX2(s[1][0], s[1][1]);
672 MIX2(s[2][0], s[2][1]);
673 MIX2(s[3][0], s[3][1]);
674
675 // Round 5
676 AES2_4x(s[0], s[1], s[2], s[3], state->rc + 16);
677
678 MIX2(s[0][0], s[0][1]);
679 MIX2(s[1][0], s[1][1]);
680 MIX2(s[2][0], s[2][1]);
681 MIX2(s[3][0], s[3][1]);
682
683 // Feed Forward
684 s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
685 s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
686 s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
687 s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
688 s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
689 s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
690 s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
691 s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
692
693 STORE(out, s[0][0]);
694 STORE(out + 16, s[0][1]);
695 STORE(out + 32, s[1][0]);
696 STORE(out + 48, s[1][1]);
697 STORE(out + 64, s[2][0]);
698 STORE(out + 80, s[2][1]);
699 STORE(out + 96, s[3][0]);
700 STORE(out + 112, s[3][1]);
701 }
702
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka256_sk(unsigned char * out,const unsigned char * in,const harakactx * state)703 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka256_sk(unsigned char *out, const unsigned char *in, const harakactx *state) {
704 u128 s[2], tmp;
705
706 s[0] = LOAD(in);
707 s[1] = LOAD(in + 16);
708
709 AES2(s[0], s[1], state->rc_sseed);
710 MIX2(s[0], s[1]);
711
712 AES2(s[0], s[1], state->rc_sseed + 4);
713 MIX2(s[0], s[1]);
714
715 AES2(s[0], s[1], state->rc_sseed + 8);
716 MIX2(s[0], s[1]);
717
718 AES2(s[0], s[1], state->rc_sseed + 12);
719 MIX2(s[0], s[1]);
720
721 AES2(s[0], s[1], state->rc_sseed + 16);
722 MIX2(s[0], s[1]);
723
724 s[0] = XOR128(s[0], LOAD(in));
725 s[1] = XOR128(s[1], LOAD(in + 16));
726
727 STORE(out, s[0]);
728 STORE(out + 16, s[1]);
729 }
730
PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka256_skx4(unsigned char * out,const unsigned char * in,const harakactx * state)731 void PQCLEAN_SPHINCSHARAKA256SSIMPLE_AESNI_haraka256_skx4(unsigned char *out, const unsigned char *in, const harakactx *state) {
732 u128 s[4][2], tmp;
733
734 s[0][0] = LOAD(in);
735 s[0][1] = LOAD(in + 16);
736 s[1][0] = LOAD(in + 32);
737 s[1][1] = LOAD(in + 48);
738 s[2][0] = LOAD(in + 64);
739 s[2][1] = LOAD(in + 80);
740 s[3][0] = LOAD(in + 96);
741 s[3][1] = LOAD(in + 112);
742
743 // Round 1
744 AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed);
745
746 MIX2(s[0][0], s[0][1]);
747 MIX2(s[1][0], s[1][1]);
748 MIX2(s[2][0], s[2][1]);
749 MIX2(s[3][0], s[3][1]);
750
751 // Round 2
752 AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 4);
753
754 MIX2(s[0][0], s[0][1]);
755 MIX2(s[1][0], s[1][1]);
756 MIX2(s[2][0], s[2][1]);
757 MIX2(s[3][0], s[3][1]);
758
759 // Round 3
760 AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 8);
761
762 MIX2(s[0][0], s[0][1]);
763 MIX2(s[1][0], s[1][1]);
764 MIX2(s[2][0], s[2][1]);
765 MIX2(s[3][0], s[3][1]);
766
767 // Round 4
768 AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 12);
769
770 MIX2(s[0][0], s[0][1]);
771 MIX2(s[1][0], s[1][1]);
772 MIX2(s[2][0], s[2][1]);
773 MIX2(s[3][0], s[3][1]);
774
775 // Round 5
776 AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 16);
777
778 MIX2(s[0][0], s[0][1]);
779 MIX2(s[1][0], s[1][1]);
780 MIX2(s[2][0], s[2][1]);
781 MIX2(s[3][0], s[3][1]);
782
783 // Feed Forward
784 s[0][0] = XOR128(s[0][0], LOAD(in));
785 s[0][1] = XOR128(s[0][1], LOAD(in + 16));
786 s[1][0] = XOR128(s[1][0], LOAD(in + 32));
787 s[1][1] = XOR128(s[1][1], LOAD(in + 48));
788 s[2][0] = XOR128(s[2][0], LOAD(in + 64));
789 s[2][1] = XOR128(s[2][1], LOAD(in + 80));
790 s[3][0] = XOR128(s[3][0], LOAD(in + 96));
791 s[3][1] = XOR128(s[3][1], LOAD(in + 112));
792
793 STORE(out, s[0][0]);
794 STORE(out + 16, s[0][1]);
795 STORE(out + 32, s[1][0]);
796 STORE(out + 48, s[1][1]);
797 STORE(out + 64, s[2][0]);
798 STORE(out + 80, s[2][1]);
799 STORE(out + 96, s[3][0]);
800 STORE(out + 112, s[3][1]);
801 }
802