1 /*
2  * Constant time implementation of the Haraka hash function.
3  *
4  * The bit-sliced implementation of the AES round functions are
5  * based on the AES implementation in BearSSL written
6  * by Thomas Pornin <pornin@bolet.org>
7  */
8 
9 #include <stddef.h>
10 #include <stdint.h>
11 #include <stdlib.h>
12 #include <string.h>
13 
14 #include "haraka.h"
15 
16 #define HARAKAS_RATE 32
17 
18 static const uint64_t haraka512_rc64[10][8] = {
19     {0x24cf0ab9086f628b, 0xbdd6eeecc83b8382, 0xd96fb0306cdad0a7, 0xaace082ac8f95f89, 0x449d8e8870d7041f, 0x49bb2f80b2b3e2f8, 0x0569ae98d93bb258, 0x23dc9691e7d6a4b1},
20     {0xd8ba10ede0fe5b6e, 0x7ecf7dbe424c7b8e, 0x6ea9949c6df62a31, 0xbf3f3c97ec9c313e, 0x241d03a196a1861e, 0xead3a51116e5a2ea, 0x77d479fcad9574e3, 0x18657a1af894b7a0},
21     {0x10671e1a7f595522, 0xd9a00ff675d28c7b, 0x2f1edf0d2b9ba661, 0xb8ff58b8e3de45f9, 0xee29261da9865c02, 0xd1532aa4b50bdf43, 0x8bf858159b231bb1, 0xdf17439d22d4f599},
22     {0xdd4b2f0870b918c0, 0x757a81f3b39b1bb6, 0x7a5c556898952e3f, 0x7dd70a16d915d87a, 0x3ae61971982b8301, 0xc3ab319e030412be, 0x17c0033ac094a8cb, 0x5a0630fc1a8dc4ef},
23     {0x17708988c1632f73, 0xf92ddae090b44f4f, 0x11ac0285c43aa314, 0x509059941936b8ba, 0xd03e152fa2ce9b69, 0x3fbcbcb63a32998b, 0x6204696d692254f7, 0x915542ed93ec59b4},
24     {0xf4ed94aa8879236e, 0xff6cb41cd38e03c0, 0x069b38602368aeab, 0x669495b820f0ddba, 0xf42013b1b8bf9e3d, 0xcf935efe6439734d, 0xbc1dcf42ca29e3f8, 0x7e6d3ed29f78ad67},
25     {0xf3b0f6837ffcddaa, 0x3a76faef934ddf41, 0xcec7ae583a9c8e35, 0xe4dd18c68f0260af, 0x2c0e5df1ad398eaa, 0x478df5236ae22e8c, 0xfb944c46fe865f39, 0xaa48f82f028132ba},
26     {0x231b9ae2b76aca77, 0x292a76a712db0b40, 0x5850625dc8134491, 0x73137dd469810fb5, 0x8a12a6a202a474fd, 0xd36fd9daa78bdb80, 0xb34c5e733505706f, 0xbaf1cdca818d9d96},
27     {0x2e99781335e8c641, 0xbddfe5cce47d560e, 0xf74e9bf32e5e040c, 0x1d7a709d65996be9, 0x670df36a9cf66cdd, 0xd05ef84a176a2875, 0x0f888e828cb1c44e, 0x1a79e9c9727b052c},
28     {0x83497348628d84de, 0x2e9387d51f22a754, 0xb000068da2f852d6, 0x378c9e1190fd6fe5, 0x870027c316de7293, 0xe51a9d4462e047bb, 0x90ecf7f8c6251195, 0x655953bfbed90a9c},
29 };
30 
br_dec32le(const unsigned char * src)31 static inline uint32_t br_dec32le(const unsigned char *src) {
32     return (uint32_t)src[0]
33            | ((uint32_t)src[1] << 8)
34            | ((uint32_t)src[2] << 16)
35            | ((uint32_t)src[3] << 24);
36 }
37 
br_range_dec32le(uint32_t * v,size_t num,const unsigned char * src)38 static void br_range_dec32le(uint32_t *v, size_t num, const unsigned char *src) {
39     while (num-- > 0) {
40         *v ++ = br_dec32le(src);
41         src += 4;
42     }
43 }
44 
br_enc32le(unsigned char * dst,uint32_t x)45 static inline void br_enc32le(unsigned char *dst, uint32_t x) {
46     dst[0] = (unsigned char)x;
47     dst[1] = (unsigned char)(x >> 8);
48     dst[2] = (unsigned char)(x >> 16);
49     dst[3] = (unsigned char)(x >> 24);
50 }
51 
52 
br_range_enc32le(unsigned char * dst,const uint32_t * v,size_t num)53 static void br_range_enc32le(unsigned char *dst, const uint32_t *v, size_t num) {
54     while (num-- > 0) {
55         br_enc32le(dst, *v ++);
56         dst += 4;
57     }
58 }
59 
br_aes_ct64_bitslice_Sbox(uint64_t * q)60 static void br_aes_ct64_bitslice_Sbox(uint64_t *q) {
61     /*
62      * This S-box implementation is a straightforward translation of
63      * the circuit described by Boyar and Peralta in "A new
64      * combinational logic minimization technique with applications
65      * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
66      *
67      * Note that variables x* (input) and s* (output) are numbered
68      * in "reverse" order (x0 is the high bit, x7 is the low bit).
69      */
70 
71     uint64_t x0, x1, x2, x3, x4, x5, x6, x7;
72     uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9;
73     uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
74     uint64_t y20, y21;
75     uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
76     uint64_t z10, z11, z12, z13, z14, z15, z16, z17;
77     uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
78     uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
79     uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
80     uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
81     uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
82     uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
83     uint64_t t60, t61, t62, t63, t64, t65, t66, t67;
84     uint64_t s0, s1, s2, s3, s4, s5, s6, s7;
85 
86     x0 = q[7];
87     x1 = q[6];
88     x2 = q[5];
89     x3 = q[4];
90     x4 = q[3];
91     x5 = q[2];
92     x6 = q[1];
93     x7 = q[0];
94 
95     /*
96      * Top linear transformation.
97      */
98     y14 = x3 ^ x5;
99     y13 = x0 ^ x6;
100     y9 = x0 ^ x3;
101     y8 = x0 ^ x5;
102     t0 = x1 ^ x2;
103     y1 = t0 ^ x7;
104     y4 = y1 ^ x3;
105     y12 = y13 ^ y14;
106     y2 = y1 ^ x0;
107     y5 = y1 ^ x6;
108     y3 = y5 ^ y8;
109     t1 = x4 ^ y12;
110     y15 = t1 ^ x5;
111     y20 = t1 ^ x1;
112     y6 = y15 ^ x7;
113     y10 = y15 ^ t0;
114     y11 = y20 ^ y9;
115     y7 = x7 ^ y11;
116     y17 = y10 ^ y11;
117     y19 = y10 ^ y8;
118     y16 = t0 ^ y11;
119     y21 = y13 ^ y16;
120     y18 = x0 ^ y16;
121 
122     /*
123      * Non-linear section.
124      */
125     t2 = y12 & y15;
126     t3 = y3 & y6;
127     t4 = t3 ^ t2;
128     t5 = y4 & x7;
129     t6 = t5 ^ t2;
130     t7 = y13 & y16;
131     t8 = y5 & y1;
132     t9 = t8 ^ t7;
133     t10 = y2 & y7;
134     t11 = t10 ^ t7;
135     t12 = y9 & y11;
136     t13 = y14 & y17;
137     t14 = t13 ^ t12;
138     t15 = y8 & y10;
139     t16 = t15 ^ t12;
140     t17 = t4 ^ t14;
141     t18 = t6 ^ t16;
142     t19 = t9 ^ t14;
143     t20 = t11 ^ t16;
144     t21 = t17 ^ y20;
145     t22 = t18 ^ y19;
146     t23 = t19 ^ y21;
147     t24 = t20 ^ y18;
148 
149     t25 = t21 ^ t22;
150     t26 = t21 & t23;
151     t27 = t24 ^ t26;
152     t28 = t25 & t27;
153     t29 = t28 ^ t22;
154     t30 = t23 ^ t24;
155     t31 = t22 ^ t26;
156     t32 = t31 & t30;
157     t33 = t32 ^ t24;
158     t34 = t23 ^ t33;
159     t35 = t27 ^ t33;
160     t36 = t24 & t35;
161     t37 = t36 ^ t34;
162     t38 = t27 ^ t36;
163     t39 = t29 & t38;
164     t40 = t25 ^ t39;
165 
166     t41 = t40 ^ t37;
167     t42 = t29 ^ t33;
168     t43 = t29 ^ t40;
169     t44 = t33 ^ t37;
170     t45 = t42 ^ t41;
171     z0 = t44 & y15;
172     z1 = t37 & y6;
173     z2 = t33 & x7;
174     z3 = t43 & y16;
175     z4 = t40 & y1;
176     z5 = t29 & y7;
177     z6 = t42 & y11;
178     z7 = t45 & y17;
179     z8 = t41 & y10;
180     z9 = t44 & y12;
181     z10 = t37 & y3;
182     z11 = t33 & y4;
183     z12 = t43 & y13;
184     z13 = t40 & y5;
185     z14 = t29 & y2;
186     z15 = t42 & y9;
187     z16 = t45 & y14;
188     z17 = t41 & y8;
189 
190     /*
191      * Bottom linear transformation.
192      */
193     t46 = z15 ^ z16;
194     t47 = z10 ^ z11;
195     t48 = z5 ^ z13;
196     t49 = z9 ^ z10;
197     t50 = z2 ^ z12;
198     t51 = z2 ^ z5;
199     t52 = z7 ^ z8;
200     t53 = z0 ^ z3;
201     t54 = z6 ^ z7;
202     t55 = z16 ^ z17;
203     t56 = z12 ^ t48;
204     t57 = t50 ^ t53;
205     t58 = z4 ^ t46;
206     t59 = z3 ^ t54;
207     t60 = t46 ^ t57;
208     t61 = z14 ^ t57;
209     t62 = t52 ^ t58;
210     t63 = t49 ^ t58;
211     t64 = z4 ^ t59;
212     t65 = t61 ^ t62;
213     t66 = z1 ^ t63;
214     s0 = t59 ^ t63;
215     s6 = t56 ^ ~t62;
216     s7 = t48 ^ ~t60;
217     t67 = t64 ^ t65;
218     s3 = t53 ^ t66;
219     s4 = t51 ^ t66;
220     s5 = t47 ^ t65;
221     s1 = t64 ^ ~s3;
222     s2 = t55 ^ ~t67;
223 
224     q[7] = s0;
225     q[6] = s1;
226     q[5] = s2;
227     q[4] = s3;
228     q[3] = s4;
229     q[2] = s5;
230     q[1] = s6;
231     q[0] = s7;
232 }
233 
br_aes_ct_bitslice_Sbox(uint32_t * q)234 static void br_aes_ct_bitslice_Sbox(uint32_t *q) {
235     /*
236      * This S-box implementation is a straightforward translation of
237      * the circuit described by Boyar and Peralta in "A new
238      * combinational logic minimization technique with applications
239      * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
240      *
241      * Note that variables x* (input) and s* (output) are numbered
242      * in "reverse" order (x0 is the high bit, x7 is the low bit).
243      */
244 
245     uint32_t x0, x1, x2, x3, x4, x5, x6, x7;
246     uint32_t y1, y2, y3, y4, y5, y6, y7, y8, y9;
247     uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
248     uint32_t y20, y21;
249     uint32_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
250     uint32_t z10, z11, z12, z13, z14, z15, z16, z17;
251     uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
252     uint32_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
253     uint32_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
254     uint32_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
255     uint32_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
256     uint32_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
257     uint32_t t60, t61, t62, t63, t64, t65, t66, t67;
258     uint32_t s0, s1, s2, s3, s4, s5, s6, s7;
259 
260     x0 = q[7];
261     x1 = q[6];
262     x2 = q[5];
263     x3 = q[4];
264     x4 = q[3];
265     x5 = q[2];
266     x6 = q[1];
267     x7 = q[0];
268 
269     /*
270      * Top linear transformation.
271      */
272     y14 = x3 ^ x5;
273     y13 = x0 ^ x6;
274     y9 = x0 ^ x3;
275     y8 = x0 ^ x5;
276     t0 = x1 ^ x2;
277     y1 = t0 ^ x7;
278     y4 = y1 ^ x3;
279     y12 = y13 ^ y14;
280     y2 = y1 ^ x0;
281     y5 = y1 ^ x6;
282     y3 = y5 ^ y8;
283     t1 = x4 ^ y12;
284     y15 = t1 ^ x5;
285     y20 = t1 ^ x1;
286     y6 = y15 ^ x7;
287     y10 = y15 ^ t0;
288     y11 = y20 ^ y9;
289     y7 = x7 ^ y11;
290     y17 = y10 ^ y11;
291     y19 = y10 ^ y8;
292     y16 = t0 ^ y11;
293     y21 = y13 ^ y16;
294     y18 = x0 ^ y16;
295 
296     /*
297      * Non-linear section.
298      */
299     t2 = y12 & y15;
300     t3 = y3 & y6;
301     t4 = t3 ^ t2;
302     t5 = y4 & x7;
303     t6 = t5 ^ t2;
304     t7 = y13 & y16;
305     t8 = y5 & y1;
306     t9 = t8 ^ t7;
307     t10 = y2 & y7;
308     t11 = t10 ^ t7;
309     t12 = y9 & y11;
310     t13 = y14 & y17;
311     t14 = t13 ^ t12;
312     t15 = y8 & y10;
313     t16 = t15 ^ t12;
314     t17 = t4 ^ t14;
315     t18 = t6 ^ t16;
316     t19 = t9 ^ t14;
317     t20 = t11 ^ t16;
318     t21 = t17 ^ y20;
319     t22 = t18 ^ y19;
320     t23 = t19 ^ y21;
321     t24 = t20 ^ y18;
322 
323     t25 = t21 ^ t22;
324     t26 = t21 & t23;
325     t27 = t24 ^ t26;
326     t28 = t25 & t27;
327     t29 = t28 ^ t22;
328     t30 = t23 ^ t24;
329     t31 = t22 ^ t26;
330     t32 = t31 & t30;
331     t33 = t32 ^ t24;
332     t34 = t23 ^ t33;
333     t35 = t27 ^ t33;
334     t36 = t24 & t35;
335     t37 = t36 ^ t34;
336     t38 = t27 ^ t36;
337     t39 = t29 & t38;
338     t40 = t25 ^ t39;
339 
340     t41 = t40 ^ t37;
341     t42 = t29 ^ t33;
342     t43 = t29 ^ t40;
343     t44 = t33 ^ t37;
344     t45 = t42 ^ t41;
345     z0 = t44 & y15;
346     z1 = t37 & y6;
347     z2 = t33 & x7;
348     z3 = t43 & y16;
349     z4 = t40 & y1;
350     z5 = t29 & y7;
351     z6 = t42 & y11;
352     z7 = t45 & y17;
353     z8 = t41 & y10;
354     z9 = t44 & y12;
355     z10 = t37 & y3;
356     z11 = t33 & y4;
357     z12 = t43 & y13;
358     z13 = t40 & y5;
359     z14 = t29 & y2;
360     z15 = t42 & y9;
361     z16 = t45 & y14;
362     z17 = t41 & y8;
363 
364     /*
365      * Bottom linear transformation.
366      */
367     t46 = z15 ^ z16;
368     t47 = z10 ^ z11;
369     t48 = z5 ^ z13;
370     t49 = z9 ^ z10;
371     t50 = z2 ^ z12;
372     t51 = z2 ^ z5;
373     t52 = z7 ^ z8;
374     t53 = z0 ^ z3;
375     t54 = z6 ^ z7;
376     t55 = z16 ^ z17;
377     t56 = z12 ^ t48;
378     t57 = t50 ^ t53;
379     t58 = z4 ^ t46;
380     t59 = z3 ^ t54;
381     t60 = t46 ^ t57;
382     t61 = z14 ^ t57;
383     t62 = t52 ^ t58;
384     t63 = t49 ^ t58;
385     t64 = z4 ^ t59;
386     t65 = t61 ^ t62;
387     t66 = z1 ^ t63;
388     s0 = t59 ^ t63;
389     s6 = t56 ^ ~t62;
390     s7 = t48 ^ ~t60;
391     t67 = t64 ^ t65;
392     s3 = t53 ^ t66;
393     s4 = t51 ^ t66;
394     s5 = t47 ^ t65;
395     s1 = t64 ^ ~s3;
396     s2 = t55 ^ ~t67;
397 
398     q[7] = s0;
399     q[6] = s1;
400     q[5] = s2;
401     q[4] = s3;
402     q[3] = s4;
403     q[2] = s5;
404     q[1] = s6;
405     q[0] = s7;
406 }
407 
br_aes_ct_ortho(uint32_t * q)408 static void br_aes_ct_ortho(uint32_t *q) {
409 #define SWAPN_32(cl, ch, s, x, y)   do { \
410         uint32_t a, b; \
411         a = (x); \
412         b = (y); \
413         (x) = (a & (uint32_t)(cl)) | ((b & (uint32_t)(cl)) << (s)); \
414         (y) = ((a & (uint32_t)(ch)) >> (s)) | (b & (uint32_t)(ch)); \
415     } while (0)
416 
417 #define SWAP2_32(x, y)   SWAPN_32(0x55555555, 0xAAAAAAAA, 1, x, y)
418 #define SWAP4_32(x, y)   SWAPN_32(0x33333333, 0xCCCCCCCC, 2, x, y)
419 #define SWAP8_32(x, y)   SWAPN_32(0x0F0F0F0F, 0xF0F0F0F0, 4, x, y)
420 
421     SWAP2_32(q[0], q[1]);
422     SWAP2_32(q[2], q[3]);
423     SWAP2_32(q[4], q[5]);
424     SWAP2_32(q[6], q[7]);
425 
426     SWAP4_32(q[0], q[2]);
427     SWAP4_32(q[1], q[3]);
428     SWAP4_32(q[4], q[6]);
429     SWAP4_32(q[5], q[7]);
430 
431     SWAP8_32(q[0], q[4]);
432     SWAP8_32(q[1], q[5]);
433     SWAP8_32(q[2], q[6]);
434     SWAP8_32(q[3], q[7]);
435 }
436 
add_round_key32(uint32_t * q,const uint32_t * sk)437 static inline void add_round_key32(uint32_t *q, const uint32_t *sk) {
438     q[0] ^= sk[0];
439     q[1] ^= sk[1];
440     q[2] ^= sk[2];
441     q[3] ^= sk[3];
442     q[4] ^= sk[4];
443     q[5] ^= sk[5];
444     q[6] ^= sk[6];
445     q[7] ^= sk[7];
446 }
447 
shift_rows32(uint32_t * q)448 static inline void shift_rows32(uint32_t *q) {
449     int i;
450 
451     for (i = 0; i < 8; i++) {
452         uint32_t x;
453 
454         x = q[i];
455         q[i] = (x & 0x000000FF)
456                | ((x & 0x0000FC00) >> 2) | ((x & 0x00000300) << 6)
457                | ((x & 0x00F00000) >> 4) | ((x & 0x000F0000) << 4)
458                | ((x & 0xC0000000) >> 6) | ((x & 0x3F000000) << 2);
459     }
460 }
461 
rotr16(uint32_t x)462 static inline uint32_t rotr16(uint32_t x) {
463     return (x << 16) | (x >> 16);
464 }
465 
mix_columns32(uint32_t * q)466 static inline void mix_columns32(uint32_t *q) {
467     uint32_t q0, q1, q2, q3, q4, q5, q6, q7;
468     uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
469 
470     q0 = q[0];
471     q1 = q[1];
472     q2 = q[2];
473     q3 = q[3];
474     q4 = q[4];
475     q5 = q[5];
476     q6 = q[6];
477     q7 = q[7];
478     r0 = (q0 >> 8) | (q0 << 24);
479     r1 = (q1 >> 8) | (q1 << 24);
480     r2 = (q2 >> 8) | (q2 << 24);
481     r3 = (q3 >> 8) | (q3 << 24);
482     r4 = (q4 >> 8) | (q4 << 24);
483     r5 = (q5 >> 8) | (q5 << 24);
484     r6 = (q6 >> 8) | (q6 << 24);
485     r7 = (q7 >> 8) | (q7 << 24);
486 
487     q[0] = q7 ^ r7 ^ r0 ^ rotr16(q0 ^ r0);
488     q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr16(q1 ^ r1);
489     q[2] = q1 ^ r1 ^ r2 ^ rotr16(q2 ^ r2);
490     q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr16(q3 ^ r3);
491     q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr16(q4 ^ r4);
492     q[5] = q4 ^ r4 ^ r5 ^ rotr16(q5 ^ r5);
493     q[6] = q5 ^ r5 ^ r6 ^ rotr16(q6 ^ r6);
494     q[7] = q6 ^ r6 ^ r7 ^ rotr16(q7 ^ r7);
495 }
496 
br_aes_ct64_ortho(uint64_t * q)497 static void br_aes_ct64_ortho(uint64_t *q) {
498 #define SWAPN(cl, ch, s, x, y)   do { \
499         uint64_t a, b; \
500         a = (x); \
501         b = (y); \
502         (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \
503         (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \
504     } while (0)
505 
506 #define SWAP2(x, y)    SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA,  1, x, y)
507 #define SWAP4(x, y)    SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC,  2, x, y)
508 #define SWAP8(x, y)    SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0,  4, x, y)
509 
510     SWAP2(q[0], q[1]);
511     SWAP2(q[2], q[3]);
512     SWAP2(q[4], q[5]);
513     SWAP2(q[6], q[7]);
514 
515     SWAP4(q[0], q[2]);
516     SWAP4(q[1], q[3]);
517     SWAP4(q[4], q[6]);
518     SWAP4(q[5], q[7]);
519 
520     SWAP8(q[0], q[4]);
521     SWAP8(q[1], q[5]);
522     SWAP8(q[2], q[6]);
523     SWAP8(q[3], q[7]);
524 }
525 
526 
br_aes_ct64_interleave_in(uint64_t * q0,uint64_t * q1,const uint32_t * w)527 static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) {
528     uint64_t x0, x1, x2, x3;
529 
530     x0 = w[0];
531     x1 = w[1];
532     x2 = w[2];
533     x3 = w[3];
534     x0 |= (x0 << 16);
535     x1 |= (x1 << 16);
536     x2 |= (x2 << 16);
537     x3 |= (x3 << 16);
538     x0 &= (uint64_t)0x0000FFFF0000FFFF;
539     x1 &= (uint64_t)0x0000FFFF0000FFFF;
540     x2 &= (uint64_t)0x0000FFFF0000FFFF;
541     x3 &= (uint64_t)0x0000FFFF0000FFFF;
542     x0 |= (x0 << 8);
543     x1 |= (x1 << 8);
544     x2 |= (x2 << 8);
545     x3 |= (x3 << 8);
546     x0 &= (uint64_t)0x00FF00FF00FF00FF;
547     x1 &= (uint64_t)0x00FF00FF00FF00FF;
548     x2 &= (uint64_t)0x00FF00FF00FF00FF;
549     x3 &= (uint64_t)0x00FF00FF00FF00FF;
550     *q0 = x0 | (x2 << 8);
551     *q1 = x1 | (x3 << 8);
552 }
553 
554 
br_aes_ct64_interleave_out(uint32_t * w,uint64_t q0,uint64_t q1)555 static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) {
556     uint64_t x0, x1, x2, x3;
557 
558     x0 = q0 & (uint64_t)0x00FF00FF00FF00FF;
559     x1 = q1 & (uint64_t)0x00FF00FF00FF00FF;
560     x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
561     x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
562     x0 |= (x0 >> 8);
563     x1 |= (x1 >> 8);
564     x2 |= (x2 >> 8);
565     x3 |= (x3 >> 8);
566     x0 &= (uint64_t)0x0000FFFF0000FFFF;
567     x1 &= (uint64_t)0x0000FFFF0000FFFF;
568     x2 &= (uint64_t)0x0000FFFF0000FFFF;
569     x3 &= (uint64_t)0x0000FFFF0000FFFF;
570     w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16);
571     w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16);
572     w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16);
573     w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16);
574 }
575 
add_round_key(uint64_t * q,const uint64_t * sk)576 static inline void add_round_key(uint64_t *q, const uint64_t *sk) {
577     q[0] ^= sk[0];
578     q[1] ^= sk[1];
579     q[2] ^= sk[2];
580     q[3] ^= sk[3];
581     q[4] ^= sk[4];
582     q[5] ^= sk[5];
583     q[6] ^= sk[6];
584     q[7] ^= sk[7];
585 }
586 
shift_rows(uint64_t * q)587 static inline void shift_rows(uint64_t *q) {
588     int i;
589 
590     for (i = 0; i < 8; i++) {
591         uint64_t x;
592 
593         x = q[i];
594         q[i] = (x & (uint64_t)0x000000000000FFFF)
595                | ((x & (uint64_t)0x00000000FFF00000) >> 4)
596                | ((x & (uint64_t)0x00000000000F0000) << 12)
597                | ((x & (uint64_t)0x0000FF0000000000) >> 8)
598                | ((x & (uint64_t)0x000000FF00000000) << 8)
599                | ((x & (uint64_t)0xF000000000000000) >> 12)
600                | ((x & (uint64_t)0x0FFF000000000000) << 4);
601     }
602 }
603 
rotr32(uint64_t x)604 static inline uint64_t rotr32(uint64_t x) {
605     return (x << 32) | (x >> 32);
606 }
607 
mix_columns(uint64_t * q)608 static inline void mix_columns(uint64_t *q) {
609     uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
610     uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
611 
612     q0 = q[0];
613     q1 = q[1];
614     q2 = q[2];
615     q3 = q[3];
616     q4 = q[4];
617     q5 = q[5];
618     q6 = q[6];
619     q7 = q[7];
620     r0 = (q0 >> 16) | (q0 << 48);
621     r1 = (q1 >> 16) | (q1 << 48);
622     r2 = (q2 >> 16) | (q2 << 48);
623     r3 = (q3 >> 16) | (q3 << 48);
624     r4 = (q4 >> 16) | (q4 << 48);
625     r5 = (q5 >> 16) | (q5 << 48);
626     r6 = (q6 >> 16) | (q6 << 48);
627     r7 = (q7 >> 16) | (q7 << 48);
628 
629     q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0);
630     q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1);
631     q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2);
632     q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3);
633     q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4);
634     q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5);
635     q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6);
636     q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
637 }
638 
interleave_constant(uint64_t * out,const unsigned char * in)639 static void interleave_constant(uint64_t *out, const unsigned char *in) {
640     uint32_t tmp_32_constant[16];
641     int i;
642 
643     br_range_dec32le(tmp_32_constant, 16, in);
644     for (i = 0; i < 4; i++) {
645         br_aes_ct64_interleave_in(&out[i], &out[i + 4], tmp_32_constant + (i << 2));
646     }
647     br_aes_ct64_ortho(out);
648 }
649 
interleave_constant32(uint32_t * out,const unsigned char * in)650 static void interleave_constant32(uint32_t *out, const unsigned char *in) {
651     int i;
652     for (i = 0; i < 4; i++) {
653         out[2 * i] = br_dec32le(in + 4 * i);
654         out[2 * i + 1] = br_dec32le(in + 4 * i + 16);
655     }
656     br_aes_ct_ortho(out);
657 }
658 
PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_tweak_constants(harakactx * state,const unsigned char * pk_seed,const unsigned char * sk_seed,unsigned long long seed_length)659 void PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_tweak_constants(
660     harakactx *state,
661     const unsigned char *pk_seed, const unsigned char *sk_seed,
662     unsigned long long seed_length) {
663     unsigned char buf[40 * 16];
664     int i;
665 
666     /* Use the standard constants to generate tweaked ones. */
667     memcpy((uint8_t *)state->tweaked512_rc64, (uint8_t *)haraka512_rc64, 40 * 16);
668 
669     /* Constants for sk.seed */
670     if (sk_seed != NULL) {
671         PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S(
672             buf, 40 * 16, sk_seed, seed_length, state);
673 
674         /* Interleave constants */
675         for (i = 0; i < 10; i++) {
676             interleave_constant32(state->tweaked256_rc32_sseed[i], buf + 32 * i);
677         }
678     }
679 
680     /* Constants for pk.seed */
681     PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S(
682         buf, 40 * 16, pk_seed, seed_length, state);
683     for (i = 0; i < 10; i++) {
684         interleave_constant32(state->tweaked256_rc32[i], buf + 32 * i);
685         interleave_constant(state->tweaked512_rc64[i], buf + 64 * i);
686     }
687 }
688 
haraka_S_absorb(unsigned char * s,const unsigned char * m,unsigned long long mlen,unsigned char p,const harakactx * state)689 static void haraka_S_absorb(unsigned char *s,
690                             const unsigned char *m, unsigned long long mlen,
691                             unsigned char p, const harakactx *state) {
692     unsigned long long i;
693     unsigned char t[HARAKAS_RATE];
694 
695     while (mlen >= HARAKAS_RATE) {
696         /* XOR block to state */
697         for (i = 0; i < HARAKAS_RATE; ++i) {
698             s[i] ^= m[i];
699         }
700         PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka512_perm(s, s, state);
701         mlen -= HARAKAS_RATE;
702         m += HARAKAS_RATE;
703     }
704 
705     for (i = 0; i < HARAKAS_RATE; ++i) {
706         t[i] = 0;
707     }
708     for (i = 0; i < mlen; ++i) {
709         t[i] = m[i];
710     }
711     t[i] = p;
712     t[HARAKAS_RATE - 1] |= 128;
713     for (i = 0; i < HARAKAS_RATE; ++i) {
714         s[i] ^= t[i];
715     }
716 }
717 
haraka_S_squeezeblocks(unsigned char * h,unsigned long long nblocks,unsigned char * s,const harakactx * state)718 static void haraka_S_squeezeblocks(unsigned char *h, unsigned long long nblocks,
719                                    unsigned char *s, const harakactx *state) {
720     while (nblocks > 0) {
721         PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka512_perm(s, s, state);
722         memcpy(h, s, HARAKAS_RATE);
723         h += HARAKAS_RATE;
724         nblocks--;
725     }
726 }
727 
PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S_inc_init(uint8_t * s_inc)728 void PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S_inc_init(uint8_t *s_inc) {
729     size_t i;
730 
731     for (i = 0; i < 64; i++) {
732         s_inc[i] = 0;
733     }
734     s_inc[64] = 0;
735 }
736 
PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S_inc_absorb(uint8_t * s_inc,const uint8_t * m,size_t mlen,const harakactx * state)737 void PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S_inc_absorb(uint8_t *s_inc, const uint8_t *m, size_t mlen, const harakactx *state) {
738     size_t i;
739 
740     /* Recall that s_inc[64] is the non-absorbed bytes xored into the state */
741     while (mlen + s_inc[64] >= HARAKAS_RATE) {
742         for (i = 0; i < (size_t)(HARAKAS_RATE - s_inc[64]); i++) {
743             /* Take the i'th byte from message
744                xor with the s_inc[64] + i'th byte of the state */
745             s_inc[s_inc[64] + i] ^= m[i];
746         }
747         mlen -= (size_t)(HARAKAS_RATE - s_inc[64]);
748         m += HARAKAS_RATE - s_inc[64];
749         s_inc[64] = 0;
750 
751         PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka512_perm(s_inc, s_inc, state);
752     }
753 
754     for (i = 0; i < mlen; i++) {
755         s_inc[s_inc[64] + i] ^= m[i];
756     }
757     s_inc[64] = (uint8_t)(mlen + s_inc[64]);
758 }
759 
PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S_inc_finalize(uint8_t * s_inc)760 void PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S_inc_finalize(uint8_t *s_inc) {
761     /* After haraka_S_inc_absorb, we are guaranteed that s_inc[64] < HARAKAS_RATE,
762        so we can always use one more byte for p in the current state. */
763     s_inc[s_inc[64]] ^= 0x1F;
764     s_inc[HARAKAS_RATE - 1] ^= 128;
765     s_inc[64] = 0;
766 }
767 
PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S_inc_squeeze(uint8_t * out,size_t outlen,uint8_t * s_inc,const harakactx * state)768 void PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S_inc_squeeze(uint8_t *out, size_t outlen, uint8_t *s_inc, const harakactx *state) {
769     uint8_t i;
770 
771     /* First consume any bytes we still have sitting around */
772     for (i = 0; i < outlen && i < s_inc[64]; i++) {
773         /* There are s_inc[64] bytes left, so r - s_inc[64] is the first
774            available byte. We consume from there, i.e., up to r. */
775         out[i] = s_inc[(HARAKAS_RATE - s_inc[64] + i)];
776     }
777     out += i;
778     outlen -= i;
779     s_inc[64] = (uint8_t)(s_inc[64] - i);
780 
781     /* Then squeeze the remaining necessary blocks */
782     while (outlen > 0) {
783         PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka512_perm(s_inc, s_inc, state);
784 
785         for (i = 0; i < outlen && i < HARAKAS_RATE; i++) {
786             out[i] = s_inc[i];
787         }
788         out += i;
789         outlen -= i;
790         s_inc[64] = (uint8_t)(HARAKAS_RATE - i);
791     }
792 }
793 
PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S(unsigned char * out,unsigned long long outlen,const unsigned char * in,unsigned long long inlen,const harakactx * state)794 void PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka_S(unsigned char *out, unsigned long long outlen, const unsigned char *in, unsigned long long inlen, const harakactx *state) {
795     unsigned long long i;
796     unsigned char s[64];
797     unsigned char d[32];
798 
799     for (i = 0; i < 64; i++) {
800         s[i] = 0;
801     }
802     haraka_S_absorb(s, in, inlen, 0x1F, state);
803 
804     haraka_S_squeezeblocks(out, outlen / 32, s, state);
805     out += (outlen / 32) * 32;
806 
807     if (outlen % 32) {
808         haraka_S_squeezeblocks(d, 1, s, state);
809         for (i = 0; i < outlen % 32; i++) {
810             out[i] = d[i];
811         }
812     }
813 }
814 
PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka512_perm(unsigned char * out,const unsigned char * in,const harakactx * state)815 void PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka512_perm(unsigned char *out, const unsigned char *in, const harakactx *state) {
816     uint32_t w[16];
817     uint64_t q[8], tmp_q;
818     unsigned int i, j;
819 
820     br_range_dec32le(w, 16, in);
821     for (i = 0; i < 4; i++) {
822         br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2));
823     }
824     br_aes_ct64_ortho(q);
825 
826     /* AES rounds */
827     for (i = 0; i < 5; i++) {
828         for (j = 0; j < 2; j++) {
829             br_aes_ct64_bitslice_Sbox(q);
830             shift_rows(q);
831             mix_columns(q);
832             add_round_key(q, state->tweaked512_rc64[2 * i + j]);
833         }
834         /* Mix states */
835         for (j = 0; j < 8; j++) {
836             tmp_q = q[j];
837             q[j] = (tmp_q & 0x0001000100010001) << 5 |
838                    (tmp_q & 0x0002000200020002) << 12 |
839                    (tmp_q & 0x0004000400040004) >> 1 |
840                    (tmp_q & 0x0008000800080008) << 6 |
841                    (tmp_q & 0x0020002000200020) << 9 |
842                    (tmp_q & 0x0040004000400040) >> 4 |
843                    (tmp_q & 0x0080008000800080) << 3 |
844                    (tmp_q & 0x2100210021002100) >> 5 |
845                    (tmp_q & 0x0210021002100210) << 2 |
846                    (tmp_q & 0x0800080008000800) << 4 |
847                    (tmp_q & 0x1000100010001000) >> 12 |
848                    (tmp_q & 0x4000400040004000) >> 10 |
849                    (tmp_q & 0x8400840084008400) >> 3;
850         }
851     }
852 
853     br_aes_ct64_ortho(q);
854     for (i = 0; i < 4; i ++) {
855         br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]);
856     }
857     br_range_enc32le(out, w, 16);
858 }
859 
PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka512(unsigned char * out,const unsigned char * in,const harakactx * state)860 void PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka512(unsigned char *out, const unsigned char *in, const harakactx *state) {
861     int i;
862 
863     unsigned char buf[64];
864 
865     PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka512_perm(buf, in, state);
866     /* Feed-forward */
867     for (i = 0; i < 64; i++) {
868         buf[i] = buf[i] ^ in[i];
869     }
870 
871     /* Truncated */
872     memcpy(out,      buf + 8, 8);
873     memcpy(out + 8,  buf + 24, 8);
874     memcpy(out + 16, buf + 32, 8);
875     memcpy(out + 24, buf + 48, 8);
876 }
877 
878 
PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka256(unsigned char * out,const unsigned char * in,const harakactx * state)879 void PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka256(unsigned char *out, const unsigned char *in, const harakactx *state) {
880     uint32_t q[8], tmp_q;
881     int i, j;
882 
883     for (i = 0; i < 4; i++) {
884         q[2 * i] = br_dec32le(in + 4 * i);
885         q[2 * i + 1] = br_dec32le(in + 4 * i + 16);
886     }
887     br_aes_ct_ortho(q);
888 
889     /* AES rounds */
890     for (i = 0; i < 5; i++) {
891         for (j = 0; j < 2; j++) {
892             br_aes_ct_bitslice_Sbox(q);
893             shift_rows32(q);
894             mix_columns32(q);
895             add_round_key32(q, state->tweaked256_rc32[2 * i + j]);
896         }
897 
898         /* Mix states */
899         for (j = 0; j < 8; j++) {
900             tmp_q = q[j];
901             q[j] = (tmp_q & 0x81818181) |
902                    (tmp_q & 0x02020202) << 1 |
903                    (tmp_q & 0x04040404) << 2 |
904                    (tmp_q & 0x08080808) << 3 |
905                    (tmp_q & 0x10101010) >> 3 |
906                    (tmp_q & 0x20202020) >> 2 |
907                    (tmp_q & 0x40404040) >> 1;
908         }
909     }
910 
911     br_aes_ct_ortho(q);
912     for (i = 0; i < 4; i++) {
913         br_enc32le(out + 4 * i, q[2 * i]);
914         br_enc32le(out + 4 * i + 16, q[2 * i + 1]);
915     }
916 
917     for (i = 0; i < 32; i++) {
918         out[i] ^= in[i];
919     }
920 }
921 
PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka256_sk(unsigned char * out,const unsigned char * in,const harakactx * state)922 void PQCLEAN_SPHINCSHARAKA128FSIMPLE_CLEAN_haraka256_sk(unsigned char *out, const unsigned char *in, const harakactx *state) {
923     uint32_t q[8], tmp_q;
924     int i, j;
925 
926     for (i = 0; i < 4; i++) {
927         q[2 * i] = br_dec32le(in + 4 * i);
928         q[2 * i + 1] = br_dec32le(in + 4 * i + 16);
929     }
930     br_aes_ct_ortho(q);
931 
932     /* AES rounds */
933     for (i = 0; i < 5; i++) {
934         for (j = 0; j < 2; j++) {
935             br_aes_ct_bitslice_Sbox(q);
936             shift_rows32(q);
937             mix_columns32(q);
938             add_round_key32(q, state->tweaked256_rc32_sseed[2 * i + j]);
939         }
940 
941         /* Mix states */
942         for (j = 0; j < 8; j++) {
943             tmp_q = q[j];
944             q[j] = (tmp_q & 0x81818181) |
945                    (tmp_q & 0x02020202) << 1 |
946                    (tmp_q & 0x04040404) << 2 |
947                    (tmp_q & 0x08080808) << 3 |
948                    (tmp_q & 0x10101010) >> 3 |
949                    (tmp_q & 0x20202020) >> 2 |
950                    (tmp_q & 0x40404040) >> 1;
951         }
952     }
953 
954     br_aes_ct_ortho(q);
955     for (i = 0; i < 4; i++) {
956         br_enc32le(out + 4 * i, q[2 * i]);
957         br_enc32le(out + 4 * i + 16, q[2 * i + 1]);
958     }
959 
960     for (i = 0; i < 32; i++) {
961         out[i] ^= in[i];
962     }
963 }
964