1 /*
2  * Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the OpenSSL license (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 #include <openssl/e_os2.h>
11 #include <string.h>
12 #include <assert.h>
13 
14 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
15                    size_t r);
16 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r);
17 
18 #if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
19 
20 /*
21  * Choose some sensible defaults
22  */
23 #if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \
24     !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
25 # define KECCAK_2X      /* default to KECCAK_2X variant */
26 #endif
27 
28 #if defined(__i386) || defined(__i386__) || defined(_M_IX86)
29 # define KECCAK_COMPLEMENTING_TRANSFORM
30 #endif
31 
32 #if defined(__x86_64__) || defined(__aarch64__) || \
33     defined(__mips64) || defined(__ia64) || \
34     (defined(__VMS) && !defined(__vax))
35 /*
36  * These are available even in ILP32 flavours, but even then they are
37  * capable of performing 64-bit operations as efficiently as in *P64.
38  * Since it's not given that we can use sizeof(void *), just shunt it.
39  */
40 # define BIT_INTERLEAVE (0)
41 #else
42 # define BIT_INTERLEAVE (sizeof(void *) < 8)
43 #endif
44 
45 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
46 
ROL64(uint64_t val,int offset)47 static uint64_t ROL64(uint64_t val, int offset)
48 {
49     if (offset == 0) {
50         return val;
51     } else if (!BIT_INTERLEAVE) {
52         return (val << offset) | (val >> (64-offset));
53     } else {
54         uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
55 
56         if (offset & 1) {
57             uint32_t tmp = hi;
58 
59             offset >>= 1;
60             hi = ROL32(lo, offset);
61             lo = ROL32(tmp, offset + 1);
62         } else {
63             offset >>= 1;
64             lo = ROL32(lo, offset);
65             hi = ROL32(hi, offset);
66         }
67 
68         return ((uint64_t)hi << 32) | lo;
69     }
70 }
71 
72 static const unsigned char rhotates[5][5] = {
73     {  0,  1, 62, 28, 27 },
74     { 36, 44,  6, 55, 20 },
75     {  3, 10, 43, 25, 39 },
76     { 41, 45, 15, 21,  8 },
77     { 18,  2, 61, 56, 14 }
78 };
79 
80 static const uint64_t iotas[] = {
81     BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
82     BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
83     BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
84     BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
85     BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
86     BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
87     BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
88     BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
89     BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
90     BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
91     BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
92     BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
93     BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
94     BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
95     BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
96     BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
97     BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
98     BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
99     BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
100     BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
101     BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
102     BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
103     BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
104     BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
105 };
106 
107 #if defined(KECCAK_REF)
108 /*
109  * This is straightforward or "maximum clarity" implementation aiming
110  * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
111  * Permutation-Based Hash and Extendible-Output Functions" as much as
112  * possible. With one caveat. Because of the way C stores matrices,
113  * references to A[x,y] in the specification are presented as A[y][x].
114  * Implementation unrolls inner x-loops so that modulo 5 operations are
115  * explicitly pre-computed.
116  */
Theta(uint64_t A[5][5])117 static void Theta(uint64_t A[5][5])
118 {
119     uint64_t C[5], D[5];
120     size_t y;
121 
122     C[0] = A[0][0];
123     C[1] = A[0][1];
124     C[2] = A[0][2];
125     C[3] = A[0][3];
126     C[4] = A[0][4];
127 
128     for (y = 1; y < 5; y++) {
129         C[0] ^= A[y][0];
130         C[1] ^= A[y][1];
131         C[2] ^= A[y][2];
132         C[3] ^= A[y][3];
133         C[4] ^= A[y][4];
134     }
135 
136     D[0] = ROL64(C[1], 1) ^ C[4];
137     D[1] = ROL64(C[2], 1) ^ C[0];
138     D[2] = ROL64(C[3], 1) ^ C[1];
139     D[3] = ROL64(C[4], 1) ^ C[2];
140     D[4] = ROL64(C[0], 1) ^ C[3];
141 
142     for (y = 0; y < 5; y++) {
143         A[y][0] ^= D[0];
144         A[y][1] ^= D[1];
145         A[y][2] ^= D[2];
146         A[y][3] ^= D[3];
147         A[y][4] ^= D[4];
148     }
149 }
150 
Rho(uint64_t A[5][5])151 static void Rho(uint64_t A[5][5])
152 {
153     size_t y;
154 
155     for (y = 0; y < 5; y++) {
156         A[y][0] = ROL64(A[y][0], rhotates[y][0]);
157         A[y][1] = ROL64(A[y][1], rhotates[y][1]);
158         A[y][2] = ROL64(A[y][2], rhotates[y][2]);
159         A[y][3] = ROL64(A[y][3], rhotates[y][3]);
160         A[y][4] = ROL64(A[y][4], rhotates[y][4]);
161     }
162 }
163 
Pi(uint64_t A[5][5])164 static void Pi(uint64_t A[5][5])
165 {
166     uint64_t T[5][5];
167 
168     /*
169      * T = A
170      * A[y][x] = T[x][(3*y+x)%5]
171      */
172     memcpy(T, A, sizeof(T));
173 
174     A[0][0] = T[0][0];
175     A[0][1] = T[1][1];
176     A[0][2] = T[2][2];
177     A[0][3] = T[3][3];
178     A[0][4] = T[4][4];
179 
180     A[1][0] = T[0][3];
181     A[1][1] = T[1][4];
182     A[1][2] = T[2][0];
183     A[1][3] = T[3][1];
184     A[1][4] = T[4][2];
185 
186     A[2][0] = T[0][1];
187     A[2][1] = T[1][2];
188     A[2][2] = T[2][3];
189     A[2][3] = T[3][4];
190     A[2][4] = T[4][0];
191 
192     A[3][0] = T[0][4];
193     A[3][1] = T[1][0];
194     A[3][2] = T[2][1];
195     A[3][3] = T[3][2];
196     A[3][4] = T[4][3];
197 
198     A[4][0] = T[0][2];
199     A[4][1] = T[1][3];
200     A[4][2] = T[2][4];
201     A[4][3] = T[3][0];
202     A[4][4] = T[4][1];
203 }
204 
Chi(uint64_t A[5][5])205 static void Chi(uint64_t A[5][5])
206 {
207     uint64_t C[5];
208     size_t y;
209 
210     for (y = 0; y < 5; y++) {
211         C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
212         C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
213         C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
214         C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
215         C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
216 
217         A[y][0] = C[0];
218         A[y][1] = C[1];
219         A[y][2] = C[2];
220         A[y][3] = C[3];
221         A[y][4] = C[4];
222     }
223 }
224 
Iota(uint64_t A[5][5],size_t i)225 static void Iota(uint64_t A[5][5], size_t i)
226 {
227     assert(i < (sizeof(iotas) / sizeof(iotas[0])));
228     A[0][0] ^= iotas[i];
229 }
230 
KeccakF1600(uint64_t A[5][5])231 static void KeccakF1600(uint64_t A[5][5])
232 {
233     size_t i;
234 
235     for (i = 0; i < 24; i++) {
236         Theta(A);
237         Rho(A);
238         Pi(A);
239         Chi(A);
240         Iota(A, i);
241     }
242 }
243 
244 #elif defined(KECCAK_1X)
245 /*
246  * This implementation is optimization of above code featuring unroll
247  * of even y-loops, their fusion and code motion. It also minimizes
248  * temporary storage. Compiler would normally do all these things for
249  * you, purpose of manual optimization is to provide "unobscured"
250  * reference for assembly implementation [in case this approach is
251  * chosen for implementation on some platform]. In the nutshell it's
252  * equivalent of "plane-per-plane processing" approach discussed in
253  * section 2.4 of "Keccak implementation overview".
254  */
Round(uint64_t A[5][5],size_t i)255 static void Round(uint64_t A[5][5], size_t i)
256 {
257     uint64_t C[5], E[2];        /* registers */
258     uint64_t D[5], T[2][5];     /* memory    */
259 
260     assert(i < (sizeof(iotas) / sizeof(iotas[0])));
261 
262     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
263     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
264     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
265     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
266     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
267 
268 #if defined(__arm__)
269     D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
270     D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
271     D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
272     D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
273     D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
274 
275     T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
276     T[0][1] = A[0][1] ^ E[0]; /* D[1] */
277     T[0][2] = A[0][2] ^ C[1]; /* D[2] */
278     T[0][3] = A[0][3] ^ C[2]; /* D[3] */
279     T[0][4] = A[0][4] ^ E[1]; /* D[4] */
280 
281     C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
282     C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
283     C[0] =       A[0][0] ^ C[0]; /* rotate by 0 */  /* D[0] */
284     C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
285     C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
286 #else
287     D[0] = ROL64(C[1], 1) ^ C[4];
288     D[1] = ROL64(C[2], 1) ^ C[0];
289     D[2] = ROL64(C[3], 1) ^ C[1];
290     D[3] = ROL64(C[4], 1) ^ C[2];
291     D[4] = ROL64(C[0], 1) ^ C[3];
292 
293     T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
294     T[0][1] = A[0][1] ^ D[1];
295     T[0][2] = A[0][2] ^ D[2];
296     T[0][3] = A[0][3] ^ D[3];
297     T[0][4] = A[0][4] ^ D[4];
298 
299     C[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
300     C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
301     C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
302     C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
303     C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
304 #endif
305     A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
306     A[0][1] = C[1] ^ (~C[2] & C[3]);
307     A[0][2] = C[2] ^ (~C[3] & C[4]);
308     A[0][3] = C[3] ^ (~C[4] & C[0]);
309     A[0][4] = C[4] ^ (~C[0] & C[1]);
310 
311     T[1][0] = A[1][0] ^ (C[3] = D[0]);
312     T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
313     T[1][2] = A[1][2] ^ (E[0] = D[2]);
314     T[1][3] = A[1][3] ^ (E[1] = D[3]);
315     T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
316 
317     C[0] = ROL64(T[0][3],        rhotates[0][3]);
318     C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]);   /* D[4] */
319     C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]);   /* D[0] */
320     C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]);   /* D[1] */
321     C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]);   /* D[2] */
322 
323     A[1][0] = C[0] ^ (~C[1] & C[2]);
324     A[1][1] = C[1] ^ (~C[2] & C[3]);
325     A[1][2] = C[2] ^ (~C[3] & C[4]);
326     A[1][3] = C[3] ^ (~C[4] & C[0]);
327     A[1][4] = C[4] ^ (~C[0] & C[1]);
328 
329     C[0] = ROL64(T[0][1],        rhotates[0][1]);
330     C[1] = ROL64(T[1][2],        rhotates[1][2]);
331     C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
332     C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
333     C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
334 
335     A[2][0] = C[0] ^ (~C[1] & C[2]);
336     A[2][1] = C[1] ^ (~C[2] & C[3]);
337     A[2][2] = C[2] ^ (~C[3] & C[4]);
338     A[2][3] = C[3] ^ (~C[4] & C[0]);
339     A[2][4] = C[4] ^ (~C[0] & C[1]);
340 
341     C[0] = ROL64(T[0][4],        rhotates[0][4]);
342     C[1] = ROL64(T[1][0],        rhotates[1][0]);
343     C[2] = ROL64(T[1][1],        rhotates[2][1]); /* originally A[2][1] */
344     C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
345     C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
346 
347     A[3][0] = C[0] ^ (~C[1] & C[2]);
348     A[3][1] = C[1] ^ (~C[2] & C[3]);
349     A[3][2] = C[2] ^ (~C[3] & C[4]);
350     A[3][3] = C[3] ^ (~C[4] & C[0]);
351     A[3][4] = C[4] ^ (~C[0] & C[1]);
352 
353     C[0] = ROL64(T[0][2],        rhotates[0][2]);
354     C[1] = ROL64(T[1][3],        rhotates[1][3]);
355     C[2] = ROL64(T[1][4],        rhotates[2][4]); /* originally A[2][4] */
356     C[3] = ROL64(T[0][0],        rhotates[3][0]); /* originally A[3][0] */
357     C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
358 
359     A[4][0] = C[0] ^ (~C[1] & C[2]);
360     A[4][1] = C[1] ^ (~C[2] & C[3]);
361     A[4][2] = C[2] ^ (~C[3] & C[4]);
362     A[4][3] = C[3] ^ (~C[4] & C[0]);
363     A[4][4] = C[4] ^ (~C[0] & C[1]);
364 }
365 
KeccakF1600(uint64_t A[5][5])366 static void KeccakF1600(uint64_t A[5][5])
367 {
368     size_t i;
369 
370     for (i = 0; i < 24; i++) {
371         Round(A, i);
372     }
373 }
374 
375 #elif defined(KECCAK_1X_ALT)
376 /*
377  * This is variant of above KECCAK_1X that reduces requirement for
378  * temporary storage even further, but at cost of more updates to A[][].
379  * It's less suitable if A[][] is memory bound, but better if it's
380  * register bound.
381  */
382 
Round(uint64_t A[5][5],size_t i)383 static void Round(uint64_t A[5][5], size_t i)
384 {
385     uint64_t C[5], D[5];
386 
387     assert(i < (sizeof(iotas) / sizeof(iotas[0])));
388 
389     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
390     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
391     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
392     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
393     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
394 
395     D[1] = C[0] ^  ROL64(C[2], 1);
396     D[2] = C[1] ^  ROL64(C[3], 1);
397     D[3] = C[2] ^= ROL64(C[4], 1);
398     D[4] = C[3] ^= ROL64(C[0], 1);
399     D[0] = C[4] ^= ROL64(C[1], 1);
400 
401     A[0][1] ^= D[1];
402     A[1][1] ^= D[1];
403     A[2][1] ^= D[1];
404     A[3][1] ^= D[1];
405     A[4][1] ^= D[1];
406 
407     A[0][2] ^= D[2];
408     A[1][2] ^= D[2];
409     A[2][2] ^= D[2];
410     A[3][2] ^= D[2];
411     A[4][2] ^= D[2];
412 
413     A[0][3] ^= C[2];
414     A[1][3] ^= C[2];
415     A[2][3] ^= C[2];
416     A[3][3] ^= C[2];
417     A[4][3] ^= C[2];
418 
419     A[0][4] ^= C[3];
420     A[1][4] ^= C[3];
421     A[2][4] ^= C[3];
422     A[3][4] ^= C[3];
423     A[4][4] ^= C[3];
424 
425     A[0][0] ^= C[4];
426     A[1][0] ^= C[4];
427     A[2][0] ^= C[4];
428     A[3][0] ^= C[4];
429     A[4][0] ^= C[4];
430 
431     C[1] = A[0][1];
432     C[2] = A[0][2];
433     C[3] = A[0][3];
434     C[4] = A[0][4];
435 
436     A[0][1] = ROL64(A[1][1], rhotates[1][1]);
437     A[0][2] = ROL64(A[2][2], rhotates[2][2]);
438     A[0][3] = ROL64(A[3][3], rhotates[3][3]);
439     A[0][4] = ROL64(A[4][4], rhotates[4][4]);
440 
441     A[1][1] = ROL64(A[1][4], rhotates[1][4]);
442     A[2][2] = ROL64(A[2][3], rhotates[2][3]);
443     A[3][3] = ROL64(A[3][2], rhotates[3][2]);
444     A[4][4] = ROL64(A[4][1], rhotates[4][1]);
445 
446     A[1][4] = ROL64(A[4][2], rhotates[4][2]);
447     A[2][3] = ROL64(A[3][4], rhotates[3][4]);
448     A[3][2] = ROL64(A[2][1], rhotates[2][1]);
449     A[4][1] = ROL64(A[1][3], rhotates[1][3]);
450 
451     A[4][2] = ROL64(A[2][4], rhotates[2][4]);
452     A[3][4] = ROL64(A[4][3], rhotates[4][3]);
453     A[2][1] = ROL64(A[1][2], rhotates[1][2]);
454     A[1][3] = ROL64(A[3][1], rhotates[3][1]);
455 
456     A[2][4] = ROL64(A[4][0], rhotates[4][0]);
457     A[4][3] = ROL64(A[3][0], rhotates[3][0]);
458     A[1][2] = ROL64(A[2][0], rhotates[2][0]);
459     A[3][1] = ROL64(A[1][0], rhotates[1][0]);
460 
461     A[1][0] = ROL64(C[3],    rhotates[0][3]);
462     A[2][0] = ROL64(C[1],    rhotates[0][1]);
463     A[3][0] = ROL64(C[4],    rhotates[0][4]);
464     A[4][0] = ROL64(C[2],    rhotates[0][2]);
465 
466     C[0] = A[0][0];
467     C[1] = A[1][0];
468     D[0] = A[0][1];
469     D[1] = A[1][1];
470 
471     A[0][0] ^= (~A[0][1] & A[0][2]);
472     A[1][0] ^= (~A[1][1] & A[1][2]);
473     A[0][1] ^= (~A[0][2] & A[0][3]);
474     A[1][1] ^= (~A[1][2] & A[1][3]);
475     A[0][2] ^= (~A[0][3] & A[0][4]);
476     A[1][2] ^= (~A[1][3] & A[1][4]);
477     A[0][3] ^= (~A[0][4] & C[0]);
478     A[1][3] ^= (~A[1][4] & C[1]);
479     A[0][4] ^= (~C[0]    & D[0]);
480     A[1][4] ^= (~C[1]    & D[1]);
481 
482     C[2] = A[2][0];
483     C[3] = A[3][0];
484     D[2] = A[2][1];
485     D[3] = A[3][1];
486 
487     A[2][0] ^= (~A[2][1] & A[2][2]);
488     A[3][0] ^= (~A[3][1] & A[3][2]);
489     A[2][1] ^= (~A[2][2] & A[2][3]);
490     A[3][1] ^= (~A[3][2] & A[3][3]);
491     A[2][2] ^= (~A[2][3] & A[2][4]);
492     A[3][2] ^= (~A[3][3] & A[3][4]);
493     A[2][3] ^= (~A[2][4] & C[2]);
494     A[3][3] ^= (~A[3][4] & C[3]);
495     A[2][4] ^= (~C[2]    & D[2]);
496     A[3][4] ^= (~C[3]    & D[3]);
497 
498     C[4] = A[4][0];
499     D[4] = A[4][1];
500 
501     A[4][0] ^= (~A[4][1] & A[4][2]);
502     A[4][1] ^= (~A[4][2] & A[4][3]);
503     A[4][2] ^= (~A[4][3] & A[4][4]);
504     A[4][3] ^= (~A[4][4] & C[4]);
505     A[4][4] ^= (~C[4]    & D[4]);
506     A[0][0] ^= iotas[i];
507 }
508 
KeccakF1600(uint64_t A[5][5])509 static void KeccakF1600(uint64_t A[5][5])
510 {
511     size_t i;
512 
513     for (i = 0; i < 24; i++) {
514         Round(A, i);
515     }
516 }
517 
518 #elif defined(KECCAK_2X)
519 /*
520  * This implementation is variant of KECCAK_1X above with outer-most
521  * round loop unrolled twice. This allows to take temporary storage
522  * out of round procedure and simplify references to it by alternating
523  * it with actual data (see round loop below). Originally it was meant
524  * rather as reference for an assembly implementation, but it seems to
525  * play best with compilers [as well as provide best instruction per
526  * processed byte ratio at minimal round unroll factor]...
527  */
Round(uint64_t R[5][5],uint64_t A[5][5],size_t i)528 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
529 {
530     uint64_t C[5], D[5];
531 
532     assert(i < (sizeof(iotas) / sizeof(iotas[0])));
533 
534     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
535     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
536     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
537     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
538     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
539 
540     D[0] = ROL64(C[1], 1) ^ C[4];
541     D[1] = ROL64(C[2], 1) ^ C[0];
542     D[2] = ROL64(C[3], 1) ^ C[1];
543     D[3] = ROL64(C[4], 1) ^ C[2];
544     D[4] = ROL64(C[0], 1) ^ C[3];
545 
546     C[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
547     C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
548     C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
549     C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
550     C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
551 
552 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
553     R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
554     R[0][1] = C[1] ^ (~C[2] | C[3]);
555     R[0][2] = C[2] ^ ( C[3] & C[4]);
556     R[0][3] = C[3] ^ ( C[4] | C[0]);
557     R[0][4] = C[4] ^ ( C[0] & C[1]);
558 #else
559     R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
560     R[0][1] = C[1] ^ (~C[2] & C[3]);
561     R[0][2] = C[2] ^ (~C[3] & C[4]);
562     R[0][3] = C[3] ^ (~C[4] & C[0]);
563     R[0][4] = C[4] ^ (~C[0] & C[1]);
564 #endif
565 
566     C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
567     C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
568     C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
569     C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
570     C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
571 
572 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
573     R[1][0] = C[0] ^ (C[1] |  C[2]);
574     R[1][1] = C[1] ^ (C[2] &  C[3]);
575     R[1][2] = C[2] ^ (C[3] | ~C[4]);
576     R[1][3] = C[3] ^ (C[4] |  C[0]);
577     R[1][4] = C[4] ^ (C[0] &  C[1]);
578 #else
579     R[1][0] = C[0] ^ (~C[1] & C[2]);
580     R[1][1] = C[1] ^ (~C[2] & C[3]);
581     R[1][2] = C[2] ^ (~C[3] & C[4]);
582     R[1][3] = C[3] ^ (~C[4] & C[0]);
583     R[1][4] = C[4] ^ (~C[0] & C[1]);
584 #endif
585 
586     C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
587     C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
588     C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
589     C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
590     C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
591 
592 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
593     R[2][0] =  C[0] ^ ( C[1] | C[2]);
594     R[2][1] =  C[1] ^ ( C[2] & C[3]);
595     R[2][2] =  C[2] ^ (~C[3] & C[4]);
596     R[2][3] = ~C[3] ^ ( C[4] | C[0]);
597     R[2][4] =  C[4] ^ ( C[0] & C[1]);
598 #else
599     R[2][0] = C[0] ^ (~C[1] & C[2]);
600     R[2][1] = C[1] ^ (~C[2] & C[3]);
601     R[2][2] = C[2] ^ (~C[3] & C[4]);
602     R[2][3] = C[3] ^ (~C[4] & C[0]);
603     R[2][4] = C[4] ^ (~C[0] & C[1]);
604 #endif
605 
606     C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
607     C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
608     C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
609     C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
610     C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
611 
612 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
613     R[3][0] =  C[0] ^ ( C[1] & C[2]);
614     R[3][1] =  C[1] ^ ( C[2] | C[3]);
615     R[3][2] =  C[2] ^ (~C[3] | C[4]);
616     R[3][3] = ~C[3] ^ ( C[4] & C[0]);
617     R[3][4] =  C[4] ^ ( C[0] | C[1]);
618 #else
619     R[3][0] = C[0] ^ (~C[1] & C[2]);
620     R[3][1] = C[1] ^ (~C[2] & C[3]);
621     R[3][2] = C[2] ^ (~C[3] & C[4]);
622     R[3][3] = C[3] ^ (~C[4] & C[0]);
623     R[3][4] = C[4] ^ (~C[0] & C[1]);
624 #endif
625 
626     C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
627     C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
628     C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
629     C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
630     C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
631 
632 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
633     R[4][0] =  C[0] ^ (~C[1] & C[2]);
634     R[4][1] = ~C[1] ^ ( C[2] | C[3]);
635     R[4][2] =  C[2] ^ ( C[3] & C[4]);
636     R[4][3] =  C[3] ^ ( C[4] | C[0]);
637     R[4][4] =  C[4] ^ ( C[0] & C[1]);
638 #else
639     R[4][0] = C[0] ^ (~C[1] & C[2]);
640     R[4][1] = C[1] ^ (~C[2] & C[3]);
641     R[4][2] = C[2] ^ (~C[3] & C[4]);
642     R[4][3] = C[3] ^ (~C[4] & C[0]);
643     R[4][4] = C[4] ^ (~C[0] & C[1]);
644 #endif
645 }
646 
KeccakF1600(uint64_t A[5][5])647 static void KeccakF1600(uint64_t A[5][5])
648 {
649     uint64_t T[5][5];
650     size_t i;
651 
652 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
653     A[0][1] = ~A[0][1];
654     A[0][2] = ~A[0][2];
655     A[1][3] = ~A[1][3];
656     A[2][2] = ~A[2][2];
657     A[3][2] = ~A[3][2];
658     A[4][0] = ~A[4][0];
659 #endif
660 
661     for (i = 0; i < 24; i += 2) {
662         Round(T, A, i);
663         Round(A, T, i + 1);
664     }
665 
666 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
667     A[0][1] = ~A[0][1];
668     A[0][2] = ~A[0][2];
669     A[1][3] = ~A[1][3];
670     A[2][2] = ~A[2][2];
671     A[3][2] = ~A[3][2];
672     A[4][0] = ~A[4][0];
673 #endif
674 }
675 
676 #else   /* define KECCAK_INPLACE to compile this code path */
677 /*
678  * This implementation is KECCAK_1X from above combined 4 times with
679  * a twist that allows to omit temporary storage and perform in-place
680  * processing. It's discussed in section 2.5 of "Keccak implementation
681  * overview". It's likely to be best suited for processors with large
682  * register bank... On the other hand processor with large register
683  * bank can as well use KECCAK_1X_ALT, it would be as fast but much
684  * more compact...
685  */
FourRounds(uint64_t A[5][5],size_t i)686 static void FourRounds(uint64_t A[5][5], size_t i)
687 {
688     uint64_t B[5], C[5], D[5];
689 
690     assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4));
691 
692     /* Round 4*n */
693     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
694     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
695     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
696     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
697     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
698 
699     D[0] = ROL64(C[1], 1) ^ C[4];
700     D[1] = ROL64(C[2], 1) ^ C[0];
701     D[2] = ROL64(C[3], 1) ^ C[1];
702     D[3] = ROL64(C[4], 1) ^ C[2];
703     D[4] = ROL64(C[0], 1) ^ C[3];
704 
705     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
706     B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
707     B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
708     B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
709     B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
710 
711     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
712     C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
713     C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
714     C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
715     C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
716 
717     B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
718     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
719     B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
720     B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
721     B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
722 
723     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
724     C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
725     C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
726     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
727     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
728 
729     B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
730     B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
731     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
732     B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
733     B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
734 
735     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
736     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
737     C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
738     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
739     C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
740 
741     B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
742     B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
743     B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
744     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
745     B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
746 
747     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
748     C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
749     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
750     C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
751     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
752 
753     B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
754     B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
755     B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
756     B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
757     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
758 
759     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
760     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
761     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
762     C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
763     C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
764 
765     /* Round 4*n+1 */
766     D[0] = ROL64(C[1], 1) ^ C[4];
767     D[1] = ROL64(C[2], 1) ^ C[0];
768     D[2] = ROL64(C[3], 1) ^ C[1];
769     D[3] = ROL64(C[4], 1) ^ C[2];
770     D[4] = ROL64(C[0], 1) ^ C[3];
771 
772     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
773     B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
774     B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
775     B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
776     B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
777 
778     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
779     C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
780     C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
781     C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
782     C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
783 
784     B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
785     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
786     B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
787     B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
788     B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
789 
790     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
791     C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
792     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
793     C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
794     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
795 
796     B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
797     B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
798     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
799     B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
800     B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
801 
802     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
803     C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
804     C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
805     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
806     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
807 
808     B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
809     B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
810     B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
811     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
812     B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
813 
814     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
815     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
816     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
817     C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
818     C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
819 
820     B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
821     B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
822     B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
823     B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
824     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
825 
826     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
827     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
828     C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
829     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
830     C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
831 
832     /* Round 4*n+2 */
833     D[0] = ROL64(C[1], 1) ^ C[4];
834     D[1] = ROL64(C[2], 1) ^ C[0];
835     D[2] = ROL64(C[3], 1) ^ C[1];
836     D[3] = ROL64(C[4], 1) ^ C[2];
837     D[4] = ROL64(C[0], 1) ^ C[3];
838 
839     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
840     B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
841     B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
842     B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
843     B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
844 
845     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
846     C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
847     C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
848     C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
849     C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
850 
851     B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
852     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
853     B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
854     B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
855     B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
856 
857     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
858     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
859     C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
860     C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
861     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
862 
863     B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
864     B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
865     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
866     B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
867     B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
868 
869     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
870     C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
871     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
872     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
873     C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
874 
875     B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
876     B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
877     B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
878     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
879     B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
880 
881     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
882     C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
883     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
884     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
885     C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
886 
887     B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
888     B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
889     B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
890     B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
891     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
892 
893     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
894     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
895     C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
896     C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
897     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
898 
899     /* Round 4*n+3 */
900     D[0] = ROL64(C[1], 1) ^ C[4];
901     D[1] = ROL64(C[2], 1) ^ C[0];
902     D[2] = ROL64(C[3], 1) ^ C[1];
903     D[3] = ROL64(C[4], 1) ^ C[2];
904     D[4] = ROL64(C[0], 1) ^ C[3];
905 
906     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
907     B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
908     B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
909     B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
910     B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
911 
912     /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
913     /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
914     /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
915     /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
916     /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
917 
918     B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
919     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
920     B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
921     B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
922     B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
923 
924     /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
925     /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
926     /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
927     /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
928     /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
929 
930     B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
931     B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
932     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
933     B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
934     B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
935 
936     /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
937     /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
938     /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
939     /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
940     /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
941 
942     B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
943     B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
944     B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
945     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
946     B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
947 
948     /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
949     /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
950     /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
951     /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
952     /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
953 
954     B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
955     B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
956     B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
957     B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
958     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
959 
960     /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
961     /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
962     /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
963     /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
964     /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
965 }
966 
KeccakF1600(uint64_t A[5][5])967 static void KeccakF1600(uint64_t A[5][5])
968 {
969     size_t i;
970 
971     for (i = 0; i < 24; i += 4) {
972         FourRounds(A, i);
973     }
974 }
975 
976 #endif
977 
BitInterleave(uint64_t Ai)978 static uint64_t BitInterleave(uint64_t Ai)
979 {
980     if (BIT_INTERLEAVE) {
981         uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
982         uint32_t t0, t1;
983 
984         t0 = lo & 0x55555555;
985         t0 |= t0 >> 1;  t0 &= 0x33333333;
986         t0 |= t0 >> 2;  t0 &= 0x0f0f0f0f;
987         t0 |= t0 >> 4;  t0 &= 0x00ff00ff;
988         t0 |= t0 >> 8;  t0 &= 0x0000ffff;
989 
990         t1 = hi & 0x55555555;
991         t1 |= t1 >> 1;  t1 &= 0x33333333;
992         t1 |= t1 >> 2;  t1 &= 0x0f0f0f0f;
993         t1 |= t1 >> 4;  t1 &= 0x00ff00ff;
994         t1 |= t1 >> 8;  t1 <<= 16;
995 
996         lo &= 0xaaaaaaaa;
997         lo |= lo << 1;  lo &= 0xcccccccc;
998         lo |= lo << 2;  lo &= 0xf0f0f0f0;
999         lo |= lo << 4;  lo &= 0xff00ff00;
1000         lo |= lo << 8;  lo >>= 16;
1001 
1002         hi &= 0xaaaaaaaa;
1003         hi |= hi << 1;  hi &= 0xcccccccc;
1004         hi |= hi << 2;  hi &= 0xf0f0f0f0;
1005         hi |= hi << 4;  hi &= 0xff00ff00;
1006         hi |= hi << 8;  hi &= 0xffff0000;
1007 
1008         Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1009     }
1010 
1011     return Ai;
1012 }
1013 
BitDeinterleave(uint64_t Ai)1014 static uint64_t BitDeinterleave(uint64_t Ai)
1015 {
1016     if (BIT_INTERLEAVE) {
1017         uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1018         uint32_t t0, t1;
1019 
1020         t0 = lo & 0x0000ffff;
1021         t0 |= t0 << 8;  t0 &= 0x00ff00ff;
1022         t0 |= t0 << 4;  t0 &= 0x0f0f0f0f;
1023         t0 |= t0 << 2;  t0 &= 0x33333333;
1024         t0 |= t0 << 1;  t0 &= 0x55555555;
1025 
1026         t1 = hi << 16;
1027         t1 |= t1 >> 8;  t1 &= 0xff00ff00;
1028         t1 |= t1 >> 4;  t1 &= 0xf0f0f0f0;
1029         t1 |= t1 >> 2;  t1 &= 0xcccccccc;
1030         t1 |= t1 >> 1;  t1 &= 0xaaaaaaaa;
1031 
1032         lo >>= 16;
1033         lo |= lo << 8;  lo &= 0x00ff00ff;
1034         lo |= lo << 4;  lo &= 0x0f0f0f0f;
1035         lo |= lo << 2;  lo &= 0x33333333;
1036         lo |= lo << 1;  lo &= 0x55555555;
1037 
1038         hi &= 0xffff0000;
1039         hi |= hi >> 8;  hi &= 0xff00ff00;
1040         hi |= hi >> 4;  hi &= 0xf0f0f0f0;
1041         hi |= hi >> 2;  hi &= 0xcccccccc;
1042         hi |= hi >> 1;  hi &= 0xaaaaaaaa;
1043 
1044         Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1045     }
1046 
1047     return Ai;
1048 }
1049 
1050 /*
1051  * SHA3_absorb can be called multiple times, but at each invocation
1052  * largest multiple of |r| out of |len| bytes are processed. Then
1053  * remaining amount of bytes is returned. This is done to spare caller
1054  * trouble of calculating the largest multiple of |r|. |r| can be viewed
1055  * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1056  * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1057  * padding and intermediate sub-block buffering, byte- or bitwise, is
1058  * caller's responsibility.
1059  */
SHA3_absorb(uint64_t A[5][5],const unsigned char * inp,size_t len,size_t r)1060 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1061                    size_t r)
1062 {
1063     uint64_t *A_flat = (uint64_t *)A;
1064     size_t i, w = r / 8;
1065 
1066     assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1067 
1068     while (len >= r) {
1069         for (i = 0; i < w; i++) {
1070             uint64_t Ai = (uint64_t)inp[0]       | (uint64_t)inp[1] << 8  |
1071                           (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1072                           (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1073                           (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1074             inp += 8;
1075 
1076             A_flat[i] ^= BitInterleave(Ai);
1077         }
1078         KeccakF1600(A);
1079         len -= r;
1080     }
1081 
1082     return len;
1083 }
1084 
1085 /*
1086  * SHA3_squeeze is called once at the end to generate |out| hash value
1087  * of |len| bytes.
1088  */
SHA3_squeeze(uint64_t A[5][5],unsigned char * out,size_t len,size_t r)1089 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r)
1090 {
1091     uint64_t *A_flat = (uint64_t *)A;
1092     size_t i, w = r / 8;
1093 
1094     assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1095 
1096     while (len != 0) {
1097         for (i = 0; i < w && len != 0; i++) {
1098             uint64_t Ai = BitDeinterleave(A_flat[i]);
1099 
1100             if (len < 8) {
1101                 for (i = 0; i < len; i++) {
1102                     *out++ = (unsigned char)Ai;
1103                     Ai >>= 8;
1104                 }
1105                 return;
1106             }
1107 
1108             out[0] = (unsigned char)(Ai);
1109             out[1] = (unsigned char)(Ai >> 8);
1110             out[2] = (unsigned char)(Ai >> 16);
1111             out[3] = (unsigned char)(Ai >> 24);
1112             out[4] = (unsigned char)(Ai >> 32);
1113             out[5] = (unsigned char)(Ai >> 40);
1114             out[6] = (unsigned char)(Ai >> 48);
1115             out[7] = (unsigned char)(Ai >> 56);
1116             out += 8;
1117             len -= 8;
1118         }
1119         if (len)
1120             KeccakF1600(A);
1121     }
1122 }
1123 #endif
1124 
1125 #ifdef SELFTEST
1126 /*
1127  * Post-padding one-shot implementations would look as following:
1128  *
1129  * SHA3_224     SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1130  * SHA3_256     SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1131  * SHA3_384     SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1132  * SHA3_512     SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1133  * SHAKE_128    SHA3_sponge(inp, len, out, d, (1600-256)/8);
1134  * SHAKE_256    SHA3_sponge(inp, len, out, d, (1600-512)/8);
1135  */
1136 
SHA3_sponge(const unsigned char * inp,size_t len,unsigned char * out,size_t d,size_t r)1137 void SHA3_sponge(const unsigned char *inp, size_t len,
1138                  unsigned char *out, size_t d, size_t r)
1139 {
1140     uint64_t A[5][5];
1141 
1142     memset(A, 0, sizeof(A));
1143     SHA3_absorb(A, inp, len, r);
1144     SHA3_squeeze(A, out, d, r);
1145 }
1146 
1147 # include <stdio.h>
1148 
main()1149 int main()
1150 {
1151     /*
1152      * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1153      */
1154     unsigned char test[168] = { '\xf3', '\x3' };
1155     unsigned char out[512];
1156     size_t i;
1157     static const unsigned char result[512] = {
1158         0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1159         0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1160         0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1161         0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1162         0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1163         0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1164         0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1165         0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1166         0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1167         0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1168         0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1169         0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1170         0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1171         0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1172         0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1173         0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1174         0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1175         0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1176         0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1177         0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1178         0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1179         0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1180         0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1181         0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1182         0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1183         0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1184         0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1185         0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1186         0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1187         0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1188         0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1189         0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1190         0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1191         0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1192         0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1193         0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1194         0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1195         0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1196         0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1197         0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1198         0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1199         0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1200         0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1201         0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1202         0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1203         0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1204         0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1205         0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1206         0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1207         0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1208         0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1209         0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1210         0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1211         0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1212         0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1213         0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1214         0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1215         0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1216         0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1217         0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1218         0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1219         0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1220         0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1221         0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1222     };
1223 
1224     test[167] = '\x80';
1225     SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1226 
1227     /*
1228      * Rationale behind keeping output [formatted as below] is that
1229      * one should be able to redirect it to a file, then copy-n-paste
1230      * final "output val" from official example to another file, and
1231      * compare the two with diff(1).
1232      */
1233     for (i = 0; i < sizeof(out);) {
1234         printf("%02X", out[i]);
1235         printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1236     }
1237 
1238     if (memcmp(out,result,sizeof(out))) {
1239         fprintf(stderr,"failure\n");
1240         return 1;
1241     } else {
1242         fprintf(stderr,"success\n");
1243         return 0;
1244     }
1245 }
1246 #endif
1247