xref: /freebsd/crypto/openssl/crypto/sha/keccak1600.c (revision 315ee00f)
1 /*
2  * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the Apache License 2.0 (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 #include <openssl/e_os2.h>
11 #include <string.h>
12 #include <assert.h>
13 
14 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
15                    size_t r);
16 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r);
17 
18 #if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
19 
20 /*
21  * Choose some sensible defaults
22  */
23 #if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \
24     !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
25 # define KECCAK_2X      /* default to KECCAK_2X variant */
26 #endif
27 
28 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
29     (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \
30     defined(__mips) || defined(__riscv) || defined(__s390__) || \
31     defined(__EMSCRIPTEN__)
32 /*
33  * These don't have "and with complement" instruction, so minimize amount
34  * of "not"-s. Implemented only in the [default] KECCAK_2X variant.
35  */
36 # define KECCAK_COMPLEMENTING_TRANSFORM
37 #endif
38 
39 #if defined(__x86_64__) || defined(__aarch64__) || \
40     defined(__mips64) || defined(__ia64) || \
41     (defined(__VMS) && !defined(__vax))
42 /*
43  * These are available even in ILP32 flavours, but even then they are
44  * capable of performing 64-bit operations as efficiently as in *P64.
45  * Since it's not given that we can use sizeof(void *), just shunt it.
46  */
47 # define BIT_INTERLEAVE (0)
48 #else
49 # define BIT_INTERLEAVE (sizeof(void *) < 8)
50 #endif
51 
52 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
53 
54 static uint64_t ROL64(uint64_t val, int offset)
55 {
56     if (offset == 0) {
57         return val;
58     } else if (!BIT_INTERLEAVE) {
59         return (val << offset) | (val >> (64-offset));
60     } else {
61         uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
62 
63         if (offset & 1) {
64             uint32_t tmp = hi;
65 
66             offset >>= 1;
67             hi = ROL32(lo, offset);
68             lo = ROL32(tmp, offset + 1);
69         } else {
70             offset >>= 1;
71             lo = ROL32(lo, offset);
72             hi = ROL32(hi, offset);
73         }
74 
75         return ((uint64_t)hi << 32) | lo;
76     }
77 }
78 
79 static const unsigned char rhotates[5][5] = {
80     {  0,  1, 62, 28, 27 },
81     { 36, 44,  6, 55, 20 },
82     {  3, 10, 43, 25, 39 },
83     { 41, 45, 15, 21,  8 },
84     { 18,  2, 61, 56, 14 }
85 };
86 
87 static const uint64_t iotas[] = {
88     BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
89     BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
90     BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
91     BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
92     BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
93     BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
94     BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
95     BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
96     BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
97     BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
98     BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
99     BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
100     BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
101     BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
102     BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
103     BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
104     BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
105     BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
106     BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
107     BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
108     BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
109     BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
110     BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
111     BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
112 };
113 
114 #if defined(KECCAK_REF)
115 /*
116  * This is straightforward or "maximum clarity" implementation aiming
117  * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
118  * Permutation-Based Hash and Extendible-Output Functions" as much as
119  * possible. With one caveat. Because of the way C stores matrices,
120  * references to A[x,y] in the specification are presented as A[y][x].
121  * Implementation unrolls inner x-loops so that modulo 5 operations are
122  * explicitly pre-computed.
123  */
124 static void Theta(uint64_t A[5][5])
125 {
126     uint64_t C[5], D[5];
127     size_t y;
128 
129     C[0] = A[0][0];
130     C[1] = A[0][1];
131     C[2] = A[0][2];
132     C[3] = A[0][3];
133     C[4] = A[0][4];
134 
135     for (y = 1; y < 5; y++) {
136         C[0] ^= A[y][0];
137         C[1] ^= A[y][1];
138         C[2] ^= A[y][2];
139         C[3] ^= A[y][3];
140         C[4] ^= A[y][4];
141     }
142 
143     D[0] = ROL64(C[1], 1) ^ C[4];
144     D[1] = ROL64(C[2], 1) ^ C[0];
145     D[2] = ROL64(C[3], 1) ^ C[1];
146     D[3] = ROL64(C[4], 1) ^ C[2];
147     D[4] = ROL64(C[0], 1) ^ C[3];
148 
149     for (y = 0; y < 5; y++) {
150         A[y][0] ^= D[0];
151         A[y][1] ^= D[1];
152         A[y][2] ^= D[2];
153         A[y][3] ^= D[3];
154         A[y][4] ^= D[4];
155     }
156 }
157 
158 static void Rho(uint64_t A[5][5])
159 {
160     size_t y;
161 
162     for (y = 0; y < 5; y++) {
163         A[y][0] = ROL64(A[y][0], rhotates[y][0]);
164         A[y][1] = ROL64(A[y][1], rhotates[y][1]);
165         A[y][2] = ROL64(A[y][2], rhotates[y][2]);
166         A[y][3] = ROL64(A[y][3], rhotates[y][3]);
167         A[y][4] = ROL64(A[y][4], rhotates[y][4]);
168     }
169 }
170 
171 static void Pi(uint64_t A[5][5])
172 {
173     uint64_t T[5][5];
174 
175     /*
176      * T = A
177      * A[y][x] = T[x][(3*y+x)%5]
178      */
179     memcpy(T, A, sizeof(T));
180 
181     A[0][0] = T[0][0];
182     A[0][1] = T[1][1];
183     A[0][2] = T[2][2];
184     A[0][3] = T[3][3];
185     A[0][4] = T[4][4];
186 
187     A[1][0] = T[0][3];
188     A[1][1] = T[1][4];
189     A[1][2] = T[2][0];
190     A[1][3] = T[3][1];
191     A[1][4] = T[4][2];
192 
193     A[2][0] = T[0][1];
194     A[2][1] = T[1][2];
195     A[2][2] = T[2][3];
196     A[2][3] = T[3][4];
197     A[2][4] = T[4][0];
198 
199     A[3][0] = T[0][4];
200     A[3][1] = T[1][0];
201     A[3][2] = T[2][1];
202     A[3][3] = T[3][2];
203     A[3][4] = T[4][3];
204 
205     A[4][0] = T[0][2];
206     A[4][1] = T[1][3];
207     A[4][2] = T[2][4];
208     A[4][3] = T[3][0];
209     A[4][4] = T[4][1];
210 }
211 
212 static void Chi(uint64_t A[5][5])
213 {
214     uint64_t C[5];
215     size_t y;
216 
217     for (y = 0; y < 5; y++) {
218         C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
219         C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
220         C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
221         C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
222         C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
223 
224         A[y][0] = C[0];
225         A[y][1] = C[1];
226         A[y][2] = C[2];
227         A[y][3] = C[3];
228         A[y][4] = C[4];
229     }
230 }
231 
232 static void Iota(uint64_t A[5][5], size_t i)
233 {
234     assert(i < (sizeof(iotas) / sizeof(iotas[0])));
235     A[0][0] ^= iotas[i];
236 }
237 
238 static void KeccakF1600(uint64_t A[5][5])
239 {
240     size_t i;
241 
242     for (i = 0; i < 24; i++) {
243         Theta(A);
244         Rho(A);
245         Pi(A);
246         Chi(A);
247         Iota(A, i);
248     }
249 }
250 
251 #elif defined(KECCAK_1X)
252 /*
253  * This implementation is optimization of above code featuring unroll
254  * of even y-loops, their fusion and code motion. It also minimizes
255  * temporary storage. Compiler would normally do all these things for
256  * you, purpose of manual optimization is to provide "unobscured"
257  * reference for assembly implementation [in case this approach is
258  * chosen for implementation on some platform]. In the nutshell it's
259  * equivalent of "plane-per-plane processing" approach discussed in
260  * section 2.4 of "Keccak implementation overview".
261  */
262 static void Round(uint64_t A[5][5], size_t i)
263 {
264     uint64_t C[5], E[2];        /* registers */
265     uint64_t D[5], T[2][5];     /* memory    */
266 
267     assert(i < (sizeof(iotas) / sizeof(iotas[0])));
268 
269     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
270     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
271     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
272     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
273     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
274 
275 #if defined(__arm__)
276     D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
277     D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
278     D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
279     D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
280     D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
281 
282     T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
283     T[0][1] = A[0][1] ^ E[0]; /* D[1] */
284     T[0][2] = A[0][2] ^ C[1]; /* D[2] */
285     T[0][3] = A[0][3] ^ C[2]; /* D[3] */
286     T[0][4] = A[0][4] ^ E[1]; /* D[4] */
287 
288     C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
289     C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
290     C[0] =       A[0][0] ^ C[0]; /* rotate by 0 */  /* D[0] */
291     C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
292     C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
293 #else
294     D[0] = ROL64(C[1], 1) ^ C[4];
295     D[1] = ROL64(C[2], 1) ^ C[0];
296     D[2] = ROL64(C[3], 1) ^ C[1];
297     D[3] = ROL64(C[4], 1) ^ C[2];
298     D[4] = ROL64(C[0], 1) ^ C[3];
299 
300     T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
301     T[0][1] = A[0][1] ^ D[1];
302     T[0][2] = A[0][2] ^ D[2];
303     T[0][3] = A[0][3] ^ D[3];
304     T[0][4] = A[0][4] ^ D[4];
305 
306     C[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
307     C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
308     C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
309     C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
310     C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
311 #endif
312     A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
313     A[0][1] = C[1] ^ (~C[2] & C[3]);
314     A[0][2] = C[2] ^ (~C[3] & C[4]);
315     A[0][3] = C[3] ^ (~C[4] & C[0]);
316     A[0][4] = C[4] ^ (~C[0] & C[1]);
317 
318     T[1][0] = A[1][0] ^ (C[3] = D[0]);
319     T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
320     T[1][2] = A[1][2] ^ (E[0] = D[2]);
321     T[1][3] = A[1][3] ^ (E[1] = D[3]);
322     T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
323 
324     C[0] = ROL64(T[0][3],        rhotates[0][3]);
325     C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]);   /* D[4] */
326     C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]);   /* D[0] */
327     C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]);   /* D[1] */
328     C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]);   /* D[2] */
329 
330     A[1][0] = C[0] ^ (~C[1] & C[2]);
331     A[1][1] = C[1] ^ (~C[2] & C[3]);
332     A[1][2] = C[2] ^ (~C[3] & C[4]);
333     A[1][3] = C[3] ^ (~C[4] & C[0]);
334     A[1][4] = C[4] ^ (~C[0] & C[1]);
335 
336     C[0] = ROL64(T[0][1],        rhotates[0][1]);
337     C[1] = ROL64(T[1][2],        rhotates[1][2]);
338     C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
339     C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
340     C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
341 
342     A[2][0] = C[0] ^ (~C[1] & C[2]);
343     A[2][1] = C[1] ^ (~C[2] & C[3]);
344     A[2][2] = C[2] ^ (~C[3] & C[4]);
345     A[2][3] = C[3] ^ (~C[4] & C[0]);
346     A[2][4] = C[4] ^ (~C[0] & C[1]);
347 
348     C[0] = ROL64(T[0][4],        rhotates[0][4]);
349     C[1] = ROL64(T[1][0],        rhotates[1][0]);
350     C[2] = ROL64(T[1][1],        rhotates[2][1]); /* originally A[2][1] */
351     C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
352     C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
353 
354     A[3][0] = C[0] ^ (~C[1] & C[2]);
355     A[3][1] = C[1] ^ (~C[2] & C[3]);
356     A[3][2] = C[2] ^ (~C[3] & C[4]);
357     A[3][3] = C[3] ^ (~C[4] & C[0]);
358     A[3][4] = C[4] ^ (~C[0] & C[1]);
359 
360     C[0] = ROL64(T[0][2],        rhotates[0][2]);
361     C[1] = ROL64(T[1][3],        rhotates[1][3]);
362     C[2] = ROL64(T[1][4],        rhotates[2][4]); /* originally A[2][4] */
363     C[3] = ROL64(T[0][0],        rhotates[3][0]); /* originally A[3][0] */
364     C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
365 
366     A[4][0] = C[0] ^ (~C[1] & C[2]);
367     A[4][1] = C[1] ^ (~C[2] & C[3]);
368     A[4][2] = C[2] ^ (~C[3] & C[4]);
369     A[4][3] = C[3] ^ (~C[4] & C[0]);
370     A[4][4] = C[4] ^ (~C[0] & C[1]);
371 }
372 
373 static void KeccakF1600(uint64_t A[5][5])
374 {
375     size_t i;
376 
377     for (i = 0; i < 24; i++) {
378         Round(A, i);
379     }
380 }
381 
382 #elif defined(KECCAK_1X_ALT)
383 /*
384  * This is variant of above KECCAK_1X that reduces requirement for
385  * temporary storage even further, but at cost of more updates to A[][].
386  * It's less suitable if A[][] is memory bound, but better if it's
387  * register bound.
388  */
389 
390 static void Round(uint64_t A[5][5], size_t i)
391 {
392     uint64_t C[5], D[5];
393 
394     assert(i < (sizeof(iotas) / sizeof(iotas[0])));
395 
396     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
397     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
398     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
399     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
400     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
401 
402     D[1] = C[0] ^  ROL64(C[2], 1);
403     D[2] = C[1] ^  ROL64(C[3], 1);
404     D[3] = C[2] ^= ROL64(C[4], 1);
405     D[4] = C[3] ^= ROL64(C[0], 1);
406     D[0] = C[4] ^= ROL64(C[1], 1);
407 
408     A[0][1] ^= D[1];
409     A[1][1] ^= D[1];
410     A[2][1] ^= D[1];
411     A[3][1] ^= D[1];
412     A[4][1] ^= D[1];
413 
414     A[0][2] ^= D[2];
415     A[1][2] ^= D[2];
416     A[2][2] ^= D[2];
417     A[3][2] ^= D[2];
418     A[4][2] ^= D[2];
419 
420     A[0][3] ^= C[2];
421     A[1][3] ^= C[2];
422     A[2][3] ^= C[2];
423     A[3][3] ^= C[2];
424     A[4][3] ^= C[2];
425 
426     A[0][4] ^= C[3];
427     A[1][4] ^= C[3];
428     A[2][4] ^= C[3];
429     A[3][4] ^= C[3];
430     A[4][4] ^= C[3];
431 
432     A[0][0] ^= C[4];
433     A[1][0] ^= C[4];
434     A[2][0] ^= C[4];
435     A[3][0] ^= C[4];
436     A[4][0] ^= C[4];
437 
438     C[1] = A[0][1];
439     C[2] = A[0][2];
440     C[3] = A[0][3];
441     C[4] = A[0][4];
442 
443     A[0][1] = ROL64(A[1][1], rhotates[1][1]);
444     A[0][2] = ROL64(A[2][2], rhotates[2][2]);
445     A[0][3] = ROL64(A[3][3], rhotates[3][3]);
446     A[0][4] = ROL64(A[4][4], rhotates[4][4]);
447 
448     A[1][1] = ROL64(A[1][4], rhotates[1][4]);
449     A[2][2] = ROL64(A[2][3], rhotates[2][3]);
450     A[3][3] = ROL64(A[3][2], rhotates[3][2]);
451     A[4][4] = ROL64(A[4][1], rhotates[4][1]);
452 
453     A[1][4] = ROL64(A[4][2], rhotates[4][2]);
454     A[2][3] = ROL64(A[3][4], rhotates[3][4]);
455     A[3][2] = ROL64(A[2][1], rhotates[2][1]);
456     A[4][1] = ROL64(A[1][3], rhotates[1][3]);
457 
458     A[4][2] = ROL64(A[2][4], rhotates[2][4]);
459     A[3][4] = ROL64(A[4][3], rhotates[4][3]);
460     A[2][1] = ROL64(A[1][2], rhotates[1][2]);
461     A[1][3] = ROL64(A[3][1], rhotates[3][1]);
462 
463     A[2][4] = ROL64(A[4][0], rhotates[4][0]);
464     A[4][3] = ROL64(A[3][0], rhotates[3][0]);
465     A[1][2] = ROL64(A[2][0], rhotates[2][0]);
466     A[3][1] = ROL64(A[1][0], rhotates[1][0]);
467 
468     A[1][0] = ROL64(C[3],    rhotates[0][3]);
469     A[2][0] = ROL64(C[1],    rhotates[0][1]);
470     A[3][0] = ROL64(C[4],    rhotates[0][4]);
471     A[4][0] = ROL64(C[2],    rhotates[0][2]);
472 
473     C[0] = A[0][0];
474     C[1] = A[1][0];
475     D[0] = A[0][1];
476     D[1] = A[1][1];
477 
478     A[0][0] ^= (~A[0][1] & A[0][2]);
479     A[1][0] ^= (~A[1][1] & A[1][2]);
480     A[0][1] ^= (~A[0][2] & A[0][3]);
481     A[1][1] ^= (~A[1][2] & A[1][3]);
482     A[0][2] ^= (~A[0][3] & A[0][4]);
483     A[1][2] ^= (~A[1][3] & A[1][4]);
484     A[0][3] ^= (~A[0][4] & C[0]);
485     A[1][3] ^= (~A[1][4] & C[1]);
486     A[0][4] ^= (~C[0]    & D[0]);
487     A[1][4] ^= (~C[1]    & D[1]);
488 
489     C[2] = A[2][0];
490     C[3] = A[3][0];
491     D[2] = A[2][1];
492     D[3] = A[3][1];
493 
494     A[2][0] ^= (~A[2][1] & A[2][2]);
495     A[3][0] ^= (~A[3][1] & A[3][2]);
496     A[2][1] ^= (~A[2][2] & A[2][3]);
497     A[3][1] ^= (~A[3][2] & A[3][3]);
498     A[2][2] ^= (~A[2][3] & A[2][4]);
499     A[3][2] ^= (~A[3][3] & A[3][4]);
500     A[2][3] ^= (~A[2][4] & C[2]);
501     A[3][3] ^= (~A[3][4] & C[3]);
502     A[2][4] ^= (~C[2]    & D[2]);
503     A[3][4] ^= (~C[3]    & D[3]);
504 
505     C[4] = A[4][0];
506     D[4] = A[4][1];
507 
508     A[4][0] ^= (~A[4][1] & A[4][2]);
509     A[4][1] ^= (~A[4][2] & A[4][3]);
510     A[4][2] ^= (~A[4][3] & A[4][4]);
511     A[4][3] ^= (~A[4][4] & C[4]);
512     A[4][4] ^= (~C[4]    & D[4]);
513     A[0][0] ^= iotas[i];
514 }
515 
516 static void KeccakF1600(uint64_t A[5][5])
517 {
518     size_t i;
519 
520     for (i = 0; i < 24; i++) {
521         Round(A, i);
522     }
523 }
524 
525 #elif defined(KECCAK_2X)
526 /*
527  * This implementation is variant of KECCAK_1X above with outer-most
528  * round loop unrolled twice. This allows to take temporary storage
529  * out of round procedure and simplify references to it by alternating
530  * it with actual data (see round loop below). Originally it was meant
531  * rather as reference for an assembly implementation, but it seems to
532  * play best with compilers [as well as provide best instruction per
533  * processed byte ratio at minimal round unroll factor]...
534  */
535 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
536 {
537     uint64_t C[5], D[5];
538 
539     assert(i < (sizeof(iotas) / sizeof(iotas[0])));
540 
541     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
542     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
543     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
544     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
545     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
546 
547     D[0] = ROL64(C[1], 1) ^ C[4];
548     D[1] = ROL64(C[2], 1) ^ C[0];
549     D[2] = ROL64(C[3], 1) ^ C[1];
550     D[3] = ROL64(C[4], 1) ^ C[2];
551     D[4] = ROL64(C[0], 1) ^ C[3];
552 
553     C[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
554     C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
555     C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
556     C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
557     C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
558 
559 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
560     R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
561     R[0][1] = C[1] ^ (~C[2] | C[3]);
562     R[0][2] = C[2] ^ ( C[3] & C[4]);
563     R[0][3] = C[3] ^ ( C[4] | C[0]);
564     R[0][4] = C[4] ^ ( C[0] & C[1]);
565 #else
566     R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
567     R[0][1] = C[1] ^ (~C[2] & C[3]);
568     R[0][2] = C[2] ^ (~C[3] & C[4]);
569     R[0][3] = C[3] ^ (~C[4] & C[0]);
570     R[0][4] = C[4] ^ (~C[0] & C[1]);
571 #endif
572 
573     C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
574     C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
575     C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
576     C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
577     C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
578 
579 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
580     R[1][0] = C[0] ^ (C[1] |  C[2]);
581     R[1][1] = C[1] ^ (C[2] &  C[3]);
582     R[1][2] = C[2] ^ (C[3] | ~C[4]);
583     R[1][3] = C[3] ^ (C[4] |  C[0]);
584     R[1][4] = C[4] ^ (C[0] &  C[1]);
585 #else
586     R[1][0] = C[0] ^ (~C[1] & C[2]);
587     R[1][1] = C[1] ^ (~C[2] & C[3]);
588     R[1][2] = C[2] ^ (~C[3] & C[4]);
589     R[1][3] = C[3] ^ (~C[4] & C[0]);
590     R[1][4] = C[4] ^ (~C[0] & C[1]);
591 #endif
592 
593     C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
594     C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
595     C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
596     C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
597     C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
598 
599 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
600     R[2][0] =  C[0] ^ ( C[1] | C[2]);
601     R[2][1] =  C[1] ^ ( C[2] & C[3]);
602     R[2][2] =  C[2] ^ (~C[3] & C[4]);
603     R[2][3] = ~C[3] ^ ( C[4] | C[0]);
604     R[2][4] =  C[4] ^ ( C[0] & C[1]);
605 #else
606     R[2][0] = C[0] ^ (~C[1] & C[2]);
607     R[2][1] = C[1] ^ (~C[2] & C[3]);
608     R[2][2] = C[2] ^ (~C[3] & C[4]);
609     R[2][3] = C[3] ^ (~C[4] & C[0]);
610     R[2][4] = C[4] ^ (~C[0] & C[1]);
611 #endif
612 
613     C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
614     C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
615     C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
616     C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
617     C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
618 
619 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
620     R[3][0] =  C[0] ^ ( C[1] & C[2]);
621     R[3][1] =  C[1] ^ ( C[2] | C[3]);
622     R[3][2] =  C[2] ^ (~C[3] | C[4]);
623     R[3][3] = ~C[3] ^ ( C[4] & C[0]);
624     R[3][4] =  C[4] ^ ( C[0] | C[1]);
625 #else
626     R[3][0] = C[0] ^ (~C[1] & C[2]);
627     R[3][1] = C[1] ^ (~C[2] & C[3]);
628     R[3][2] = C[2] ^ (~C[3] & C[4]);
629     R[3][3] = C[3] ^ (~C[4] & C[0]);
630     R[3][4] = C[4] ^ (~C[0] & C[1]);
631 #endif
632 
633     C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
634     C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
635     C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
636     C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
637     C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
638 
639 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
640     R[4][0] =  C[0] ^ (~C[1] & C[2]);
641     R[4][1] = ~C[1] ^ ( C[2] | C[3]);
642     R[4][2] =  C[2] ^ ( C[3] & C[4]);
643     R[4][3] =  C[3] ^ ( C[4] | C[0]);
644     R[4][4] =  C[4] ^ ( C[0] & C[1]);
645 #else
646     R[4][0] = C[0] ^ (~C[1] & C[2]);
647     R[4][1] = C[1] ^ (~C[2] & C[3]);
648     R[4][2] = C[2] ^ (~C[3] & C[4]);
649     R[4][3] = C[3] ^ (~C[4] & C[0]);
650     R[4][4] = C[4] ^ (~C[0] & C[1]);
651 #endif
652 }
653 
654 static void KeccakF1600(uint64_t A[5][5])
655 {
656     uint64_t T[5][5];
657     size_t i;
658 
659 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
660     A[0][1] = ~A[0][1];
661     A[0][2] = ~A[0][2];
662     A[1][3] = ~A[1][3];
663     A[2][2] = ~A[2][2];
664     A[3][2] = ~A[3][2];
665     A[4][0] = ~A[4][0];
666 #endif
667 
668     for (i = 0; i < 24; i += 2) {
669         Round(T, A, i);
670         Round(A, T, i + 1);
671     }
672 
673 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
674     A[0][1] = ~A[0][1];
675     A[0][2] = ~A[0][2];
676     A[1][3] = ~A[1][3];
677     A[2][2] = ~A[2][2];
678     A[3][2] = ~A[3][2];
679     A[4][0] = ~A[4][0];
680 #endif
681 }
682 
683 #else   /* define KECCAK_INPLACE to compile this code path */
684 /*
685  * This implementation is KECCAK_1X from above combined 4 times with
686  * a twist that allows to omit temporary storage and perform in-place
687  * processing. It's discussed in section 2.5 of "Keccak implementation
688  * overview". It's likely to be best suited for processors with large
689  * register bank... On the other hand processor with large register
690  * bank can as well use KECCAK_1X_ALT, it would be as fast but much
691  * more compact...
692  */
693 static void FourRounds(uint64_t A[5][5], size_t i)
694 {
695     uint64_t B[5], C[5], D[5];
696 
697     assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4));
698 
699     /* Round 4*n */
700     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
701     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
702     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
703     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
704     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
705 
706     D[0] = ROL64(C[1], 1) ^ C[4];
707     D[1] = ROL64(C[2], 1) ^ C[0];
708     D[2] = ROL64(C[3], 1) ^ C[1];
709     D[3] = ROL64(C[4], 1) ^ C[2];
710     D[4] = ROL64(C[0], 1) ^ C[3];
711 
712     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
713     B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
714     B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
715     B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
716     B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
717 
718     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
719     C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
720     C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
721     C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
722     C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
723 
724     B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
725     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
726     B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
727     B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
728     B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
729 
730     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
731     C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
732     C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
733     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
734     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
735 
736     B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
737     B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
738     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
739     B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
740     B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
741 
742     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
743     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
744     C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
745     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
746     C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
747 
748     B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
749     B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
750     B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
751     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
752     B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
753 
754     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
755     C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
756     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
757     C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
758     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
759 
760     B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
761     B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
762     B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
763     B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
764     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
765 
766     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
767     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
768     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
769     C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
770     C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
771 
772     /* Round 4*n+1 */
773     D[0] = ROL64(C[1], 1) ^ C[4];
774     D[1] = ROL64(C[2], 1) ^ C[0];
775     D[2] = ROL64(C[3], 1) ^ C[1];
776     D[3] = ROL64(C[4], 1) ^ C[2];
777     D[4] = ROL64(C[0], 1) ^ C[3];
778 
779     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
780     B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
781     B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
782     B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
783     B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
784 
785     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
786     C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
787     C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
788     C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
789     C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
790 
791     B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
792     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
793     B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
794     B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
795     B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
796 
797     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
798     C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
799     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
800     C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
801     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
802 
803     B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
804     B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
805     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
806     B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
807     B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
808 
809     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
810     C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
811     C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
812     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
813     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
814 
815     B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
816     B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
817     B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
818     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
819     B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
820 
821     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
822     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
823     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
824     C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
825     C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
826 
827     B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
828     B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
829     B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
830     B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
831     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
832 
833     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
834     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
835     C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
836     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
837     C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
838 
839     /* Round 4*n+2 */
840     D[0] = ROL64(C[1], 1) ^ C[4];
841     D[1] = ROL64(C[2], 1) ^ C[0];
842     D[2] = ROL64(C[3], 1) ^ C[1];
843     D[3] = ROL64(C[4], 1) ^ C[2];
844     D[4] = ROL64(C[0], 1) ^ C[3];
845 
846     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
847     B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
848     B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
849     B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
850     B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
851 
852     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
853     C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
854     C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
855     C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
856     C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
857 
858     B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
859     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
860     B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
861     B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
862     B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
863 
864     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
865     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
866     C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
867     C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
868     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
869 
870     B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
871     B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
872     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
873     B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
874     B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
875 
876     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
877     C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
878     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
879     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
880     C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
881 
882     B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
883     B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
884     B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
885     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
886     B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
887 
888     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
889     C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
890     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
891     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
892     C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
893 
894     B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
895     B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
896     B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
897     B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
898     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
899 
900     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
901     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
902     C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
903     C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
904     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
905 
906     /* Round 4*n+3 */
907     D[0] = ROL64(C[1], 1) ^ C[4];
908     D[1] = ROL64(C[2], 1) ^ C[0];
909     D[2] = ROL64(C[3], 1) ^ C[1];
910     D[3] = ROL64(C[4], 1) ^ C[2];
911     D[4] = ROL64(C[0], 1) ^ C[3];
912 
913     B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
914     B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
915     B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
916     B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
917     B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
918 
919     /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
920     /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
921     /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
922     /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
923     /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
924 
925     B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
926     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
927     B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
928     B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
929     B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
930 
931     /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
932     /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
933     /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
934     /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
935     /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
936 
937     B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
938     B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
939     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
940     B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
941     B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
942 
943     /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
944     /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
945     /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
946     /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
947     /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
948 
949     B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
950     B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
951     B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
952     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
953     B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
954 
955     /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
956     /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
957     /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
958     /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
959     /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
960 
961     B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
962     B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
963     B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
964     B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
965     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
966 
967     /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
968     /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
969     /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
970     /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
971     /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
972 }
973 
974 static void KeccakF1600(uint64_t A[5][5])
975 {
976     size_t i;
977 
978     for (i = 0; i < 24; i += 4) {
979         FourRounds(A, i);
980     }
981 }
982 
983 #endif
984 
985 static uint64_t BitInterleave(uint64_t Ai)
986 {
987     if (BIT_INTERLEAVE) {
988         uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
989         uint32_t t0, t1;
990 
991         t0 = lo & 0x55555555;
992         t0 |= t0 >> 1;  t0 &= 0x33333333;
993         t0 |= t0 >> 2;  t0 &= 0x0f0f0f0f;
994         t0 |= t0 >> 4;  t0 &= 0x00ff00ff;
995         t0 |= t0 >> 8;  t0 &= 0x0000ffff;
996 
997         t1 = hi & 0x55555555;
998         t1 |= t1 >> 1;  t1 &= 0x33333333;
999         t1 |= t1 >> 2;  t1 &= 0x0f0f0f0f;
1000         t1 |= t1 >> 4;  t1 &= 0x00ff00ff;
1001         t1 |= t1 >> 8;  t1 <<= 16;
1002 
1003         lo &= 0xaaaaaaaa;
1004         lo |= lo << 1;  lo &= 0xcccccccc;
1005         lo |= lo << 2;  lo &= 0xf0f0f0f0;
1006         lo |= lo << 4;  lo &= 0xff00ff00;
1007         lo |= lo << 8;  lo >>= 16;
1008 
1009         hi &= 0xaaaaaaaa;
1010         hi |= hi << 1;  hi &= 0xcccccccc;
1011         hi |= hi << 2;  hi &= 0xf0f0f0f0;
1012         hi |= hi << 4;  hi &= 0xff00ff00;
1013         hi |= hi << 8;  hi &= 0xffff0000;
1014 
1015         Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1016     }
1017 
1018     return Ai;
1019 }
1020 
1021 static uint64_t BitDeinterleave(uint64_t Ai)
1022 {
1023     if (BIT_INTERLEAVE) {
1024         uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1025         uint32_t t0, t1;
1026 
1027         t0 = lo & 0x0000ffff;
1028         t0 |= t0 << 8;  t0 &= 0x00ff00ff;
1029         t0 |= t0 << 4;  t0 &= 0x0f0f0f0f;
1030         t0 |= t0 << 2;  t0 &= 0x33333333;
1031         t0 |= t0 << 1;  t0 &= 0x55555555;
1032 
1033         t1 = hi << 16;
1034         t1 |= t1 >> 8;  t1 &= 0xff00ff00;
1035         t1 |= t1 >> 4;  t1 &= 0xf0f0f0f0;
1036         t1 |= t1 >> 2;  t1 &= 0xcccccccc;
1037         t1 |= t1 >> 1;  t1 &= 0xaaaaaaaa;
1038 
1039         lo >>= 16;
1040         lo |= lo << 8;  lo &= 0x00ff00ff;
1041         lo |= lo << 4;  lo &= 0x0f0f0f0f;
1042         lo |= lo << 2;  lo &= 0x33333333;
1043         lo |= lo << 1;  lo &= 0x55555555;
1044 
1045         hi &= 0xffff0000;
1046         hi |= hi >> 8;  hi &= 0xff00ff00;
1047         hi |= hi >> 4;  hi &= 0xf0f0f0f0;
1048         hi |= hi >> 2;  hi &= 0xcccccccc;
1049         hi |= hi >> 1;  hi &= 0xaaaaaaaa;
1050 
1051         Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1052     }
1053 
1054     return Ai;
1055 }
1056 
1057 /*
1058  * SHA3_absorb can be called multiple times, but at each invocation
1059  * largest multiple of |r| out of |len| bytes are processed. Then
1060  * remaining amount of bytes is returned. This is done to spare caller
1061  * trouble of calculating the largest multiple of |r|. |r| can be viewed
1062  * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1063  * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1064  * padding and intermediate sub-block buffering, byte- or bitwise, is
1065  * caller's responsibility.
1066  */
1067 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1068                    size_t r)
1069 {
1070     uint64_t *A_flat = (uint64_t *)A;
1071     size_t i, w = r / 8;
1072 
1073     assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1074 
1075     while (len >= r) {
1076         for (i = 0; i < w; i++) {
1077             uint64_t Ai = (uint64_t)inp[0]       | (uint64_t)inp[1] << 8  |
1078                           (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1079                           (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1080                           (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1081             inp += 8;
1082 
1083             A_flat[i] ^= BitInterleave(Ai);
1084         }
1085         KeccakF1600(A);
1086         len -= r;
1087     }
1088 
1089     return len;
1090 }
1091 
1092 /*
1093  * sha3_squeeze is called once at the end to generate |out| hash value
1094  * of |len| bytes.
1095  */
1096 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r)
1097 {
1098     uint64_t *A_flat = (uint64_t *)A;
1099     size_t i, w = r / 8;
1100 
1101     assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1102 
1103     while (len != 0) {
1104         for (i = 0; i < w && len != 0; i++) {
1105             uint64_t Ai = BitDeinterleave(A_flat[i]);
1106 
1107             if (len < 8) {
1108                 for (i = 0; i < len; i++) {
1109                     *out++ = (unsigned char)Ai;
1110                     Ai >>= 8;
1111                 }
1112                 return;
1113             }
1114 
1115             out[0] = (unsigned char)(Ai);
1116             out[1] = (unsigned char)(Ai >> 8);
1117             out[2] = (unsigned char)(Ai >> 16);
1118             out[3] = (unsigned char)(Ai >> 24);
1119             out[4] = (unsigned char)(Ai >> 32);
1120             out[5] = (unsigned char)(Ai >> 40);
1121             out[6] = (unsigned char)(Ai >> 48);
1122             out[7] = (unsigned char)(Ai >> 56);
1123             out += 8;
1124             len -= 8;
1125         }
1126         if (len)
1127             KeccakF1600(A);
1128     }
1129 }
1130 #endif
1131 
1132 #ifdef SELFTEST
1133 /*
1134  * Post-padding one-shot implementations would look as following:
1135  *
1136  * SHA3_224     SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1137  * SHA3_256     SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1138  * SHA3_384     SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1139  * SHA3_512     SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1140  * SHAKE_128    SHA3_sponge(inp, len, out, d, (1600-256)/8);
1141  * SHAKE_256    SHA3_sponge(inp, len, out, d, (1600-512)/8);
1142  */
1143 
1144 void SHA3_sponge(const unsigned char *inp, size_t len,
1145                  unsigned char *out, size_t d, size_t r)
1146 {
1147     uint64_t A[5][5];
1148 
1149     memset(A, 0, sizeof(A));
1150     SHA3_absorb(A, inp, len, r);
1151     SHA3_squeeze(A, out, d, r);
1152 }
1153 
1154 # include <stdio.h>
1155 
1156 int main()
1157 {
1158     /*
1159      * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1160      */
1161     unsigned char test[168] = { '\xf3', '\x3' };
1162     unsigned char out[512];
1163     size_t i;
1164     static const unsigned char result[512] = {
1165         0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1166         0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1167         0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1168         0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1169         0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1170         0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1171         0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1172         0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1173         0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1174         0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1175         0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1176         0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1177         0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1178         0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1179         0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1180         0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1181         0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1182         0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1183         0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1184         0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1185         0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1186         0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1187         0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1188         0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1189         0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1190         0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1191         0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1192         0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1193         0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1194         0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1195         0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1196         0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1197         0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1198         0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1199         0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1200         0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1201         0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1202         0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1203         0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1204         0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1205         0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1206         0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1207         0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1208         0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1209         0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1210         0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1211         0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1212         0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1213         0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1214         0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1215         0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1216         0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1217         0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1218         0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1219         0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1220         0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1221         0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1222         0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1223         0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1224         0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1225         0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1226         0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1227         0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1228         0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1229     };
1230 
1231     test[167] = '\x80';
1232     SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1233 
1234     /*
1235      * Rationale behind keeping output [formatted as below] is that
1236      * one should be able to redirect it to a file, then copy-n-paste
1237      * final "output val" from official example to another file, and
1238      * compare the two with diff(1).
1239      */
1240     for (i = 0; i < sizeof(out);) {
1241         printf("%02X", out[i]);
1242         printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1243     }
1244 
1245     if (memcmp(out,result,sizeof(out))) {
1246         fprintf(stderr,"failure\n");
1247         return 1;
1248     } else {
1249         fprintf(stderr,"success\n");
1250         return 0;
1251     }
1252 }
1253 #endif
1254