1 /* 2 * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. 3 * 4 * Licensed under the Apache License 2.0 (the "License"). You may not use 5 * this file except in compliance with the License. You can obtain a copy 6 * in the file LICENSE in the source distribution or at 7 * https://www.openssl.org/source/license.html 8 */ 9 10 #include <openssl/e_os2.h> 11 #include <string.h> 12 #include <assert.h> 13 14 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len, 15 size_t r); 16 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r); 17 18 #if !defined(KECCAK1600_ASM) || !defined(SELFTEST) 19 20 /* 21 * Choose some sensible defaults 22 */ 23 #if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \ 24 !defined(KECCAK_2X) && !defined(KECCAK_INPLACE) 25 # define KECCAK_2X /* default to KECCAK_2X variant */ 26 #endif 27 28 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ 29 (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \ 30 defined(__mips) || defined(__riscv) || defined(__s390__) || \ 31 defined(__EMSCRIPTEN__) 32 /* 33 * These don't have "and with complement" instruction, so minimize amount 34 * of "not"-s. Implemented only in the [default] KECCAK_2X variant. 35 */ 36 # define KECCAK_COMPLEMENTING_TRANSFORM 37 #endif 38 39 #if defined(__x86_64__) || defined(__aarch64__) || \ 40 defined(__mips64) || defined(__ia64) || \ 41 (defined(__VMS) && !defined(__vax)) 42 /* 43 * These are available even in ILP32 flavours, but even then they are 44 * capable of performing 64-bit operations as efficiently as in *P64. 45 * Since it's not given that we can use sizeof(void *), just shunt it. 46 */ 47 # define BIT_INTERLEAVE (0) 48 #else 49 # define BIT_INTERLEAVE (sizeof(void *) < 8) 50 #endif 51 52 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31))) 53 54 static uint64_t ROL64(uint64_t val, int offset) 55 { 56 if (offset == 0) { 57 return val; 58 } else if (!BIT_INTERLEAVE) { 59 return (val << offset) | (val >> (64-offset)); 60 } else { 61 uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val; 62 63 if (offset & 1) { 64 uint32_t tmp = hi; 65 66 offset >>= 1; 67 hi = ROL32(lo, offset); 68 lo = ROL32(tmp, offset + 1); 69 } else { 70 offset >>= 1; 71 lo = ROL32(lo, offset); 72 hi = ROL32(hi, offset); 73 } 74 75 return ((uint64_t)hi << 32) | lo; 76 } 77 } 78 79 static const unsigned char rhotates[5][5] = { 80 { 0, 1, 62, 28, 27 }, 81 { 36, 44, 6, 55, 20 }, 82 { 3, 10, 43, 25, 39 }, 83 { 41, 45, 15, 21, 8 }, 84 { 18, 2, 61, 56, 14 } 85 }; 86 87 static const uint64_t iotas[] = { 88 BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL, 89 BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL, 90 BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL, 91 BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL, 92 BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL, 93 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL, 94 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL, 95 BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL, 96 BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL, 97 BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL, 98 BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL, 99 BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL, 100 BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL, 101 BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL, 102 BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL, 103 BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL, 104 BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL, 105 BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL, 106 BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL, 107 BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL, 108 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL, 109 BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL, 110 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL, 111 BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL 112 }; 113 114 #if defined(KECCAK_REF) 115 /* 116 * This is straightforward or "maximum clarity" implementation aiming 117 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard: 118 * Permutation-Based Hash and Extendible-Output Functions" as much as 119 * possible. With one caveat. Because of the way C stores matrices, 120 * references to A[x,y] in the specification are presented as A[y][x]. 121 * Implementation unrolls inner x-loops so that modulo 5 operations are 122 * explicitly pre-computed. 123 */ 124 static void Theta(uint64_t A[5][5]) 125 { 126 uint64_t C[5], D[5]; 127 size_t y; 128 129 C[0] = A[0][0]; 130 C[1] = A[0][1]; 131 C[2] = A[0][2]; 132 C[3] = A[0][3]; 133 C[4] = A[0][4]; 134 135 for (y = 1; y < 5; y++) { 136 C[0] ^= A[y][0]; 137 C[1] ^= A[y][1]; 138 C[2] ^= A[y][2]; 139 C[3] ^= A[y][3]; 140 C[4] ^= A[y][4]; 141 } 142 143 D[0] = ROL64(C[1], 1) ^ C[4]; 144 D[1] = ROL64(C[2], 1) ^ C[0]; 145 D[2] = ROL64(C[3], 1) ^ C[1]; 146 D[3] = ROL64(C[4], 1) ^ C[2]; 147 D[4] = ROL64(C[0], 1) ^ C[3]; 148 149 for (y = 0; y < 5; y++) { 150 A[y][0] ^= D[0]; 151 A[y][1] ^= D[1]; 152 A[y][2] ^= D[2]; 153 A[y][3] ^= D[3]; 154 A[y][4] ^= D[4]; 155 } 156 } 157 158 static void Rho(uint64_t A[5][5]) 159 { 160 size_t y; 161 162 for (y = 0; y < 5; y++) { 163 A[y][0] = ROL64(A[y][0], rhotates[y][0]); 164 A[y][1] = ROL64(A[y][1], rhotates[y][1]); 165 A[y][2] = ROL64(A[y][2], rhotates[y][2]); 166 A[y][3] = ROL64(A[y][3], rhotates[y][3]); 167 A[y][4] = ROL64(A[y][4], rhotates[y][4]); 168 } 169 } 170 171 static void Pi(uint64_t A[5][5]) 172 { 173 uint64_t T[5][5]; 174 175 /* 176 * T = A 177 * A[y][x] = T[x][(3*y+x)%5] 178 */ 179 memcpy(T, A, sizeof(T)); 180 181 A[0][0] = T[0][0]; 182 A[0][1] = T[1][1]; 183 A[0][2] = T[2][2]; 184 A[0][3] = T[3][3]; 185 A[0][4] = T[4][4]; 186 187 A[1][0] = T[0][3]; 188 A[1][1] = T[1][4]; 189 A[1][2] = T[2][0]; 190 A[1][3] = T[3][1]; 191 A[1][4] = T[4][2]; 192 193 A[2][0] = T[0][1]; 194 A[2][1] = T[1][2]; 195 A[2][2] = T[2][3]; 196 A[2][3] = T[3][4]; 197 A[2][4] = T[4][0]; 198 199 A[3][0] = T[0][4]; 200 A[3][1] = T[1][0]; 201 A[3][2] = T[2][1]; 202 A[3][3] = T[3][2]; 203 A[3][4] = T[4][3]; 204 205 A[4][0] = T[0][2]; 206 A[4][1] = T[1][3]; 207 A[4][2] = T[2][4]; 208 A[4][3] = T[3][0]; 209 A[4][4] = T[4][1]; 210 } 211 212 static void Chi(uint64_t A[5][5]) 213 { 214 uint64_t C[5]; 215 size_t y; 216 217 for (y = 0; y < 5; y++) { 218 C[0] = A[y][0] ^ (~A[y][1] & A[y][2]); 219 C[1] = A[y][1] ^ (~A[y][2] & A[y][3]); 220 C[2] = A[y][2] ^ (~A[y][3] & A[y][4]); 221 C[3] = A[y][3] ^ (~A[y][4] & A[y][0]); 222 C[4] = A[y][4] ^ (~A[y][0] & A[y][1]); 223 224 A[y][0] = C[0]; 225 A[y][1] = C[1]; 226 A[y][2] = C[2]; 227 A[y][3] = C[3]; 228 A[y][4] = C[4]; 229 } 230 } 231 232 static void Iota(uint64_t A[5][5], size_t i) 233 { 234 assert(i < (sizeof(iotas) / sizeof(iotas[0]))); 235 A[0][0] ^= iotas[i]; 236 } 237 238 static void KeccakF1600(uint64_t A[5][5]) 239 { 240 size_t i; 241 242 for (i = 0; i < 24; i++) { 243 Theta(A); 244 Rho(A); 245 Pi(A); 246 Chi(A); 247 Iota(A, i); 248 } 249 } 250 251 #elif defined(KECCAK_1X) 252 /* 253 * This implementation is optimization of above code featuring unroll 254 * of even y-loops, their fusion and code motion. It also minimizes 255 * temporary storage. Compiler would normally do all these things for 256 * you, purpose of manual optimization is to provide "unobscured" 257 * reference for assembly implementation [in case this approach is 258 * chosen for implementation on some platform]. In the nutshell it's 259 * equivalent of "plane-per-plane processing" approach discussed in 260 * section 2.4 of "Keccak implementation overview". 261 */ 262 static void Round(uint64_t A[5][5], size_t i) 263 { 264 uint64_t C[5], E[2]; /* registers */ 265 uint64_t D[5], T[2][5]; /* memory */ 266 267 assert(i < (sizeof(iotas) / sizeof(iotas[0]))); 268 269 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; 270 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; 271 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; 272 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; 273 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; 274 275 #if defined(__arm__) 276 D[1] = E[0] = ROL64(C[2], 1) ^ C[0]; 277 D[4] = E[1] = ROL64(C[0], 1) ^ C[3]; 278 D[0] = C[0] = ROL64(C[1], 1) ^ C[4]; 279 D[2] = C[1] = ROL64(C[3], 1) ^ C[1]; 280 D[3] = C[2] = ROL64(C[4], 1) ^ C[2]; 281 282 T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */ 283 T[0][1] = A[0][1] ^ E[0]; /* D[1] */ 284 T[0][2] = A[0][2] ^ C[1]; /* D[2] */ 285 T[0][3] = A[0][3] ^ C[2]; /* D[3] */ 286 T[0][4] = A[0][4] ^ E[1]; /* D[4] */ 287 288 C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ 289 C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ 290 C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */ 291 C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ 292 C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */ 293 #else 294 D[0] = ROL64(C[1], 1) ^ C[4]; 295 D[1] = ROL64(C[2], 1) ^ C[0]; 296 D[2] = ROL64(C[3], 1) ^ C[1]; 297 D[3] = ROL64(C[4], 1) ^ C[2]; 298 D[4] = ROL64(C[0], 1) ^ C[3]; 299 300 T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */ 301 T[0][1] = A[0][1] ^ D[1]; 302 T[0][2] = A[0][2] ^ D[2]; 303 T[0][3] = A[0][3] ^ D[3]; 304 T[0][4] = A[0][4] ^ D[4]; 305 306 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 307 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); 308 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]); 309 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]); 310 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]); 311 #endif 312 A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; 313 A[0][1] = C[1] ^ (~C[2] & C[3]); 314 A[0][2] = C[2] ^ (~C[3] & C[4]); 315 A[0][3] = C[3] ^ (~C[4] & C[0]); 316 A[0][4] = C[4] ^ (~C[0] & C[1]); 317 318 T[1][0] = A[1][0] ^ (C[3] = D[0]); 319 T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */ 320 T[1][2] = A[1][2] ^ (E[0] = D[2]); 321 T[1][3] = A[1][3] ^ (E[1] = D[3]); 322 T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */ 323 324 C[0] = ROL64(T[0][3], rhotates[0][3]); 325 C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */ 326 C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */ 327 C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */ 328 C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */ 329 330 A[1][0] = C[0] ^ (~C[1] & C[2]); 331 A[1][1] = C[1] ^ (~C[2] & C[3]); 332 A[1][2] = C[2] ^ (~C[3] & C[4]); 333 A[1][3] = C[3] ^ (~C[4] & C[0]); 334 A[1][4] = C[4] ^ (~C[0] & C[1]); 335 336 C[0] = ROL64(T[0][1], rhotates[0][1]); 337 C[1] = ROL64(T[1][2], rhotates[1][2]); 338 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 339 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); 340 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); 341 342 A[2][0] = C[0] ^ (~C[1] & C[2]); 343 A[2][1] = C[1] ^ (~C[2] & C[3]); 344 A[2][2] = C[2] ^ (~C[3] & C[4]); 345 A[2][3] = C[3] ^ (~C[4] & C[0]); 346 A[2][4] = C[4] ^ (~C[0] & C[1]); 347 348 C[0] = ROL64(T[0][4], rhotates[0][4]); 349 C[1] = ROL64(T[1][0], rhotates[1][0]); 350 C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */ 351 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 352 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); 353 354 A[3][0] = C[0] ^ (~C[1] & C[2]); 355 A[3][1] = C[1] ^ (~C[2] & C[3]); 356 A[3][2] = C[2] ^ (~C[3] & C[4]); 357 A[3][3] = C[3] ^ (~C[4] & C[0]); 358 A[3][4] = C[4] ^ (~C[0] & C[1]); 359 360 C[0] = ROL64(T[0][2], rhotates[0][2]); 361 C[1] = ROL64(T[1][3], rhotates[1][3]); 362 C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */ 363 C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */ 364 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 365 366 A[4][0] = C[0] ^ (~C[1] & C[2]); 367 A[4][1] = C[1] ^ (~C[2] & C[3]); 368 A[4][2] = C[2] ^ (~C[3] & C[4]); 369 A[4][3] = C[3] ^ (~C[4] & C[0]); 370 A[4][4] = C[4] ^ (~C[0] & C[1]); 371 } 372 373 static void KeccakF1600(uint64_t A[5][5]) 374 { 375 size_t i; 376 377 for (i = 0; i < 24; i++) { 378 Round(A, i); 379 } 380 } 381 382 #elif defined(KECCAK_1X_ALT) 383 /* 384 * This is variant of above KECCAK_1X that reduces requirement for 385 * temporary storage even further, but at cost of more updates to A[][]. 386 * It's less suitable if A[][] is memory bound, but better if it's 387 * register bound. 388 */ 389 390 static void Round(uint64_t A[5][5], size_t i) 391 { 392 uint64_t C[5], D[5]; 393 394 assert(i < (sizeof(iotas) / sizeof(iotas[0]))); 395 396 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; 397 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; 398 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; 399 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; 400 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; 401 402 D[1] = C[0] ^ ROL64(C[2], 1); 403 D[2] = C[1] ^ ROL64(C[3], 1); 404 D[3] = C[2] ^= ROL64(C[4], 1); 405 D[4] = C[3] ^= ROL64(C[0], 1); 406 D[0] = C[4] ^= ROL64(C[1], 1); 407 408 A[0][1] ^= D[1]; 409 A[1][1] ^= D[1]; 410 A[2][1] ^= D[1]; 411 A[3][1] ^= D[1]; 412 A[4][1] ^= D[1]; 413 414 A[0][2] ^= D[2]; 415 A[1][2] ^= D[2]; 416 A[2][2] ^= D[2]; 417 A[3][2] ^= D[2]; 418 A[4][2] ^= D[2]; 419 420 A[0][3] ^= C[2]; 421 A[1][3] ^= C[2]; 422 A[2][3] ^= C[2]; 423 A[3][3] ^= C[2]; 424 A[4][3] ^= C[2]; 425 426 A[0][4] ^= C[3]; 427 A[1][4] ^= C[3]; 428 A[2][4] ^= C[3]; 429 A[3][4] ^= C[3]; 430 A[4][4] ^= C[3]; 431 432 A[0][0] ^= C[4]; 433 A[1][0] ^= C[4]; 434 A[2][0] ^= C[4]; 435 A[3][0] ^= C[4]; 436 A[4][0] ^= C[4]; 437 438 C[1] = A[0][1]; 439 C[2] = A[0][2]; 440 C[3] = A[0][3]; 441 C[4] = A[0][4]; 442 443 A[0][1] = ROL64(A[1][1], rhotates[1][1]); 444 A[0][2] = ROL64(A[2][2], rhotates[2][2]); 445 A[0][3] = ROL64(A[3][3], rhotates[3][3]); 446 A[0][4] = ROL64(A[4][4], rhotates[4][4]); 447 448 A[1][1] = ROL64(A[1][4], rhotates[1][4]); 449 A[2][2] = ROL64(A[2][3], rhotates[2][3]); 450 A[3][3] = ROL64(A[3][2], rhotates[3][2]); 451 A[4][4] = ROL64(A[4][1], rhotates[4][1]); 452 453 A[1][4] = ROL64(A[4][2], rhotates[4][2]); 454 A[2][3] = ROL64(A[3][4], rhotates[3][4]); 455 A[3][2] = ROL64(A[2][1], rhotates[2][1]); 456 A[4][1] = ROL64(A[1][3], rhotates[1][3]); 457 458 A[4][2] = ROL64(A[2][4], rhotates[2][4]); 459 A[3][4] = ROL64(A[4][3], rhotates[4][3]); 460 A[2][1] = ROL64(A[1][2], rhotates[1][2]); 461 A[1][3] = ROL64(A[3][1], rhotates[3][1]); 462 463 A[2][4] = ROL64(A[4][0], rhotates[4][0]); 464 A[4][3] = ROL64(A[3][0], rhotates[3][0]); 465 A[1][2] = ROL64(A[2][0], rhotates[2][0]); 466 A[3][1] = ROL64(A[1][0], rhotates[1][0]); 467 468 A[1][0] = ROL64(C[3], rhotates[0][3]); 469 A[2][0] = ROL64(C[1], rhotates[0][1]); 470 A[3][0] = ROL64(C[4], rhotates[0][4]); 471 A[4][0] = ROL64(C[2], rhotates[0][2]); 472 473 C[0] = A[0][0]; 474 C[1] = A[1][0]; 475 D[0] = A[0][1]; 476 D[1] = A[1][1]; 477 478 A[0][0] ^= (~A[0][1] & A[0][2]); 479 A[1][0] ^= (~A[1][1] & A[1][2]); 480 A[0][1] ^= (~A[0][2] & A[0][3]); 481 A[1][1] ^= (~A[1][2] & A[1][3]); 482 A[0][2] ^= (~A[0][3] & A[0][4]); 483 A[1][2] ^= (~A[1][3] & A[1][4]); 484 A[0][3] ^= (~A[0][4] & C[0]); 485 A[1][3] ^= (~A[1][4] & C[1]); 486 A[0][4] ^= (~C[0] & D[0]); 487 A[1][4] ^= (~C[1] & D[1]); 488 489 C[2] = A[2][0]; 490 C[3] = A[3][0]; 491 D[2] = A[2][1]; 492 D[3] = A[3][1]; 493 494 A[2][0] ^= (~A[2][1] & A[2][2]); 495 A[3][0] ^= (~A[3][1] & A[3][2]); 496 A[2][1] ^= (~A[2][2] & A[2][3]); 497 A[3][1] ^= (~A[3][2] & A[3][3]); 498 A[2][2] ^= (~A[2][3] & A[2][4]); 499 A[3][2] ^= (~A[3][3] & A[3][4]); 500 A[2][3] ^= (~A[2][4] & C[2]); 501 A[3][3] ^= (~A[3][4] & C[3]); 502 A[2][4] ^= (~C[2] & D[2]); 503 A[3][4] ^= (~C[3] & D[3]); 504 505 C[4] = A[4][0]; 506 D[4] = A[4][1]; 507 508 A[4][0] ^= (~A[4][1] & A[4][2]); 509 A[4][1] ^= (~A[4][2] & A[4][3]); 510 A[4][2] ^= (~A[4][3] & A[4][4]); 511 A[4][3] ^= (~A[4][4] & C[4]); 512 A[4][4] ^= (~C[4] & D[4]); 513 A[0][0] ^= iotas[i]; 514 } 515 516 static void KeccakF1600(uint64_t A[5][5]) 517 { 518 size_t i; 519 520 for (i = 0; i < 24; i++) { 521 Round(A, i); 522 } 523 } 524 525 #elif defined(KECCAK_2X) 526 /* 527 * This implementation is variant of KECCAK_1X above with outer-most 528 * round loop unrolled twice. This allows to take temporary storage 529 * out of round procedure and simplify references to it by alternating 530 * it with actual data (see round loop below). Originally it was meant 531 * rather as reference for an assembly implementation, but it seems to 532 * play best with compilers [as well as provide best instruction per 533 * processed byte ratio at minimal round unroll factor]... 534 */ 535 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i) 536 { 537 uint64_t C[5], D[5]; 538 539 assert(i < (sizeof(iotas) / sizeof(iotas[0]))); 540 541 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; 542 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; 543 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; 544 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; 545 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; 546 547 D[0] = ROL64(C[1], 1) ^ C[4]; 548 D[1] = ROL64(C[2], 1) ^ C[0]; 549 D[2] = ROL64(C[3], 1) ^ C[1]; 550 D[3] = ROL64(C[4], 1) ^ C[2]; 551 D[4] = ROL64(C[0], 1) ^ C[3]; 552 553 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 554 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); 555 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]); 556 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]); 557 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]); 558 559 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 560 R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]; 561 R[0][1] = C[1] ^ (~C[2] | C[3]); 562 R[0][2] = C[2] ^ ( C[3] & C[4]); 563 R[0][3] = C[3] ^ ( C[4] | C[0]); 564 R[0][4] = C[4] ^ ( C[0] & C[1]); 565 #else 566 R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; 567 R[0][1] = C[1] ^ (~C[2] & C[3]); 568 R[0][2] = C[2] ^ (~C[3] & C[4]); 569 R[0][3] = C[3] ^ (~C[4] & C[0]); 570 R[0][4] = C[4] ^ (~C[0] & C[1]); 571 #endif 572 573 C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); 574 C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 575 C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); 576 C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); 577 C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); 578 579 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 580 R[1][0] = C[0] ^ (C[1] | C[2]); 581 R[1][1] = C[1] ^ (C[2] & C[3]); 582 R[1][2] = C[2] ^ (C[3] | ~C[4]); 583 R[1][3] = C[3] ^ (C[4] | C[0]); 584 R[1][4] = C[4] ^ (C[0] & C[1]); 585 #else 586 R[1][0] = C[0] ^ (~C[1] & C[2]); 587 R[1][1] = C[1] ^ (~C[2] & C[3]); 588 R[1][2] = C[2] ^ (~C[3] & C[4]); 589 R[1][3] = C[3] ^ (~C[4] & C[0]); 590 R[1][4] = C[4] ^ (~C[0] & C[1]); 591 #endif 592 593 C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); 594 C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); 595 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 596 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); 597 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); 598 599 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 600 R[2][0] = C[0] ^ ( C[1] | C[2]); 601 R[2][1] = C[1] ^ ( C[2] & C[3]); 602 R[2][2] = C[2] ^ (~C[3] & C[4]); 603 R[2][3] = ~C[3] ^ ( C[4] | C[0]); 604 R[2][4] = C[4] ^ ( C[0] & C[1]); 605 #else 606 R[2][0] = C[0] ^ (~C[1] & C[2]); 607 R[2][1] = C[1] ^ (~C[2] & C[3]); 608 R[2][2] = C[2] ^ (~C[3] & C[4]); 609 R[2][3] = C[3] ^ (~C[4] & C[0]); 610 R[2][4] = C[4] ^ (~C[0] & C[1]); 611 #endif 612 613 C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); 614 C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); 615 C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); 616 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 617 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); 618 619 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 620 R[3][0] = C[0] ^ ( C[1] & C[2]); 621 R[3][1] = C[1] ^ ( C[2] | C[3]); 622 R[3][2] = C[2] ^ (~C[3] | C[4]); 623 R[3][3] = ~C[3] ^ ( C[4] & C[0]); 624 R[3][4] = C[4] ^ ( C[0] | C[1]); 625 #else 626 R[3][0] = C[0] ^ (~C[1] & C[2]); 627 R[3][1] = C[1] ^ (~C[2] & C[3]); 628 R[3][2] = C[2] ^ (~C[3] & C[4]); 629 R[3][3] = C[3] ^ (~C[4] & C[0]); 630 R[3][4] = C[4] ^ (~C[0] & C[1]); 631 #endif 632 633 C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); 634 C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); 635 C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); 636 C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); 637 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 638 639 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 640 R[4][0] = C[0] ^ (~C[1] & C[2]); 641 R[4][1] = ~C[1] ^ ( C[2] | C[3]); 642 R[4][2] = C[2] ^ ( C[3] & C[4]); 643 R[4][3] = C[3] ^ ( C[4] | C[0]); 644 R[4][4] = C[4] ^ ( C[0] & C[1]); 645 #else 646 R[4][0] = C[0] ^ (~C[1] & C[2]); 647 R[4][1] = C[1] ^ (~C[2] & C[3]); 648 R[4][2] = C[2] ^ (~C[3] & C[4]); 649 R[4][3] = C[3] ^ (~C[4] & C[0]); 650 R[4][4] = C[4] ^ (~C[0] & C[1]); 651 #endif 652 } 653 654 static void KeccakF1600(uint64_t A[5][5]) 655 { 656 uint64_t T[5][5]; 657 size_t i; 658 659 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 660 A[0][1] = ~A[0][1]; 661 A[0][2] = ~A[0][2]; 662 A[1][3] = ~A[1][3]; 663 A[2][2] = ~A[2][2]; 664 A[3][2] = ~A[3][2]; 665 A[4][0] = ~A[4][0]; 666 #endif 667 668 for (i = 0; i < 24; i += 2) { 669 Round(T, A, i); 670 Round(A, T, i + 1); 671 } 672 673 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 674 A[0][1] = ~A[0][1]; 675 A[0][2] = ~A[0][2]; 676 A[1][3] = ~A[1][3]; 677 A[2][2] = ~A[2][2]; 678 A[3][2] = ~A[3][2]; 679 A[4][0] = ~A[4][0]; 680 #endif 681 } 682 683 #else /* define KECCAK_INPLACE to compile this code path */ 684 /* 685 * This implementation is KECCAK_1X from above combined 4 times with 686 * a twist that allows to omit temporary storage and perform in-place 687 * processing. It's discussed in section 2.5 of "Keccak implementation 688 * overview". It's likely to be best suited for processors with large 689 * register bank... On the other hand processor with large register 690 * bank can as well use KECCAK_1X_ALT, it would be as fast but much 691 * more compact... 692 */ 693 static void FourRounds(uint64_t A[5][5], size_t i) 694 { 695 uint64_t B[5], C[5], D[5]; 696 697 assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4)); 698 699 /* Round 4*n */ 700 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; 701 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; 702 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; 703 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; 704 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; 705 706 D[0] = ROL64(C[1], 1) ^ C[4]; 707 D[1] = ROL64(C[2], 1) ^ C[0]; 708 D[2] = ROL64(C[3], 1) ^ C[1]; 709 D[3] = ROL64(C[4], 1) ^ C[2]; 710 D[4] = ROL64(C[0], 1) ^ C[3]; 711 712 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 713 B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); 714 B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]); 715 B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]); 716 B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]); 717 718 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i]; 719 C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]); 720 C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]); 721 C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]); 722 C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]); 723 724 B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); 725 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 726 B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); 727 B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); 728 B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); 729 730 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]); 731 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]); 732 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]); 733 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]); 734 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]); 735 736 B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); 737 B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); 738 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 739 B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); 740 B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); 741 742 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]); 743 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]); 744 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]); 745 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]); 746 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]); 747 748 B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); 749 B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); 750 B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); 751 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 752 B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); 753 754 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]); 755 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]); 756 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]); 757 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]); 758 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]); 759 760 B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); 761 B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); 762 B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); 763 B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); 764 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 765 766 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]); 767 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]); 768 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]); 769 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]); 770 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]); 771 772 /* Round 4*n+1 */ 773 D[0] = ROL64(C[1], 1) ^ C[4]; 774 D[1] = ROL64(C[2], 1) ^ C[0]; 775 D[2] = ROL64(C[3], 1) ^ C[1]; 776 D[3] = ROL64(C[4], 1) ^ C[2]; 777 D[4] = ROL64(C[0], 1) ^ C[3]; 778 779 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 780 B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]); 781 B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]); 782 B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]); 783 B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]); 784 785 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1]; 786 C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]); 787 C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]); 788 C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]); 789 C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]); 790 791 B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]); 792 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 793 B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]); 794 B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]); 795 B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]); 796 797 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]); 798 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]); 799 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]); 800 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]); 801 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]); 802 803 B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]); 804 B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]); 805 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 806 B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]); 807 B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]); 808 809 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]); 810 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]); 811 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]); 812 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]); 813 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]); 814 815 B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]); 816 B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]); 817 B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]); 818 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 819 B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]); 820 821 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]); 822 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]); 823 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]); 824 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]); 825 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]); 826 827 B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]); 828 B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]); 829 B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]); 830 B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]); 831 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 832 833 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]); 834 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]); 835 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]); 836 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]); 837 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]); 838 839 /* Round 4*n+2 */ 840 D[0] = ROL64(C[1], 1) ^ C[4]; 841 D[1] = ROL64(C[2], 1) ^ C[0]; 842 D[2] = ROL64(C[3], 1) ^ C[1]; 843 D[3] = ROL64(C[4], 1) ^ C[2]; 844 D[4] = ROL64(C[0], 1) ^ C[3]; 845 846 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 847 B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]); 848 B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]); 849 B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]); 850 B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]); 851 852 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2]; 853 C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]); 854 C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]); 855 C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]); 856 C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]); 857 858 B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]); 859 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 860 B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]); 861 B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]); 862 B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]); 863 864 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]); 865 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]); 866 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]); 867 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]); 868 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]); 869 870 B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]); 871 B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]); 872 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 873 B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]); 874 B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]); 875 876 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]); 877 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]); 878 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]); 879 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]); 880 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]); 881 882 B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]); 883 B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]); 884 B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]); 885 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 886 B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]); 887 888 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]); 889 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]); 890 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]); 891 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]); 892 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]); 893 894 B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]); 895 B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]); 896 B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]); 897 B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]); 898 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 899 900 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]); 901 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]); 902 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]); 903 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]); 904 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]); 905 906 /* Round 4*n+3 */ 907 D[0] = ROL64(C[1], 1) ^ C[4]; 908 D[1] = ROL64(C[2], 1) ^ C[0]; 909 D[2] = ROL64(C[3], 1) ^ C[1]; 910 D[3] = ROL64(C[4], 1) ^ C[2]; 911 D[4] = ROL64(C[0], 1) ^ C[3]; 912 913 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 914 B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]); 915 B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]); 916 B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]); 917 B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]); 918 919 /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3]; 920 /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]); 921 /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]); 922 /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]); 923 /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]); 924 925 B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]); 926 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 927 B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]); 928 B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]); 929 B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]); 930 931 /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]); 932 /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]); 933 /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]); 934 /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]); 935 /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]); 936 937 B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]); 938 B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]); 939 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 940 B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]); 941 B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]); 942 943 /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]); 944 /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]); 945 /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]); 946 /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]); 947 /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]); 948 949 B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]); 950 B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]); 951 B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]); 952 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 953 B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]); 954 955 /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]); 956 /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]); 957 /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]); 958 /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]); 959 /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]); 960 961 B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]); 962 B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]); 963 B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]); 964 B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]); 965 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 966 967 /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]); 968 /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]); 969 /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]); 970 /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]); 971 /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]); 972 } 973 974 static void KeccakF1600(uint64_t A[5][5]) 975 { 976 size_t i; 977 978 for (i = 0; i < 24; i += 4) { 979 FourRounds(A, i); 980 } 981 } 982 983 #endif 984 985 static uint64_t BitInterleave(uint64_t Ai) 986 { 987 if (BIT_INTERLEAVE) { 988 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai; 989 uint32_t t0, t1; 990 991 t0 = lo & 0x55555555; 992 t0 |= t0 >> 1; t0 &= 0x33333333; 993 t0 |= t0 >> 2; t0 &= 0x0f0f0f0f; 994 t0 |= t0 >> 4; t0 &= 0x00ff00ff; 995 t0 |= t0 >> 8; t0 &= 0x0000ffff; 996 997 t1 = hi & 0x55555555; 998 t1 |= t1 >> 1; t1 &= 0x33333333; 999 t1 |= t1 >> 2; t1 &= 0x0f0f0f0f; 1000 t1 |= t1 >> 4; t1 &= 0x00ff00ff; 1001 t1 |= t1 >> 8; t1 <<= 16; 1002 1003 lo &= 0xaaaaaaaa; 1004 lo |= lo << 1; lo &= 0xcccccccc; 1005 lo |= lo << 2; lo &= 0xf0f0f0f0; 1006 lo |= lo << 4; lo &= 0xff00ff00; 1007 lo |= lo << 8; lo >>= 16; 1008 1009 hi &= 0xaaaaaaaa; 1010 hi |= hi << 1; hi &= 0xcccccccc; 1011 hi |= hi << 2; hi &= 0xf0f0f0f0; 1012 hi |= hi << 4; hi &= 0xff00ff00; 1013 hi |= hi << 8; hi &= 0xffff0000; 1014 1015 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0); 1016 } 1017 1018 return Ai; 1019 } 1020 1021 static uint64_t BitDeinterleave(uint64_t Ai) 1022 { 1023 if (BIT_INTERLEAVE) { 1024 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai; 1025 uint32_t t0, t1; 1026 1027 t0 = lo & 0x0000ffff; 1028 t0 |= t0 << 8; t0 &= 0x00ff00ff; 1029 t0 |= t0 << 4; t0 &= 0x0f0f0f0f; 1030 t0 |= t0 << 2; t0 &= 0x33333333; 1031 t0 |= t0 << 1; t0 &= 0x55555555; 1032 1033 t1 = hi << 16; 1034 t1 |= t1 >> 8; t1 &= 0xff00ff00; 1035 t1 |= t1 >> 4; t1 &= 0xf0f0f0f0; 1036 t1 |= t1 >> 2; t1 &= 0xcccccccc; 1037 t1 |= t1 >> 1; t1 &= 0xaaaaaaaa; 1038 1039 lo >>= 16; 1040 lo |= lo << 8; lo &= 0x00ff00ff; 1041 lo |= lo << 4; lo &= 0x0f0f0f0f; 1042 lo |= lo << 2; lo &= 0x33333333; 1043 lo |= lo << 1; lo &= 0x55555555; 1044 1045 hi &= 0xffff0000; 1046 hi |= hi >> 8; hi &= 0xff00ff00; 1047 hi |= hi >> 4; hi &= 0xf0f0f0f0; 1048 hi |= hi >> 2; hi &= 0xcccccccc; 1049 hi |= hi >> 1; hi &= 0xaaaaaaaa; 1050 1051 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0); 1052 } 1053 1054 return Ai; 1055 } 1056 1057 /* 1058 * SHA3_absorb can be called multiple times, but at each invocation 1059 * largest multiple of |r| out of |len| bytes are processed. Then 1060 * remaining amount of bytes is returned. This is done to spare caller 1061 * trouble of calculating the largest multiple of |r|. |r| can be viewed 1062 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104, 1063 * 72, but can also be (1600 - 448)/8 = 144. All this means that message 1064 * padding and intermediate sub-block buffering, byte- or bitwise, is 1065 * caller's responsibility. 1066 */ 1067 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len, 1068 size_t r) 1069 { 1070 uint64_t *A_flat = (uint64_t *)A; 1071 size_t i, w = r / 8; 1072 1073 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0); 1074 1075 while (len >= r) { 1076 for (i = 0; i < w; i++) { 1077 uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 | 1078 (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 | 1079 (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 | 1080 (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56; 1081 inp += 8; 1082 1083 A_flat[i] ^= BitInterleave(Ai); 1084 } 1085 KeccakF1600(A); 1086 len -= r; 1087 } 1088 1089 return len; 1090 } 1091 1092 /* 1093 * sha3_squeeze is called once at the end to generate |out| hash value 1094 * of |len| bytes. 1095 */ 1096 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r) 1097 { 1098 uint64_t *A_flat = (uint64_t *)A; 1099 size_t i, w = r / 8; 1100 1101 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0); 1102 1103 while (len != 0) { 1104 for (i = 0; i < w && len != 0; i++) { 1105 uint64_t Ai = BitDeinterleave(A_flat[i]); 1106 1107 if (len < 8) { 1108 for (i = 0; i < len; i++) { 1109 *out++ = (unsigned char)Ai; 1110 Ai >>= 8; 1111 } 1112 return; 1113 } 1114 1115 out[0] = (unsigned char)(Ai); 1116 out[1] = (unsigned char)(Ai >> 8); 1117 out[2] = (unsigned char)(Ai >> 16); 1118 out[3] = (unsigned char)(Ai >> 24); 1119 out[4] = (unsigned char)(Ai >> 32); 1120 out[5] = (unsigned char)(Ai >> 40); 1121 out[6] = (unsigned char)(Ai >> 48); 1122 out[7] = (unsigned char)(Ai >> 56); 1123 out += 8; 1124 len -= 8; 1125 } 1126 if (len) 1127 KeccakF1600(A); 1128 } 1129 } 1130 #endif 1131 1132 #ifdef SELFTEST 1133 /* 1134 * Post-padding one-shot implementations would look as following: 1135 * 1136 * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8); 1137 * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8); 1138 * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8); 1139 * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8); 1140 * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8); 1141 * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8); 1142 */ 1143 1144 void SHA3_sponge(const unsigned char *inp, size_t len, 1145 unsigned char *out, size_t d, size_t r) 1146 { 1147 uint64_t A[5][5]; 1148 1149 memset(A, 0, sizeof(A)); 1150 SHA3_absorb(A, inp, len, r); 1151 SHA3_squeeze(A, out, d, r); 1152 } 1153 1154 # include <stdio.h> 1155 1156 int main() 1157 { 1158 /* 1159 * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing 1160 */ 1161 unsigned char test[168] = { '\xf3', '\x3' }; 1162 unsigned char out[512]; 1163 size_t i; 1164 static const unsigned char result[512] = { 1165 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B, 1166 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF, 1167 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98, 1168 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA, 1169 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB, 1170 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C, 1171 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C, 1172 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D, 1173 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD, 1174 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F, 1175 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC, 1176 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C, 1177 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6, 1178 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE, 1179 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3, 1180 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64, 1181 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6, 1182 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5, 1183 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38, 1184 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05, 1185 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98, 1186 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2, 1187 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50, 1188 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75, 1189 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0, 1190 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04, 1191 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8, 1192 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03, 1193 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9, 1194 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B, 1195 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B, 1196 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73, 1197 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6, 1198 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C, 1199 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D, 1200 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20, 1201 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D, 1202 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F, 1203 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A, 1204 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15, 1205 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D, 1206 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6, 1207 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2, 1208 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D, 1209 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30, 1210 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C, 1211 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E, 1212 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28, 1213 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A, 1214 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25, 1215 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28, 1216 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C, 1217 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41, 1218 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B, 1219 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C, 1220 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C, 1221 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA, 1222 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F, 1223 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A, 1224 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60, 1225 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6, 1226 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50, 1227 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF, 1228 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9 1229 }; 1230 1231 test[167] = '\x80'; 1232 SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test)); 1233 1234 /* 1235 * Rationale behind keeping output [formatted as below] is that 1236 * one should be able to redirect it to a file, then copy-n-paste 1237 * final "output val" from official example to another file, and 1238 * compare the two with diff(1). 1239 */ 1240 for (i = 0; i < sizeof(out);) { 1241 printf("%02X", out[i]); 1242 printf(++i % 16 && i != sizeof(out) ? " " : "\n"); 1243 } 1244 1245 if (memcmp(out,result,sizeof(out))) { 1246 fprintf(stderr,"failure\n"); 1247 return 1; 1248 } else { 1249 fprintf(stderr,"success\n"); 1250 return 0; 1251 } 1252 } 1253 #endif 1254