1 /* 2 * Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved. 3 * 4 * Licensed under the OpenSSL license (the "License"). You may not use 5 * this file except in compliance with the License. You can obtain a copy 6 * in the file LICENSE in the source distribution or at 7 * https://www.openssl.org/source/license.html 8 */ 9 10 #include <openssl/e_os2.h> 11 #include <string.h> 12 #include <assert.h> 13 14 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len, 15 size_t r); 16 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r); 17 18 #if !defined(KECCAK1600_ASM) || !defined(SELFTEST) 19 20 /* 21 * Choose some sensible defaults 22 */ 23 #if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \ 24 !defined(KECCAK_2X) && !defined(KECCAK_INPLACE) 25 # define KECCAK_2X /* default to KECCAK_2X variant */ 26 #endif 27 28 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) 29 # define KECCAK_COMPLEMENTING_TRANSFORM 30 #endif 31 32 #if defined(__x86_64__) || defined(__aarch64__) || \ 33 defined(__mips64) || defined(__ia64) || \ 34 (defined(__VMS) && !defined(__vax)) 35 /* 36 * These are available even in ILP32 flavours, but even then they are 37 * capable of performing 64-bit operations as efficiently as in *P64. 38 * Since it's not given that we can use sizeof(void *), just shunt it. 39 */ 40 # define BIT_INTERLEAVE (0) 41 #else 42 # define BIT_INTERLEAVE (sizeof(void *) < 8) 43 #endif 44 45 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31))) 46 47 static uint64_t ROL64(uint64_t val, int offset) 48 { 49 if (offset == 0) { 50 return val; 51 } else if (!BIT_INTERLEAVE) { 52 return (val << offset) | (val >> (64-offset)); 53 } else { 54 uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val; 55 56 if (offset & 1) { 57 uint32_t tmp = hi; 58 59 offset >>= 1; 60 hi = ROL32(lo, offset); 61 lo = ROL32(tmp, offset + 1); 62 } else { 63 offset >>= 1; 64 lo = ROL32(lo, offset); 65 hi = ROL32(hi, offset); 66 } 67 68 return ((uint64_t)hi << 32) | lo; 69 } 70 } 71 72 static const unsigned char rhotates[5][5] = { 73 { 0, 1, 62, 28, 27 }, 74 { 36, 44, 6, 55, 20 }, 75 { 3, 10, 43, 25, 39 }, 76 { 41, 45, 15, 21, 8 }, 77 { 18, 2, 61, 56, 14 } 78 }; 79 80 static const uint64_t iotas[] = { 81 BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL, 82 BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL, 83 BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL, 84 BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL, 85 BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL, 86 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL, 87 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL, 88 BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL, 89 BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL, 90 BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL, 91 BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL, 92 BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL, 93 BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL, 94 BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL, 95 BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL, 96 BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL, 97 BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL, 98 BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL, 99 BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL, 100 BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL, 101 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL, 102 BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL, 103 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL, 104 BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL 105 }; 106 107 #if defined(KECCAK_REF) 108 /* 109 * This is straightforward or "maximum clarity" implementation aiming 110 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard: 111 * Permutation-Based Hash and Extendible-Output Functions" as much as 112 * possible. With one caveat. Because of the way C stores matrices, 113 * references to A[x,y] in the specification are presented as A[y][x]. 114 * Implementation unrolls inner x-loops so that modulo 5 operations are 115 * explicitly pre-computed. 116 */ 117 static void Theta(uint64_t A[5][5]) 118 { 119 uint64_t C[5], D[5]; 120 size_t y; 121 122 C[0] = A[0][0]; 123 C[1] = A[0][1]; 124 C[2] = A[0][2]; 125 C[3] = A[0][3]; 126 C[4] = A[0][4]; 127 128 for (y = 1; y < 5; y++) { 129 C[0] ^= A[y][0]; 130 C[1] ^= A[y][1]; 131 C[2] ^= A[y][2]; 132 C[3] ^= A[y][3]; 133 C[4] ^= A[y][4]; 134 } 135 136 D[0] = ROL64(C[1], 1) ^ C[4]; 137 D[1] = ROL64(C[2], 1) ^ C[0]; 138 D[2] = ROL64(C[3], 1) ^ C[1]; 139 D[3] = ROL64(C[4], 1) ^ C[2]; 140 D[4] = ROL64(C[0], 1) ^ C[3]; 141 142 for (y = 0; y < 5; y++) { 143 A[y][0] ^= D[0]; 144 A[y][1] ^= D[1]; 145 A[y][2] ^= D[2]; 146 A[y][3] ^= D[3]; 147 A[y][4] ^= D[4]; 148 } 149 } 150 151 static void Rho(uint64_t A[5][5]) 152 { 153 size_t y; 154 155 for (y = 0; y < 5; y++) { 156 A[y][0] = ROL64(A[y][0], rhotates[y][0]); 157 A[y][1] = ROL64(A[y][1], rhotates[y][1]); 158 A[y][2] = ROL64(A[y][2], rhotates[y][2]); 159 A[y][3] = ROL64(A[y][3], rhotates[y][3]); 160 A[y][4] = ROL64(A[y][4], rhotates[y][4]); 161 } 162 } 163 164 static void Pi(uint64_t A[5][5]) 165 { 166 uint64_t T[5][5]; 167 168 /* 169 * T = A 170 * A[y][x] = T[x][(3*y+x)%5] 171 */ 172 memcpy(T, A, sizeof(T)); 173 174 A[0][0] = T[0][0]; 175 A[0][1] = T[1][1]; 176 A[0][2] = T[2][2]; 177 A[0][3] = T[3][3]; 178 A[0][4] = T[4][4]; 179 180 A[1][0] = T[0][3]; 181 A[1][1] = T[1][4]; 182 A[1][2] = T[2][0]; 183 A[1][3] = T[3][1]; 184 A[1][4] = T[4][2]; 185 186 A[2][0] = T[0][1]; 187 A[2][1] = T[1][2]; 188 A[2][2] = T[2][3]; 189 A[2][3] = T[3][4]; 190 A[2][4] = T[4][0]; 191 192 A[3][0] = T[0][4]; 193 A[3][1] = T[1][0]; 194 A[3][2] = T[2][1]; 195 A[3][3] = T[3][2]; 196 A[3][4] = T[4][3]; 197 198 A[4][0] = T[0][2]; 199 A[4][1] = T[1][3]; 200 A[4][2] = T[2][4]; 201 A[4][3] = T[3][0]; 202 A[4][4] = T[4][1]; 203 } 204 205 static void Chi(uint64_t A[5][5]) 206 { 207 uint64_t C[5]; 208 size_t y; 209 210 for (y = 0; y < 5; y++) { 211 C[0] = A[y][0] ^ (~A[y][1] & A[y][2]); 212 C[1] = A[y][1] ^ (~A[y][2] & A[y][3]); 213 C[2] = A[y][2] ^ (~A[y][3] & A[y][4]); 214 C[3] = A[y][3] ^ (~A[y][4] & A[y][0]); 215 C[4] = A[y][4] ^ (~A[y][0] & A[y][1]); 216 217 A[y][0] = C[0]; 218 A[y][1] = C[1]; 219 A[y][2] = C[2]; 220 A[y][3] = C[3]; 221 A[y][4] = C[4]; 222 } 223 } 224 225 static void Iota(uint64_t A[5][5], size_t i) 226 { 227 assert(i < (sizeof(iotas) / sizeof(iotas[0]))); 228 A[0][0] ^= iotas[i]; 229 } 230 231 static void KeccakF1600(uint64_t A[5][5]) 232 { 233 size_t i; 234 235 for (i = 0; i < 24; i++) { 236 Theta(A); 237 Rho(A); 238 Pi(A); 239 Chi(A); 240 Iota(A, i); 241 } 242 } 243 244 #elif defined(KECCAK_1X) 245 /* 246 * This implementation is optimization of above code featuring unroll 247 * of even y-loops, their fusion and code motion. It also minimizes 248 * temporary storage. Compiler would normally do all these things for 249 * you, purpose of manual optimization is to provide "unobscured" 250 * reference for assembly implementation [in case this approach is 251 * chosen for implementation on some platform]. In the nutshell it's 252 * equivalent of "plane-per-plane processing" approach discussed in 253 * section 2.4 of "Keccak implementation overview". 254 */ 255 static void Round(uint64_t A[5][5], size_t i) 256 { 257 uint64_t C[5], E[2]; /* registers */ 258 uint64_t D[5], T[2][5]; /* memory */ 259 260 assert(i < (sizeof(iotas) / sizeof(iotas[0]))); 261 262 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; 263 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; 264 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; 265 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; 266 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; 267 268 #if defined(__arm__) 269 D[1] = E[0] = ROL64(C[2], 1) ^ C[0]; 270 D[4] = E[1] = ROL64(C[0], 1) ^ C[3]; 271 D[0] = C[0] = ROL64(C[1], 1) ^ C[4]; 272 D[2] = C[1] = ROL64(C[3], 1) ^ C[1]; 273 D[3] = C[2] = ROL64(C[4], 1) ^ C[2]; 274 275 T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */ 276 T[0][1] = A[0][1] ^ E[0]; /* D[1] */ 277 T[0][2] = A[0][2] ^ C[1]; /* D[2] */ 278 T[0][3] = A[0][3] ^ C[2]; /* D[3] */ 279 T[0][4] = A[0][4] ^ E[1]; /* D[4] */ 280 281 C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ 282 C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ 283 C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */ 284 C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ 285 C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */ 286 #else 287 D[0] = ROL64(C[1], 1) ^ C[4]; 288 D[1] = ROL64(C[2], 1) ^ C[0]; 289 D[2] = ROL64(C[3], 1) ^ C[1]; 290 D[3] = ROL64(C[4], 1) ^ C[2]; 291 D[4] = ROL64(C[0], 1) ^ C[3]; 292 293 T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */ 294 T[0][1] = A[0][1] ^ D[1]; 295 T[0][2] = A[0][2] ^ D[2]; 296 T[0][3] = A[0][3] ^ D[3]; 297 T[0][4] = A[0][4] ^ D[4]; 298 299 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 300 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); 301 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]); 302 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]); 303 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]); 304 #endif 305 A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; 306 A[0][1] = C[1] ^ (~C[2] & C[3]); 307 A[0][2] = C[2] ^ (~C[3] & C[4]); 308 A[0][3] = C[3] ^ (~C[4] & C[0]); 309 A[0][4] = C[4] ^ (~C[0] & C[1]); 310 311 T[1][0] = A[1][0] ^ (C[3] = D[0]); 312 T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */ 313 T[1][2] = A[1][2] ^ (E[0] = D[2]); 314 T[1][3] = A[1][3] ^ (E[1] = D[3]); 315 T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */ 316 317 C[0] = ROL64(T[0][3], rhotates[0][3]); 318 C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */ 319 C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */ 320 C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */ 321 C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */ 322 323 A[1][0] = C[0] ^ (~C[1] & C[2]); 324 A[1][1] = C[1] ^ (~C[2] & C[3]); 325 A[1][2] = C[2] ^ (~C[3] & C[4]); 326 A[1][3] = C[3] ^ (~C[4] & C[0]); 327 A[1][4] = C[4] ^ (~C[0] & C[1]); 328 329 C[0] = ROL64(T[0][1], rhotates[0][1]); 330 C[1] = ROL64(T[1][2], rhotates[1][2]); 331 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 332 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); 333 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); 334 335 A[2][0] = C[0] ^ (~C[1] & C[2]); 336 A[2][1] = C[1] ^ (~C[2] & C[3]); 337 A[2][2] = C[2] ^ (~C[3] & C[4]); 338 A[2][3] = C[3] ^ (~C[4] & C[0]); 339 A[2][4] = C[4] ^ (~C[0] & C[1]); 340 341 C[0] = ROL64(T[0][4], rhotates[0][4]); 342 C[1] = ROL64(T[1][0], rhotates[1][0]); 343 C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */ 344 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 345 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); 346 347 A[3][0] = C[0] ^ (~C[1] & C[2]); 348 A[3][1] = C[1] ^ (~C[2] & C[3]); 349 A[3][2] = C[2] ^ (~C[3] & C[4]); 350 A[3][3] = C[3] ^ (~C[4] & C[0]); 351 A[3][4] = C[4] ^ (~C[0] & C[1]); 352 353 C[0] = ROL64(T[0][2], rhotates[0][2]); 354 C[1] = ROL64(T[1][3], rhotates[1][3]); 355 C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */ 356 C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */ 357 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 358 359 A[4][0] = C[0] ^ (~C[1] & C[2]); 360 A[4][1] = C[1] ^ (~C[2] & C[3]); 361 A[4][2] = C[2] ^ (~C[3] & C[4]); 362 A[4][3] = C[3] ^ (~C[4] & C[0]); 363 A[4][4] = C[4] ^ (~C[0] & C[1]); 364 } 365 366 static void KeccakF1600(uint64_t A[5][5]) 367 { 368 size_t i; 369 370 for (i = 0; i < 24; i++) { 371 Round(A, i); 372 } 373 } 374 375 #elif defined(KECCAK_1X_ALT) 376 /* 377 * This is variant of above KECCAK_1X that reduces requirement for 378 * temporary storage even further, but at cost of more updates to A[][]. 379 * It's less suitable if A[][] is memory bound, but better if it's 380 * register bound. 381 */ 382 383 static void Round(uint64_t A[5][5], size_t i) 384 { 385 uint64_t C[5], D[5]; 386 387 assert(i < (sizeof(iotas) / sizeof(iotas[0]))); 388 389 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; 390 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; 391 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; 392 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; 393 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; 394 395 D[1] = C[0] ^ ROL64(C[2], 1); 396 D[2] = C[1] ^ ROL64(C[3], 1); 397 D[3] = C[2] ^= ROL64(C[4], 1); 398 D[4] = C[3] ^= ROL64(C[0], 1); 399 D[0] = C[4] ^= ROL64(C[1], 1); 400 401 A[0][1] ^= D[1]; 402 A[1][1] ^= D[1]; 403 A[2][1] ^= D[1]; 404 A[3][1] ^= D[1]; 405 A[4][1] ^= D[1]; 406 407 A[0][2] ^= D[2]; 408 A[1][2] ^= D[2]; 409 A[2][2] ^= D[2]; 410 A[3][2] ^= D[2]; 411 A[4][2] ^= D[2]; 412 413 A[0][3] ^= C[2]; 414 A[1][3] ^= C[2]; 415 A[2][3] ^= C[2]; 416 A[3][3] ^= C[2]; 417 A[4][3] ^= C[2]; 418 419 A[0][4] ^= C[3]; 420 A[1][4] ^= C[3]; 421 A[2][4] ^= C[3]; 422 A[3][4] ^= C[3]; 423 A[4][4] ^= C[3]; 424 425 A[0][0] ^= C[4]; 426 A[1][0] ^= C[4]; 427 A[2][0] ^= C[4]; 428 A[3][0] ^= C[4]; 429 A[4][0] ^= C[4]; 430 431 C[1] = A[0][1]; 432 C[2] = A[0][2]; 433 C[3] = A[0][3]; 434 C[4] = A[0][4]; 435 436 A[0][1] = ROL64(A[1][1], rhotates[1][1]); 437 A[0][2] = ROL64(A[2][2], rhotates[2][2]); 438 A[0][3] = ROL64(A[3][3], rhotates[3][3]); 439 A[0][4] = ROL64(A[4][4], rhotates[4][4]); 440 441 A[1][1] = ROL64(A[1][4], rhotates[1][4]); 442 A[2][2] = ROL64(A[2][3], rhotates[2][3]); 443 A[3][3] = ROL64(A[3][2], rhotates[3][2]); 444 A[4][4] = ROL64(A[4][1], rhotates[4][1]); 445 446 A[1][4] = ROL64(A[4][2], rhotates[4][2]); 447 A[2][3] = ROL64(A[3][4], rhotates[3][4]); 448 A[3][2] = ROL64(A[2][1], rhotates[2][1]); 449 A[4][1] = ROL64(A[1][3], rhotates[1][3]); 450 451 A[4][2] = ROL64(A[2][4], rhotates[2][4]); 452 A[3][4] = ROL64(A[4][3], rhotates[4][3]); 453 A[2][1] = ROL64(A[1][2], rhotates[1][2]); 454 A[1][3] = ROL64(A[3][1], rhotates[3][1]); 455 456 A[2][4] = ROL64(A[4][0], rhotates[4][0]); 457 A[4][3] = ROL64(A[3][0], rhotates[3][0]); 458 A[1][2] = ROL64(A[2][0], rhotates[2][0]); 459 A[3][1] = ROL64(A[1][0], rhotates[1][0]); 460 461 A[1][0] = ROL64(C[3], rhotates[0][3]); 462 A[2][0] = ROL64(C[1], rhotates[0][1]); 463 A[3][0] = ROL64(C[4], rhotates[0][4]); 464 A[4][0] = ROL64(C[2], rhotates[0][2]); 465 466 C[0] = A[0][0]; 467 C[1] = A[1][0]; 468 D[0] = A[0][1]; 469 D[1] = A[1][1]; 470 471 A[0][0] ^= (~A[0][1] & A[0][2]); 472 A[1][0] ^= (~A[1][1] & A[1][2]); 473 A[0][1] ^= (~A[0][2] & A[0][3]); 474 A[1][1] ^= (~A[1][2] & A[1][3]); 475 A[0][2] ^= (~A[0][3] & A[0][4]); 476 A[1][2] ^= (~A[1][3] & A[1][4]); 477 A[0][3] ^= (~A[0][4] & C[0]); 478 A[1][3] ^= (~A[1][4] & C[1]); 479 A[0][4] ^= (~C[0] & D[0]); 480 A[1][4] ^= (~C[1] & D[1]); 481 482 C[2] = A[2][0]; 483 C[3] = A[3][0]; 484 D[2] = A[2][1]; 485 D[3] = A[3][1]; 486 487 A[2][0] ^= (~A[2][1] & A[2][2]); 488 A[3][0] ^= (~A[3][1] & A[3][2]); 489 A[2][1] ^= (~A[2][2] & A[2][3]); 490 A[3][1] ^= (~A[3][2] & A[3][3]); 491 A[2][2] ^= (~A[2][3] & A[2][4]); 492 A[3][2] ^= (~A[3][3] & A[3][4]); 493 A[2][3] ^= (~A[2][4] & C[2]); 494 A[3][3] ^= (~A[3][4] & C[3]); 495 A[2][4] ^= (~C[2] & D[2]); 496 A[3][4] ^= (~C[3] & D[3]); 497 498 C[4] = A[4][0]; 499 D[4] = A[4][1]; 500 501 A[4][0] ^= (~A[4][1] & A[4][2]); 502 A[4][1] ^= (~A[4][2] & A[4][3]); 503 A[4][2] ^= (~A[4][3] & A[4][4]); 504 A[4][3] ^= (~A[4][4] & C[4]); 505 A[4][4] ^= (~C[4] & D[4]); 506 A[0][0] ^= iotas[i]; 507 } 508 509 static void KeccakF1600(uint64_t A[5][5]) 510 { 511 size_t i; 512 513 for (i = 0; i < 24; i++) { 514 Round(A, i); 515 } 516 } 517 518 #elif defined(KECCAK_2X) 519 /* 520 * This implementation is variant of KECCAK_1X above with outer-most 521 * round loop unrolled twice. This allows to take temporary storage 522 * out of round procedure and simplify references to it by alternating 523 * it with actual data (see round loop below). Originally it was meant 524 * rather as reference for an assembly implementation, but it seems to 525 * play best with compilers [as well as provide best instruction per 526 * processed byte ratio at minimal round unroll factor]... 527 */ 528 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i) 529 { 530 uint64_t C[5], D[5]; 531 532 assert(i < (sizeof(iotas) / sizeof(iotas[0]))); 533 534 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; 535 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; 536 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; 537 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; 538 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; 539 540 D[0] = ROL64(C[1], 1) ^ C[4]; 541 D[1] = ROL64(C[2], 1) ^ C[0]; 542 D[2] = ROL64(C[3], 1) ^ C[1]; 543 D[3] = ROL64(C[4], 1) ^ C[2]; 544 D[4] = ROL64(C[0], 1) ^ C[3]; 545 546 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 547 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); 548 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]); 549 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]); 550 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]); 551 552 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 553 R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]; 554 R[0][1] = C[1] ^ (~C[2] | C[3]); 555 R[0][2] = C[2] ^ ( C[3] & C[4]); 556 R[0][3] = C[3] ^ ( C[4] | C[0]); 557 R[0][4] = C[4] ^ ( C[0] & C[1]); 558 #else 559 R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; 560 R[0][1] = C[1] ^ (~C[2] & C[3]); 561 R[0][2] = C[2] ^ (~C[3] & C[4]); 562 R[0][3] = C[3] ^ (~C[4] & C[0]); 563 R[0][4] = C[4] ^ (~C[0] & C[1]); 564 #endif 565 566 C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); 567 C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 568 C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); 569 C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); 570 C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); 571 572 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 573 R[1][0] = C[0] ^ (C[1] | C[2]); 574 R[1][1] = C[1] ^ (C[2] & C[3]); 575 R[1][2] = C[2] ^ (C[3] | ~C[4]); 576 R[1][3] = C[3] ^ (C[4] | C[0]); 577 R[1][4] = C[4] ^ (C[0] & C[1]); 578 #else 579 R[1][0] = C[0] ^ (~C[1] & C[2]); 580 R[1][1] = C[1] ^ (~C[2] & C[3]); 581 R[1][2] = C[2] ^ (~C[3] & C[4]); 582 R[1][3] = C[3] ^ (~C[4] & C[0]); 583 R[1][4] = C[4] ^ (~C[0] & C[1]); 584 #endif 585 586 C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); 587 C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); 588 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 589 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); 590 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); 591 592 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 593 R[2][0] = C[0] ^ ( C[1] | C[2]); 594 R[2][1] = C[1] ^ ( C[2] & C[3]); 595 R[2][2] = C[2] ^ (~C[3] & C[4]); 596 R[2][3] = ~C[3] ^ ( C[4] | C[0]); 597 R[2][4] = C[4] ^ ( C[0] & C[1]); 598 #else 599 R[2][0] = C[0] ^ (~C[1] & C[2]); 600 R[2][1] = C[1] ^ (~C[2] & C[3]); 601 R[2][2] = C[2] ^ (~C[3] & C[4]); 602 R[2][3] = C[3] ^ (~C[4] & C[0]); 603 R[2][4] = C[4] ^ (~C[0] & C[1]); 604 #endif 605 606 C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); 607 C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); 608 C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); 609 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 610 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); 611 612 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 613 R[3][0] = C[0] ^ ( C[1] & C[2]); 614 R[3][1] = C[1] ^ ( C[2] | C[3]); 615 R[3][2] = C[2] ^ (~C[3] | C[4]); 616 R[3][3] = ~C[3] ^ ( C[4] & C[0]); 617 R[3][4] = C[4] ^ ( C[0] | C[1]); 618 #else 619 R[3][0] = C[0] ^ (~C[1] & C[2]); 620 R[3][1] = C[1] ^ (~C[2] & C[3]); 621 R[3][2] = C[2] ^ (~C[3] & C[4]); 622 R[3][3] = C[3] ^ (~C[4] & C[0]); 623 R[3][4] = C[4] ^ (~C[0] & C[1]); 624 #endif 625 626 C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); 627 C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); 628 C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); 629 C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); 630 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 631 632 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 633 R[4][0] = C[0] ^ (~C[1] & C[2]); 634 R[4][1] = ~C[1] ^ ( C[2] | C[3]); 635 R[4][2] = C[2] ^ ( C[3] & C[4]); 636 R[4][3] = C[3] ^ ( C[4] | C[0]); 637 R[4][4] = C[4] ^ ( C[0] & C[1]); 638 #else 639 R[4][0] = C[0] ^ (~C[1] & C[2]); 640 R[4][1] = C[1] ^ (~C[2] & C[3]); 641 R[4][2] = C[2] ^ (~C[3] & C[4]); 642 R[4][3] = C[3] ^ (~C[4] & C[0]); 643 R[4][4] = C[4] ^ (~C[0] & C[1]); 644 #endif 645 } 646 647 static void KeccakF1600(uint64_t A[5][5]) 648 { 649 uint64_t T[5][5]; 650 size_t i; 651 652 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 653 A[0][1] = ~A[0][1]; 654 A[0][2] = ~A[0][2]; 655 A[1][3] = ~A[1][3]; 656 A[2][2] = ~A[2][2]; 657 A[3][2] = ~A[3][2]; 658 A[4][0] = ~A[4][0]; 659 #endif 660 661 for (i = 0; i < 24; i += 2) { 662 Round(T, A, i); 663 Round(A, T, i + 1); 664 } 665 666 #ifdef KECCAK_COMPLEMENTING_TRANSFORM 667 A[0][1] = ~A[0][1]; 668 A[0][2] = ~A[0][2]; 669 A[1][3] = ~A[1][3]; 670 A[2][2] = ~A[2][2]; 671 A[3][2] = ~A[3][2]; 672 A[4][0] = ~A[4][0]; 673 #endif 674 } 675 676 #else /* define KECCAK_INPLACE to compile this code path */ 677 /* 678 * This implementation is KECCAK_1X from above combined 4 times with 679 * a twist that allows to omit temporary storage and perform in-place 680 * processing. It's discussed in section 2.5 of "Keccak implementation 681 * overview". It's likely to be best suited for processors with large 682 * register bank... On the other hand processor with large register 683 * bank can as well use KECCAK_1X_ALT, it would be as fast but much 684 * more compact... 685 */ 686 static void FourRounds(uint64_t A[5][5], size_t i) 687 { 688 uint64_t B[5], C[5], D[5]; 689 690 assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4)); 691 692 /* Round 4*n */ 693 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; 694 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; 695 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; 696 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; 697 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; 698 699 D[0] = ROL64(C[1], 1) ^ C[4]; 700 D[1] = ROL64(C[2], 1) ^ C[0]; 701 D[2] = ROL64(C[3], 1) ^ C[1]; 702 D[3] = ROL64(C[4], 1) ^ C[2]; 703 D[4] = ROL64(C[0], 1) ^ C[3]; 704 705 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 706 B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); 707 B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]); 708 B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]); 709 B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]); 710 711 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i]; 712 C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]); 713 C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]); 714 C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]); 715 C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]); 716 717 B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); 718 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 719 B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); 720 B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); 721 B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); 722 723 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]); 724 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]); 725 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]); 726 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]); 727 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]); 728 729 B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); 730 B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); 731 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 732 B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); 733 B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); 734 735 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]); 736 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]); 737 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]); 738 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]); 739 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]); 740 741 B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); 742 B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); 743 B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); 744 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 745 B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); 746 747 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]); 748 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]); 749 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]); 750 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]); 751 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]); 752 753 B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); 754 B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); 755 B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); 756 B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); 757 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 758 759 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]); 760 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]); 761 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]); 762 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]); 763 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]); 764 765 /* Round 4*n+1 */ 766 D[0] = ROL64(C[1], 1) ^ C[4]; 767 D[1] = ROL64(C[2], 1) ^ C[0]; 768 D[2] = ROL64(C[3], 1) ^ C[1]; 769 D[3] = ROL64(C[4], 1) ^ C[2]; 770 D[4] = ROL64(C[0], 1) ^ C[3]; 771 772 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 773 B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]); 774 B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]); 775 B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]); 776 B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]); 777 778 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1]; 779 C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]); 780 C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]); 781 C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]); 782 C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]); 783 784 B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]); 785 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 786 B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]); 787 B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]); 788 B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]); 789 790 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]); 791 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]); 792 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]); 793 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]); 794 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]); 795 796 B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]); 797 B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]); 798 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 799 B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]); 800 B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]); 801 802 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]); 803 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]); 804 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]); 805 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]); 806 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]); 807 808 B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]); 809 B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]); 810 B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]); 811 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 812 B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]); 813 814 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]); 815 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]); 816 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]); 817 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]); 818 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]); 819 820 B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]); 821 B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]); 822 B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]); 823 B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]); 824 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 825 826 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]); 827 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]); 828 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]); 829 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]); 830 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]); 831 832 /* Round 4*n+2 */ 833 D[0] = ROL64(C[1], 1) ^ C[4]; 834 D[1] = ROL64(C[2], 1) ^ C[0]; 835 D[2] = ROL64(C[3], 1) ^ C[1]; 836 D[3] = ROL64(C[4], 1) ^ C[2]; 837 D[4] = ROL64(C[0], 1) ^ C[3]; 838 839 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 840 B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]); 841 B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]); 842 B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]); 843 B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]); 844 845 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2]; 846 C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]); 847 C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]); 848 C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]); 849 C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]); 850 851 B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]); 852 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 853 B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]); 854 B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]); 855 B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]); 856 857 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]); 858 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]); 859 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]); 860 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]); 861 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]); 862 863 B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]); 864 B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]); 865 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 866 B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]); 867 B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]); 868 869 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]); 870 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]); 871 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]); 872 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]); 873 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]); 874 875 B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]); 876 B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]); 877 B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]); 878 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 879 B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]); 880 881 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]); 882 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]); 883 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]); 884 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]); 885 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]); 886 887 B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]); 888 B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]); 889 B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]); 890 B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]); 891 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 892 893 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]); 894 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]); 895 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]); 896 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]); 897 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]); 898 899 /* Round 4*n+3 */ 900 D[0] = ROL64(C[1], 1) ^ C[4]; 901 D[1] = ROL64(C[2], 1) ^ C[0]; 902 D[2] = ROL64(C[3], 1) ^ C[1]; 903 D[3] = ROL64(C[4], 1) ^ C[2]; 904 D[4] = ROL64(C[0], 1) ^ C[3]; 905 906 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ 907 B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]); 908 B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]); 909 B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]); 910 B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]); 911 912 /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3]; 913 /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]); 914 /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]); 915 /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]); 916 /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]); 917 918 B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]); 919 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 920 B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]); 921 B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]); 922 B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]); 923 924 /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]); 925 /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]); 926 /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]); 927 /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]); 928 /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]); 929 930 B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]); 931 B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]); 932 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 933 B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]); 934 B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]); 935 936 /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]); 937 /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]); 938 /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]); 939 /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]); 940 /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]); 941 942 B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]); 943 B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]); 944 B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]); 945 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 946 B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]); 947 948 /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]); 949 /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]); 950 /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]); 951 /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]); 952 /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]); 953 954 B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]); 955 B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]); 956 B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]); 957 B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]); 958 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 959 960 /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]); 961 /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]); 962 /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]); 963 /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]); 964 /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]); 965 } 966 967 static void KeccakF1600(uint64_t A[5][5]) 968 { 969 size_t i; 970 971 for (i = 0; i < 24; i += 4) { 972 FourRounds(A, i); 973 } 974 } 975 976 #endif 977 978 static uint64_t BitInterleave(uint64_t Ai) 979 { 980 if (BIT_INTERLEAVE) { 981 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai; 982 uint32_t t0, t1; 983 984 t0 = lo & 0x55555555; 985 t0 |= t0 >> 1; t0 &= 0x33333333; 986 t0 |= t0 >> 2; t0 &= 0x0f0f0f0f; 987 t0 |= t0 >> 4; t0 &= 0x00ff00ff; 988 t0 |= t0 >> 8; t0 &= 0x0000ffff; 989 990 t1 = hi & 0x55555555; 991 t1 |= t1 >> 1; t1 &= 0x33333333; 992 t1 |= t1 >> 2; t1 &= 0x0f0f0f0f; 993 t1 |= t1 >> 4; t1 &= 0x00ff00ff; 994 t1 |= t1 >> 8; t1 <<= 16; 995 996 lo &= 0xaaaaaaaa; 997 lo |= lo << 1; lo &= 0xcccccccc; 998 lo |= lo << 2; lo &= 0xf0f0f0f0; 999 lo |= lo << 4; lo &= 0xff00ff00; 1000 lo |= lo << 8; lo >>= 16; 1001 1002 hi &= 0xaaaaaaaa; 1003 hi |= hi << 1; hi &= 0xcccccccc; 1004 hi |= hi << 2; hi &= 0xf0f0f0f0; 1005 hi |= hi << 4; hi &= 0xff00ff00; 1006 hi |= hi << 8; hi &= 0xffff0000; 1007 1008 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0); 1009 } 1010 1011 return Ai; 1012 } 1013 1014 static uint64_t BitDeinterleave(uint64_t Ai) 1015 { 1016 if (BIT_INTERLEAVE) { 1017 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai; 1018 uint32_t t0, t1; 1019 1020 t0 = lo & 0x0000ffff; 1021 t0 |= t0 << 8; t0 &= 0x00ff00ff; 1022 t0 |= t0 << 4; t0 &= 0x0f0f0f0f; 1023 t0 |= t0 << 2; t0 &= 0x33333333; 1024 t0 |= t0 << 1; t0 &= 0x55555555; 1025 1026 t1 = hi << 16; 1027 t1 |= t1 >> 8; t1 &= 0xff00ff00; 1028 t1 |= t1 >> 4; t1 &= 0xf0f0f0f0; 1029 t1 |= t1 >> 2; t1 &= 0xcccccccc; 1030 t1 |= t1 >> 1; t1 &= 0xaaaaaaaa; 1031 1032 lo >>= 16; 1033 lo |= lo << 8; lo &= 0x00ff00ff; 1034 lo |= lo << 4; lo &= 0x0f0f0f0f; 1035 lo |= lo << 2; lo &= 0x33333333; 1036 lo |= lo << 1; lo &= 0x55555555; 1037 1038 hi &= 0xffff0000; 1039 hi |= hi >> 8; hi &= 0xff00ff00; 1040 hi |= hi >> 4; hi &= 0xf0f0f0f0; 1041 hi |= hi >> 2; hi &= 0xcccccccc; 1042 hi |= hi >> 1; hi &= 0xaaaaaaaa; 1043 1044 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0); 1045 } 1046 1047 return Ai; 1048 } 1049 1050 /* 1051 * SHA3_absorb can be called multiple times, but at each invocation 1052 * largest multiple of |r| out of |len| bytes are processed. Then 1053 * remaining amount of bytes is returned. This is done to spare caller 1054 * trouble of calculating the largest multiple of |r|. |r| can be viewed 1055 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104, 1056 * 72, but can also be (1600 - 448)/8 = 144. All this means that message 1057 * padding and intermediate sub-block buffering, byte- or bitwise, is 1058 * caller's responsibility. 1059 */ 1060 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len, 1061 size_t r) 1062 { 1063 uint64_t *A_flat = (uint64_t *)A; 1064 size_t i, w = r / 8; 1065 1066 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0); 1067 1068 while (len >= r) { 1069 for (i = 0; i < w; i++) { 1070 uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 | 1071 (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 | 1072 (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 | 1073 (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56; 1074 inp += 8; 1075 1076 A_flat[i] ^= BitInterleave(Ai); 1077 } 1078 KeccakF1600(A); 1079 len -= r; 1080 } 1081 1082 return len; 1083 } 1084 1085 /* 1086 * SHA3_squeeze is called once at the end to generate |out| hash value 1087 * of |len| bytes. 1088 */ 1089 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r) 1090 { 1091 uint64_t *A_flat = (uint64_t *)A; 1092 size_t i, w = r / 8; 1093 1094 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0); 1095 1096 while (len != 0) { 1097 for (i = 0; i < w && len != 0; i++) { 1098 uint64_t Ai = BitDeinterleave(A_flat[i]); 1099 1100 if (len < 8) { 1101 for (i = 0; i < len; i++) { 1102 *out++ = (unsigned char)Ai; 1103 Ai >>= 8; 1104 } 1105 return; 1106 } 1107 1108 out[0] = (unsigned char)(Ai); 1109 out[1] = (unsigned char)(Ai >> 8); 1110 out[2] = (unsigned char)(Ai >> 16); 1111 out[3] = (unsigned char)(Ai >> 24); 1112 out[4] = (unsigned char)(Ai >> 32); 1113 out[5] = (unsigned char)(Ai >> 40); 1114 out[6] = (unsigned char)(Ai >> 48); 1115 out[7] = (unsigned char)(Ai >> 56); 1116 out += 8; 1117 len -= 8; 1118 } 1119 if (len) 1120 KeccakF1600(A); 1121 } 1122 } 1123 #endif 1124 1125 #ifdef SELFTEST 1126 /* 1127 * Post-padding one-shot implementations would look as following: 1128 * 1129 * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8); 1130 * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8); 1131 * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8); 1132 * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8); 1133 * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8); 1134 * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8); 1135 */ 1136 1137 void SHA3_sponge(const unsigned char *inp, size_t len, 1138 unsigned char *out, size_t d, size_t r) 1139 { 1140 uint64_t A[5][5]; 1141 1142 memset(A, 0, sizeof(A)); 1143 SHA3_absorb(A, inp, len, r); 1144 SHA3_squeeze(A, out, d, r); 1145 } 1146 1147 # include <stdio.h> 1148 1149 int main() 1150 { 1151 /* 1152 * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing 1153 */ 1154 unsigned char test[168] = { '\xf3', '\x3' }; 1155 unsigned char out[512]; 1156 size_t i; 1157 static const unsigned char result[512] = { 1158 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B, 1159 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF, 1160 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98, 1161 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA, 1162 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB, 1163 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C, 1164 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C, 1165 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D, 1166 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD, 1167 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F, 1168 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC, 1169 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C, 1170 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6, 1171 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE, 1172 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3, 1173 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64, 1174 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6, 1175 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5, 1176 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38, 1177 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05, 1178 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98, 1179 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2, 1180 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50, 1181 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75, 1182 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0, 1183 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04, 1184 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8, 1185 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03, 1186 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9, 1187 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B, 1188 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B, 1189 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73, 1190 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6, 1191 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C, 1192 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D, 1193 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20, 1194 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D, 1195 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F, 1196 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A, 1197 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15, 1198 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D, 1199 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6, 1200 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2, 1201 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D, 1202 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30, 1203 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C, 1204 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E, 1205 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28, 1206 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A, 1207 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25, 1208 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28, 1209 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C, 1210 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41, 1211 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B, 1212 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C, 1213 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C, 1214 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA, 1215 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F, 1216 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A, 1217 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60, 1218 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6, 1219 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50, 1220 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF, 1221 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9 1222 }; 1223 1224 test[167] = '\x80'; 1225 SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test)); 1226 1227 /* 1228 * Rationale behind keeping output [formatted as below] is that 1229 * one should be able to redirect it to a file, then copy-n-paste 1230 * final "output val" from official example to another file, and 1231 * compare the two with diff(1). 1232 */ 1233 for (i = 0; i < sizeof(out);) { 1234 printf("%02X", out[i]); 1235 printf(++i % 16 && i != sizeof(out) ? " " : "\n"); 1236 } 1237 1238 if (memcmp(out,result,sizeof(out))) { 1239 fprintf(stderr,"failure\n"); 1240 return 1; 1241 } else { 1242 fprintf(stderr,"success\n"); 1243 return 0; 1244 } 1245 } 1246 #endif 1247