1 /* $OpenBSD: gcm128.c,v 1.26 2023/08/10 07:18:43 jsing Exp $ */ 2 /* ==================================================================== 3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * 3. All advertising materials mentioning features or use of this 18 * software must display the following acknowledgment: 19 * "This product includes software developed by the OpenSSL Project 20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" 21 * 22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to 23 * endorse or promote products derived from this software without 24 * prior written permission. For written permission, please contact 25 * openssl-core@openssl.org. 26 * 27 * 5. Products derived from this software may not be called "OpenSSL" 28 * nor may "OpenSSL" appear in their names without prior written 29 * permission of the OpenSSL Project. 30 * 31 * 6. Redistributions of any form whatsoever must retain the following 32 * acknowledgment: 33 * "This product includes software developed by the OpenSSL Project 34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)" 35 * 36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY 37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR 40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 47 * OF THE POSSIBILITY OF SUCH DAMAGE. 48 * ==================================================================== 49 */ 50 51 #define OPENSSL_FIPSAPI 52 53 #include <openssl/crypto.h> 54 #include "modes_local.h" 55 #include <string.h> 56 57 #ifndef MODES_DEBUG 58 # ifndef NDEBUG 59 # define NDEBUG 60 # endif 61 #endif 62 63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT) 64 /* redefine, because alignment is ensured */ 65 #undef GETU32 66 #define GETU32(p) BSWAP4(*(const u32 *)(p)) 67 #endif 68 69 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) 70 #define REDUCE1BIT(V) \ 71 do { \ 72 if (sizeof(size_t)==8) { \ 73 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ 74 V.lo = (V.hi<<63)|(V.lo>>1); \ 75 V.hi = (V.hi>>1 )^T; \ 76 } else { \ 77 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ 78 V.lo = (V.hi<<63)|(V.lo>>1); \ 79 V.hi = (V.hi>>1 )^((u64)T<<32); \ 80 } \ 81 } while(0) 82 83 /* 84 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should 85 * never be set to 8. 8 is effectively reserved for testing purposes. 86 * TABLE_BITS>1 are lookup-table-driven implementations referred to as 87 * "Shoup's" in GCM specification. In other words OpenSSL does not cover 88 * whole spectrum of possible table driven implementations. Why? In 89 * non-"Shoup's" case memory access pattern is segmented in such manner, 90 * that it's trivial to see that cache timing information can reveal 91 * fair portion of intermediate hash value. Given that ciphertext is 92 * always available to attacker, it's possible for him to attempt to 93 * deduce secret parameter H and if successful, tamper with messages 94 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's 95 * not as trivial, but there is no reason to believe that it's resistant 96 * to cache-timing attack. And the thing about "8-bit" implementation is 97 * that it consumes 16 (sixteen) times more memory, 4KB per individual 98 * key + 1KB shared. Well, on pros side it should be twice as fast as 99 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version 100 * was observed to run ~75% faster, closer to 100% for commercial 101 * compilers... Yet "4-bit" procedure is preferred, because it's 102 * believed to provide better security-performance balance and adequate 103 * all-round performance. "All-round" refers to things like: 104 * 105 * - shorter setup time effectively improves overall timing for 106 * handling short messages; 107 * - larger table allocation can become unbearable because of VM 108 * subsystem penalties (for example on Windows large enough free 109 * results in VM working set trimming, meaning that consequent 110 * malloc would immediately incur working set expansion); 111 * - larger table has larger cache footprint, which can affect 112 * performance of other code paths (not necessarily even from same 113 * thread in Hyper-Threading world); 114 * 115 * Value of 1 is not appropriate for performance reasons. 116 */ 117 #if TABLE_BITS==8 118 119 static void 120 gcm_init_8bit(u128 Htable[256], u64 H[2]) 121 { 122 int i, j; 123 u128 V; 124 125 Htable[0].hi = 0; 126 Htable[0].lo = 0; 127 V.hi = H[0]; 128 V.lo = H[1]; 129 130 for (Htable[128] = V, i = 64; i > 0; i >>= 1) { 131 REDUCE1BIT(V); 132 Htable[i] = V; 133 } 134 135 for (i = 2; i < 256; i <<= 1) { 136 u128 *Hi = Htable + i, H0 = *Hi; 137 for (j = 1; j < i; ++j) { 138 Hi[j].hi = H0.hi ^ Htable[j].hi; 139 Hi[j].lo = H0.lo ^ Htable[j].lo; 140 } 141 } 142 } 143 144 static void 145 gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) 146 { 147 u128 Z = { 0, 0}; 148 const u8 *xi = (const u8 *)Xi + 15; 149 size_t rem, n = *xi; 150 static const size_t rem_8bit[256] = { 151 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), 152 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), 153 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), 154 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), 155 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), 156 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), 157 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), 158 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), 159 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), 160 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), 161 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), 162 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), 163 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), 164 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), 165 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), 166 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), 167 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), 168 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), 169 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), 170 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), 171 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), 172 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), 173 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), 174 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), 175 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), 176 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), 177 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), 178 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), 179 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), 180 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), 181 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), 182 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), 183 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), 184 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), 185 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), 186 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), 187 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), 188 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), 189 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), 190 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), 191 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), 192 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), 193 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), 194 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), 195 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), 196 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), 197 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), 198 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), 199 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), 200 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), 201 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), 202 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), 203 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), 204 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), 205 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), 206 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), 207 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), 208 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), 209 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), 210 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), 211 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), 212 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), 213 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), 214 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; 215 216 while (1) { 217 Z.hi ^= Htable[n].hi; 218 Z.lo ^= Htable[n].lo; 219 220 if ((u8 *)Xi == xi) 221 break; 222 223 n = *(--xi); 224 225 rem = (size_t)Z.lo & 0xff; 226 Z.lo = (Z.hi << 56)|(Z.lo >> 8); 227 Z.hi = (Z.hi >> 8); 228 #if SIZE_MAX == 0xffffffffffffffff 229 Z.hi ^= rem_8bit[rem]; 230 #else 231 Z.hi ^= (u64)rem_8bit[rem] << 32; 232 #endif 233 } 234 235 Xi[0] = htobe64(Z.hi); 236 Xi[1] = htobe64(Z.lo); 237 } 238 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) 239 240 #elif TABLE_BITS==4 241 242 static void 243 gcm_init_4bit(u128 Htable[16], u64 H[2]) 244 { 245 u128 V; 246 #if defined(OPENSSL_SMALL_FOOTPRINT) 247 int i; 248 #endif 249 250 Htable[0].hi = 0; 251 Htable[0].lo = 0; 252 V.hi = H[0]; 253 V.lo = H[1]; 254 255 #if defined(OPENSSL_SMALL_FOOTPRINT) 256 for (Htable[8] = V, i = 4; i > 0; i >>= 1) { 257 REDUCE1BIT(V); 258 Htable[i] = V; 259 } 260 261 for (i = 2; i < 16; i <<= 1) { 262 u128 *Hi = Htable + i; 263 int j; 264 for (V = *Hi, j = 1; j < i; ++j) { 265 Hi[j].hi = V.hi ^ Htable[j].hi; 266 Hi[j].lo = V.lo ^ Htable[j].lo; 267 } 268 } 269 #else 270 Htable[8] = V; 271 REDUCE1BIT(V); 272 Htable[4] = V; 273 REDUCE1BIT(V); 274 Htable[2] = V; 275 REDUCE1BIT(V); 276 Htable[1] = V; 277 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo; 278 V = Htable[4]; 279 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo; 280 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo; 281 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo; 282 V = Htable[8]; 283 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo; 284 Htable[10].hi = V.hi ^ Htable[2].hi, 285 Htable[10].lo = V.lo ^ Htable[2].lo; 286 Htable[11].hi = V.hi ^ Htable[3].hi, 287 Htable[11].lo = V.lo ^ Htable[3].lo; 288 Htable[12].hi = V.hi ^ Htable[4].hi, 289 Htable[12].lo = V.lo ^ Htable[4].lo; 290 Htable[13].hi = V.hi ^ Htable[5].hi, 291 Htable[13].lo = V.lo ^ Htable[5].lo; 292 Htable[14].hi = V.hi ^ Htable[6].hi, 293 Htable[14].lo = V.lo ^ Htable[6].lo; 294 Htable[15].hi = V.hi ^ Htable[7].hi, 295 Htable[15].lo = V.lo ^ Htable[7].lo; 296 #endif 297 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) 298 /* 299 * ARM assembler expects specific dword order in Htable. 300 */ 301 { 302 int j; 303 #if BYTE_ORDER == LITTLE_ENDIAN 304 for (j = 0; j < 16; ++j) { 305 V = Htable[j]; 306 Htable[j].hi = V.lo; 307 Htable[j].lo = V.hi; 308 } 309 #else /* BIG_ENDIAN */ 310 for (j = 0; j < 16; ++j) { 311 V = Htable[j]; 312 Htable[j].hi = V.lo << 32|V.lo >> 32; 313 Htable[j].lo = V.hi << 32|V.hi >> 32; 314 } 315 #endif 316 } 317 #endif 318 } 319 320 #ifndef GHASH_ASM 321 static const size_t rem_4bit[16] = { 322 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), 323 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), 324 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), 325 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; 326 327 static void 328 gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) 329 { 330 u128 Z; 331 int cnt = 15; 332 size_t rem, nlo, nhi; 333 334 nlo = ((const u8 *)Xi)[15]; 335 nhi = nlo >> 4; 336 nlo &= 0xf; 337 338 Z.hi = Htable[nlo].hi; 339 Z.lo = Htable[nlo].lo; 340 341 while (1) { 342 rem = (size_t)Z.lo & 0xf; 343 Z.lo = (Z.hi << 60)|(Z.lo >> 4); 344 Z.hi = (Z.hi >> 4); 345 #if SIZE_MAX == 0xffffffffffffffff 346 Z.hi ^= rem_4bit[rem]; 347 #else 348 Z.hi ^= (u64)rem_4bit[rem] << 32; 349 #endif 350 Z.hi ^= Htable[nhi].hi; 351 Z.lo ^= Htable[nhi].lo; 352 353 if (--cnt < 0) 354 break; 355 356 nlo = ((const u8 *)Xi)[cnt]; 357 nhi = nlo >> 4; 358 nlo &= 0xf; 359 360 rem = (size_t)Z.lo & 0xf; 361 Z.lo = (Z.hi << 60)|(Z.lo >> 4); 362 Z.hi = (Z.hi >> 4); 363 #if SIZE_MAX == 0xffffffffffffffff 364 Z.hi ^= rem_4bit[rem]; 365 #else 366 Z.hi ^= (u64)rem_4bit[rem] << 32; 367 #endif 368 Z.hi ^= Htable[nlo].hi; 369 Z.lo ^= Htable[nlo].lo; 370 } 371 372 Xi[0] = htobe64(Z.hi); 373 Xi[1] = htobe64(Z.lo); 374 } 375 376 #if !defined(OPENSSL_SMALL_FOOTPRINT) 377 /* 378 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for 379 * details... Compiler-generated code doesn't seem to give any 380 * performance improvement, at least not on x86[_64]. It's here 381 * mostly as reference and a placeholder for possible future 382 * non-trivial optimization[s]... 383 */ 384 static void 385 gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], 386 const u8 *inp, size_t len) 387 { 388 u128 Z; 389 int cnt; 390 size_t rem, nlo, nhi; 391 392 #if 1 393 do { 394 cnt = 15; 395 nlo = ((const u8 *)Xi)[15]; 396 nlo ^= inp[15]; 397 nhi = nlo >> 4; 398 nlo &= 0xf; 399 400 Z.hi = Htable[nlo].hi; 401 Z.lo = Htable[nlo].lo; 402 403 while (1) { 404 rem = (size_t)Z.lo & 0xf; 405 Z.lo = (Z.hi << 60)|(Z.lo >> 4); 406 Z.hi = (Z.hi >> 4); 407 #if SIZE_MAX == 0xffffffffffffffff 408 Z.hi ^= rem_4bit[rem]; 409 #else 410 Z.hi ^= (u64)rem_4bit[rem] << 32; 411 #endif 412 Z.hi ^= Htable[nhi].hi; 413 Z.lo ^= Htable[nhi].lo; 414 415 if (--cnt < 0) 416 break; 417 418 nlo = ((const u8 *)Xi)[cnt]; 419 nlo ^= inp[cnt]; 420 nhi = nlo >> 4; 421 nlo &= 0xf; 422 423 rem = (size_t)Z.lo & 0xf; 424 Z.lo = (Z.hi << 60)|(Z.lo >> 4); 425 Z.hi = (Z.hi >> 4); 426 #if SIZE_MAX == 0xffffffffffffffff 427 Z.hi ^= rem_4bit[rem]; 428 #else 429 Z.hi ^= (u64)rem_4bit[rem] << 32; 430 #endif 431 Z.hi ^= Htable[nlo].hi; 432 Z.lo ^= Htable[nlo].lo; 433 } 434 #else 435 /* 436 * Extra 256+16 bytes per-key plus 512 bytes shared tables 437 * [should] give ~50% improvement... One could have PACK()-ed 438 * the rem_8bit even here, but the priority is to minimize 439 * cache footprint... 440 */ 441 u128 Hshr4[16]; /* Htable shifted right by 4 bits */ 442 u8 Hshl4[16]; /* Htable shifted left by 4 bits */ 443 static const unsigned short rem_8bit[256] = { 444 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, 445 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, 446 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, 447 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, 448 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, 449 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, 450 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, 451 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, 452 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, 453 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, 454 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, 455 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, 456 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, 457 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, 458 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, 459 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, 460 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, 461 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, 462 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, 463 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, 464 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, 465 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, 466 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, 467 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, 468 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, 469 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, 470 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, 471 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, 472 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, 473 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, 474 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, 475 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; 476 /* 477 * This pre-processing phase slows down procedure by approximately 478 * same time as it makes each loop spin faster. In other words 479 * single block performance is approximately same as straightforward 480 * "4-bit" implementation, and then it goes only faster... 481 */ 482 for (cnt = 0; cnt < 16; ++cnt) { 483 Z.hi = Htable[cnt].hi; 484 Z.lo = Htable[cnt].lo; 485 Hshr4[cnt].lo = (Z.hi << 60)|(Z.lo >> 4); 486 Hshr4[cnt].hi = (Z.hi >> 4); 487 Hshl4[cnt] = (u8)(Z.lo << 4); 488 } 489 490 do { 491 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) { 492 nlo = ((const u8 *)Xi)[cnt]; 493 nlo ^= inp[cnt]; 494 nhi = nlo >> 4; 495 nlo &= 0xf; 496 497 Z.hi ^= Htable[nlo].hi; 498 Z.lo ^= Htable[nlo].lo; 499 500 rem = (size_t)Z.lo & 0xff; 501 502 Z.lo = (Z.hi << 56)|(Z.lo >> 8); 503 Z.hi = (Z.hi >> 8); 504 505 Z.hi ^= Hshr4[nhi].hi; 506 Z.lo ^= Hshr4[nhi].lo; 507 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48; 508 } 509 510 nlo = ((const u8 *)Xi)[0]; 511 nlo ^= inp[0]; 512 nhi = nlo >> 4; 513 nlo &= 0xf; 514 515 Z.hi ^= Htable[nlo].hi; 516 Z.lo ^= Htable[nlo].lo; 517 518 rem = (size_t)Z.lo & 0xf; 519 520 Z.lo = (Z.hi << 60)|(Z.lo >> 4); 521 Z.hi = (Z.hi >> 4); 522 523 Z.hi ^= Htable[nhi].hi; 524 Z.lo ^= Htable[nhi].lo; 525 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48; 526 #endif 527 528 Xi[0] = htobe64(Z.hi); 529 Xi[1] = htobe64(Z.lo); 530 } while (inp += 16, len -= 16); 531 } 532 #endif 533 #else 534 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]); 535 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp, 536 size_t len); 537 #endif 538 539 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) 540 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) 541 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) 542 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache 543 * trashing effect. In other words idea is to hash data while it's 544 * still in L1 cache after encryption pass... */ 545 #define GHASH_CHUNK (3*1024) 546 #endif 547 548 #else /* TABLE_BITS */ 549 550 static void 551 gcm_gmult_1bit(u64 Xi[2], const u64 H[2]) 552 { 553 u128 V, Z = { 0,0 }; 554 long X; 555 int i, j; 556 const long *xi = (const long *)Xi; 557 558 V.hi = H[0]; /* H is in host byte order, no byte swapping */ 559 V.lo = H[1]; 560 561 for (j = 0; j < 16/sizeof(long); ++j) { 562 #if BYTE_ORDER == LITTLE_ENDIAN 563 #if SIZE_MAX == 0xffffffffffffffff 564 #ifdef BSWAP8 565 X = (long)(BSWAP8(xi[j])); 566 #else 567 const u8 *p = (const u8 *)(xi + j); 568 X = (long)((u64)GETU32(p) << 32|GETU32(p + 4)); 569 #endif 570 #else 571 const u8 *p = (const u8 *)(xi + j); 572 X = (long)GETU32(p); 573 #endif 574 #else /* BIG_ENDIAN */ 575 X = xi[j]; 576 #endif 577 578 for (i = 0; i < 8*sizeof(long); ++i, X <<= 1) { 579 u64 M = (u64)(X >> (8*sizeof(long) - 1)); 580 Z.hi ^= V.hi & M; 581 Z.lo ^= V.lo & M; 582 583 REDUCE1BIT(V); 584 } 585 } 586 587 Xi[0] = htobe64(Z.hi); 588 Xi[1] = htobe64(Z.lo); 589 } 590 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) 591 592 #endif 593 594 #if defined(GHASH_ASM) && \ 595 (defined(__i386) || defined(__i386__) || \ 596 defined(__x86_64) || defined(__x86_64__) || \ 597 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) 598 #include "x86_arch.h" 599 #endif 600 601 #if TABLE_BITS==4 && defined(GHASH_ASM) 602 # if (defined(__i386) || defined(__i386__) || \ 603 defined(__x86_64) || defined(__x86_64__) || \ 604 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) 605 # define GHASH_ASM_X86_OR_64 606 # define GCM_FUNCREF_4BIT 607 608 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]); 609 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]); 610 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp, 611 size_t len); 612 613 # if defined(__i386) || defined(__i386__) || defined(_M_IX86) 614 # define GHASH_ASM_X86 615 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]); 616 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp, 617 size_t len); 618 619 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]); 620 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp, 621 size_t len); 622 # endif 623 # elif defined(__arm__) || defined(__arm) 624 # include "arm_arch.h" 625 # if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT) 626 # define GHASH_ASM_ARM 627 # define GCM_FUNCREF_4BIT 628 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]); 629 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp, 630 size_t len); 631 # endif 632 # endif 633 #endif 634 635 #ifdef GCM_FUNCREF_4BIT 636 # undef GCM_MUL 637 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) 638 # ifdef GHASH 639 # undef GHASH 640 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) 641 # endif 642 #endif 643 644 void 645 CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block) 646 { 647 memset(ctx, 0, sizeof(*ctx)); 648 ctx->block = block; 649 ctx->key = key; 650 651 (*block)(ctx->H.c, ctx->H.c, key); 652 653 /* H is stored in host byte order */ 654 ctx->H.u[0] = be64toh(ctx->H.u[0]); 655 ctx->H.u[1] = be64toh(ctx->H.u[1]); 656 657 #if TABLE_BITS==8 658 gcm_init_8bit(ctx->Htable, ctx->H.u); 659 #elif TABLE_BITS==4 660 # if defined(GHASH_ASM_X86_OR_64) 661 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) 662 /* check FXSR and PCLMULQDQ bits */ 663 if ((OPENSSL_cpu_caps() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) == 664 (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) { 665 gcm_init_clmul(ctx->Htable, ctx->H.u); 666 ctx->gmult = gcm_gmult_clmul; 667 ctx->ghash = gcm_ghash_clmul; 668 return; 669 } 670 # endif 671 gcm_init_4bit(ctx->Htable, ctx->H.u); 672 # if defined(GHASH_ASM_X86) /* x86 only */ 673 # if defined(OPENSSL_IA32_SSE2) 674 if (OPENSSL_cpu_caps() & CPUCAP_MASK_SSE) { /* check SSE bit */ 675 # else 676 if (OPENSSL_cpu_caps() & CPUCAP_MASK_MMX) { /* check MMX bit */ 677 # endif 678 ctx->gmult = gcm_gmult_4bit_mmx; 679 ctx->ghash = gcm_ghash_4bit_mmx; 680 } else { 681 ctx->gmult = gcm_gmult_4bit_x86; 682 ctx->ghash = gcm_ghash_4bit_x86; 683 } 684 # else 685 ctx->gmult = gcm_gmult_4bit; 686 ctx->ghash = gcm_ghash_4bit; 687 # endif 688 # elif defined(GHASH_ASM_ARM) 689 if (OPENSSL_armcap_P & ARMV7_NEON) { 690 ctx->gmult = gcm_gmult_neon; 691 ctx->ghash = gcm_ghash_neon; 692 } else { 693 gcm_init_4bit(ctx->Htable, ctx->H.u); 694 ctx->gmult = gcm_gmult_4bit; 695 ctx->ghash = gcm_ghash_4bit; 696 } 697 # else 698 gcm_init_4bit(ctx->Htable, ctx->H.u); 699 # endif 700 #endif 701 } 702 LCRYPTO_ALIAS(CRYPTO_gcm128_init); 703 704 void 705 CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, size_t len) 706 { 707 unsigned int ctr; 708 #ifdef GCM_FUNCREF_4BIT 709 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 710 #endif 711 712 ctx->Yi.u[0] = 0; 713 ctx->Yi.u[1] = 0; 714 ctx->Xi.u[0] = 0; 715 ctx->Xi.u[1] = 0; 716 ctx->len.u[0] = 0; /* AAD length */ 717 ctx->len.u[1] = 0; /* message length */ 718 ctx->ares = 0; 719 ctx->mres = 0; 720 721 if (len == 12) { 722 memcpy(ctx->Yi.c, iv, 12); 723 ctx->Yi.c[15] = 1; 724 ctr = 1; 725 } else { 726 size_t i; 727 u64 len0 = len; 728 729 while (len >= 16) { 730 for (i = 0; i < 16; ++i) 731 ctx->Yi.c[i] ^= iv[i]; 732 GCM_MUL(ctx, Yi); 733 iv += 16; 734 len -= 16; 735 } 736 if (len) { 737 for (i = 0; i < len; ++i) 738 ctx->Yi.c[i] ^= iv[i]; 739 GCM_MUL(ctx, Yi); 740 } 741 len0 <<= 3; 742 ctx->Yi.u[1] ^= htobe64(len0); 743 744 GCM_MUL(ctx, Yi); 745 746 ctr = be32toh(ctx->Yi.d[3]); 747 } 748 749 (*ctx->block)(ctx->Yi.c, ctx->EK0.c, ctx->key); 750 ++ctr; 751 ctx->Yi.d[3] = htobe32(ctr); 752 } 753 LCRYPTO_ALIAS(CRYPTO_gcm128_setiv); 754 755 int 756 CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, size_t len) 757 { 758 size_t i; 759 unsigned int n; 760 u64 alen = ctx->len.u[0]; 761 #ifdef GCM_FUNCREF_4BIT 762 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 763 # ifdef GHASH 764 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16], 765 const u8 *inp, size_t len) = ctx->ghash; 766 # endif 767 #endif 768 769 if (ctx->len.u[1]) 770 return -2; 771 772 alen += len; 773 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len)) 774 return -1; 775 ctx->len.u[0] = alen; 776 777 n = ctx->ares; 778 if (n) { 779 while (n && len) { 780 ctx->Xi.c[n] ^= *(aad++); 781 --len; 782 n = (n + 1) % 16; 783 } 784 if (n == 0) 785 GCM_MUL(ctx, Xi); 786 else { 787 ctx->ares = n; 788 return 0; 789 } 790 } 791 792 #ifdef GHASH 793 if ((i = (len & (size_t)-16))) { 794 GHASH(ctx, aad, i); 795 aad += i; 796 len -= i; 797 } 798 #else 799 while (len >= 16) { 800 for (i = 0; i < 16; ++i) 801 ctx->Xi.c[i] ^= aad[i]; 802 GCM_MUL(ctx, Xi); 803 aad += 16; 804 len -= 16; 805 } 806 #endif 807 if (len) { 808 n = (unsigned int)len; 809 for (i = 0; i < len; ++i) 810 ctx->Xi.c[i] ^= aad[i]; 811 } 812 813 ctx->ares = n; 814 return 0; 815 } 816 LCRYPTO_ALIAS(CRYPTO_gcm128_aad); 817 818 int 819 CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, 820 const unsigned char *in, unsigned char *out, 821 size_t len) 822 { 823 unsigned int n, ctr; 824 size_t i; 825 u64 mlen = ctx->len.u[1]; 826 block128_f block = ctx->block; 827 void *key = ctx->key; 828 #ifdef GCM_FUNCREF_4BIT 829 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 830 # ifdef GHASH 831 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16], 832 const u8 *inp, size_t len) = ctx->ghash; 833 # endif 834 #endif 835 836 mlen += len; 837 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) 838 return -1; 839 ctx->len.u[1] = mlen; 840 841 if (ctx->ares) { 842 /* First call to encrypt finalizes GHASH(AAD) */ 843 GCM_MUL(ctx, Xi); 844 ctx->ares = 0; 845 } 846 847 ctr = be32toh(ctx->Yi.d[3]); 848 849 n = ctx->mres; 850 #if !defined(OPENSSL_SMALL_FOOTPRINT) 851 if (16 % sizeof(size_t) == 0) 852 do { /* always true actually */ 853 if (n) { 854 while (n && len) { 855 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ 856 ctx->EKi.c[n]; 857 --len; 858 n = (n + 1) % 16; 859 } 860 if (n == 0) 861 GCM_MUL(ctx, Xi); 862 else { 863 ctx->mres = n; 864 return 0; 865 } 866 } 867 #ifdef __STRICT_ALIGNMENT 868 if (((size_t)in|(size_t)out) % sizeof(size_t) != 0) 869 break; 870 #endif 871 #if defined(GHASH) && defined(GHASH_CHUNK) 872 while (len >= GHASH_CHUNK) { 873 size_t j = GHASH_CHUNK; 874 875 while (j) { 876 size_t *out_t = (size_t *)out; 877 const size_t *in_t = (const size_t *)in; 878 879 (*block)(ctx->Yi.c, ctx->EKi.c, key); 880 ++ctr; 881 ctx->Yi.d[3] = htobe32(ctr); 882 883 for (i = 0; i < 16/sizeof(size_t); ++i) 884 out_t[i] = in_t[i] ^ 885 ctx->EKi.t[i]; 886 out += 16; 887 in += 16; 888 j -= 16; 889 } 890 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK); 891 len -= GHASH_CHUNK; 892 } 893 if ((i = (len & (size_t)-16))) { 894 size_t j = i; 895 896 while (len >= 16) { 897 size_t *out_t = (size_t *)out; 898 const size_t *in_t = (const size_t *)in; 899 900 (*block)(ctx->Yi.c, ctx->EKi.c, key); 901 ++ctr; 902 ctx->Yi.d[3] = htobe32(ctr); 903 904 for (i = 0; i < 16/sizeof(size_t); ++i) 905 out_t[i] = in_t[i] ^ 906 ctx->EKi.t[i]; 907 out += 16; 908 in += 16; 909 len -= 16; 910 } 911 GHASH(ctx, out - j, j); 912 } 913 #else 914 while (len >= 16) { 915 size_t *out_t = (size_t *)out; 916 const size_t *in_t = (const size_t *)in; 917 918 (*block)(ctx->Yi.c, ctx->EKi.c, key); 919 ++ctr; 920 ctx->Yi.d[3] = htobe32(ctr); 921 922 for (i = 0; i < 16/sizeof(size_t); ++i) 923 ctx->Xi.t[i] ^= 924 out_t[i] = in_t[i] ^ ctx->EKi.t[i]; 925 GCM_MUL(ctx, Xi); 926 out += 16; 927 in += 16; 928 len -= 16; 929 } 930 #endif 931 if (len) { 932 (*block)(ctx->Yi.c, ctx->EKi.c, key); 933 ++ctr; 934 ctx->Yi.d[3] = htobe32(ctr); 935 936 while (len--) { 937 ctx->Xi.c[n] ^= out[n] = in[n] ^ 938 ctx->EKi.c[n]; 939 ++n; 940 } 941 } 942 943 ctx->mres = n; 944 return 0; 945 } while (0); 946 #endif 947 for (i = 0; i < len; ++i) { 948 if (n == 0) { 949 (*block)(ctx->Yi.c, ctx->EKi.c, key); 950 ++ctr; 951 ctx->Yi.d[3] = htobe32(ctr); 952 } 953 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n]; 954 n = (n + 1) % 16; 955 if (n == 0) 956 GCM_MUL(ctx, Xi); 957 } 958 959 ctx->mres = n; 960 return 0; 961 } 962 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt); 963 964 int 965 CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, 966 const unsigned char *in, unsigned char *out, 967 size_t len) 968 { 969 unsigned int n, ctr; 970 size_t i; 971 u64 mlen = ctx->len.u[1]; 972 block128_f block = ctx->block; 973 void *key = ctx->key; 974 #ifdef GCM_FUNCREF_4BIT 975 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 976 # ifdef GHASH 977 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16], 978 const u8 *inp, size_t len) = ctx->ghash; 979 # endif 980 #endif 981 982 mlen += len; 983 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) 984 return -1; 985 ctx->len.u[1] = mlen; 986 987 if (ctx->ares) { 988 /* First call to decrypt finalizes GHASH(AAD) */ 989 GCM_MUL(ctx, Xi); 990 ctx->ares = 0; 991 } 992 993 ctr = be32toh(ctx->Yi.d[3]); 994 995 n = ctx->mres; 996 #if !defined(OPENSSL_SMALL_FOOTPRINT) 997 if (16 % sizeof(size_t) == 0) 998 do { /* always true actually */ 999 if (n) { 1000 while (n && len) { 1001 u8 c = *(in++); 1002 *(out++) = c ^ ctx->EKi.c[n]; 1003 ctx->Xi.c[n] ^= c; 1004 --len; 1005 n = (n + 1) % 16; 1006 } 1007 if (n == 0) 1008 GCM_MUL(ctx, Xi); 1009 else { 1010 ctx->mres = n; 1011 return 0; 1012 } 1013 } 1014 #ifdef __STRICT_ALIGNMENT 1015 if (((size_t)in|(size_t)out) % sizeof(size_t) != 0) 1016 break; 1017 #endif 1018 #if defined(GHASH) && defined(GHASH_CHUNK) 1019 while (len >= GHASH_CHUNK) { 1020 size_t j = GHASH_CHUNK; 1021 1022 GHASH(ctx, in, GHASH_CHUNK); 1023 while (j) { 1024 size_t *out_t = (size_t *)out; 1025 const size_t *in_t = (const size_t *)in; 1026 1027 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1028 ++ctr; 1029 ctx->Yi.d[3] = htobe32(ctr); 1030 1031 for (i = 0; i < 16/sizeof(size_t); ++i) 1032 out_t[i] = in_t[i] ^ 1033 ctx->EKi.t[i]; 1034 out += 16; 1035 in += 16; 1036 j -= 16; 1037 } 1038 len -= GHASH_CHUNK; 1039 } 1040 if ((i = (len & (size_t)-16))) { 1041 GHASH(ctx, in, i); 1042 while (len >= 16) { 1043 size_t *out_t = (size_t *)out; 1044 const size_t *in_t = (const size_t *)in; 1045 1046 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1047 ++ctr; 1048 ctx->Yi.d[3] = htobe32(ctr); 1049 1050 for (i = 0; i < 16/sizeof(size_t); ++i) 1051 out_t[i] = in_t[i] ^ 1052 ctx->EKi.t[i]; 1053 out += 16; 1054 in += 16; 1055 len -= 16; 1056 } 1057 } 1058 #else 1059 while (len >= 16) { 1060 size_t *out_t = (size_t *)out; 1061 const size_t *in_t = (const size_t *)in; 1062 1063 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1064 ++ctr; 1065 ctx->Yi.d[3] = htobe32(ctr); 1066 1067 for (i = 0; i < 16/sizeof(size_t); ++i) { 1068 size_t c = in[i]; 1069 out[i] = c ^ ctx->EKi.t[i]; 1070 ctx->Xi.t[i] ^= c; 1071 } 1072 GCM_MUL(ctx, Xi); 1073 out += 16; 1074 in += 16; 1075 len -= 16; 1076 } 1077 #endif 1078 if (len) { 1079 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1080 ++ctr; 1081 ctx->Yi.d[3] = htobe32(ctr); 1082 1083 while (len--) { 1084 u8 c = in[n]; 1085 ctx->Xi.c[n] ^= c; 1086 out[n] = c ^ ctx->EKi.c[n]; 1087 ++n; 1088 } 1089 } 1090 1091 ctx->mres = n; 1092 return 0; 1093 } while (0); 1094 #endif 1095 for (i = 0; i < len; ++i) { 1096 u8 c; 1097 if (n == 0) { 1098 (*block)(ctx->Yi.c, ctx->EKi.c, key); 1099 ++ctr; 1100 ctx->Yi.d[3] = htobe32(ctr); 1101 } 1102 c = in[i]; 1103 out[i] = c ^ ctx->EKi.c[n]; 1104 ctx->Xi.c[n] ^= c; 1105 n = (n + 1) % 16; 1106 if (n == 0) 1107 GCM_MUL(ctx, Xi); 1108 } 1109 1110 ctx->mres = n; 1111 return 0; 1112 } 1113 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt); 1114 1115 int 1116 CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, 1117 const unsigned char *in, unsigned char *out, 1118 size_t len, ctr128_f stream) 1119 { 1120 unsigned int n, ctr; 1121 size_t i; 1122 u64 mlen = ctx->len.u[1]; 1123 void *key = ctx->key; 1124 #ifdef GCM_FUNCREF_4BIT 1125 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 1126 # ifdef GHASH 1127 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16], 1128 const u8 *inp, size_t len) = ctx->ghash; 1129 # endif 1130 #endif 1131 1132 mlen += len; 1133 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) 1134 return -1; 1135 ctx->len.u[1] = mlen; 1136 1137 if (ctx->ares) { 1138 /* First call to encrypt finalizes GHASH(AAD) */ 1139 GCM_MUL(ctx, Xi); 1140 ctx->ares = 0; 1141 } 1142 1143 ctr = be32toh(ctx->Yi.d[3]); 1144 1145 n = ctx->mres; 1146 if (n) { 1147 while (n && len) { 1148 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n]; 1149 --len; 1150 n = (n + 1) % 16; 1151 } 1152 if (n == 0) 1153 GCM_MUL(ctx, Xi); 1154 else { 1155 ctx->mres = n; 1156 return 0; 1157 } 1158 } 1159 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1160 while (len >= GHASH_CHUNK) { 1161 (*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c); 1162 ctr += GHASH_CHUNK/16; 1163 ctx->Yi.d[3] = htobe32(ctr); 1164 GHASH(ctx, out, GHASH_CHUNK); 1165 out += GHASH_CHUNK; 1166 in += GHASH_CHUNK; 1167 len -= GHASH_CHUNK; 1168 } 1169 #endif 1170 if ((i = (len & (size_t)-16))) { 1171 size_t j = i/16; 1172 1173 (*stream)(in, out, j, key, ctx->Yi.c); 1174 ctr += (unsigned int)j; 1175 ctx->Yi.d[3] = htobe32(ctr); 1176 in += i; 1177 len -= i; 1178 #if defined(GHASH) 1179 GHASH(ctx, out, i); 1180 out += i; 1181 #else 1182 while (j--) { 1183 for (i = 0; i < 16; ++i) 1184 ctx->Xi.c[i] ^= out[i]; 1185 GCM_MUL(ctx, Xi); 1186 out += 16; 1187 } 1188 #endif 1189 } 1190 if (len) { 1191 (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key); 1192 ++ctr; 1193 ctx->Yi.d[3] = htobe32(ctr); 1194 while (len--) { 1195 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n]; 1196 ++n; 1197 } 1198 } 1199 1200 ctx->mres = n; 1201 return 0; 1202 } 1203 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt_ctr32); 1204 1205 int 1206 CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, 1207 const unsigned char *in, unsigned char *out, 1208 size_t len, ctr128_f stream) 1209 { 1210 unsigned int n, ctr; 1211 size_t i; 1212 u64 mlen = ctx->len.u[1]; 1213 void *key = ctx->key; 1214 #ifdef GCM_FUNCREF_4BIT 1215 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 1216 # ifdef GHASH 1217 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16], 1218 const u8 *inp, size_t len) = ctx->ghash; 1219 # endif 1220 #endif 1221 1222 mlen += len; 1223 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len)) 1224 return -1; 1225 ctx->len.u[1] = mlen; 1226 1227 if (ctx->ares) { 1228 /* First call to decrypt finalizes GHASH(AAD) */ 1229 GCM_MUL(ctx, Xi); 1230 ctx->ares = 0; 1231 } 1232 1233 ctr = be32toh(ctx->Yi.d[3]); 1234 1235 n = ctx->mres; 1236 if (n) { 1237 while (n && len) { 1238 u8 c = *(in++); 1239 *(out++) = c ^ ctx->EKi.c[n]; 1240 ctx->Xi.c[n] ^= c; 1241 --len; 1242 n = (n + 1) % 16; 1243 } 1244 if (n == 0) 1245 GCM_MUL(ctx, Xi); 1246 else { 1247 ctx->mres = n; 1248 return 0; 1249 } 1250 } 1251 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1252 while (len >= GHASH_CHUNK) { 1253 GHASH(ctx, in, GHASH_CHUNK); 1254 (*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c); 1255 ctr += GHASH_CHUNK/16; 1256 ctx->Yi.d[3] = htobe32(ctr); 1257 out += GHASH_CHUNK; 1258 in += GHASH_CHUNK; 1259 len -= GHASH_CHUNK; 1260 } 1261 #endif 1262 if ((i = (len & (size_t)-16))) { 1263 size_t j = i/16; 1264 1265 #if defined(GHASH) 1266 GHASH(ctx, in, i); 1267 #else 1268 while (j--) { 1269 size_t k; 1270 for (k = 0; k < 16; ++k) 1271 ctx->Xi.c[k] ^= in[k]; 1272 GCM_MUL(ctx, Xi); 1273 in += 16; 1274 } 1275 j = i/16; 1276 in -= i; 1277 #endif 1278 (*stream)(in, out, j, key, ctx->Yi.c); 1279 ctr += (unsigned int)j; 1280 ctx->Yi.d[3] = htobe32(ctr); 1281 out += i; 1282 in += i; 1283 len -= i; 1284 } 1285 if (len) { 1286 (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key); 1287 ++ctr; 1288 ctx->Yi.d[3] = htobe32(ctr); 1289 while (len--) { 1290 u8 c = in[n]; 1291 ctx->Xi.c[n] ^= c; 1292 out[n] = c ^ ctx->EKi.c[n]; 1293 ++n; 1294 } 1295 } 1296 1297 ctx->mres = n; 1298 return 0; 1299 } 1300 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt_ctr32); 1301 1302 int 1303 CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag, 1304 size_t len) 1305 { 1306 u64 alen = ctx->len.u[0] << 3; 1307 u64 clen = ctx->len.u[1] << 3; 1308 #ifdef GCM_FUNCREF_4BIT 1309 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult; 1310 #endif 1311 1312 if (ctx->mres || ctx->ares) 1313 GCM_MUL(ctx, Xi); 1314 1315 ctx->Xi.u[0] ^= htobe64(alen); 1316 ctx->Xi.u[1] ^= htobe64(clen); 1317 GCM_MUL(ctx, Xi); 1318 1319 ctx->Xi.u[0] ^= ctx->EK0.u[0]; 1320 ctx->Xi.u[1] ^= ctx->EK0.u[1]; 1321 1322 if (tag && len <= sizeof(ctx->Xi)) 1323 return memcmp(ctx->Xi.c, tag, len); 1324 else 1325 return -1; 1326 } 1327 LCRYPTO_ALIAS(CRYPTO_gcm128_finish); 1328 1329 void 1330 CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) 1331 { 1332 CRYPTO_gcm128_finish(ctx, NULL, 0); 1333 memcpy(tag, ctx->Xi.c, 1334 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c)); 1335 } 1336 LCRYPTO_ALIAS(CRYPTO_gcm128_tag); 1337 1338 GCM128_CONTEXT * 1339 CRYPTO_gcm128_new(void *key, block128_f block) 1340 { 1341 GCM128_CONTEXT *ret; 1342 1343 if ((ret = malloc(sizeof(GCM128_CONTEXT)))) 1344 CRYPTO_gcm128_init(ret, key, block); 1345 1346 return ret; 1347 } 1348 LCRYPTO_ALIAS(CRYPTO_gcm128_new); 1349 1350 void 1351 CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) 1352 { 1353 freezero(ctx, sizeof(*ctx)); 1354 } 1355 LCRYPTO_ALIAS(CRYPTO_gcm128_release); 1356