1 /* $OpenBSD: gcm128.c,v 1.22 2018/01/24 23:03:37 kettenis Exp $ */ 2 /* ==================================================================== 3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * 3. All advertising materials mentioning features or use of this 18 * software must display the following acknowledgment: 19 * "This product includes software developed by the OpenSSL Project 20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" 21 * 22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to 23 * endorse or promote products derived from this software without 24 * prior written permission. For written permission, please contact 25 * openssl-core@openssl.org. 26 * 27 * 5. Products derived from this software may not be called "OpenSSL" 28 * nor may "OpenSSL" appear in their names without prior written 29 * permission of the OpenSSL Project. 30 * 31 * 6. Redistributions of any form whatsoever must retain the following 32 * acknowledgment: 33 * "This product includes software developed by the OpenSSL Project 34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)" 35 * 36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY 37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR 40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 47 * OF THE POSSIBILITY OF SUCH DAMAGE. 48 * ==================================================================== 49 */ 50 51 #define OPENSSL_FIPSAPI 52 53 #include <openssl/crypto.h> 54 #include "modes_lcl.h" 55 #include <string.h> 56 57 #ifndef MODES_DEBUG 58 # ifndef NDEBUG 59 # define NDEBUG 60 # endif 61 #endif 62 63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT) 64 /* redefine, because alignment is ensured */ 65 #undef GETU32 66 #define GETU32(p) BSWAP4(*(const u32 *)(p)) 67 #undef PUTU32 68 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) 69 #endif 70 71 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) 72 #define REDUCE1BIT(V) \ 73 do { \ 74 if (sizeof(size_t)==8) { \ 75 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ 76 V.lo = (V.hi<<63)|(V.lo>>1); \ 77 V.hi = (V.hi>>1 )^T; \ 78 } else { \ 79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ 80 V.lo = (V.hi<<63)|(V.lo>>1); \ 81 V.hi = (V.hi>>1 )^((u64)T<<32); \ 82 } \ 83 } while(0) 84 85 /* 86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should 87 * never be set to 8. 8 is effectively reserved for testing purposes. 88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as 89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover 90 * whole spectrum of possible table driven implementations. Why? In 91 * non-"Shoup's" case memory access pattern is segmented in such manner, 92 * that it's trivial to see that cache timing information can reveal 93 * fair portion of intermediate hash value. Given that ciphertext is 94 * always available to attacker, it's possible for him to attempt to 95 * deduce secret parameter H and if successful, tamper with messages 96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's 97 * not as trivial, but there is no reason to believe that it's resistant 98 * to cache-timing attack. And the thing about "8-bit" implementation is 99 * that it consumes 16 (sixteen) times more memory, 4KB per individual 100 * key + 1KB shared. Well, on pros side it should be twice as fast as 101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version 102 * was observed to run ~75% faster, closer to 100% for commercial 103 * compilers... Yet "4-bit" procedure is preferred, because it's 104 * believed to provide better security-performance balance and adequate 105 * all-round performance. "All-round" refers to things like: 106 * 107 * - shorter setup time effectively improves overall timing for 108 * handling short messages; 109 * - larger table allocation can become unbearable because of VM 110 * subsystem penalties (for example on Windows large enough free 111 * results in VM working set trimming, meaning that consequent 112 * malloc would immediately incur working set expansion); 113 * - larger table has larger cache footprint, which can affect 114 * performance of other code paths (not necessarily even from same 115 * thread in Hyper-Threading world); 116 * 117 * Value of 1 is not appropriate for performance reasons. 118 */ 119 #if TABLE_BITS==8 120 121 static void gcm_init_8bit(u128 Htable[256], u64 H[2]) 122 { 123 int i, j; 124 u128 V; 125 126 Htable[0].hi = 0; 127 Htable[0].lo = 0; 128 V.hi = H[0]; 129 V.lo = H[1]; 130 131 for (Htable[128]=V, i=64; i>0; i>>=1) { 132 REDUCE1BIT(V); 133 Htable[i] = V; 134 } 135 136 for (i=2; i<256; i<<=1) { 137 u128 *Hi = Htable+i, H0 = *Hi; 138 for (j=1; j<i; ++j) { 139 Hi[j].hi = H0.hi^Htable[j].hi; 140 Hi[j].lo = H0.lo^Htable[j].lo; 141 } 142 } 143 } 144 145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) 146 { 147 u128 Z = { 0, 0}; 148 const u8 *xi = (const u8 *)Xi+15; 149 size_t rem, n = *xi; 150 static const size_t rem_8bit[256] = { 151 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), 152 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), 153 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), 154 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), 155 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), 156 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), 157 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), 158 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), 159 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), 160 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), 161 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), 162 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), 163 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), 164 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), 165 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), 166 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), 167 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), 168 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), 169 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), 170 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), 171 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), 172 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), 173 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), 174 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), 175 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), 176 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), 177 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), 178 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), 179 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), 180 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), 181 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), 182 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), 183 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), 184 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), 185 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), 186 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), 187 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), 188 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), 189 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), 190 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), 191 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), 192 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), 193 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), 194 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), 195 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), 196 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), 197 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), 198 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), 199 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), 200 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), 201 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), 202 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), 203 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), 204 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), 205 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), 206 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), 207 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), 208 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), 209 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), 210 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), 211 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), 212 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), 213 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), 214 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; 215 216 while (1) { 217 Z.hi ^= Htable[n].hi; 218 Z.lo ^= Htable[n].lo; 219 220 if ((u8 *)Xi==xi) break; 221 222 n = *(--xi); 223 224 rem = (size_t)Z.lo&0xff; 225 Z.lo = (Z.hi<<56)|(Z.lo>>8); 226 Z.hi = (Z.hi>>8); 227 #if SIZE_MAX == 0xffffffffffffffff 228 Z.hi ^= rem_8bit[rem]; 229 #else 230 Z.hi ^= (u64)rem_8bit[rem]<<32; 231 #endif 232 } 233 234 #if BYTE_ORDER == LITTLE_ENDIAN 235 #ifdef BSWAP8 236 Xi[0] = BSWAP8(Z.hi); 237 Xi[1] = BSWAP8(Z.lo); 238 #else 239 u8 *p = (u8 *)Xi; 240 u32 v; 241 v = (u32)(Z.hi>>32); PUTU32(p,v); 242 v = (u32)(Z.hi); PUTU32(p+4,v); 243 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 244 v = (u32)(Z.lo); PUTU32(p+12,v); 245 #endif 246 #else /* BIG_ENDIAN */ 247 Xi[0] = Z.hi; 248 Xi[1] = Z.lo; 249 #endif 250 } 251 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) 252 253 #elif TABLE_BITS==4 254 255 static void gcm_init_4bit(u128 Htable[16], u64 H[2]) 256 { 257 u128 V; 258 #if defined(OPENSSL_SMALL_FOOTPRINT) 259 int i; 260 #endif 261 262 Htable[0].hi = 0; 263 Htable[0].lo = 0; 264 V.hi = H[0]; 265 V.lo = H[1]; 266 267 #if defined(OPENSSL_SMALL_FOOTPRINT) 268 for (Htable[8]=V, i=4; i>0; i>>=1) { 269 REDUCE1BIT(V); 270 Htable[i] = V; 271 } 272 273 for (i=2; i<16; i<<=1) { 274 u128 *Hi = Htable+i; 275 int j; 276 for (V=*Hi, j=1; j<i; ++j) { 277 Hi[j].hi = V.hi^Htable[j].hi; 278 Hi[j].lo = V.lo^Htable[j].lo; 279 } 280 } 281 #else 282 Htable[8] = V; 283 REDUCE1BIT(V); 284 Htable[4] = V; 285 REDUCE1BIT(V); 286 Htable[2] = V; 287 REDUCE1BIT(V); 288 Htable[1] = V; 289 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo; 290 V=Htable[4]; 291 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo; 292 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo; 293 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo; 294 V=Htable[8]; 295 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo; 296 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo; 297 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo; 298 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo; 299 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo; 300 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo; 301 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo; 302 #endif 303 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) 304 /* 305 * ARM assembler expects specific dword order in Htable. 306 */ 307 { 308 int j; 309 #if BYTE_ORDER == LITTLE_ENDIAN 310 for (j=0;j<16;++j) { 311 V = Htable[j]; 312 Htable[j].hi = V.lo; 313 Htable[j].lo = V.hi; 314 } 315 #else /* BIG_ENDIAN */ 316 for (j=0;j<16;++j) { 317 V = Htable[j]; 318 Htable[j].hi = V.lo<<32|V.lo>>32; 319 Htable[j].lo = V.hi<<32|V.hi>>32; 320 } 321 #endif 322 } 323 #endif 324 } 325 326 #ifndef GHASH_ASM 327 static const size_t rem_4bit[16] = { 328 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), 329 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), 330 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), 331 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; 332 333 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) 334 { 335 u128 Z; 336 int cnt = 15; 337 size_t rem, nlo, nhi; 338 339 nlo = ((const u8 *)Xi)[15]; 340 nhi = nlo>>4; 341 nlo &= 0xf; 342 343 Z.hi = Htable[nlo].hi; 344 Z.lo = Htable[nlo].lo; 345 346 while (1) { 347 rem = (size_t)Z.lo&0xf; 348 Z.lo = (Z.hi<<60)|(Z.lo>>4); 349 Z.hi = (Z.hi>>4); 350 #if SIZE_MAX == 0xffffffffffffffff 351 Z.hi ^= rem_4bit[rem]; 352 #else 353 Z.hi ^= (u64)rem_4bit[rem]<<32; 354 #endif 355 Z.hi ^= Htable[nhi].hi; 356 Z.lo ^= Htable[nhi].lo; 357 358 if (--cnt<0) break; 359 360 nlo = ((const u8 *)Xi)[cnt]; 361 nhi = nlo>>4; 362 nlo &= 0xf; 363 364 rem = (size_t)Z.lo&0xf; 365 Z.lo = (Z.hi<<60)|(Z.lo>>4); 366 Z.hi = (Z.hi>>4); 367 #if SIZE_MAX == 0xffffffffffffffff 368 Z.hi ^= rem_4bit[rem]; 369 #else 370 Z.hi ^= (u64)rem_4bit[rem]<<32; 371 #endif 372 Z.hi ^= Htable[nlo].hi; 373 Z.lo ^= Htable[nlo].lo; 374 } 375 376 #if BYTE_ORDER == LITTLE_ENDIAN 377 #ifdef BSWAP8 378 Xi[0] = BSWAP8(Z.hi); 379 Xi[1] = BSWAP8(Z.lo); 380 #else 381 u8 *p = (u8 *)Xi; 382 u32 v; 383 v = (u32)(Z.hi>>32); PUTU32(p,v); 384 v = (u32)(Z.hi); PUTU32(p+4,v); 385 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 386 v = (u32)(Z.lo); PUTU32(p+12,v); 387 #endif 388 #else /* BIG_ENDIAN */ 389 Xi[0] = Z.hi; 390 Xi[1] = Z.lo; 391 #endif 392 } 393 394 #if !defined(OPENSSL_SMALL_FOOTPRINT) 395 /* 396 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for 397 * details... Compiler-generated code doesn't seem to give any 398 * performance improvement, at least not on x86[_64]. It's here 399 * mostly as reference and a placeholder for possible future 400 * non-trivial optimization[s]... 401 */ 402 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16], 403 const u8 *inp,size_t len) 404 { 405 u128 Z; 406 int cnt; 407 size_t rem, nlo, nhi; 408 409 #if 1 410 do { 411 cnt = 15; 412 nlo = ((const u8 *)Xi)[15]; 413 nlo ^= inp[15]; 414 nhi = nlo>>4; 415 nlo &= 0xf; 416 417 Z.hi = Htable[nlo].hi; 418 Z.lo = Htable[nlo].lo; 419 420 while (1) { 421 rem = (size_t)Z.lo&0xf; 422 Z.lo = (Z.hi<<60)|(Z.lo>>4); 423 Z.hi = (Z.hi>>4); 424 #if SIZE_MAX == 0xffffffffffffffff 425 Z.hi ^= rem_4bit[rem]; 426 #else 427 Z.hi ^= (u64)rem_4bit[rem]<<32; 428 #endif 429 Z.hi ^= Htable[nhi].hi; 430 Z.lo ^= Htable[nhi].lo; 431 432 if (--cnt<0) break; 433 434 nlo = ((const u8 *)Xi)[cnt]; 435 nlo ^= inp[cnt]; 436 nhi = nlo>>4; 437 nlo &= 0xf; 438 439 rem = (size_t)Z.lo&0xf; 440 Z.lo = (Z.hi<<60)|(Z.lo>>4); 441 Z.hi = (Z.hi>>4); 442 #if SIZE_MAX == 0xffffffffffffffff 443 Z.hi ^= rem_4bit[rem]; 444 #else 445 Z.hi ^= (u64)rem_4bit[rem]<<32; 446 #endif 447 Z.hi ^= Htable[nlo].hi; 448 Z.lo ^= Htable[nlo].lo; 449 } 450 #else 451 /* 452 * Extra 256+16 bytes per-key plus 512 bytes shared tables 453 * [should] give ~50% improvement... One could have PACK()-ed 454 * the rem_8bit even here, but the priority is to minimize 455 * cache footprint... 456 */ 457 u128 Hshr4[16]; /* Htable shifted right by 4 bits */ 458 u8 Hshl4[16]; /* Htable shifted left by 4 bits */ 459 static const unsigned short rem_8bit[256] = { 460 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, 461 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, 462 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, 463 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, 464 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, 465 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, 466 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, 467 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, 468 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, 469 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, 470 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, 471 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, 472 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, 473 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, 474 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, 475 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, 476 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, 477 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, 478 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, 479 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, 480 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, 481 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, 482 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, 483 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, 484 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, 485 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, 486 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, 487 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, 488 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, 489 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, 490 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, 491 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; 492 /* 493 * This pre-processing phase slows down procedure by approximately 494 * same time as it makes each loop spin faster. In other words 495 * single block performance is approximately same as straightforward 496 * "4-bit" implementation, and then it goes only faster... 497 */ 498 for (cnt=0; cnt<16; ++cnt) { 499 Z.hi = Htable[cnt].hi; 500 Z.lo = Htable[cnt].lo; 501 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4); 502 Hshr4[cnt].hi = (Z.hi>>4); 503 Hshl4[cnt] = (u8)(Z.lo<<4); 504 } 505 506 do { 507 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) { 508 nlo = ((const u8 *)Xi)[cnt]; 509 nlo ^= inp[cnt]; 510 nhi = nlo>>4; 511 nlo &= 0xf; 512 513 Z.hi ^= Htable[nlo].hi; 514 Z.lo ^= Htable[nlo].lo; 515 516 rem = (size_t)Z.lo&0xff; 517 518 Z.lo = (Z.hi<<56)|(Z.lo>>8); 519 Z.hi = (Z.hi>>8); 520 521 Z.hi ^= Hshr4[nhi].hi; 522 Z.lo ^= Hshr4[nhi].lo; 523 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48; 524 } 525 526 nlo = ((const u8 *)Xi)[0]; 527 nlo ^= inp[0]; 528 nhi = nlo>>4; 529 nlo &= 0xf; 530 531 Z.hi ^= Htable[nlo].hi; 532 Z.lo ^= Htable[nlo].lo; 533 534 rem = (size_t)Z.lo&0xf; 535 536 Z.lo = (Z.hi<<60)|(Z.lo>>4); 537 Z.hi = (Z.hi>>4); 538 539 Z.hi ^= Htable[nhi].hi; 540 Z.lo ^= Htable[nhi].lo; 541 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48; 542 #endif 543 544 #if BYTE_ORDER == LITTLE_ENDIAN 545 #ifdef BSWAP8 546 Xi[0] = BSWAP8(Z.hi); 547 Xi[1] = BSWAP8(Z.lo); 548 #else 549 u8 *p = (u8 *)Xi; 550 u32 v; 551 v = (u32)(Z.hi>>32); PUTU32(p,v); 552 v = (u32)(Z.hi); PUTU32(p+4,v); 553 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 554 v = (u32)(Z.lo); PUTU32(p+12,v); 555 #endif 556 #else /* BIG_ENDIAN */ 557 Xi[0] = Z.hi; 558 Xi[1] = Z.lo; 559 #endif 560 } while (inp+=16, len-=16); 561 } 562 #endif 563 #else 564 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]); 565 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 566 #endif 567 568 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) 569 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) 570 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) 571 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache 572 * trashing effect. In other words idea is to hash data while it's 573 * still in L1 cache after encryption pass... */ 574 #define GHASH_CHUNK (3*1024) 575 #endif 576 577 #else /* TABLE_BITS */ 578 579 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) 580 { 581 u128 V,Z = { 0,0 }; 582 long X; 583 int i,j; 584 const long *xi = (const long *)Xi; 585 586 V.hi = H[0]; /* H is in host byte order, no byte swapping */ 587 V.lo = H[1]; 588 589 for (j=0; j<16/sizeof(long); ++j) { 590 #if BYTE_ORDER == LITTLE_ENDIAN 591 #if SIZE_MAX == 0xffffffffffffffff 592 #ifdef BSWAP8 593 X = (long)(BSWAP8(xi[j])); 594 #else 595 const u8 *p = (const u8 *)(xi+j); 596 X = (long)((u64)GETU32(p)<<32|GETU32(p+4)); 597 #endif 598 #else 599 const u8 *p = (const u8 *)(xi+j); 600 X = (long)GETU32(p); 601 #endif 602 #else /* BIG_ENDIAN */ 603 X = xi[j]; 604 #endif 605 606 for (i=0; i<8*sizeof(long); ++i, X<<=1) { 607 u64 M = (u64)(X>>(8*sizeof(long)-1)); 608 Z.hi ^= V.hi&M; 609 Z.lo ^= V.lo&M; 610 611 REDUCE1BIT(V); 612 } 613 } 614 615 #if BYTE_ORDER == LITTLE_ENDIAN 616 #ifdef BSWAP8 617 Xi[0] = BSWAP8(Z.hi); 618 Xi[1] = BSWAP8(Z.lo); 619 #else 620 u8 *p = (u8 *)Xi; 621 u32 v; 622 v = (u32)(Z.hi>>32); PUTU32(p,v); 623 v = (u32)(Z.hi); PUTU32(p+4,v); 624 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 625 v = (u32)(Z.lo); PUTU32(p+12,v); 626 #endif 627 #else /* BIG_ENDIAN */ 628 Xi[0] = Z.hi; 629 Xi[1] = Z.lo; 630 #endif 631 } 632 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) 633 634 #endif 635 636 #if defined(GHASH_ASM) && \ 637 (defined(__i386) || defined(__i386__) || \ 638 defined(__x86_64) || defined(__x86_64__) || \ 639 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) 640 #include "x86_arch.h" 641 #endif 642 643 #if TABLE_BITS==4 && defined(GHASH_ASM) 644 # if (defined(__i386) || defined(__i386__) || \ 645 defined(__x86_64) || defined(__x86_64__) || \ 646 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) 647 # define GHASH_ASM_X86_OR_64 648 # define GCM_FUNCREF_4BIT 649 650 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); 651 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); 652 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 653 654 # if defined(__i386) || defined(__i386__) || defined(_M_IX86) 655 # define GHASH_ASM_X86 656 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); 657 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 658 659 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); 660 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 661 # endif 662 # elif defined(__arm__) || defined(__arm) 663 # include "arm_arch.h" 664 # if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT) 665 # define GHASH_ASM_ARM 666 # define GCM_FUNCREF_4BIT 667 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); 668 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 669 # endif 670 # endif 671 #endif 672 673 #ifdef GCM_FUNCREF_4BIT 674 # undef GCM_MUL 675 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) 676 # ifdef GHASH 677 # undef GHASH 678 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) 679 # endif 680 #endif 681 682 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) 683 { 684 memset(ctx,0,sizeof(*ctx)); 685 ctx->block = block; 686 ctx->key = key; 687 688 (*block)(ctx->H.c,ctx->H.c,key); 689 690 #if BYTE_ORDER == LITTLE_ENDIAN 691 /* H is stored in host byte order */ 692 #ifdef BSWAP8 693 ctx->H.u[0] = BSWAP8(ctx->H.u[0]); 694 ctx->H.u[1] = BSWAP8(ctx->H.u[1]); 695 #else 696 u8 *p = ctx->H.c; 697 u64 hi,lo; 698 hi = (u64)GETU32(p) <<32|GETU32(p+4); 699 lo = (u64)GETU32(p+8)<<32|GETU32(p+12); 700 ctx->H.u[0] = hi; 701 ctx->H.u[1] = lo; 702 #endif 703 #endif 704 705 #if TABLE_BITS==8 706 gcm_init_8bit(ctx->Htable,ctx->H.u); 707 #elif TABLE_BITS==4 708 # if defined(GHASH_ASM_X86_OR_64) 709 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) 710 /* check FXSR and PCLMULQDQ bits */ 711 if ((OPENSSL_cpu_caps() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) == 712 (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) { 713 gcm_init_clmul(ctx->Htable,ctx->H.u); 714 ctx->gmult = gcm_gmult_clmul; 715 ctx->ghash = gcm_ghash_clmul; 716 return; 717 } 718 # endif 719 gcm_init_4bit(ctx->Htable,ctx->H.u); 720 # if defined(GHASH_ASM_X86) /* x86 only */ 721 # if defined(OPENSSL_IA32_SSE2) 722 if (OPENSSL_cpu_caps() & CPUCAP_MASK_SSE) { /* check SSE bit */ 723 # else 724 if (OPENSSL_cpu_caps() & CPUCAP_MASK_MMX) { /* check MMX bit */ 725 # endif 726 ctx->gmult = gcm_gmult_4bit_mmx; 727 ctx->ghash = gcm_ghash_4bit_mmx; 728 } else { 729 ctx->gmult = gcm_gmult_4bit_x86; 730 ctx->ghash = gcm_ghash_4bit_x86; 731 } 732 # else 733 ctx->gmult = gcm_gmult_4bit; 734 ctx->ghash = gcm_ghash_4bit; 735 # endif 736 # elif defined(GHASH_ASM_ARM) 737 if (OPENSSL_armcap_P & ARMV7_NEON) { 738 ctx->gmult = gcm_gmult_neon; 739 ctx->ghash = gcm_ghash_neon; 740 } else { 741 gcm_init_4bit(ctx->Htable,ctx->H.u); 742 ctx->gmult = gcm_gmult_4bit; 743 ctx->ghash = gcm_ghash_4bit; 744 } 745 # else 746 gcm_init_4bit(ctx->Htable,ctx->H.u); 747 # endif 748 #endif 749 } 750 751 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len) 752 { 753 unsigned int ctr; 754 #ifdef GCM_FUNCREF_4BIT 755 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 756 #endif 757 758 ctx->Yi.u[0] = 0; 759 ctx->Yi.u[1] = 0; 760 ctx->Xi.u[0] = 0; 761 ctx->Xi.u[1] = 0; 762 ctx->len.u[0] = 0; /* AAD length */ 763 ctx->len.u[1] = 0; /* message length */ 764 ctx->ares = 0; 765 ctx->mres = 0; 766 767 if (len==12) { 768 memcpy(ctx->Yi.c,iv,12); 769 ctx->Yi.c[15]=1; 770 ctr=1; 771 } 772 else { 773 size_t i; 774 u64 len0 = len; 775 776 while (len>=16) { 777 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i]; 778 GCM_MUL(ctx,Yi); 779 iv += 16; 780 len -= 16; 781 } 782 if (len) { 783 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i]; 784 GCM_MUL(ctx,Yi); 785 } 786 len0 <<= 3; 787 #if BYTE_ORDER == LITTLE_ENDIAN 788 #ifdef BSWAP8 789 ctx->Yi.u[1] ^= BSWAP8(len0); 790 #else 791 ctx->Yi.c[8] ^= (u8)(len0>>56); 792 ctx->Yi.c[9] ^= (u8)(len0>>48); 793 ctx->Yi.c[10] ^= (u8)(len0>>40); 794 ctx->Yi.c[11] ^= (u8)(len0>>32); 795 ctx->Yi.c[12] ^= (u8)(len0>>24); 796 ctx->Yi.c[13] ^= (u8)(len0>>16); 797 ctx->Yi.c[14] ^= (u8)(len0>>8); 798 ctx->Yi.c[15] ^= (u8)(len0); 799 #endif 800 #else /* BIG_ENDIAN */ 801 ctx->Yi.u[1] ^= len0; 802 #endif 803 804 GCM_MUL(ctx,Yi); 805 806 #if BYTE_ORDER == LITTLE_ENDIAN 807 #ifdef BSWAP4 808 ctr = BSWAP4(ctx->Yi.d[3]); 809 #else 810 ctr = GETU32(ctx->Yi.c+12); 811 #endif 812 #else /* BIG_ENDIAN */ 813 ctr = ctx->Yi.d[3]; 814 #endif 815 } 816 817 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key); 818 ++ctr; 819 #if BYTE_ORDER == LITTLE_ENDIAN 820 #ifdef BSWAP4 821 ctx->Yi.d[3] = BSWAP4(ctr); 822 #else 823 PUTU32(ctx->Yi.c+12,ctr); 824 #endif 825 #else /* BIG_ENDIAN */ 826 ctx->Yi.d[3] = ctr; 827 #endif 828 } 829 830 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) 831 { 832 size_t i; 833 unsigned int n; 834 u64 alen = ctx->len.u[0]; 835 #ifdef GCM_FUNCREF_4BIT 836 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 837 # ifdef GHASH 838 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 839 const u8 *inp,size_t len) = ctx->ghash; 840 # endif 841 #endif 842 843 if (ctx->len.u[1]) return -2; 844 845 alen += len; 846 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len)) 847 return -1; 848 ctx->len.u[0] = alen; 849 850 n = ctx->ares; 851 if (n) { 852 while (n && len) { 853 ctx->Xi.c[n] ^= *(aad++); 854 --len; 855 n = (n+1)%16; 856 } 857 if (n==0) GCM_MUL(ctx,Xi); 858 else { 859 ctx->ares = n; 860 return 0; 861 } 862 } 863 864 #ifdef GHASH 865 if ((i = (len&(size_t)-16))) { 866 GHASH(ctx,aad,i); 867 aad += i; 868 len -= i; 869 } 870 #else 871 while (len>=16) { 872 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i]; 873 GCM_MUL(ctx,Xi); 874 aad += 16; 875 len -= 16; 876 } 877 #endif 878 if (len) { 879 n = (unsigned int)len; 880 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i]; 881 } 882 883 ctx->ares = n; 884 return 0; 885 } 886 887 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, 888 const unsigned char *in, unsigned char *out, 889 size_t len) 890 { 891 unsigned int n, ctr; 892 size_t i; 893 u64 mlen = ctx->len.u[1]; 894 block128_f block = ctx->block; 895 void *key = ctx->key; 896 #ifdef GCM_FUNCREF_4BIT 897 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 898 # ifdef GHASH 899 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 900 const u8 *inp,size_t len) = ctx->ghash; 901 # endif 902 #endif 903 904 mlen += len; 905 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 906 return -1; 907 ctx->len.u[1] = mlen; 908 909 if (ctx->ares) { 910 /* First call to encrypt finalizes GHASH(AAD) */ 911 GCM_MUL(ctx,Xi); 912 ctx->ares = 0; 913 } 914 915 #if BYTE_ORDER == LITTLE_ENDIAN 916 #ifdef BSWAP4 917 ctr = BSWAP4(ctx->Yi.d[3]); 918 #else 919 ctr = GETU32(ctx->Yi.c+12); 920 #endif 921 #else /* BIG_ENDIAN */ 922 ctr = ctx->Yi.d[3]; 923 #endif 924 925 n = ctx->mres; 926 #if !defined(OPENSSL_SMALL_FOOTPRINT) 927 if (16%sizeof(size_t) == 0) do { /* always true actually */ 928 if (n) { 929 while (n && len) { 930 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; 931 --len; 932 n = (n+1)%16; 933 } 934 if (n==0) GCM_MUL(ctx,Xi); 935 else { 936 ctx->mres = n; 937 return 0; 938 } 939 } 940 #ifdef __STRICT_ALIGNMENT 941 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) 942 break; 943 #endif 944 #if defined(GHASH) && defined(GHASH_CHUNK) 945 while (len>=GHASH_CHUNK) { 946 size_t j=GHASH_CHUNK; 947 948 while (j) { 949 size_t *out_t=(size_t *)out; 950 const size_t *in_t=(const size_t *)in; 951 952 (*block)(ctx->Yi.c,ctx->EKi.c,key); 953 ++ctr; 954 #if BYTE_ORDER == LITTLE_ENDIAN 955 #ifdef BSWAP4 956 ctx->Yi.d[3] = BSWAP4(ctr); 957 #else 958 PUTU32(ctx->Yi.c+12,ctr); 959 #endif 960 #else /* BIG_ENDIAN */ 961 ctx->Yi.d[3] = ctr; 962 #endif 963 for (i=0; i<16/sizeof(size_t); ++i) 964 out_t[i] = in_t[i] ^ ctx->EKi.t[i]; 965 out += 16; 966 in += 16; 967 j -= 16; 968 } 969 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK); 970 len -= GHASH_CHUNK; 971 } 972 if ((i = (len&(size_t)-16))) { 973 size_t j=i; 974 975 while (len>=16) { 976 size_t *out_t=(size_t *)out; 977 const size_t *in_t=(const size_t *)in; 978 979 (*block)(ctx->Yi.c,ctx->EKi.c,key); 980 ++ctr; 981 #if BYTE_ORDER == LITTLE_ENDIAN 982 #ifdef BSWAP4 983 ctx->Yi.d[3] = BSWAP4(ctr); 984 #else 985 PUTU32(ctx->Yi.c+12,ctr); 986 #endif 987 #else /* BIG_ENDIAN */ 988 ctx->Yi.d[3] = ctr; 989 #endif 990 for (i=0; i<16/sizeof(size_t); ++i) 991 out_t[i] = in_t[i] ^ ctx->EKi.t[i]; 992 out += 16; 993 in += 16; 994 len -= 16; 995 } 996 GHASH(ctx,out-j,j); 997 } 998 #else 999 while (len>=16) { 1000 size_t *out_t=(size_t *)out; 1001 const size_t *in_t=(const size_t *)in; 1002 1003 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1004 ++ctr; 1005 #if BYTE_ORDER == LITTLE_ENDIAN 1006 #ifdef BSWAP4 1007 ctx->Yi.d[3] = BSWAP4(ctr); 1008 #else 1009 PUTU32(ctx->Yi.c+12,ctr); 1010 #endif 1011 #else /* BIG_ENDIAN */ 1012 ctx->Yi.d[3] = ctr; 1013 #endif 1014 for (i=0; i<16/sizeof(size_t); ++i) 1015 ctx->Xi.t[i] ^= 1016 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1017 GCM_MUL(ctx,Xi); 1018 out += 16; 1019 in += 16; 1020 len -= 16; 1021 } 1022 #endif 1023 if (len) { 1024 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1025 ++ctr; 1026 #if BYTE_ORDER == LITTLE_ENDIAN 1027 #ifdef BSWAP4 1028 ctx->Yi.d[3] = BSWAP4(ctr); 1029 #else 1030 PUTU32(ctx->Yi.c+12,ctr); 1031 #endif 1032 #else /* BIG_ENDIAN */ 1033 ctx->Yi.d[3] = ctr; 1034 #endif 1035 while (len--) { 1036 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; 1037 ++n; 1038 } 1039 } 1040 1041 ctx->mres = n; 1042 return 0; 1043 } while(0); 1044 #endif 1045 for (i=0;i<len;++i) { 1046 if (n==0) { 1047 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1048 ++ctr; 1049 #if BYTE_ORDER == LITTLE_ENDIAN 1050 #ifdef BSWAP4 1051 ctx->Yi.d[3] = BSWAP4(ctr); 1052 #else 1053 PUTU32(ctx->Yi.c+12,ctr); 1054 #endif 1055 #else /* BIG_ENDIAN */ 1056 ctx->Yi.d[3] = ctr; 1057 #endif 1058 } 1059 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n]; 1060 n = (n+1)%16; 1061 if (n==0) 1062 GCM_MUL(ctx,Xi); 1063 } 1064 1065 ctx->mres = n; 1066 return 0; 1067 } 1068 1069 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, 1070 const unsigned char *in, unsigned char *out, 1071 size_t len) 1072 { 1073 unsigned int n, ctr; 1074 size_t i; 1075 u64 mlen = ctx->len.u[1]; 1076 block128_f block = ctx->block; 1077 void *key = ctx->key; 1078 #ifdef GCM_FUNCREF_4BIT 1079 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1080 # ifdef GHASH 1081 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1082 const u8 *inp,size_t len) = ctx->ghash; 1083 # endif 1084 #endif 1085 1086 mlen += len; 1087 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1088 return -1; 1089 ctx->len.u[1] = mlen; 1090 1091 if (ctx->ares) { 1092 /* First call to decrypt finalizes GHASH(AAD) */ 1093 GCM_MUL(ctx,Xi); 1094 ctx->ares = 0; 1095 } 1096 1097 #if BYTE_ORDER == LITTLE_ENDIAN 1098 #ifdef BSWAP4 1099 ctr = BSWAP4(ctx->Yi.d[3]); 1100 #else 1101 ctr = GETU32(ctx->Yi.c+12); 1102 #endif 1103 #else /* BIG_ENDIAN */ 1104 ctr = ctx->Yi.d[3]; 1105 #endif 1106 1107 n = ctx->mres; 1108 #if !defined(OPENSSL_SMALL_FOOTPRINT) 1109 if (16%sizeof(size_t) == 0) do { /* always true actually */ 1110 if (n) { 1111 while (n && len) { 1112 u8 c = *(in++); 1113 *(out++) = c^ctx->EKi.c[n]; 1114 ctx->Xi.c[n] ^= c; 1115 --len; 1116 n = (n+1)%16; 1117 } 1118 if (n==0) GCM_MUL (ctx,Xi); 1119 else { 1120 ctx->mres = n; 1121 return 0; 1122 } 1123 } 1124 #ifdef __STRICT_ALIGNMENT 1125 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) 1126 break; 1127 #endif 1128 #if defined(GHASH) && defined(GHASH_CHUNK) 1129 while (len>=GHASH_CHUNK) { 1130 size_t j=GHASH_CHUNK; 1131 1132 GHASH(ctx,in,GHASH_CHUNK); 1133 while (j) { 1134 size_t *out_t=(size_t *)out; 1135 const size_t *in_t=(const size_t *)in; 1136 1137 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1138 ++ctr; 1139 #if BYTE_ORDER == LITTLE_ENDIAN 1140 #ifdef BSWAP4 1141 ctx->Yi.d[3] = BSWAP4(ctr); 1142 #else 1143 PUTU32(ctx->Yi.c+12,ctr); 1144 #endif 1145 #else /* BIG_ENDIAN */ 1146 ctx->Yi.d[3] = ctr; 1147 #endif 1148 for (i=0; i<16/sizeof(size_t); ++i) 1149 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1150 out += 16; 1151 in += 16; 1152 j -= 16; 1153 } 1154 len -= GHASH_CHUNK; 1155 } 1156 if ((i = (len&(size_t)-16))) { 1157 GHASH(ctx,in,i); 1158 while (len>=16) { 1159 size_t *out_t=(size_t *)out; 1160 const size_t *in_t=(const size_t *)in; 1161 1162 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1163 ++ctr; 1164 #if BYTE_ORDER == LITTLE_ENDIAN 1165 #ifdef BSWAP4 1166 ctx->Yi.d[3] = BSWAP4(ctr); 1167 #else 1168 PUTU32(ctx->Yi.c+12,ctr); 1169 #endif 1170 #else /* BIG_ENDIAN */ 1171 ctx->Yi.d[3] = ctr; 1172 #endif 1173 for (i=0; i<16/sizeof(size_t); ++i) 1174 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1175 out += 16; 1176 in += 16; 1177 len -= 16; 1178 } 1179 } 1180 #else 1181 while (len>=16) { 1182 size_t *out_t=(size_t *)out; 1183 const size_t *in_t=(const size_t *)in; 1184 1185 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1186 ++ctr; 1187 #if BYTE_ORDER == LITTLE_ENDIAN 1188 #ifdef BSWAP4 1189 ctx->Yi.d[3] = BSWAP4(ctr); 1190 #else 1191 PUTU32(ctx->Yi.c+12,ctr); 1192 #endif 1193 #else /* BIG_ENDIAN */ 1194 ctx->Yi.d[3] = ctr; 1195 #endif 1196 for (i=0; i<16/sizeof(size_t); ++i) { 1197 size_t c = in[i]; 1198 out[i] = c^ctx->EKi.t[i]; 1199 ctx->Xi.t[i] ^= c; 1200 } 1201 GCM_MUL(ctx,Xi); 1202 out += 16; 1203 in += 16; 1204 len -= 16; 1205 } 1206 #endif 1207 if (len) { 1208 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1209 ++ctr; 1210 #if BYTE_ORDER == LITTLE_ENDIAN 1211 #ifdef BSWAP4 1212 ctx->Yi.d[3] = BSWAP4(ctr); 1213 #else 1214 PUTU32(ctx->Yi.c+12,ctr); 1215 #endif 1216 #else /* BIG_ENDIAN */ 1217 ctx->Yi.d[3] = ctr; 1218 #endif 1219 while (len--) { 1220 u8 c = in[n]; 1221 ctx->Xi.c[n] ^= c; 1222 out[n] = c^ctx->EKi.c[n]; 1223 ++n; 1224 } 1225 } 1226 1227 ctx->mres = n; 1228 return 0; 1229 } while(0); 1230 #endif 1231 for (i=0;i<len;++i) { 1232 u8 c; 1233 if (n==0) { 1234 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1235 ++ctr; 1236 #if BYTE_ORDER == LITTLE_ENDIAN 1237 #ifdef BSWAP4 1238 ctx->Yi.d[3] = BSWAP4(ctr); 1239 #else 1240 PUTU32(ctx->Yi.c+12,ctr); 1241 #endif 1242 #else /* BIG_ENDIAN */ 1243 ctx->Yi.d[3] = ctr; 1244 #endif 1245 } 1246 c = in[i]; 1247 out[i] = c^ctx->EKi.c[n]; 1248 ctx->Xi.c[n] ^= c; 1249 n = (n+1)%16; 1250 if (n==0) 1251 GCM_MUL(ctx,Xi); 1252 } 1253 1254 ctx->mres = n; 1255 return 0; 1256 } 1257 1258 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, 1259 const unsigned char *in, unsigned char *out, 1260 size_t len, ctr128_f stream) 1261 { 1262 unsigned int n, ctr; 1263 size_t i; 1264 u64 mlen = ctx->len.u[1]; 1265 void *key = ctx->key; 1266 #ifdef GCM_FUNCREF_4BIT 1267 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1268 # ifdef GHASH 1269 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1270 const u8 *inp,size_t len) = ctx->ghash; 1271 # endif 1272 #endif 1273 1274 mlen += len; 1275 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1276 return -1; 1277 ctx->len.u[1] = mlen; 1278 1279 if (ctx->ares) { 1280 /* First call to encrypt finalizes GHASH(AAD) */ 1281 GCM_MUL(ctx,Xi); 1282 ctx->ares = 0; 1283 } 1284 1285 #if BYTE_ORDER == LITTLE_ENDIAN 1286 #ifdef BSWAP4 1287 ctr = BSWAP4(ctx->Yi.d[3]); 1288 #else 1289 ctr = GETU32(ctx->Yi.c+12); 1290 #endif 1291 #else /* BIG_ENDIAN */ 1292 ctr = ctx->Yi.d[3]; 1293 #endif 1294 1295 n = ctx->mres; 1296 if (n) { 1297 while (n && len) { 1298 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; 1299 --len; 1300 n = (n+1)%16; 1301 } 1302 if (n==0) GCM_MUL(ctx,Xi); 1303 else { 1304 ctx->mres = n; 1305 return 0; 1306 } 1307 } 1308 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1309 while (len>=GHASH_CHUNK) { 1310 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); 1311 ctr += GHASH_CHUNK/16; 1312 #if BYTE_ORDER == LITTLE_ENDIAN 1313 #ifdef BSWAP4 1314 ctx->Yi.d[3] = BSWAP4(ctr); 1315 #else 1316 PUTU32(ctx->Yi.c+12,ctr); 1317 #endif 1318 #else /* BIG_ENDIAN */ 1319 ctx->Yi.d[3] = ctr; 1320 #endif 1321 GHASH(ctx,out,GHASH_CHUNK); 1322 out += GHASH_CHUNK; 1323 in += GHASH_CHUNK; 1324 len -= GHASH_CHUNK; 1325 } 1326 #endif 1327 if ((i = (len&(size_t)-16))) { 1328 size_t j=i/16; 1329 1330 (*stream)(in,out,j,key,ctx->Yi.c); 1331 ctr += (unsigned int)j; 1332 #if BYTE_ORDER == LITTLE_ENDIAN 1333 #ifdef BSWAP4 1334 ctx->Yi.d[3] = BSWAP4(ctr); 1335 #else 1336 PUTU32(ctx->Yi.c+12,ctr); 1337 #endif 1338 #else /* BIG_ENDIAN */ 1339 ctx->Yi.d[3] = ctr; 1340 #endif 1341 in += i; 1342 len -= i; 1343 #if defined(GHASH) 1344 GHASH(ctx,out,i); 1345 out += i; 1346 #else 1347 while (j--) { 1348 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i]; 1349 GCM_MUL(ctx,Xi); 1350 out += 16; 1351 } 1352 #endif 1353 } 1354 if (len) { 1355 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); 1356 ++ctr; 1357 #if BYTE_ORDER == LITTLE_ENDIAN 1358 #ifdef BSWAP4 1359 ctx->Yi.d[3] = BSWAP4(ctr); 1360 #else 1361 PUTU32(ctx->Yi.c+12,ctr); 1362 #endif 1363 #else /* BIG_ENDIAN */ 1364 ctx->Yi.d[3] = ctr; 1365 #endif 1366 while (len--) { 1367 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; 1368 ++n; 1369 } 1370 } 1371 1372 ctx->mres = n; 1373 return 0; 1374 } 1375 1376 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, 1377 const unsigned char *in, unsigned char *out, 1378 size_t len,ctr128_f stream) 1379 { 1380 unsigned int n, ctr; 1381 size_t i; 1382 u64 mlen = ctx->len.u[1]; 1383 void *key = ctx->key; 1384 #ifdef GCM_FUNCREF_4BIT 1385 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1386 # ifdef GHASH 1387 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1388 const u8 *inp,size_t len) = ctx->ghash; 1389 # endif 1390 #endif 1391 1392 mlen += len; 1393 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1394 return -1; 1395 ctx->len.u[1] = mlen; 1396 1397 if (ctx->ares) { 1398 /* First call to decrypt finalizes GHASH(AAD) */ 1399 GCM_MUL(ctx,Xi); 1400 ctx->ares = 0; 1401 } 1402 1403 #if BYTE_ORDER == LITTLE_ENDIAN 1404 #ifdef BSWAP4 1405 ctr = BSWAP4(ctx->Yi.d[3]); 1406 #else 1407 ctr = GETU32(ctx->Yi.c+12); 1408 #endif 1409 #else /* BIG_ENDIAN */ 1410 ctr = ctx->Yi.d[3]; 1411 #endif 1412 1413 n = ctx->mres; 1414 if (n) { 1415 while (n && len) { 1416 u8 c = *(in++); 1417 *(out++) = c^ctx->EKi.c[n]; 1418 ctx->Xi.c[n] ^= c; 1419 --len; 1420 n = (n+1)%16; 1421 } 1422 if (n==0) GCM_MUL (ctx,Xi); 1423 else { 1424 ctx->mres = n; 1425 return 0; 1426 } 1427 } 1428 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1429 while (len>=GHASH_CHUNK) { 1430 GHASH(ctx,in,GHASH_CHUNK); 1431 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); 1432 ctr += GHASH_CHUNK/16; 1433 #if BYTE_ORDER == LITTLE_ENDIAN 1434 #ifdef BSWAP4 1435 ctx->Yi.d[3] = BSWAP4(ctr); 1436 #else 1437 PUTU32(ctx->Yi.c+12,ctr); 1438 #endif 1439 #else /* BIG_ENDIAN */ 1440 ctx->Yi.d[3] = ctr; 1441 #endif 1442 out += GHASH_CHUNK; 1443 in += GHASH_CHUNK; 1444 len -= GHASH_CHUNK; 1445 } 1446 #endif 1447 if ((i = (len&(size_t)-16))) { 1448 size_t j=i/16; 1449 1450 #if defined(GHASH) 1451 GHASH(ctx,in,i); 1452 #else 1453 while (j--) { 1454 size_t k; 1455 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k]; 1456 GCM_MUL(ctx,Xi); 1457 in += 16; 1458 } 1459 j = i/16; 1460 in -= i; 1461 #endif 1462 (*stream)(in,out,j,key,ctx->Yi.c); 1463 ctr += (unsigned int)j; 1464 #if BYTE_ORDER == LITTLE_ENDIAN 1465 #ifdef BSWAP4 1466 ctx->Yi.d[3] = BSWAP4(ctr); 1467 #else 1468 PUTU32(ctx->Yi.c+12,ctr); 1469 #endif 1470 #else /* BIG_ENDIAN */ 1471 ctx->Yi.d[3] = ctr; 1472 #endif 1473 out += i; 1474 in += i; 1475 len -= i; 1476 } 1477 if (len) { 1478 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); 1479 ++ctr; 1480 #if BYTE_ORDER == LITTLE_ENDIAN 1481 #ifdef BSWAP4 1482 ctx->Yi.d[3] = BSWAP4(ctr); 1483 #else 1484 PUTU32(ctx->Yi.c+12,ctr); 1485 #endif 1486 #else /* BIG_ENDIAN */ 1487 ctx->Yi.d[3] = ctr; 1488 #endif 1489 while (len--) { 1490 u8 c = in[n]; 1491 ctx->Xi.c[n] ^= c; 1492 out[n] = c^ctx->EKi.c[n]; 1493 ++n; 1494 } 1495 } 1496 1497 ctx->mres = n; 1498 return 0; 1499 } 1500 1501 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, 1502 size_t len) 1503 { 1504 u64 alen = ctx->len.u[0]<<3; 1505 u64 clen = ctx->len.u[1]<<3; 1506 #ifdef GCM_FUNCREF_4BIT 1507 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1508 #endif 1509 1510 if (ctx->mres || ctx->ares) 1511 GCM_MUL(ctx,Xi); 1512 1513 #if BYTE_ORDER == LITTLE_ENDIAN 1514 #ifdef BSWAP8 1515 alen = BSWAP8(alen); 1516 clen = BSWAP8(clen); 1517 #else 1518 { 1519 u8 *p = ctx->len.c; 1520 1521 ctx->len.u[0] = alen; 1522 ctx->len.u[1] = clen; 1523 1524 alen = (u64)GETU32(p) <<32|GETU32(p+4); 1525 clen = (u64)GETU32(p+8)<<32|GETU32(p+12); 1526 } 1527 #endif 1528 #endif 1529 1530 ctx->Xi.u[0] ^= alen; 1531 ctx->Xi.u[1] ^= clen; 1532 GCM_MUL(ctx,Xi); 1533 1534 ctx->Xi.u[0] ^= ctx->EK0.u[0]; 1535 ctx->Xi.u[1] ^= ctx->EK0.u[1]; 1536 1537 if (tag && len<=sizeof(ctx->Xi)) 1538 return memcmp(ctx->Xi.c,tag,len); 1539 else 1540 return -1; 1541 } 1542 1543 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) 1544 { 1545 CRYPTO_gcm128_finish(ctx, NULL, 0); 1546 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c)); 1547 } 1548 1549 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) 1550 { 1551 GCM128_CONTEXT *ret; 1552 1553 if ((ret = malloc(sizeof(GCM128_CONTEXT)))) 1554 CRYPTO_gcm128_init(ret,key,block); 1555 1556 return ret; 1557 } 1558 1559 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) 1560 { 1561 freezero(ctx, sizeof(*ctx)); 1562 } 1563