1 /* $OpenBSD: gcm128.c,v 1.12 2015/02/10 09:46:30 miod Exp $ */ 2 /* ==================================================================== 3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * 3. All advertising materials mentioning features or use of this 18 * software must display the following acknowledgment: 19 * "This product includes software developed by the OpenSSL Project 20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" 21 * 22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to 23 * endorse or promote products derived from this software without 24 * prior written permission. For written permission, please contact 25 * openssl-core@openssl.org. 26 * 27 * 5. Products derived from this software may not be called "OpenSSL" 28 * nor may "OpenSSL" appear in their names without prior written 29 * permission of the OpenSSL Project. 30 * 31 * 6. Redistributions of any form whatsoever must retain the following 32 * acknowledgment: 33 * "This product includes software developed by the OpenSSL Project 34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)" 35 * 36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY 37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR 40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 47 * OF THE POSSIBILITY OF SUCH DAMAGE. 48 * ==================================================================== 49 */ 50 51 #define OPENSSL_FIPSAPI 52 53 #include <openssl/crypto.h> 54 #include "modes_lcl.h" 55 #include <string.h> 56 57 #ifndef MODES_DEBUG 58 # ifndef NDEBUG 59 # define NDEBUG 60 # endif 61 #endif 62 63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT) 64 /* redefine, because alignment is ensured */ 65 #undef GETU32 66 #define GETU32(p) BSWAP4(*(const u32 *)(p)) 67 #undef PUTU32 68 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) 69 #endif 70 71 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) 72 #define REDUCE1BIT(V) \ 73 do { \ 74 if (sizeof(size_t)==8) { \ 75 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ 76 V.lo = (V.hi<<63)|(V.lo>>1); \ 77 V.hi = (V.hi>>1 )^T; \ 78 } else { \ 79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ 80 V.lo = (V.hi<<63)|(V.lo>>1); \ 81 V.hi = (V.hi>>1 )^((u64)T<<32); \ 82 } \ 83 } while(0) 84 85 /* 86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should 87 * never be set to 8. 8 is effectively reserved for testing purposes. 88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as 89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover 90 * whole spectrum of possible table driven implementations. Why? In 91 * non-"Shoup's" case memory access pattern is segmented in such manner, 92 * that it's trivial to see that cache timing information can reveal 93 * fair portion of intermediate hash value. Given that ciphertext is 94 * always available to attacker, it's possible for him to attempt to 95 * deduce secret parameter H and if successful, tamper with messages 96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's 97 * not as trivial, but there is no reason to believe that it's resistant 98 * to cache-timing attack. And the thing about "8-bit" implementation is 99 * that it consumes 16 (sixteen) times more memory, 4KB per individual 100 * key + 1KB shared. Well, on pros side it should be twice as fast as 101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version 102 * was observed to run ~75% faster, closer to 100% for commercial 103 * compilers... Yet "4-bit" procedure is preferred, because it's 104 * believed to provide better security-performance balance and adequate 105 * all-round performance. "All-round" refers to things like: 106 * 107 * - shorter setup time effectively improves overall timing for 108 * handling short messages; 109 * - larger table allocation can become unbearable because of VM 110 * subsystem penalties (for example on Windows large enough free 111 * results in VM working set trimming, meaning that consequent 112 * malloc would immediately incur working set expansion); 113 * - larger table has larger cache footprint, which can affect 114 * performance of other code paths (not necessarily even from same 115 * thread in Hyper-Threading world); 116 * 117 * Value of 1 is not appropriate for performance reasons. 118 */ 119 #if TABLE_BITS==8 120 121 static void gcm_init_8bit(u128 Htable[256], u64 H[2]) 122 { 123 int i, j; 124 u128 V; 125 126 Htable[0].hi = 0; 127 Htable[0].lo = 0; 128 V.hi = H[0]; 129 V.lo = H[1]; 130 131 for (Htable[128]=V, i=64; i>0; i>>=1) { 132 REDUCE1BIT(V); 133 Htable[i] = V; 134 } 135 136 for (i=2; i<256; i<<=1) { 137 u128 *Hi = Htable+i, H0 = *Hi; 138 for (j=1; j<i; ++j) { 139 Hi[j].hi = H0.hi^Htable[j].hi; 140 Hi[j].lo = H0.lo^Htable[j].lo; 141 } 142 } 143 } 144 145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) 146 { 147 u128 Z = { 0, 0}; 148 const u8 *xi = (const u8 *)Xi+15; 149 size_t rem, n = *xi; 150 static const size_t rem_8bit[256] = { 151 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), 152 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), 153 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), 154 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), 155 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), 156 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), 157 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), 158 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), 159 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), 160 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), 161 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), 162 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), 163 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), 164 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), 165 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), 166 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), 167 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), 168 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), 169 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), 170 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), 171 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), 172 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), 173 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), 174 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), 175 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), 176 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), 177 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), 178 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), 179 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), 180 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), 181 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), 182 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), 183 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), 184 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), 185 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), 186 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), 187 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), 188 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), 189 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), 190 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), 191 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), 192 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), 193 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), 194 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), 195 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), 196 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), 197 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), 198 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), 199 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), 200 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), 201 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), 202 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), 203 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), 204 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), 205 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), 206 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), 207 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), 208 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), 209 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), 210 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), 211 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), 212 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), 213 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), 214 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; 215 216 while (1) { 217 Z.hi ^= Htable[n].hi; 218 Z.lo ^= Htable[n].lo; 219 220 if ((u8 *)Xi==xi) break; 221 222 n = *(--xi); 223 224 rem = (size_t)Z.lo&0xff; 225 Z.lo = (Z.hi<<56)|(Z.lo>>8); 226 Z.hi = (Z.hi>>8); 227 if (sizeof(size_t)==8) 228 Z.hi ^= rem_8bit[rem]; 229 else 230 Z.hi ^= (u64)rem_8bit[rem]<<32; 231 } 232 233 if (BYTE_ORDER == LITTLE_ENDIAN) { 234 #ifdef BSWAP8 235 Xi[0] = BSWAP8(Z.hi); 236 Xi[1] = BSWAP8(Z.lo); 237 #else 238 u8 *p = (u8 *)Xi; 239 u32 v; 240 v = (u32)(Z.hi>>32); PUTU32(p,v); 241 v = (u32)(Z.hi); PUTU32(p+4,v); 242 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 243 v = (u32)(Z.lo); PUTU32(p+12,v); 244 #endif 245 } 246 else { 247 Xi[0] = Z.hi; 248 Xi[1] = Z.lo; 249 } 250 } 251 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) 252 253 #elif TABLE_BITS==4 254 255 static void gcm_init_4bit(u128 Htable[16], u64 H[2]) 256 { 257 u128 V; 258 #if defined(OPENSSL_SMALL_FOOTPRINT) 259 int i; 260 #endif 261 262 Htable[0].hi = 0; 263 Htable[0].lo = 0; 264 V.hi = H[0]; 265 V.lo = H[1]; 266 267 #if defined(OPENSSL_SMALL_FOOTPRINT) 268 for (Htable[8]=V, i=4; i>0; i>>=1) { 269 REDUCE1BIT(V); 270 Htable[i] = V; 271 } 272 273 for (i=2; i<16; i<<=1) { 274 u128 *Hi = Htable+i; 275 int j; 276 for (V=*Hi, j=1; j<i; ++j) { 277 Hi[j].hi = V.hi^Htable[j].hi; 278 Hi[j].lo = V.lo^Htable[j].lo; 279 } 280 } 281 #else 282 Htable[8] = V; 283 REDUCE1BIT(V); 284 Htable[4] = V; 285 REDUCE1BIT(V); 286 Htable[2] = V; 287 REDUCE1BIT(V); 288 Htable[1] = V; 289 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo; 290 V=Htable[4]; 291 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo; 292 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo; 293 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo; 294 V=Htable[8]; 295 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo; 296 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo; 297 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo; 298 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo; 299 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo; 300 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo; 301 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo; 302 #endif 303 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm)) 304 /* 305 * ARM assembler expects specific dword order in Htable. 306 */ 307 { 308 int j; 309 310 if (BYTE_ORDER == LITTLE_ENDIAN) 311 for (j=0;j<16;++j) { 312 V = Htable[j]; 313 Htable[j].hi = V.lo; 314 Htable[j].lo = V.hi; 315 } 316 else 317 for (j=0;j<16;++j) { 318 V = Htable[j]; 319 Htable[j].hi = V.lo<<32|V.lo>>32; 320 Htable[j].lo = V.hi<<32|V.hi>>32; 321 } 322 } 323 #endif 324 } 325 326 #ifndef GHASH_ASM 327 static const size_t rem_4bit[16] = { 328 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), 329 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), 330 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), 331 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; 332 333 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) 334 { 335 u128 Z; 336 int cnt = 15; 337 size_t rem, nlo, nhi; 338 339 nlo = ((const u8 *)Xi)[15]; 340 nhi = nlo>>4; 341 nlo &= 0xf; 342 343 Z.hi = Htable[nlo].hi; 344 Z.lo = Htable[nlo].lo; 345 346 while (1) { 347 rem = (size_t)Z.lo&0xf; 348 Z.lo = (Z.hi<<60)|(Z.lo>>4); 349 Z.hi = (Z.hi>>4); 350 if (sizeof(size_t)==8) 351 Z.hi ^= rem_4bit[rem]; 352 else 353 Z.hi ^= (u64)rem_4bit[rem]<<32; 354 355 Z.hi ^= Htable[nhi].hi; 356 Z.lo ^= Htable[nhi].lo; 357 358 if (--cnt<0) break; 359 360 nlo = ((const u8 *)Xi)[cnt]; 361 nhi = nlo>>4; 362 nlo &= 0xf; 363 364 rem = (size_t)Z.lo&0xf; 365 Z.lo = (Z.hi<<60)|(Z.lo>>4); 366 Z.hi = (Z.hi>>4); 367 if (sizeof(size_t)==8) 368 Z.hi ^= rem_4bit[rem]; 369 else 370 Z.hi ^= (u64)rem_4bit[rem]<<32; 371 372 Z.hi ^= Htable[nlo].hi; 373 Z.lo ^= Htable[nlo].lo; 374 } 375 376 if (BYTE_ORDER == LITTLE_ENDIAN) { 377 #ifdef BSWAP8 378 Xi[0] = BSWAP8(Z.hi); 379 Xi[1] = BSWAP8(Z.lo); 380 #else 381 u8 *p = (u8 *)Xi; 382 u32 v; 383 v = (u32)(Z.hi>>32); PUTU32(p,v); 384 v = (u32)(Z.hi); PUTU32(p+4,v); 385 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 386 v = (u32)(Z.lo); PUTU32(p+12,v); 387 #endif 388 } 389 else { 390 Xi[0] = Z.hi; 391 Xi[1] = Z.lo; 392 } 393 } 394 395 #if !defined(OPENSSL_SMALL_FOOTPRINT) 396 /* 397 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for 398 * details... Compiler-generated code doesn't seem to give any 399 * performance improvement, at least not on x86[_64]. It's here 400 * mostly as reference and a placeholder for possible future 401 * non-trivial optimization[s]... 402 */ 403 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16], 404 const u8 *inp,size_t len) 405 { 406 u128 Z; 407 int cnt; 408 size_t rem, nlo, nhi; 409 410 #if 1 411 do { 412 cnt = 15; 413 nlo = ((const u8 *)Xi)[15]; 414 nlo ^= inp[15]; 415 nhi = nlo>>4; 416 nlo &= 0xf; 417 418 Z.hi = Htable[nlo].hi; 419 Z.lo = Htable[nlo].lo; 420 421 while (1) { 422 rem = (size_t)Z.lo&0xf; 423 Z.lo = (Z.hi<<60)|(Z.lo>>4); 424 Z.hi = (Z.hi>>4); 425 if (sizeof(size_t)==8) 426 Z.hi ^= rem_4bit[rem]; 427 else 428 Z.hi ^= (u64)rem_4bit[rem]<<32; 429 430 Z.hi ^= Htable[nhi].hi; 431 Z.lo ^= Htable[nhi].lo; 432 433 if (--cnt<0) break; 434 435 nlo = ((const u8 *)Xi)[cnt]; 436 nlo ^= inp[cnt]; 437 nhi = nlo>>4; 438 nlo &= 0xf; 439 440 rem = (size_t)Z.lo&0xf; 441 Z.lo = (Z.hi<<60)|(Z.lo>>4); 442 Z.hi = (Z.hi>>4); 443 if (sizeof(size_t)==8) 444 Z.hi ^= rem_4bit[rem]; 445 else 446 Z.hi ^= (u64)rem_4bit[rem]<<32; 447 448 Z.hi ^= Htable[nlo].hi; 449 Z.lo ^= Htable[nlo].lo; 450 } 451 #else 452 /* 453 * Extra 256+16 bytes per-key plus 512 bytes shared tables 454 * [should] give ~50% improvement... One could have PACK()-ed 455 * the rem_8bit even here, but the priority is to minimize 456 * cache footprint... 457 */ 458 u128 Hshr4[16]; /* Htable shifted right by 4 bits */ 459 u8 Hshl4[16]; /* Htable shifted left by 4 bits */ 460 static const unsigned short rem_8bit[256] = { 461 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, 462 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, 463 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, 464 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, 465 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, 466 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, 467 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, 468 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, 469 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, 470 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, 471 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, 472 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, 473 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, 474 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, 475 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, 476 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, 477 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, 478 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, 479 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, 480 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, 481 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, 482 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, 483 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, 484 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, 485 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, 486 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, 487 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, 488 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, 489 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, 490 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, 491 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, 492 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; 493 /* 494 * This pre-processing phase slows down procedure by approximately 495 * same time as it makes each loop spin faster. In other words 496 * single block performance is approximately same as straightforward 497 * "4-bit" implementation, and then it goes only faster... 498 */ 499 for (cnt=0; cnt<16; ++cnt) { 500 Z.hi = Htable[cnt].hi; 501 Z.lo = Htable[cnt].lo; 502 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4); 503 Hshr4[cnt].hi = (Z.hi>>4); 504 Hshl4[cnt] = (u8)(Z.lo<<4); 505 } 506 507 do { 508 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) { 509 nlo = ((const u8 *)Xi)[cnt]; 510 nlo ^= inp[cnt]; 511 nhi = nlo>>4; 512 nlo &= 0xf; 513 514 Z.hi ^= Htable[nlo].hi; 515 Z.lo ^= Htable[nlo].lo; 516 517 rem = (size_t)Z.lo&0xff; 518 519 Z.lo = (Z.hi<<56)|(Z.lo>>8); 520 Z.hi = (Z.hi>>8); 521 522 Z.hi ^= Hshr4[nhi].hi; 523 Z.lo ^= Hshr4[nhi].lo; 524 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48; 525 } 526 527 nlo = ((const u8 *)Xi)[0]; 528 nlo ^= inp[0]; 529 nhi = nlo>>4; 530 nlo &= 0xf; 531 532 Z.hi ^= Htable[nlo].hi; 533 Z.lo ^= Htable[nlo].lo; 534 535 rem = (size_t)Z.lo&0xf; 536 537 Z.lo = (Z.hi<<60)|(Z.lo>>4); 538 Z.hi = (Z.hi>>4); 539 540 Z.hi ^= Htable[nhi].hi; 541 Z.lo ^= Htable[nhi].lo; 542 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48; 543 #endif 544 545 if (BYTE_ORDER == LITTLE_ENDIAN) { 546 #ifdef BSWAP8 547 Xi[0] = BSWAP8(Z.hi); 548 Xi[1] = BSWAP8(Z.lo); 549 #else 550 u8 *p = (u8 *)Xi; 551 u32 v; 552 v = (u32)(Z.hi>>32); PUTU32(p,v); 553 v = (u32)(Z.hi); PUTU32(p+4,v); 554 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 555 v = (u32)(Z.lo); PUTU32(p+12,v); 556 #endif 557 } 558 else { 559 Xi[0] = Z.hi; 560 Xi[1] = Z.lo; 561 } 562 } while (inp+=16, len-=16); 563 } 564 #endif 565 #else 566 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]); 567 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 568 #endif 569 570 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) 571 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) 572 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) 573 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache 574 * trashing effect. In other words idea is to hash data while it's 575 * still in L1 cache after encryption pass... */ 576 #define GHASH_CHUNK (3*1024) 577 #endif 578 579 #else /* TABLE_BITS */ 580 581 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) 582 { 583 u128 V,Z = { 0,0 }; 584 long X; 585 int i,j; 586 const long *xi = (const long *)Xi; 587 588 V.hi = H[0]; /* H is in host byte order, no byte swapping */ 589 V.lo = H[1]; 590 591 for (j=0; j<16/sizeof(long); ++j) { 592 if (BYTE_ORDER == LITTLE_ENDIAN) { 593 if (sizeof(long)==8) { 594 #ifdef BSWAP8 595 X = (long)(BSWAP8(xi[j])); 596 #else 597 const u8 *p = (const u8 *)(xi+j); 598 X = (long)((u64)GETU32(p)<<32|GETU32(p+4)); 599 #endif 600 } 601 else { 602 const u8 *p = (const u8 *)(xi+j); 603 X = (long)GETU32(p); 604 } 605 } 606 else 607 X = xi[j]; 608 609 for (i=0; i<8*sizeof(long); ++i, X<<=1) { 610 u64 M = (u64)(X>>(8*sizeof(long)-1)); 611 Z.hi ^= V.hi&M; 612 Z.lo ^= V.lo&M; 613 614 REDUCE1BIT(V); 615 } 616 } 617 618 if (BYTE_ORDER == LITTLE_ENDIAN) { 619 #ifdef BSWAP8 620 Xi[0] = BSWAP8(Z.hi); 621 Xi[1] = BSWAP8(Z.lo); 622 #else 623 u8 *p = (u8 *)Xi; 624 u32 v; 625 v = (u32)(Z.hi>>32); PUTU32(p,v); 626 v = (u32)(Z.hi); PUTU32(p+4,v); 627 v = (u32)(Z.lo>>32); PUTU32(p+8,v); 628 v = (u32)(Z.lo); PUTU32(p+12,v); 629 #endif 630 } 631 else { 632 Xi[0] = Z.hi; 633 Xi[1] = Z.lo; 634 } 635 } 636 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) 637 638 #endif 639 640 #if TABLE_BITS==4 && defined(GHASH_ASM) 641 # if !defined(I386_ONLY) && \ 642 (defined(__i386) || defined(__i386__) || \ 643 defined(__x86_64) || defined(__x86_64__) || \ 644 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) 645 # define GHASH_ASM_X86_OR_64 646 # define GCM_FUNCREF_4BIT 647 extern unsigned int OPENSSL_ia32cap_P[2]; 648 649 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); 650 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); 651 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 652 653 # if defined(__i386) || defined(__i386__) || defined(_M_IX86) 654 # define GHASH_ASM_X86 655 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); 656 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 657 658 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); 659 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 660 # endif 661 # elif defined(__arm__) || defined(__arm) 662 # include "arm_arch.h" 663 # if __ARM_ARCH__>=7 664 # define GHASH_ASM_ARM 665 # define GCM_FUNCREF_4BIT 666 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); 667 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 668 # endif 669 # endif 670 #endif 671 672 #ifdef GCM_FUNCREF_4BIT 673 # undef GCM_MUL 674 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) 675 # ifdef GHASH 676 # undef GHASH 677 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) 678 # endif 679 #endif 680 681 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) 682 { 683 memset(ctx,0,sizeof(*ctx)); 684 ctx->block = block; 685 ctx->key = key; 686 687 (*block)(ctx->H.c,ctx->H.c,key); 688 689 if (BYTE_ORDER == LITTLE_ENDIAN) { 690 /* H is stored in host byte order */ 691 #ifdef BSWAP8 692 ctx->H.u[0] = BSWAP8(ctx->H.u[0]); 693 ctx->H.u[1] = BSWAP8(ctx->H.u[1]); 694 #else 695 u8 *p = ctx->H.c; 696 u64 hi,lo; 697 hi = (u64)GETU32(p) <<32|GETU32(p+4); 698 lo = (u64)GETU32(p+8)<<32|GETU32(p+12); 699 ctx->H.u[0] = hi; 700 ctx->H.u[1] = lo; 701 #endif 702 } 703 704 #if TABLE_BITS==8 705 gcm_init_8bit(ctx->Htable,ctx->H.u); 706 #elif TABLE_BITS==4 707 # if defined(GHASH_ASM_X86_OR_64) 708 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) 709 if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */ 710 OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */ 711 gcm_init_clmul(ctx->Htable,ctx->H.u); 712 ctx->gmult = gcm_gmult_clmul; 713 ctx->ghash = gcm_ghash_clmul; 714 return; 715 } 716 # endif 717 gcm_init_4bit(ctx->Htable,ctx->H.u); 718 # if defined(GHASH_ASM_X86) /* x86 only */ 719 # if defined(OPENSSL_IA32_SSE2) 720 if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */ 721 # else 722 if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */ 723 # endif 724 ctx->gmult = gcm_gmult_4bit_mmx; 725 ctx->ghash = gcm_ghash_4bit_mmx; 726 } else { 727 ctx->gmult = gcm_gmult_4bit_x86; 728 ctx->ghash = gcm_ghash_4bit_x86; 729 } 730 # else 731 ctx->gmult = gcm_gmult_4bit; 732 ctx->ghash = gcm_ghash_4bit; 733 # endif 734 # elif defined(GHASH_ASM_ARM) 735 if (OPENSSL_armcap_P & ARMV7_NEON) { 736 ctx->gmult = gcm_gmult_neon; 737 ctx->ghash = gcm_ghash_neon; 738 } else { 739 gcm_init_4bit(ctx->Htable,ctx->H.u); 740 ctx->gmult = gcm_gmult_4bit; 741 ctx->ghash = gcm_ghash_4bit; 742 } 743 # else 744 gcm_init_4bit(ctx->Htable,ctx->H.u); 745 # endif 746 #endif 747 } 748 749 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len) 750 { 751 unsigned int ctr; 752 #ifdef GCM_FUNCREF_4BIT 753 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 754 #endif 755 756 ctx->Yi.u[0] = 0; 757 ctx->Yi.u[1] = 0; 758 ctx->Xi.u[0] = 0; 759 ctx->Xi.u[1] = 0; 760 ctx->len.u[0] = 0; /* AAD length */ 761 ctx->len.u[1] = 0; /* message length */ 762 ctx->ares = 0; 763 ctx->mres = 0; 764 765 if (len==12) { 766 memcpy(ctx->Yi.c,iv,12); 767 ctx->Yi.c[15]=1; 768 ctr=1; 769 } 770 else { 771 size_t i; 772 u64 len0 = len; 773 774 while (len>=16) { 775 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i]; 776 GCM_MUL(ctx,Yi); 777 iv += 16; 778 len -= 16; 779 } 780 if (len) { 781 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i]; 782 GCM_MUL(ctx,Yi); 783 } 784 len0 <<= 3; 785 if (BYTE_ORDER == LITTLE_ENDIAN) { 786 #ifdef BSWAP8 787 ctx->Yi.u[1] ^= BSWAP8(len0); 788 #else 789 ctx->Yi.c[8] ^= (u8)(len0>>56); 790 ctx->Yi.c[9] ^= (u8)(len0>>48); 791 ctx->Yi.c[10] ^= (u8)(len0>>40); 792 ctx->Yi.c[11] ^= (u8)(len0>>32); 793 ctx->Yi.c[12] ^= (u8)(len0>>24); 794 ctx->Yi.c[13] ^= (u8)(len0>>16); 795 ctx->Yi.c[14] ^= (u8)(len0>>8); 796 ctx->Yi.c[15] ^= (u8)(len0); 797 #endif 798 } 799 else 800 ctx->Yi.u[1] ^= len0; 801 802 GCM_MUL(ctx,Yi); 803 804 if (BYTE_ORDER == LITTLE_ENDIAN) 805 #ifdef BSWAP4 806 ctr = BSWAP4(ctx->Yi.d[3]); 807 #else 808 ctr = GETU32(ctx->Yi.c+12); 809 #endif 810 else 811 ctr = ctx->Yi.d[3]; 812 } 813 814 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key); 815 ++ctr; 816 if (BYTE_ORDER == LITTLE_ENDIAN) 817 #ifdef BSWAP4 818 ctx->Yi.d[3] = BSWAP4(ctr); 819 #else 820 PUTU32(ctx->Yi.c+12,ctr); 821 #endif 822 else 823 ctx->Yi.d[3] = ctr; 824 } 825 826 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) 827 { 828 size_t i; 829 unsigned int n; 830 u64 alen = ctx->len.u[0]; 831 #ifdef GCM_FUNCREF_4BIT 832 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 833 # ifdef GHASH 834 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 835 const u8 *inp,size_t len) = ctx->ghash; 836 # endif 837 #endif 838 839 if (ctx->len.u[1]) return -2; 840 841 alen += len; 842 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len)) 843 return -1; 844 ctx->len.u[0] = alen; 845 846 n = ctx->ares; 847 if (n) { 848 while (n && len) { 849 ctx->Xi.c[n] ^= *(aad++); 850 --len; 851 n = (n+1)%16; 852 } 853 if (n==0) GCM_MUL(ctx,Xi); 854 else { 855 ctx->ares = n; 856 return 0; 857 } 858 } 859 860 #ifdef GHASH 861 if ((i = (len&(size_t)-16))) { 862 GHASH(ctx,aad,i); 863 aad += i; 864 len -= i; 865 } 866 #else 867 while (len>=16) { 868 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i]; 869 GCM_MUL(ctx,Xi); 870 aad += 16; 871 len -= 16; 872 } 873 #endif 874 if (len) { 875 n = (unsigned int)len; 876 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i]; 877 } 878 879 ctx->ares = n; 880 return 0; 881 } 882 883 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, 884 const unsigned char *in, unsigned char *out, 885 size_t len) 886 { 887 unsigned int n, ctr; 888 size_t i; 889 u64 mlen = ctx->len.u[1]; 890 block128_f block = ctx->block; 891 void *key = ctx->key; 892 #ifdef GCM_FUNCREF_4BIT 893 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 894 # ifdef GHASH 895 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 896 const u8 *inp,size_t len) = ctx->ghash; 897 # endif 898 #endif 899 900 mlen += len; 901 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 902 return -1; 903 ctx->len.u[1] = mlen; 904 905 if (ctx->ares) { 906 /* First call to encrypt finalizes GHASH(AAD) */ 907 GCM_MUL(ctx,Xi); 908 ctx->ares = 0; 909 } 910 911 if (BYTE_ORDER == LITTLE_ENDIAN) 912 #ifdef BSWAP4 913 ctr = BSWAP4(ctx->Yi.d[3]); 914 #else 915 ctr = GETU32(ctx->Yi.c+12); 916 #endif 917 else 918 ctr = ctx->Yi.d[3]; 919 920 n = ctx->mres; 921 #if !defined(OPENSSL_SMALL_FOOTPRINT) 922 if (16%sizeof(size_t) == 0) do { /* always true actually */ 923 if (n) { 924 while (n && len) { 925 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; 926 --len; 927 n = (n+1)%16; 928 } 929 if (n==0) GCM_MUL(ctx,Xi); 930 else { 931 ctx->mres = n; 932 return 0; 933 } 934 } 935 #ifdef __STRICT_ALIGNMENT 936 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) 937 break; 938 #endif 939 #if defined(GHASH) && defined(GHASH_CHUNK) 940 while (len>=GHASH_CHUNK) { 941 size_t j=GHASH_CHUNK; 942 943 while (j) { 944 size_t *out_t=(size_t *)out; 945 const size_t *in_t=(const size_t *)in; 946 947 (*block)(ctx->Yi.c,ctx->EKi.c,key); 948 ++ctr; 949 if (BYTE_ORDER == LITTLE_ENDIAN) 950 #ifdef BSWAP4 951 ctx->Yi.d[3] = BSWAP4(ctr); 952 #else 953 PUTU32(ctx->Yi.c+12,ctr); 954 #endif 955 else 956 ctx->Yi.d[3] = ctr; 957 for (i=0; i<16/sizeof(size_t); ++i) 958 out_t[i] = in_t[i] ^ ctx->EKi.t[i]; 959 out += 16; 960 in += 16; 961 j -= 16; 962 } 963 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK); 964 len -= GHASH_CHUNK; 965 } 966 if ((i = (len&(size_t)-16))) { 967 size_t j=i; 968 969 while (len>=16) { 970 size_t *out_t=(size_t *)out; 971 const size_t *in_t=(const size_t *)in; 972 973 (*block)(ctx->Yi.c,ctx->EKi.c,key); 974 ++ctr; 975 if (BYTE_ORDER == LITTLE_ENDIAN) 976 #ifdef BSWAP4 977 ctx->Yi.d[3] = BSWAP4(ctr); 978 #else 979 PUTU32(ctx->Yi.c+12,ctr); 980 #endif 981 else 982 ctx->Yi.d[3] = ctr; 983 for (i=0; i<16/sizeof(size_t); ++i) 984 out_t[i] = in_t[i] ^ ctx->EKi.t[i]; 985 out += 16; 986 in += 16; 987 len -= 16; 988 } 989 GHASH(ctx,out-j,j); 990 } 991 #else 992 while (len>=16) { 993 size_t *out_t=(size_t *)out; 994 const size_t *in_t=(const size_t *)in; 995 996 (*block)(ctx->Yi.c,ctx->EKi.c,key); 997 ++ctr; 998 if (BYTE_ORDER == LITTLE_ENDIAN) 999 #ifdef BSWAP4 1000 ctx->Yi.d[3] = BSWAP4(ctr); 1001 #else 1002 PUTU32(ctx->Yi.c+12,ctr); 1003 #endif 1004 else 1005 ctx->Yi.d[3] = ctr; 1006 for (i=0; i<16/sizeof(size_t); ++i) 1007 ctx->Xi.t[i] ^= 1008 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1009 GCM_MUL(ctx,Xi); 1010 out += 16; 1011 in += 16; 1012 len -= 16; 1013 } 1014 #endif 1015 if (len) { 1016 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1017 ++ctr; 1018 if (BYTE_ORDER == LITTLE_ENDIAN) 1019 #ifdef BSWAP4 1020 ctx->Yi.d[3] = BSWAP4(ctr); 1021 #else 1022 PUTU32(ctx->Yi.c+12,ctr); 1023 #endif 1024 else 1025 ctx->Yi.d[3] = ctr; 1026 while (len--) { 1027 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; 1028 ++n; 1029 } 1030 } 1031 1032 ctx->mres = n; 1033 return 0; 1034 } while(0); 1035 #endif 1036 for (i=0;i<len;++i) { 1037 if (n==0) { 1038 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1039 ++ctr; 1040 if (BYTE_ORDER == LITTLE_ENDIAN) 1041 #ifdef BSWAP4 1042 ctx->Yi.d[3] = BSWAP4(ctr); 1043 #else 1044 PUTU32(ctx->Yi.c+12,ctr); 1045 #endif 1046 else 1047 ctx->Yi.d[3] = ctr; 1048 } 1049 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n]; 1050 n = (n+1)%16; 1051 if (n==0) 1052 GCM_MUL(ctx,Xi); 1053 } 1054 1055 ctx->mres = n; 1056 return 0; 1057 } 1058 1059 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, 1060 const unsigned char *in, unsigned char *out, 1061 size_t len) 1062 { 1063 unsigned int n, ctr; 1064 size_t i; 1065 u64 mlen = ctx->len.u[1]; 1066 block128_f block = ctx->block; 1067 void *key = ctx->key; 1068 #ifdef GCM_FUNCREF_4BIT 1069 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1070 # ifdef GHASH 1071 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1072 const u8 *inp,size_t len) = ctx->ghash; 1073 # endif 1074 #endif 1075 1076 mlen += len; 1077 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1078 return -1; 1079 ctx->len.u[1] = mlen; 1080 1081 if (ctx->ares) { 1082 /* First call to decrypt finalizes GHASH(AAD) */ 1083 GCM_MUL(ctx,Xi); 1084 ctx->ares = 0; 1085 } 1086 1087 if (BYTE_ORDER == LITTLE_ENDIAN) 1088 #ifdef BSWAP4 1089 ctr = BSWAP4(ctx->Yi.d[3]); 1090 #else 1091 ctr = GETU32(ctx->Yi.c+12); 1092 #endif 1093 else 1094 ctr = ctx->Yi.d[3]; 1095 1096 n = ctx->mres; 1097 #if !defined(OPENSSL_SMALL_FOOTPRINT) 1098 if (16%sizeof(size_t) == 0) do { /* always true actually */ 1099 if (n) { 1100 while (n && len) { 1101 u8 c = *(in++); 1102 *(out++) = c^ctx->EKi.c[n]; 1103 ctx->Xi.c[n] ^= c; 1104 --len; 1105 n = (n+1)%16; 1106 } 1107 if (n==0) GCM_MUL (ctx,Xi); 1108 else { 1109 ctx->mres = n; 1110 return 0; 1111 } 1112 } 1113 #ifdef __STRICT_ALIGNMENT 1114 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) 1115 break; 1116 #endif 1117 #if defined(GHASH) && defined(GHASH_CHUNK) 1118 while (len>=GHASH_CHUNK) { 1119 size_t j=GHASH_CHUNK; 1120 1121 GHASH(ctx,in,GHASH_CHUNK); 1122 while (j) { 1123 size_t *out_t=(size_t *)out; 1124 const size_t *in_t=(const size_t *)in; 1125 1126 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1127 ++ctr; 1128 if (BYTE_ORDER == LITTLE_ENDIAN) 1129 #ifdef BSWAP4 1130 ctx->Yi.d[3] = BSWAP4(ctr); 1131 #else 1132 PUTU32(ctx->Yi.c+12,ctr); 1133 #endif 1134 else 1135 ctx->Yi.d[3] = ctr; 1136 for (i=0; i<16/sizeof(size_t); ++i) 1137 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1138 out += 16; 1139 in += 16; 1140 j -= 16; 1141 } 1142 len -= GHASH_CHUNK; 1143 } 1144 if ((i = (len&(size_t)-16))) { 1145 GHASH(ctx,in,i); 1146 while (len>=16) { 1147 size_t *out_t=(size_t *)out; 1148 const size_t *in_t=(const size_t *)in; 1149 1150 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1151 ++ctr; 1152 if (BYTE_ORDER == LITTLE_ENDIAN) 1153 #ifdef BSWAP4 1154 ctx->Yi.d[3] = BSWAP4(ctr); 1155 #else 1156 PUTU32(ctx->Yi.c+12,ctr); 1157 #endif 1158 else 1159 ctx->Yi.d[3] = ctr; 1160 for (i=0; i<16/sizeof(size_t); ++i) 1161 out_t[i] = in_t[i]^ctx->EKi.t[i]; 1162 out += 16; 1163 in += 16; 1164 len -= 16; 1165 } 1166 } 1167 #else 1168 while (len>=16) { 1169 size_t *out_t=(size_t *)out; 1170 const size_t *in_t=(const size_t *)in; 1171 1172 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1173 ++ctr; 1174 if (BYTE_ORDER == LITTLE_ENDIAN) 1175 #ifdef BSWAP4 1176 ctx->Yi.d[3] = BSWAP4(ctr); 1177 #else 1178 PUTU32(ctx->Yi.c+12,ctr); 1179 #endif 1180 else 1181 ctx->Yi.d[3] = ctr; 1182 for (i=0; i<16/sizeof(size_t); ++i) { 1183 size_t c = in[i]; 1184 out[i] = c^ctx->EKi.t[i]; 1185 ctx->Xi.t[i] ^= c; 1186 } 1187 GCM_MUL(ctx,Xi); 1188 out += 16; 1189 in += 16; 1190 len -= 16; 1191 } 1192 #endif 1193 if (len) { 1194 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1195 ++ctr; 1196 if (BYTE_ORDER == LITTLE_ENDIAN) 1197 #ifdef BSWAP4 1198 ctx->Yi.d[3] = BSWAP4(ctr); 1199 #else 1200 PUTU32(ctx->Yi.c+12,ctr); 1201 #endif 1202 else 1203 ctx->Yi.d[3] = ctr; 1204 while (len--) { 1205 u8 c = in[n]; 1206 ctx->Xi.c[n] ^= c; 1207 out[n] = c^ctx->EKi.c[n]; 1208 ++n; 1209 } 1210 } 1211 1212 ctx->mres = n; 1213 return 0; 1214 } while(0); 1215 #endif 1216 for (i=0;i<len;++i) { 1217 u8 c; 1218 if (n==0) { 1219 (*block)(ctx->Yi.c,ctx->EKi.c,key); 1220 ++ctr; 1221 if (BYTE_ORDER == LITTLE_ENDIAN) 1222 #ifdef BSWAP4 1223 ctx->Yi.d[3] = BSWAP4(ctr); 1224 #else 1225 PUTU32(ctx->Yi.c+12,ctr); 1226 #endif 1227 else 1228 ctx->Yi.d[3] = ctr; 1229 } 1230 c = in[i]; 1231 out[i] = c^ctx->EKi.c[n]; 1232 ctx->Xi.c[n] ^= c; 1233 n = (n+1)%16; 1234 if (n==0) 1235 GCM_MUL(ctx,Xi); 1236 } 1237 1238 ctx->mres = n; 1239 return 0; 1240 } 1241 1242 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, 1243 const unsigned char *in, unsigned char *out, 1244 size_t len, ctr128_f stream) 1245 { 1246 unsigned int n, ctr; 1247 size_t i; 1248 u64 mlen = ctx->len.u[1]; 1249 void *key = ctx->key; 1250 #ifdef GCM_FUNCREF_4BIT 1251 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1252 # ifdef GHASH 1253 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1254 const u8 *inp,size_t len) = ctx->ghash; 1255 # endif 1256 #endif 1257 1258 mlen += len; 1259 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1260 return -1; 1261 ctx->len.u[1] = mlen; 1262 1263 if (ctx->ares) { 1264 /* First call to encrypt finalizes GHASH(AAD) */ 1265 GCM_MUL(ctx,Xi); 1266 ctx->ares = 0; 1267 } 1268 1269 if (BYTE_ORDER == LITTLE_ENDIAN) 1270 #ifdef BSWAP4 1271 ctr = BSWAP4(ctx->Yi.d[3]); 1272 #else 1273 ctr = GETU32(ctx->Yi.c+12); 1274 #endif 1275 else 1276 ctr = ctx->Yi.d[3]; 1277 1278 n = ctx->mres; 1279 if (n) { 1280 while (n && len) { 1281 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; 1282 --len; 1283 n = (n+1)%16; 1284 } 1285 if (n==0) GCM_MUL(ctx,Xi); 1286 else { 1287 ctx->mres = n; 1288 return 0; 1289 } 1290 } 1291 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1292 while (len>=GHASH_CHUNK) { 1293 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); 1294 ctr += GHASH_CHUNK/16; 1295 if (BYTE_ORDER == LITTLE_ENDIAN) 1296 #ifdef BSWAP4 1297 ctx->Yi.d[3] = BSWAP4(ctr); 1298 #else 1299 PUTU32(ctx->Yi.c+12,ctr); 1300 #endif 1301 else 1302 ctx->Yi.d[3] = ctr; 1303 GHASH(ctx,out,GHASH_CHUNK); 1304 out += GHASH_CHUNK; 1305 in += GHASH_CHUNK; 1306 len -= GHASH_CHUNK; 1307 } 1308 #endif 1309 if ((i = (len&(size_t)-16))) { 1310 size_t j=i/16; 1311 1312 (*stream)(in,out,j,key,ctx->Yi.c); 1313 ctr += (unsigned int)j; 1314 if (BYTE_ORDER == LITTLE_ENDIAN) 1315 #ifdef BSWAP4 1316 ctx->Yi.d[3] = BSWAP4(ctr); 1317 #else 1318 PUTU32(ctx->Yi.c+12,ctr); 1319 #endif 1320 else 1321 ctx->Yi.d[3] = ctr; 1322 in += i; 1323 len -= i; 1324 #if defined(GHASH) 1325 GHASH(ctx,out,i); 1326 out += i; 1327 #else 1328 while (j--) { 1329 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i]; 1330 GCM_MUL(ctx,Xi); 1331 out += 16; 1332 } 1333 #endif 1334 } 1335 if (len) { 1336 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); 1337 ++ctr; 1338 if (BYTE_ORDER == LITTLE_ENDIAN) 1339 #ifdef BSWAP4 1340 ctx->Yi.d[3] = BSWAP4(ctr); 1341 #else 1342 PUTU32(ctx->Yi.c+12,ctr); 1343 #endif 1344 else 1345 ctx->Yi.d[3] = ctr; 1346 while (len--) { 1347 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; 1348 ++n; 1349 } 1350 } 1351 1352 ctx->mres = n; 1353 return 0; 1354 } 1355 1356 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, 1357 const unsigned char *in, unsigned char *out, 1358 size_t len,ctr128_f stream) 1359 { 1360 unsigned int n, ctr; 1361 size_t i; 1362 u64 mlen = ctx->len.u[1]; 1363 void *key = ctx->key; 1364 #ifdef GCM_FUNCREF_4BIT 1365 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1366 # ifdef GHASH 1367 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], 1368 const u8 *inp,size_t len) = ctx->ghash; 1369 # endif 1370 #endif 1371 1372 mlen += len; 1373 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len)) 1374 return -1; 1375 ctx->len.u[1] = mlen; 1376 1377 if (ctx->ares) { 1378 /* First call to decrypt finalizes GHASH(AAD) */ 1379 GCM_MUL(ctx,Xi); 1380 ctx->ares = 0; 1381 } 1382 1383 if (BYTE_ORDER == LITTLE_ENDIAN) 1384 #ifdef BSWAP4 1385 ctr = BSWAP4(ctx->Yi.d[3]); 1386 #else 1387 ctr = GETU32(ctx->Yi.c+12); 1388 #endif 1389 else 1390 ctr = ctx->Yi.d[3]; 1391 1392 n = ctx->mres; 1393 if (n) { 1394 while (n && len) { 1395 u8 c = *(in++); 1396 *(out++) = c^ctx->EKi.c[n]; 1397 ctx->Xi.c[n] ^= c; 1398 --len; 1399 n = (n+1)%16; 1400 } 1401 if (n==0) GCM_MUL (ctx,Xi); 1402 else { 1403 ctx->mres = n; 1404 return 0; 1405 } 1406 } 1407 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) 1408 while (len>=GHASH_CHUNK) { 1409 GHASH(ctx,in,GHASH_CHUNK); 1410 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); 1411 ctr += GHASH_CHUNK/16; 1412 if (BYTE_ORDER == LITTLE_ENDIAN) 1413 #ifdef BSWAP4 1414 ctx->Yi.d[3] = BSWAP4(ctr); 1415 #else 1416 PUTU32(ctx->Yi.c+12,ctr); 1417 #endif 1418 else 1419 ctx->Yi.d[3] = ctr; 1420 out += GHASH_CHUNK; 1421 in += GHASH_CHUNK; 1422 len -= GHASH_CHUNK; 1423 } 1424 #endif 1425 if ((i = (len&(size_t)-16))) { 1426 size_t j=i/16; 1427 1428 #if defined(GHASH) 1429 GHASH(ctx,in,i); 1430 #else 1431 while (j--) { 1432 size_t k; 1433 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k]; 1434 GCM_MUL(ctx,Xi); 1435 in += 16; 1436 } 1437 j = i/16; 1438 in -= i; 1439 #endif 1440 (*stream)(in,out,j,key,ctx->Yi.c); 1441 ctr += (unsigned int)j; 1442 if (BYTE_ORDER == LITTLE_ENDIAN) 1443 #ifdef BSWAP4 1444 ctx->Yi.d[3] = BSWAP4(ctr); 1445 #else 1446 PUTU32(ctx->Yi.c+12,ctr); 1447 #endif 1448 else 1449 ctx->Yi.d[3] = ctr; 1450 out += i; 1451 in += i; 1452 len -= i; 1453 } 1454 if (len) { 1455 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); 1456 ++ctr; 1457 if (BYTE_ORDER == LITTLE_ENDIAN) 1458 #ifdef BSWAP4 1459 ctx->Yi.d[3] = BSWAP4(ctr); 1460 #else 1461 PUTU32(ctx->Yi.c+12,ctr); 1462 #endif 1463 else 1464 ctx->Yi.d[3] = ctr; 1465 while (len--) { 1466 u8 c = in[n]; 1467 ctx->Xi.c[n] ^= c; 1468 out[n] = c^ctx->EKi.c[n]; 1469 ++n; 1470 } 1471 } 1472 1473 ctx->mres = n; 1474 return 0; 1475 } 1476 1477 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, 1478 size_t len) 1479 { 1480 u64 alen = ctx->len.u[0]<<3; 1481 u64 clen = ctx->len.u[1]<<3; 1482 #ifdef GCM_FUNCREF_4BIT 1483 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; 1484 #endif 1485 1486 if (ctx->mres || ctx->ares) 1487 GCM_MUL(ctx,Xi); 1488 1489 if (BYTE_ORDER == LITTLE_ENDIAN) { 1490 #ifdef BSWAP8 1491 alen = BSWAP8(alen); 1492 clen = BSWAP8(clen); 1493 #else 1494 u8 *p = ctx->len.c; 1495 1496 ctx->len.u[0] = alen; 1497 ctx->len.u[1] = clen; 1498 1499 alen = (u64)GETU32(p) <<32|GETU32(p+4); 1500 clen = (u64)GETU32(p+8)<<32|GETU32(p+12); 1501 #endif 1502 } 1503 1504 ctx->Xi.u[0] ^= alen; 1505 ctx->Xi.u[1] ^= clen; 1506 GCM_MUL(ctx,Xi); 1507 1508 ctx->Xi.u[0] ^= ctx->EK0.u[0]; 1509 ctx->Xi.u[1] ^= ctx->EK0.u[1]; 1510 1511 if (tag && len<=sizeof(ctx->Xi)) 1512 return memcmp(ctx->Xi.c,tag,len); 1513 else 1514 return -1; 1515 } 1516 1517 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) 1518 { 1519 CRYPTO_gcm128_finish(ctx, NULL, 0); 1520 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c)); 1521 } 1522 1523 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) 1524 { 1525 GCM128_CONTEXT *ret; 1526 1527 if ((ret = malloc(sizeof(GCM128_CONTEXT)))) 1528 CRYPTO_gcm128_init(ret,key,block); 1529 1530 return ret; 1531 } 1532 1533 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) 1534 { 1535 if (ctx) { 1536 explicit_bzero(ctx,sizeof(*ctx)); 1537 free(ctx); 1538 } 1539 } 1540