xref: /openbsd/lib/libcrypto/modes/gcm128.c (revision 4ffa82da)
1 /* $OpenBSD: gcm128.c,v 1.27 2024/09/06 09:57:32 tb Exp $ */
2 /* ====================================================================
3  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * 3. All advertising materials mentioning features or use of this
18  *    software must display the following acknowledgment:
19  *    "This product includes software developed by the OpenSSL Project
20  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21  *
22  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23  *    endorse or promote products derived from this software without
24  *    prior written permission. For written permission, please contact
25  *    openssl-core@openssl.org.
26  *
27  * 5. Products derived from this software may not be called "OpenSSL"
28  *    nor may "OpenSSL" appear in their names without prior written
29  *    permission of the OpenSSL Project.
30  *
31  * 6. Redistributions of any form whatsoever must retain the following
32  *    acknowledgment:
33  *    "This product includes software developed by the OpenSSL Project
34  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35  *
36  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47  * OF THE POSSIBILITY OF SUCH DAMAGE.
48  * ====================================================================
49  */
50 
51 #define OPENSSL_FIPSAPI
52 
53 #include <string.h>
54 
55 #include <openssl/crypto.h>
56 
57 #include "crypto_internal.h"
58 #include "modes_local.h"
59 
60 #ifndef MODES_DEBUG
61 # ifndef NDEBUG
62 #  define NDEBUG
63 # endif
64 #endif
65 
66 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
67 /* redefine, because alignment is ensured */
68 #undef	GETU32
69 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
70 #endif
71 
72 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
73 #define REDUCE1BIT(V)							\
74 	do {								\
75 		if (sizeof(size_t)==8) {				\
76 			u64 T = U64(0xe100000000000000) & (0-(V.lo&1));	\
77 			V.lo  = (V.hi<<63)|(V.lo>>1);			\
78 			V.hi  = (V.hi>>1 )^T;				\
79 		} else {						\
80 			u32 T = 0xe1000000U & (0-(u32)(V.lo&1));	\
81 			V.lo  = (V.hi<<63)|(V.lo>>1);			\
82 			V.hi  = (V.hi>>1 )^((u64)T<<32);		\
83 		}							\
84 	} while(0)
85 
86 /*
87  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
88  * never be set to 8. 8 is effectively reserved for testing purposes.
89  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
90  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
91  * whole spectrum of possible table driven implementations. Why? In
92  * non-"Shoup's" case memory access pattern is segmented in such manner,
93  * that it's trivial to see that cache timing information can reveal
94  * fair portion of intermediate hash value. Given that ciphertext is
95  * always available to attacker, it's possible for him to attempt to
96  * deduce secret parameter H and if successful, tamper with messages
97  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
98  * not as trivial, but there is no reason to believe that it's resistant
99  * to cache-timing attack. And the thing about "8-bit" implementation is
100  * that it consumes 16 (sixteen) times more memory, 4KB per individual
101  * key + 1KB shared. Well, on pros side it should be twice as fast as
102  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
103  * was observed to run ~75% faster, closer to 100% for commercial
104  * compilers... Yet "4-bit" procedure is preferred, because it's
105  * believed to provide better security-performance balance and adequate
106  * all-round performance. "All-round" refers to things like:
107  *
108  * - shorter setup time effectively improves overall timing for
109  *   handling short messages;
110  * - larger table allocation can become unbearable because of VM
111  *   subsystem penalties (for example on Windows large enough free
112  *   results in VM working set trimming, meaning that consequent
113  *   malloc would immediately incur working set expansion);
114  * - larger table has larger cache footprint, which can affect
115  *   performance of other code paths (not necessarily even from same
116  *   thread in Hyper-Threading world);
117  *
118  * Value of 1 is not appropriate for performance reasons.
119  */
120 #if	TABLE_BITS==8
121 
122 static void
gcm_init_8bit(u128 Htable[256],u64 H[2])123 gcm_init_8bit(u128 Htable[256], u64 H[2])
124 {
125 	int  i, j;
126 	u128 V;
127 
128 	Htable[0].hi = 0;
129 	Htable[0].lo = 0;
130 	V.hi = H[0];
131 	V.lo = H[1];
132 
133 	for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
134 		REDUCE1BIT(V);
135 		Htable[i] = V;
136 	}
137 
138 	for (i = 2; i < 256; i <<= 1) {
139 		u128 *Hi = Htable + i, H0 = *Hi;
140 		for (j = 1; j < i; ++j) {
141 			Hi[j].hi = H0.hi ^ Htable[j].hi;
142 			Hi[j].lo = H0.lo ^ Htable[j].lo;
143 		}
144 	}
145 }
146 
147 static void
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])148 gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
149 {
150 	u128 Z = { 0, 0};
151 	const u8 *xi = (const u8 *)Xi + 15;
152 	size_t rem, n = *xi;
153 	static const size_t rem_8bit[256] = {
154 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
155 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
156 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
157 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
158 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
159 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
160 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
161 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
162 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
163 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
164 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
165 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
166 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
167 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
168 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
169 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
170 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
171 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
172 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
173 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
174 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
175 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
176 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
177 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
178 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
179 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
180 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
181 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
182 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
183 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
184 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
185 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
186 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
187 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
188 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
189 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
190 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
191 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
192 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
193 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
194 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
195 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
196 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
197 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
198 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
199 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
200 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
201 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
202 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
203 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
204 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
205 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
206 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
207 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
208 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
209 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
210 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
211 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
212 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
213 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
214 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
215 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
216 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
217 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
218 
219 	while (1) {
220 		Z.hi ^= Htable[n].hi;
221 		Z.lo ^= Htable[n].lo;
222 
223 		if ((u8 *)Xi == xi)
224 			break;
225 
226 		n = *(--xi);
227 
228 		rem = (size_t)Z.lo & 0xff;
229 		Z.lo = (Z.hi << 56)|(Z.lo >> 8);
230 		Z.hi = (Z.hi >> 8);
231 #if SIZE_MAX == 0xffffffffffffffff
232 		Z.hi ^= rem_8bit[rem];
233 #else
234 		Z.hi ^= (u64)rem_8bit[rem] << 32;
235 #endif
236 	}
237 
238 	Xi[0] = htobe64(Z.hi);
239 	Xi[1] = htobe64(Z.lo);
240 }
241 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
242 
243 #elif	TABLE_BITS==4
244 
245 static void
gcm_init_4bit(u128 Htable[16],u64 H[2])246 gcm_init_4bit(u128 Htable[16], u64 H[2])
247 {
248 	u128 V;
249 #if defined(OPENSSL_SMALL_FOOTPRINT)
250 	int  i;
251 #endif
252 
253 	Htable[0].hi = 0;
254 	Htable[0].lo = 0;
255 	V.hi = H[0];
256 	V.lo = H[1];
257 
258 #if defined(OPENSSL_SMALL_FOOTPRINT)
259 	for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
260 		REDUCE1BIT(V);
261 		Htable[i] = V;
262 	}
263 
264 	for (i = 2; i < 16; i <<= 1) {
265 		u128 *Hi = Htable + i;
266 		int   j;
267 		for (V = *Hi, j = 1; j < i; ++j) {
268 			Hi[j].hi = V.hi ^ Htable[j].hi;
269 			Hi[j].lo = V.lo ^ Htable[j].lo;
270 		}
271 	}
272 #else
273 	Htable[8] = V;
274 	REDUCE1BIT(V);
275 	Htable[4] = V;
276 	REDUCE1BIT(V);
277 	Htable[2] = V;
278 	REDUCE1BIT(V);
279 	Htable[1] = V;
280 	Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
281 	V = Htable[4];
282 	Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
283 	Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
284 	Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
285 	V = Htable[8];
286 	Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
287 	Htable[10].hi = V.hi ^ Htable[2].hi,
288 	    Htable[10].lo = V.lo ^ Htable[2].lo;
289 	Htable[11].hi = V.hi ^ Htable[3].hi,
290 	    Htable[11].lo = V.lo ^ Htable[3].lo;
291 	Htable[12].hi = V.hi ^ Htable[4].hi,
292 	    Htable[12].lo = V.lo ^ Htable[4].lo;
293 	Htable[13].hi = V.hi ^ Htable[5].hi,
294 	    Htable[13].lo = V.lo ^ Htable[5].lo;
295 	Htable[14].hi = V.hi ^ Htable[6].hi,
296 	    Htable[14].lo = V.lo ^ Htable[6].lo;
297 	Htable[15].hi = V.hi ^ Htable[7].hi,
298 	    Htable[15].lo = V.lo ^ Htable[7].lo;
299 #endif
300 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
301 	/*
302 	 * ARM assembler expects specific dword order in Htable.
303 	 */
304 	{
305 		int j;
306 #if BYTE_ORDER == LITTLE_ENDIAN
307 		for (j = 0; j < 16; ++j) {
308 			V = Htable[j];
309 			Htable[j].hi = V.lo;
310 			Htable[j].lo = V.hi;
311 		}
312 #else /* BIG_ENDIAN */
313 		for (j = 0; j < 16; ++j) {
314 			V = Htable[j];
315 			Htable[j].hi = V.lo << 32|V.lo >> 32;
316 			Htable[j].lo = V.hi << 32|V.hi >> 32;
317 		}
318 #endif
319 	}
320 #endif
321 }
322 
323 #ifndef GHASH_ASM
324 static const size_t rem_4bit[16] = {
325 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
326 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
327 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
328 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
329 
330 static void
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])331 gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
332 {
333 	u128 Z;
334 	int cnt = 15;
335 	size_t rem, nlo, nhi;
336 
337 	nlo = ((const u8 *)Xi)[15];
338 	nhi = nlo >> 4;
339 	nlo &= 0xf;
340 
341 	Z.hi = Htable[nlo].hi;
342 	Z.lo = Htable[nlo].lo;
343 
344 	while (1) {
345 		rem = (size_t)Z.lo & 0xf;
346 		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
347 		Z.hi = (Z.hi >> 4);
348 #if SIZE_MAX == 0xffffffffffffffff
349 		Z.hi ^= rem_4bit[rem];
350 #else
351 		Z.hi ^= (u64)rem_4bit[rem] << 32;
352 #endif
353 		Z.hi ^= Htable[nhi].hi;
354 		Z.lo ^= Htable[nhi].lo;
355 
356 		if (--cnt < 0)
357 			break;
358 
359 		nlo = ((const u8 *)Xi)[cnt];
360 		nhi = nlo >> 4;
361 		nlo &= 0xf;
362 
363 		rem = (size_t)Z.lo & 0xf;
364 		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
365 		Z.hi = (Z.hi >> 4);
366 #if SIZE_MAX == 0xffffffffffffffff
367 		Z.hi ^= rem_4bit[rem];
368 #else
369 		Z.hi ^= (u64)rem_4bit[rem] << 32;
370 #endif
371 		Z.hi ^= Htable[nlo].hi;
372 		Z.lo ^= Htable[nlo].lo;
373 	}
374 
375 	Xi[0] = htobe64(Z.hi);
376 	Xi[1] = htobe64(Z.lo);
377 }
378 
379 #if !defined(OPENSSL_SMALL_FOOTPRINT)
380 /*
381  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
382  * details... Compiler-generated code doesn't seem to give any
383  * performance improvement, at least not on x86[_64]. It's here
384  * mostly as reference and a placeholder for possible future
385  * non-trivial optimization[s]...
386  */
387 static void
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)388 gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
389     const u8 *inp, size_t len)
390 {
391 	u128 Z;
392 	int cnt;
393 	size_t rem, nlo, nhi;
394 
395 #if 1
396 	do {
397 		cnt = 15;
398 		nlo = ((const u8 *)Xi)[15];
399 		nlo ^= inp[15];
400 		nhi = nlo >> 4;
401 		nlo &= 0xf;
402 
403 		Z.hi = Htable[nlo].hi;
404 		Z.lo = Htable[nlo].lo;
405 
406 		while (1) {
407 			rem = (size_t)Z.lo & 0xf;
408 			Z.lo = (Z.hi << 60)|(Z.lo >> 4);
409 			Z.hi = (Z.hi >> 4);
410 #if SIZE_MAX == 0xffffffffffffffff
411 			Z.hi ^= rem_4bit[rem];
412 #else
413 			Z.hi ^= (u64)rem_4bit[rem] << 32;
414 #endif
415 			Z.hi ^= Htable[nhi].hi;
416 			Z.lo ^= Htable[nhi].lo;
417 
418 			if (--cnt < 0)
419 				break;
420 
421 			nlo = ((const u8 *)Xi)[cnt];
422 			nlo ^= inp[cnt];
423 			nhi = nlo >> 4;
424 			nlo &= 0xf;
425 
426 			rem = (size_t)Z.lo & 0xf;
427 			Z.lo = (Z.hi << 60)|(Z.lo >> 4);
428 			Z.hi = (Z.hi >> 4);
429 #if SIZE_MAX == 0xffffffffffffffff
430 			Z.hi ^= rem_4bit[rem];
431 #else
432 			Z.hi ^= (u64)rem_4bit[rem] << 32;
433 #endif
434 			Z.hi ^= Htable[nlo].hi;
435 			Z.lo ^= Htable[nlo].lo;
436 		}
437 #else
438     /*
439      * Extra 256+16 bytes per-key plus 512 bytes shared tables
440      * [should] give ~50% improvement... One could have PACK()-ed
441      * the rem_8bit even here, but the priority is to minimize
442      * cache footprint...
443      */
444 	u128 Hshr4[16];	/* Htable shifted right by 4 bits */
445 	u8 Hshl4[16];	/* Htable shifted left  by 4 bits */
446 	static const unsigned short rem_8bit[256] = {
447 		0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
448 		0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
449 		0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
450 		0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
451 		0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
452 		0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
453 		0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
454 		0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
455 		0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
456 		0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
457 		0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
458 		0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
459 		0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
460 		0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
461 		0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
462 		0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
463 		0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
464 		0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
465 		0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
466 		0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
467 		0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
468 		0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
469 		0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
470 		0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
471 		0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
472 		0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
473 		0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
474 		0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
475 		0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
476 		0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
477 		0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
478 		0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
479     /*
480      * This pre-processing phase slows down procedure by approximately
481      * same time as it makes each loop spin faster. In other words
482      * single block performance is approximately same as straightforward
483      * "4-bit" implementation, and then it goes only faster...
484      */
485 	for (cnt = 0; cnt < 16; ++cnt) {
486 		Z.hi = Htable[cnt].hi;
487 		Z.lo = Htable[cnt].lo;
488 		Hshr4[cnt].lo = (Z.hi << 60)|(Z.lo >> 4);
489 		Hshr4[cnt].hi = (Z.hi >> 4);
490 		Hshl4[cnt] = (u8)(Z.lo << 4);
491 	}
492 
493 	do {
494 		for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
495 			nlo = ((const u8 *)Xi)[cnt];
496 			nlo ^= inp[cnt];
497 			nhi = nlo >> 4;
498 			nlo &= 0xf;
499 
500 			Z.hi ^= Htable[nlo].hi;
501 			Z.lo ^= Htable[nlo].lo;
502 
503 			rem = (size_t)Z.lo & 0xff;
504 
505 			Z.lo = (Z.hi << 56)|(Z.lo >> 8);
506 			Z.hi = (Z.hi >> 8);
507 
508 			Z.hi ^= Hshr4[nhi].hi;
509 			Z.lo ^= Hshr4[nhi].lo;
510 			Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
511 		}
512 
513 		nlo = ((const u8 *)Xi)[0];
514 		nlo ^= inp[0];
515 		nhi = nlo >> 4;
516 		nlo &= 0xf;
517 
518 		Z.hi ^= Htable[nlo].hi;
519 		Z.lo ^= Htable[nlo].lo;
520 
521 		rem = (size_t)Z.lo & 0xf;
522 
523 		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
524 		Z.hi = (Z.hi >> 4);
525 
526 		Z.hi ^= Htable[nhi].hi;
527 		Z.lo ^= Htable[nhi].lo;
528 		Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
529 #endif
530 
531 		Xi[0] = htobe64(Z.hi);
532 		Xi[1] = htobe64(Z.lo);
533 	} while (inp += 16, len -= 16);
534 }
535 #endif
536 #else
537 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
538 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
539     size_t len);
540 #endif
541 
542 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
543 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
544 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
545 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
546  * trashing effect. In other words idea is to hash data while it's
547  * still in L1 cache after encryption pass... */
548 #define GHASH_CHUNK       (3*1024)
549 #endif
550 
551 #else	/* TABLE_BITS */
552 
553 static void
554 gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
555 {
556 	u128 V, Z = { 0,0 };
557 	long X;
558 	int i, j;
559 	const long *xi = (const long *)Xi;
560 
561 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
562 	V.lo = H[1];
563 
564 	for (j = 0; j < 16/sizeof(long); ++j) {
565 #if BYTE_ORDER == LITTLE_ENDIAN
566 #if SIZE_MAX == 0xffffffffffffffff
567 #ifdef BSWAP8
568 		X = (long)(BSWAP8(xi[j]));
569 #else
570 		const u8 *p = (const u8 *)(xi + j);
571 		X = (long)((u64)GETU32(p) << 32|GETU32(p + 4));
572 #endif
573 #else
574 		const u8 *p = (const u8 *)(xi + j);
575 		X = (long)GETU32(p);
576 #endif
577 #else /* BIG_ENDIAN */
578 		X = xi[j];
579 #endif
580 
581 		for (i = 0; i < 8*sizeof(long); ++i, X <<= 1) {
582 			u64 M = (u64)(X >> (8*sizeof(long) - 1));
583 			Z.hi ^= V.hi & M;
584 			Z.lo ^= V.lo & M;
585 
586 			REDUCE1BIT(V);
587 		}
588 	}
589 
590 	Xi[0] = htobe64(Z.hi);
591 	Xi[1] = htobe64(Z.lo);
592 }
593 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
594 
595 #endif
596 
597 #if	defined(GHASH_ASM) &&						\
598 	(defined(__i386)	|| defined(__i386__)	||		\
599 	 defined(__x86_64)	|| defined(__x86_64__)	||		\
600 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
601 #include "x86_arch.h"
602 #endif
603 
604 #if	TABLE_BITS==4 && defined(GHASH_ASM)
605 # if	(defined(__i386)	|| defined(__i386__)	||		\
606 	 defined(__x86_64)	|| defined(__x86_64__)	||		\
607 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
608 #  define GHASH_ASM_X86_OR_64
609 #  define GCM_FUNCREF_4BIT
610 
611 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
612 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
613 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
614     size_t len);
615 
616 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
617 #   define GHASH_ASM_X86
618 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
619 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
620     size_t len);
621 
622 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
623 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
624     size_t len);
625 #  endif
626 # elif defined(__arm__) || defined(__arm)
627 #  include "arm_arch.h"
628 #  if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
629 #   define GHASH_ASM_ARM
630 #   define GCM_FUNCREF_4BIT
631 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
632 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
633     size_t len);
634 #  endif
635 # endif
636 #endif
637 
638 #ifdef GCM_FUNCREF_4BIT
639 # undef  GCM_MUL
640 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
641 # ifdef GHASH
642 #  undef  GHASH
643 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
644 # endif
645 #endif
646 
647 void
648 CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
649 {
650 	memset(ctx, 0, sizeof(*ctx));
651 	ctx->block = block;
652 	ctx->key = key;
653 
654 	(*block)(ctx->H.c, ctx->H.c, key);
655 
656 	/* H is stored in host byte order */
657 	ctx->H.u[0] = be64toh(ctx->H.u[0]);
658 	ctx->H.u[1] = be64toh(ctx->H.u[1]);
659 
660 #if	TABLE_BITS==8
661 	gcm_init_8bit(ctx->Htable, ctx->H.u);
662 #elif	TABLE_BITS==4
663 # if	defined(GHASH_ASM_X86_OR_64)
664 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
665 	/* check FXSR and PCLMULQDQ bits */
666 	if ((crypto_cpu_caps_ia32() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) ==
667 	    (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) {
668 		gcm_init_clmul(ctx->Htable, ctx->H.u);
669 		ctx->gmult = gcm_gmult_clmul;
670 		ctx->ghash = gcm_ghash_clmul;
671 		return;
672 	}
673 #  endif
674 	gcm_init_4bit(ctx->Htable, ctx->H.u);
675 #  if	defined(GHASH_ASM_X86)			/* x86 only */
676 #   if	defined(OPENSSL_IA32_SSE2)
677 	if (crypto_cpu_caps_ia32() & CPUCAP_MASK_SSE) {	/* check SSE bit */
678 #   else
679 	if (crypto_cpu_caps_ia32() & CPUCAP_MASK_MMX) {	/* check MMX bit */
680 #   endif
681 		ctx->gmult = gcm_gmult_4bit_mmx;
682 		ctx->ghash = gcm_ghash_4bit_mmx;
683 	} else {
684 		ctx->gmult = gcm_gmult_4bit_x86;
685 		ctx->ghash = gcm_ghash_4bit_x86;
686 	}
687 #  else
688 	ctx->gmult = gcm_gmult_4bit;
689 	ctx->ghash = gcm_ghash_4bit;
690 #  endif
691 # elif	defined(GHASH_ASM_ARM)
692 	if (OPENSSL_armcap_P & ARMV7_NEON) {
693 		ctx->gmult = gcm_gmult_neon;
694 		ctx->ghash = gcm_ghash_neon;
695 	} else {
696 		gcm_init_4bit(ctx->Htable, ctx->H.u);
697 		ctx->gmult = gcm_gmult_4bit;
698 		ctx->ghash = gcm_ghash_4bit;
699 	}
700 # else
701 	gcm_init_4bit(ctx->Htable, ctx->H.u);
702 # endif
703 #endif
704 }
705 LCRYPTO_ALIAS(CRYPTO_gcm128_init);
706 
707 void
708 CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, size_t len)
709 {
710 	unsigned int ctr;
711 #ifdef GCM_FUNCREF_4BIT
712 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
713 #endif
714 
715 	ctx->Yi.u[0] = 0;
716 	ctx->Yi.u[1] = 0;
717 	ctx->Xi.u[0] = 0;
718 	ctx->Xi.u[1] = 0;
719 	ctx->len.u[0] = 0;	/* AAD length */
720 	ctx->len.u[1] = 0;	/* message length */
721 	ctx->ares = 0;
722 	ctx->mres = 0;
723 
724 	if (len == 12) {
725 		memcpy(ctx->Yi.c, iv, 12);
726 		ctx->Yi.c[15] = 1;
727 		ctr = 1;
728 	} else {
729 		size_t i;
730 		u64 len0 = len;
731 
732 		while (len >= 16) {
733 			for (i = 0; i < 16; ++i)
734 				ctx->Yi.c[i] ^= iv[i];
735 			GCM_MUL(ctx, Yi);
736 			iv += 16;
737 			len -= 16;
738 		}
739 		if (len) {
740 			for (i = 0; i < len; ++i)
741 				ctx->Yi.c[i] ^= iv[i];
742 			GCM_MUL(ctx, Yi);
743 		}
744 		len0 <<= 3;
745 		ctx->Yi.u[1] ^= htobe64(len0);
746 
747 		GCM_MUL(ctx, Yi);
748 
749 		ctr = be32toh(ctx->Yi.d[3]);
750 	}
751 
752 	(*ctx->block)(ctx->Yi.c, ctx->EK0.c, ctx->key);
753 	++ctr;
754 	ctx->Yi.d[3] = htobe32(ctr);
755 }
756 LCRYPTO_ALIAS(CRYPTO_gcm128_setiv);
757 
758 int
759 CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, size_t len)
760 {
761 	size_t i;
762 	unsigned int n;
763 	u64 alen = ctx->len.u[0];
764 #ifdef GCM_FUNCREF_4BIT
765 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
766 # ifdef GHASH
767 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
768 	    const u8 *inp, size_t len) = ctx->ghash;
769 # endif
770 #endif
771 
772 	if (ctx->len.u[1])
773 		return -2;
774 
775 	alen += len;
776 	if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
777 		return -1;
778 	ctx->len.u[0] = alen;
779 
780 	n = ctx->ares;
781 	if (n) {
782 		while (n && len) {
783 			ctx->Xi.c[n] ^= *(aad++);
784 			--len;
785 			n = (n + 1) % 16;
786 		}
787 		if (n == 0)
788 			GCM_MUL(ctx, Xi);
789 		else {
790 			ctx->ares = n;
791 			return 0;
792 		}
793 	}
794 
795 #ifdef GHASH
796 	if ((i = (len & (size_t)-16))) {
797 		GHASH(ctx, aad, i);
798 		aad += i;
799 		len -= i;
800 	}
801 #else
802 	while (len >= 16) {
803 		for (i = 0; i < 16; ++i)
804 			ctx->Xi.c[i] ^= aad[i];
805 		GCM_MUL(ctx, Xi);
806 		aad += 16;
807 		len -= 16;
808 	}
809 #endif
810 	if (len) {
811 		n = (unsigned int)len;
812 		for (i = 0; i < len; ++i)
813 			ctx->Xi.c[i] ^= aad[i];
814 	}
815 
816 	ctx->ares = n;
817 	return 0;
818 }
819 LCRYPTO_ALIAS(CRYPTO_gcm128_aad);
820 
821 int
822 CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
823     const unsigned char *in, unsigned char *out,
824     size_t len)
825 {
826 	unsigned int n, ctr;
827 	size_t i;
828 	u64 mlen = ctx->len.u[1];
829 	block128_f block = ctx->block;
830 	void *key = ctx->key;
831 #ifdef GCM_FUNCREF_4BIT
832 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
833 # ifdef GHASH
834 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
835 	    const u8 *inp, size_t len) = ctx->ghash;
836 # endif
837 #endif
838 
839 	mlen += len;
840 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
841 		return -1;
842 	ctx->len.u[1] = mlen;
843 
844 	if (ctx->ares) {
845 		/* First call to encrypt finalizes GHASH(AAD) */
846 		GCM_MUL(ctx, Xi);
847 		ctx->ares = 0;
848 	}
849 
850 	ctr = be32toh(ctx->Yi.d[3]);
851 
852 	n = ctx->mres;
853 #if !defined(OPENSSL_SMALL_FOOTPRINT)
854 	if (16 % sizeof(size_t) == 0)
855 		do {	/* always true actually */
856 			if (n) {
857 				while (n && len) {
858 					ctx->Xi.c[n] ^= *(out++) = *(in++) ^
859 					    ctx->EKi.c[n];
860 					--len;
861 					n = (n + 1) % 16;
862 				}
863 				if (n == 0)
864 					GCM_MUL(ctx, Xi);
865 				else {
866 					ctx->mres = n;
867 					return 0;
868 				}
869 			}
870 #ifdef __STRICT_ALIGNMENT
871 			if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
872 				break;
873 #endif
874 #if defined(GHASH) && defined(GHASH_CHUNK)
875 			while (len >= GHASH_CHUNK) {
876 				size_t j = GHASH_CHUNK;
877 
878 				while (j) {
879 					size_t *out_t = (size_t *)out;
880 					const size_t *in_t = (const size_t *)in;
881 
882 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
883 					++ctr;
884 					ctx->Yi.d[3] = htobe32(ctr);
885 
886 					for (i = 0; i < 16/sizeof(size_t); ++i)
887 						out_t[i] = in_t[i] ^
888 						    ctx->EKi.t[i];
889 					out += 16;
890 					in += 16;
891 					j -= 16;
892 				}
893 				GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
894 				len -= GHASH_CHUNK;
895 			}
896 			if ((i = (len & (size_t)-16))) {
897 				size_t j = i;
898 
899 				while (len >= 16) {
900 					size_t *out_t = (size_t *)out;
901 					const size_t *in_t = (const size_t *)in;
902 
903 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
904 					++ctr;
905 					ctx->Yi.d[3] = htobe32(ctr);
906 
907 					for (i = 0; i < 16/sizeof(size_t); ++i)
908 						out_t[i] = in_t[i] ^
909 						    ctx->EKi.t[i];
910 					out += 16;
911 					in += 16;
912 					len -= 16;
913 				}
914 				GHASH(ctx, out - j, j);
915 			}
916 #else
917 			while (len >= 16) {
918 				size_t *out_t = (size_t *)out;
919 				const size_t *in_t = (const size_t *)in;
920 
921 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
922 				++ctr;
923 				ctx->Yi.d[3] = htobe32(ctr);
924 
925 				for (i = 0; i < 16/sizeof(size_t); ++i)
926 					ctx->Xi.t[i] ^=
927 					    out_t[i] = in_t[i] ^ ctx->EKi.t[i];
928 				GCM_MUL(ctx, Xi);
929 				out += 16;
930 				in += 16;
931 				len -= 16;
932 			}
933 #endif
934 			if (len) {
935 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
936 				++ctr;
937 				ctx->Yi.d[3] = htobe32(ctr);
938 
939 				while (len--) {
940 					ctx->Xi.c[n] ^= out[n] = in[n] ^
941 					    ctx->EKi.c[n];
942 					++n;
943 				}
944 			}
945 
946 			ctx->mres = n;
947 			return 0;
948 		} while (0);
949 #endif
950 	for (i = 0; i < len; ++i) {
951 		if (n == 0) {
952 			(*block)(ctx->Yi.c, ctx->EKi.c, key);
953 			++ctr;
954 			ctx->Yi.d[3] = htobe32(ctr);
955 		}
956 		ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
957 		n = (n + 1) % 16;
958 		if (n == 0)
959 			GCM_MUL(ctx, Xi);
960 	}
961 
962 	ctx->mres = n;
963 	return 0;
964 }
965 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt);
966 
967 int
968 CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
969     const unsigned char *in, unsigned char *out,
970     size_t len)
971 {
972 	unsigned int n, ctr;
973 	size_t i;
974 	u64 mlen = ctx->len.u[1];
975 	block128_f block = ctx->block;
976 	void *key = ctx->key;
977 #ifdef GCM_FUNCREF_4BIT
978 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
979 # ifdef GHASH
980 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
981 	    const u8 *inp, size_t len) = ctx->ghash;
982 # endif
983 #endif
984 
985 	mlen += len;
986 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
987 		return -1;
988 	ctx->len.u[1] = mlen;
989 
990 	if (ctx->ares) {
991 		/* First call to decrypt finalizes GHASH(AAD) */
992 		GCM_MUL(ctx, Xi);
993 		ctx->ares = 0;
994 	}
995 
996 	ctr = be32toh(ctx->Yi.d[3]);
997 
998 	n = ctx->mres;
999 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1000 	if (16 % sizeof(size_t) == 0)
1001 		do {	/* always true actually */
1002 			if (n) {
1003 				while (n && len) {
1004 					u8 c = *(in++);
1005 					*(out++) = c ^ ctx->EKi.c[n];
1006 					ctx->Xi.c[n] ^= c;
1007 					--len;
1008 					n = (n + 1) % 16;
1009 				}
1010 				if (n == 0)
1011 					GCM_MUL(ctx, Xi);
1012 				else {
1013 					ctx->mres = n;
1014 					return 0;
1015 				}
1016 			}
1017 #ifdef __STRICT_ALIGNMENT
1018 			if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
1019 				break;
1020 #endif
1021 #if defined(GHASH) && defined(GHASH_CHUNK)
1022 			while (len >= GHASH_CHUNK) {
1023 				size_t j = GHASH_CHUNK;
1024 
1025 				GHASH(ctx, in, GHASH_CHUNK);
1026 				while (j) {
1027 					size_t *out_t = (size_t *)out;
1028 					const size_t *in_t = (const size_t *)in;
1029 
1030 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
1031 					++ctr;
1032 					ctx->Yi.d[3] = htobe32(ctr);
1033 
1034 					for (i = 0; i < 16/sizeof(size_t); ++i)
1035 						out_t[i] = in_t[i] ^
1036 						    ctx->EKi.t[i];
1037 					out += 16;
1038 					in += 16;
1039 					j -= 16;
1040 				}
1041 				len -= GHASH_CHUNK;
1042 			}
1043 			if ((i = (len & (size_t)-16))) {
1044 				GHASH(ctx, in, i);
1045 				while (len >= 16) {
1046 					size_t *out_t = (size_t *)out;
1047 					const size_t *in_t = (const size_t *)in;
1048 
1049 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
1050 					++ctr;
1051 					ctx->Yi.d[3] = htobe32(ctr);
1052 
1053 					for (i = 0; i < 16/sizeof(size_t); ++i)
1054 						out_t[i] = in_t[i] ^
1055 						    ctx->EKi.t[i];
1056 					out += 16;
1057 					in += 16;
1058 					len -= 16;
1059 				}
1060 			}
1061 #else
1062 			while (len >= 16) {
1063 				size_t *out_t = (size_t *)out;
1064 				const size_t *in_t = (const size_t *)in;
1065 
1066 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
1067 				++ctr;
1068 				ctx->Yi.d[3] = htobe32(ctr);
1069 
1070 				for (i = 0; i < 16/sizeof(size_t); ++i) {
1071 					size_t c = in[i];
1072 					out[i] = c ^ ctx->EKi.t[i];
1073 					ctx->Xi.t[i] ^= c;
1074 				}
1075 				GCM_MUL(ctx, Xi);
1076 				out += 16;
1077 				in += 16;
1078 				len -= 16;
1079 			}
1080 #endif
1081 			if (len) {
1082 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
1083 				++ctr;
1084 				ctx->Yi.d[3] = htobe32(ctr);
1085 
1086 				while (len--) {
1087 					u8 c = in[n];
1088 					ctx->Xi.c[n] ^= c;
1089 					out[n] = c ^ ctx->EKi.c[n];
1090 					++n;
1091 				}
1092 			}
1093 
1094 			ctx->mres = n;
1095 			return 0;
1096 		} while (0);
1097 #endif
1098 	for (i = 0; i < len; ++i) {
1099 		u8 c;
1100 		if (n == 0) {
1101 			(*block)(ctx->Yi.c, ctx->EKi.c, key);
1102 			++ctr;
1103 			ctx->Yi.d[3] = htobe32(ctr);
1104 		}
1105 		c = in[i];
1106 		out[i] = c ^ ctx->EKi.c[n];
1107 		ctx->Xi.c[n] ^= c;
1108 		n = (n + 1) % 16;
1109 		if (n == 0)
1110 			GCM_MUL(ctx, Xi);
1111 	}
1112 
1113 	ctx->mres = n;
1114 	return 0;
1115 }
1116 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt);
1117 
1118 int
1119 CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1120     const unsigned char *in, unsigned char *out,
1121     size_t len, ctr128_f stream)
1122 {
1123 	unsigned int n, ctr;
1124 	size_t i;
1125 	u64 mlen = ctx->len.u[1];
1126 	void *key = ctx->key;
1127 #ifdef GCM_FUNCREF_4BIT
1128 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1129 # ifdef GHASH
1130 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1131 	    const u8 *inp, size_t len) = ctx->ghash;
1132 # endif
1133 #endif
1134 
1135 	mlen += len;
1136 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1137 		return -1;
1138 	ctx->len.u[1] = mlen;
1139 
1140 	if (ctx->ares) {
1141 		/* First call to encrypt finalizes GHASH(AAD) */
1142 		GCM_MUL(ctx, Xi);
1143 		ctx->ares = 0;
1144 	}
1145 
1146 	ctr = be32toh(ctx->Yi.d[3]);
1147 
1148 	n = ctx->mres;
1149 	if (n) {
1150 		while (n && len) {
1151 			ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1152 			--len;
1153 			n = (n + 1) % 16;
1154 		}
1155 		if (n == 0)
1156 			GCM_MUL(ctx, Xi);
1157 		else {
1158 			ctx->mres = n;
1159 			return 0;
1160 		}
1161 	}
1162 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1163 	while (len >= GHASH_CHUNK) {
1164 		(*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
1165 		ctr += GHASH_CHUNK/16;
1166 		ctx->Yi.d[3] = htobe32(ctr);
1167 		GHASH(ctx, out, GHASH_CHUNK);
1168 		out += GHASH_CHUNK;
1169 		in += GHASH_CHUNK;
1170 		len -= GHASH_CHUNK;
1171 	}
1172 #endif
1173 	if ((i = (len & (size_t)-16))) {
1174 		size_t j = i/16;
1175 
1176 		(*stream)(in, out, j, key, ctx->Yi.c);
1177 		ctr += (unsigned int)j;
1178 		ctx->Yi.d[3] = htobe32(ctr);
1179 		in += i;
1180 		len -= i;
1181 #if defined(GHASH)
1182 		GHASH(ctx, out, i);
1183 		out += i;
1184 #else
1185 		while (j--) {
1186 			for (i = 0; i < 16; ++i)
1187 				ctx->Xi.c[i] ^= out[i];
1188 			GCM_MUL(ctx, Xi);
1189 			out += 16;
1190 		}
1191 #endif
1192 	}
1193 	if (len) {
1194 		(*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1195 		++ctr;
1196 		ctx->Yi.d[3] = htobe32(ctr);
1197 		while (len--) {
1198 			ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1199 			++n;
1200 		}
1201 	}
1202 
1203 	ctx->mres = n;
1204 	return 0;
1205 }
1206 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt_ctr32);
1207 
1208 int
1209 CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1210     const unsigned char *in, unsigned char *out,
1211     size_t len, ctr128_f stream)
1212 {
1213 	unsigned int n, ctr;
1214 	size_t i;
1215 	u64 mlen = ctx->len.u[1];
1216 	void *key = ctx->key;
1217 #ifdef GCM_FUNCREF_4BIT
1218 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1219 # ifdef GHASH
1220 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1221 	    const u8 *inp, size_t len) = ctx->ghash;
1222 # endif
1223 #endif
1224 
1225 	mlen += len;
1226 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1227 		return -1;
1228 	ctx->len.u[1] = mlen;
1229 
1230 	if (ctx->ares) {
1231 		/* First call to decrypt finalizes GHASH(AAD) */
1232 		GCM_MUL(ctx, Xi);
1233 		ctx->ares = 0;
1234 	}
1235 
1236 	ctr = be32toh(ctx->Yi.d[3]);
1237 
1238 	n = ctx->mres;
1239 	if (n) {
1240 		while (n && len) {
1241 			u8 c = *(in++);
1242 			*(out++) = c ^ ctx->EKi.c[n];
1243 			ctx->Xi.c[n] ^= c;
1244 			--len;
1245 			n = (n + 1) % 16;
1246 		}
1247 		if (n == 0)
1248 			GCM_MUL(ctx, Xi);
1249 		else {
1250 			ctx->mres = n;
1251 			return 0;
1252 		}
1253 	}
1254 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1255 	while (len >= GHASH_CHUNK) {
1256 		GHASH(ctx, in, GHASH_CHUNK);
1257 		(*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
1258 		ctr += GHASH_CHUNK/16;
1259 		ctx->Yi.d[3] = htobe32(ctr);
1260 		out += GHASH_CHUNK;
1261 		in += GHASH_CHUNK;
1262 		len -= GHASH_CHUNK;
1263 	}
1264 #endif
1265 	if ((i = (len & (size_t)-16))) {
1266 		size_t j = i/16;
1267 
1268 #if defined(GHASH)
1269 		GHASH(ctx, in, i);
1270 #else
1271 		while (j--) {
1272 			size_t k;
1273 			for (k = 0; k < 16; ++k)
1274 				ctx->Xi.c[k] ^= in[k];
1275 			GCM_MUL(ctx, Xi);
1276 			in += 16;
1277 		}
1278 		j = i/16;
1279 		in -= i;
1280 #endif
1281 		(*stream)(in, out, j, key, ctx->Yi.c);
1282 		ctr += (unsigned int)j;
1283 		ctx->Yi.d[3] = htobe32(ctr);
1284 		out += i;
1285 		in += i;
1286 		len -= i;
1287 	}
1288 	if (len) {
1289 		(*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1290 		++ctr;
1291 		ctx->Yi.d[3] = htobe32(ctr);
1292 		while (len--) {
1293 			u8 c = in[n];
1294 			ctx->Xi.c[n] ^= c;
1295 			out[n] = c ^ ctx->EKi.c[n];
1296 			++n;
1297 		}
1298 	}
1299 
1300 	ctx->mres = n;
1301 	return 0;
1302 }
1303 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt_ctr32);
1304 
1305 int
1306 CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1307     size_t len)
1308 {
1309 	u64 alen = ctx->len.u[0] << 3;
1310 	u64 clen = ctx->len.u[1] << 3;
1311 #ifdef GCM_FUNCREF_4BIT
1312 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1313 #endif
1314 
1315 	if (ctx->mres || ctx->ares)
1316 		GCM_MUL(ctx, Xi);
1317 
1318 	ctx->Xi.u[0] ^= htobe64(alen);
1319 	ctx->Xi.u[1] ^= htobe64(clen);
1320 	GCM_MUL(ctx, Xi);
1321 
1322 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1323 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1324 
1325 	if (tag && len <= sizeof(ctx->Xi))
1326 		return memcmp(ctx->Xi.c, tag, len);
1327 	else
1328 		return -1;
1329 }
1330 LCRYPTO_ALIAS(CRYPTO_gcm128_finish);
1331 
1332 void
1333 CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1334 {
1335 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1336 	memcpy(tag, ctx->Xi.c,
1337 	    len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1338 }
1339 LCRYPTO_ALIAS(CRYPTO_gcm128_tag);
1340 
1341 GCM128_CONTEXT *
1342 CRYPTO_gcm128_new(void *key, block128_f block)
1343 {
1344 	GCM128_CONTEXT *ret;
1345 
1346 	if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1347 		CRYPTO_gcm128_init(ret, key, block);
1348 
1349 	return ret;
1350 }
1351 LCRYPTO_ALIAS(CRYPTO_gcm128_new);
1352 
1353 void
1354 CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1355 {
1356 	freezero(ctx, sizeof(*ctx));
1357 }
1358 LCRYPTO_ALIAS(CRYPTO_gcm128_release);
1359