xref: /openbsd/lib/libcrypto/modes/gcm128.c (revision 5dea098c)
1 /* $OpenBSD: gcm128.c,v 1.26 2023/08/10 07:18:43 jsing Exp $ */
2 /* ====================================================================
3  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * 3. All advertising materials mentioning features or use of this
18  *    software must display the following acknowledgment:
19  *    "This product includes software developed by the OpenSSL Project
20  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21  *
22  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23  *    endorse or promote products derived from this software without
24  *    prior written permission. For written permission, please contact
25  *    openssl-core@openssl.org.
26  *
27  * 5. Products derived from this software may not be called "OpenSSL"
28  *    nor may "OpenSSL" appear in their names without prior written
29  *    permission of the OpenSSL Project.
30  *
31  * 6. Redistributions of any form whatsoever must retain the following
32  *    acknowledgment:
33  *    "This product includes software developed by the OpenSSL Project
34  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35  *
36  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47  * OF THE POSSIBILITY OF SUCH DAMAGE.
48  * ====================================================================
49  */
50 
51 #define OPENSSL_FIPSAPI
52 
53 #include <openssl/crypto.h>
54 #include "modes_local.h"
55 #include <string.h>
56 
57 #ifndef MODES_DEBUG
58 # ifndef NDEBUG
59 #  define NDEBUG
60 # endif
61 #endif
62 
63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef	GETU32
66 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67 #endif
68 
69 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
70 #define REDUCE1BIT(V)							\
71 	do {								\
72 		if (sizeof(size_t)==8) {				\
73 			u64 T = U64(0xe100000000000000) & (0-(V.lo&1));	\
74 			V.lo  = (V.hi<<63)|(V.lo>>1);			\
75 			V.hi  = (V.hi>>1 )^T;				\
76 		} else {						\
77 			u32 T = 0xe1000000U & (0-(u32)(V.lo&1));	\
78 			V.lo  = (V.hi<<63)|(V.lo>>1);			\
79 			V.hi  = (V.hi>>1 )^((u64)T<<32);		\
80 		}							\
81 	} while(0)
82 
83 /*
84  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
85  * never be set to 8. 8 is effectively reserved for testing purposes.
86  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
87  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
88  * whole spectrum of possible table driven implementations. Why? In
89  * non-"Shoup's" case memory access pattern is segmented in such manner,
90  * that it's trivial to see that cache timing information can reveal
91  * fair portion of intermediate hash value. Given that ciphertext is
92  * always available to attacker, it's possible for him to attempt to
93  * deduce secret parameter H and if successful, tamper with messages
94  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
95  * not as trivial, but there is no reason to believe that it's resistant
96  * to cache-timing attack. And the thing about "8-bit" implementation is
97  * that it consumes 16 (sixteen) times more memory, 4KB per individual
98  * key + 1KB shared. Well, on pros side it should be twice as fast as
99  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
100  * was observed to run ~75% faster, closer to 100% for commercial
101  * compilers... Yet "4-bit" procedure is preferred, because it's
102  * believed to provide better security-performance balance and adequate
103  * all-round performance. "All-round" refers to things like:
104  *
105  * - shorter setup time effectively improves overall timing for
106  *   handling short messages;
107  * - larger table allocation can become unbearable because of VM
108  *   subsystem penalties (for example on Windows large enough free
109  *   results in VM working set trimming, meaning that consequent
110  *   malloc would immediately incur working set expansion);
111  * - larger table has larger cache footprint, which can affect
112  *   performance of other code paths (not necessarily even from same
113  *   thread in Hyper-Threading world);
114  *
115  * Value of 1 is not appropriate for performance reasons.
116  */
117 #if	TABLE_BITS==8
118 
119 static void
120 gcm_init_8bit(u128 Htable[256], u64 H[2])
121 {
122 	int  i, j;
123 	u128 V;
124 
125 	Htable[0].hi = 0;
126 	Htable[0].lo = 0;
127 	V.hi = H[0];
128 	V.lo = H[1];
129 
130 	for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
131 		REDUCE1BIT(V);
132 		Htable[i] = V;
133 	}
134 
135 	for (i = 2; i < 256; i <<= 1) {
136 		u128 *Hi = Htable + i, H0 = *Hi;
137 		for (j = 1; j < i; ++j) {
138 			Hi[j].hi = H0.hi ^ Htable[j].hi;
139 			Hi[j].lo = H0.lo ^ Htable[j].lo;
140 		}
141 	}
142 }
143 
144 static void
145 gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147 	u128 Z = { 0, 0};
148 	const u8 *xi = (const u8 *)Xi + 15;
149 	size_t rem, n = *xi;
150 	static const size_t rem_8bit[256] = {
151 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
152 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
153 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
154 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
155 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
156 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
157 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
158 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
159 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
160 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
161 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
162 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
163 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
164 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
165 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
166 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
167 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
168 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
169 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
170 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
171 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
172 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
173 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
174 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
175 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
176 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
177 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
178 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
179 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
180 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
181 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
182 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
183 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
184 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
185 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
186 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
187 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
188 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
189 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
190 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
191 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
192 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
193 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
194 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
195 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
196 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
197 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
198 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
199 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
200 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
201 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
202 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
203 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
204 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
205 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
206 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
207 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
208 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
209 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
210 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
211 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
212 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
213 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
214 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
215 
216 	while (1) {
217 		Z.hi ^= Htable[n].hi;
218 		Z.lo ^= Htable[n].lo;
219 
220 		if ((u8 *)Xi == xi)
221 			break;
222 
223 		n = *(--xi);
224 
225 		rem = (size_t)Z.lo & 0xff;
226 		Z.lo = (Z.hi << 56)|(Z.lo >> 8);
227 		Z.hi = (Z.hi >> 8);
228 #if SIZE_MAX == 0xffffffffffffffff
229 		Z.hi ^= rem_8bit[rem];
230 #else
231 		Z.hi ^= (u64)rem_8bit[rem] << 32;
232 #endif
233 	}
234 
235 	Xi[0] = htobe64(Z.hi);
236 	Xi[1] = htobe64(Z.lo);
237 }
238 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
239 
240 #elif	TABLE_BITS==4
241 
242 static void
243 gcm_init_4bit(u128 Htable[16], u64 H[2])
244 {
245 	u128 V;
246 #if defined(OPENSSL_SMALL_FOOTPRINT)
247 	int  i;
248 #endif
249 
250 	Htable[0].hi = 0;
251 	Htable[0].lo = 0;
252 	V.hi = H[0];
253 	V.lo = H[1];
254 
255 #if defined(OPENSSL_SMALL_FOOTPRINT)
256 	for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
257 		REDUCE1BIT(V);
258 		Htable[i] = V;
259 	}
260 
261 	for (i = 2; i < 16; i <<= 1) {
262 		u128 *Hi = Htable + i;
263 		int   j;
264 		for (V = *Hi, j = 1; j < i; ++j) {
265 			Hi[j].hi = V.hi ^ Htable[j].hi;
266 			Hi[j].lo = V.lo ^ Htable[j].lo;
267 		}
268 	}
269 #else
270 	Htable[8] = V;
271 	REDUCE1BIT(V);
272 	Htable[4] = V;
273 	REDUCE1BIT(V);
274 	Htable[2] = V;
275 	REDUCE1BIT(V);
276 	Htable[1] = V;
277 	Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
278 	V = Htable[4];
279 	Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
280 	Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
281 	Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
282 	V = Htable[8];
283 	Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
284 	Htable[10].hi = V.hi ^ Htable[2].hi,
285 	    Htable[10].lo = V.lo ^ Htable[2].lo;
286 	Htable[11].hi = V.hi ^ Htable[3].hi,
287 	    Htable[11].lo = V.lo ^ Htable[3].lo;
288 	Htable[12].hi = V.hi ^ Htable[4].hi,
289 	    Htable[12].lo = V.lo ^ Htable[4].lo;
290 	Htable[13].hi = V.hi ^ Htable[5].hi,
291 	    Htable[13].lo = V.lo ^ Htable[5].lo;
292 	Htable[14].hi = V.hi ^ Htable[6].hi,
293 	    Htable[14].lo = V.lo ^ Htable[6].lo;
294 	Htable[15].hi = V.hi ^ Htable[7].hi,
295 	    Htable[15].lo = V.lo ^ Htable[7].lo;
296 #endif
297 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
298 	/*
299 	 * ARM assembler expects specific dword order in Htable.
300 	 */
301 	{
302 		int j;
303 #if BYTE_ORDER == LITTLE_ENDIAN
304 		for (j = 0; j < 16; ++j) {
305 			V = Htable[j];
306 			Htable[j].hi = V.lo;
307 			Htable[j].lo = V.hi;
308 		}
309 #else /* BIG_ENDIAN */
310 		for (j = 0; j < 16; ++j) {
311 			V = Htable[j];
312 			Htable[j].hi = V.lo << 32|V.lo >> 32;
313 			Htable[j].lo = V.hi << 32|V.hi >> 32;
314 		}
315 #endif
316 	}
317 #endif
318 }
319 
320 #ifndef GHASH_ASM
321 static const size_t rem_4bit[16] = {
322 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
323 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
324 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
325 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
326 
327 static void
328 gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
329 {
330 	u128 Z;
331 	int cnt = 15;
332 	size_t rem, nlo, nhi;
333 
334 	nlo = ((const u8 *)Xi)[15];
335 	nhi = nlo >> 4;
336 	nlo &= 0xf;
337 
338 	Z.hi = Htable[nlo].hi;
339 	Z.lo = Htable[nlo].lo;
340 
341 	while (1) {
342 		rem = (size_t)Z.lo & 0xf;
343 		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
344 		Z.hi = (Z.hi >> 4);
345 #if SIZE_MAX == 0xffffffffffffffff
346 		Z.hi ^= rem_4bit[rem];
347 #else
348 		Z.hi ^= (u64)rem_4bit[rem] << 32;
349 #endif
350 		Z.hi ^= Htable[nhi].hi;
351 		Z.lo ^= Htable[nhi].lo;
352 
353 		if (--cnt < 0)
354 			break;
355 
356 		nlo = ((const u8 *)Xi)[cnt];
357 		nhi = nlo >> 4;
358 		nlo &= 0xf;
359 
360 		rem = (size_t)Z.lo & 0xf;
361 		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
362 		Z.hi = (Z.hi >> 4);
363 #if SIZE_MAX == 0xffffffffffffffff
364 		Z.hi ^= rem_4bit[rem];
365 #else
366 		Z.hi ^= (u64)rem_4bit[rem] << 32;
367 #endif
368 		Z.hi ^= Htable[nlo].hi;
369 		Z.lo ^= Htable[nlo].lo;
370 	}
371 
372 	Xi[0] = htobe64(Z.hi);
373 	Xi[1] = htobe64(Z.lo);
374 }
375 
376 #if !defined(OPENSSL_SMALL_FOOTPRINT)
377 /*
378  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
379  * details... Compiler-generated code doesn't seem to give any
380  * performance improvement, at least not on x86[_64]. It's here
381  * mostly as reference and a placeholder for possible future
382  * non-trivial optimization[s]...
383  */
384 static void
385 gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
386     const u8 *inp, size_t len)
387 {
388 	u128 Z;
389 	int cnt;
390 	size_t rem, nlo, nhi;
391 
392 #if 1
393 	do {
394 		cnt = 15;
395 		nlo = ((const u8 *)Xi)[15];
396 		nlo ^= inp[15];
397 		nhi = nlo >> 4;
398 		nlo &= 0xf;
399 
400 		Z.hi = Htable[nlo].hi;
401 		Z.lo = Htable[nlo].lo;
402 
403 		while (1) {
404 			rem = (size_t)Z.lo & 0xf;
405 			Z.lo = (Z.hi << 60)|(Z.lo >> 4);
406 			Z.hi = (Z.hi >> 4);
407 #if SIZE_MAX == 0xffffffffffffffff
408 			Z.hi ^= rem_4bit[rem];
409 #else
410 			Z.hi ^= (u64)rem_4bit[rem] << 32;
411 #endif
412 			Z.hi ^= Htable[nhi].hi;
413 			Z.lo ^= Htable[nhi].lo;
414 
415 			if (--cnt < 0)
416 				break;
417 
418 			nlo = ((const u8 *)Xi)[cnt];
419 			nlo ^= inp[cnt];
420 			nhi = nlo >> 4;
421 			nlo &= 0xf;
422 
423 			rem = (size_t)Z.lo & 0xf;
424 			Z.lo = (Z.hi << 60)|(Z.lo >> 4);
425 			Z.hi = (Z.hi >> 4);
426 #if SIZE_MAX == 0xffffffffffffffff
427 			Z.hi ^= rem_4bit[rem];
428 #else
429 			Z.hi ^= (u64)rem_4bit[rem] << 32;
430 #endif
431 			Z.hi ^= Htable[nlo].hi;
432 			Z.lo ^= Htable[nlo].lo;
433 		}
434 #else
435     /*
436      * Extra 256+16 bytes per-key plus 512 bytes shared tables
437      * [should] give ~50% improvement... One could have PACK()-ed
438      * the rem_8bit even here, but the priority is to minimize
439      * cache footprint...
440      */
441 	u128 Hshr4[16];	/* Htable shifted right by 4 bits */
442 	u8 Hshl4[16];	/* Htable shifted left  by 4 bits */
443 	static const unsigned short rem_8bit[256] = {
444 		0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
445 		0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
446 		0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
447 		0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
448 		0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
449 		0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
450 		0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
451 		0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
452 		0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
453 		0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
454 		0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
455 		0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
456 		0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
457 		0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
458 		0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
459 		0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
460 		0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
461 		0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
462 		0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
463 		0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
464 		0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
465 		0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
466 		0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
467 		0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
468 		0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
469 		0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
470 		0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
471 		0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
472 		0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
473 		0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
474 		0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
475 		0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
476     /*
477      * This pre-processing phase slows down procedure by approximately
478      * same time as it makes each loop spin faster. In other words
479      * single block performance is approximately same as straightforward
480      * "4-bit" implementation, and then it goes only faster...
481      */
482 	for (cnt = 0; cnt < 16; ++cnt) {
483 		Z.hi = Htable[cnt].hi;
484 		Z.lo = Htable[cnt].lo;
485 		Hshr4[cnt].lo = (Z.hi << 60)|(Z.lo >> 4);
486 		Hshr4[cnt].hi = (Z.hi >> 4);
487 		Hshl4[cnt] = (u8)(Z.lo << 4);
488 	}
489 
490 	do {
491 		for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
492 			nlo = ((const u8 *)Xi)[cnt];
493 			nlo ^= inp[cnt];
494 			nhi = nlo >> 4;
495 			nlo &= 0xf;
496 
497 			Z.hi ^= Htable[nlo].hi;
498 			Z.lo ^= Htable[nlo].lo;
499 
500 			rem = (size_t)Z.lo & 0xff;
501 
502 			Z.lo = (Z.hi << 56)|(Z.lo >> 8);
503 			Z.hi = (Z.hi >> 8);
504 
505 			Z.hi ^= Hshr4[nhi].hi;
506 			Z.lo ^= Hshr4[nhi].lo;
507 			Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
508 		}
509 
510 		nlo = ((const u8 *)Xi)[0];
511 		nlo ^= inp[0];
512 		nhi = nlo >> 4;
513 		nlo &= 0xf;
514 
515 		Z.hi ^= Htable[nlo].hi;
516 		Z.lo ^= Htable[nlo].lo;
517 
518 		rem = (size_t)Z.lo & 0xf;
519 
520 		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
521 		Z.hi = (Z.hi >> 4);
522 
523 		Z.hi ^= Htable[nhi].hi;
524 		Z.lo ^= Htable[nhi].lo;
525 		Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
526 #endif
527 
528 		Xi[0] = htobe64(Z.hi);
529 		Xi[1] = htobe64(Z.lo);
530 	} while (inp += 16, len -= 16);
531 }
532 #endif
533 #else
534 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
535 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
536     size_t len);
537 #endif
538 
539 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
540 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
541 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
542 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
543  * trashing effect. In other words idea is to hash data while it's
544  * still in L1 cache after encryption pass... */
545 #define GHASH_CHUNK       (3*1024)
546 #endif
547 
548 #else	/* TABLE_BITS */
549 
550 static void
551 gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
552 {
553 	u128 V, Z = { 0,0 };
554 	long X;
555 	int i, j;
556 	const long *xi = (const long *)Xi;
557 
558 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
559 	V.lo = H[1];
560 
561 	for (j = 0; j < 16/sizeof(long); ++j) {
562 #if BYTE_ORDER == LITTLE_ENDIAN
563 #if SIZE_MAX == 0xffffffffffffffff
564 #ifdef BSWAP8
565 		X = (long)(BSWAP8(xi[j]));
566 #else
567 		const u8 *p = (const u8 *)(xi + j);
568 		X = (long)((u64)GETU32(p) << 32|GETU32(p + 4));
569 #endif
570 #else
571 		const u8 *p = (const u8 *)(xi + j);
572 		X = (long)GETU32(p);
573 #endif
574 #else /* BIG_ENDIAN */
575 		X = xi[j];
576 #endif
577 
578 		for (i = 0; i < 8*sizeof(long); ++i, X <<= 1) {
579 			u64 M = (u64)(X >> (8*sizeof(long) - 1));
580 			Z.hi ^= V.hi & M;
581 			Z.lo ^= V.lo & M;
582 
583 			REDUCE1BIT(V);
584 		}
585 	}
586 
587 	Xi[0] = htobe64(Z.hi);
588 	Xi[1] = htobe64(Z.lo);
589 }
590 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
591 
592 #endif
593 
594 #if	defined(GHASH_ASM) &&						\
595 	(defined(__i386)	|| defined(__i386__)	||		\
596 	 defined(__x86_64)	|| defined(__x86_64__)	||		\
597 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
598 #include "x86_arch.h"
599 #endif
600 
601 #if	TABLE_BITS==4 && defined(GHASH_ASM)
602 # if	(defined(__i386)	|| defined(__i386__)	||		\
603 	 defined(__x86_64)	|| defined(__x86_64__)	||		\
604 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
605 #  define GHASH_ASM_X86_OR_64
606 #  define GCM_FUNCREF_4BIT
607 
608 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
609 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
610 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
611     size_t len);
612 
613 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
614 #   define GHASH_ASM_X86
615 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
616 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
617     size_t len);
618 
619 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
620 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
621     size_t len);
622 #  endif
623 # elif defined(__arm__) || defined(__arm)
624 #  include "arm_arch.h"
625 #  if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
626 #   define GHASH_ASM_ARM
627 #   define GCM_FUNCREF_4BIT
628 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
629 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
630     size_t len);
631 #  endif
632 # endif
633 #endif
634 
635 #ifdef GCM_FUNCREF_4BIT
636 # undef  GCM_MUL
637 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
638 # ifdef GHASH
639 #  undef  GHASH
640 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
641 # endif
642 #endif
643 
644 void
645 CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
646 {
647 	memset(ctx, 0, sizeof(*ctx));
648 	ctx->block = block;
649 	ctx->key = key;
650 
651 	(*block)(ctx->H.c, ctx->H.c, key);
652 
653 	/* H is stored in host byte order */
654 	ctx->H.u[0] = be64toh(ctx->H.u[0]);
655 	ctx->H.u[1] = be64toh(ctx->H.u[1]);
656 
657 #if	TABLE_BITS==8
658 	gcm_init_8bit(ctx->Htable, ctx->H.u);
659 #elif	TABLE_BITS==4
660 # if	defined(GHASH_ASM_X86_OR_64)
661 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
662 	/* check FXSR and PCLMULQDQ bits */
663 	if ((OPENSSL_cpu_caps() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) ==
664 	    (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) {
665 		gcm_init_clmul(ctx->Htable, ctx->H.u);
666 		ctx->gmult = gcm_gmult_clmul;
667 		ctx->ghash = gcm_ghash_clmul;
668 		return;
669 	}
670 #  endif
671 	gcm_init_4bit(ctx->Htable, ctx->H.u);
672 #  if	defined(GHASH_ASM_X86)			/* x86 only */
673 #   if	defined(OPENSSL_IA32_SSE2)
674 	if (OPENSSL_cpu_caps() & CPUCAP_MASK_SSE) {	/* check SSE bit */
675 #   else
676 	if (OPENSSL_cpu_caps() & CPUCAP_MASK_MMX) {	/* check MMX bit */
677 #   endif
678 		ctx->gmult = gcm_gmult_4bit_mmx;
679 		ctx->ghash = gcm_ghash_4bit_mmx;
680 	} else {
681 		ctx->gmult = gcm_gmult_4bit_x86;
682 		ctx->ghash = gcm_ghash_4bit_x86;
683 	}
684 #  else
685 	ctx->gmult = gcm_gmult_4bit;
686 	ctx->ghash = gcm_ghash_4bit;
687 #  endif
688 # elif	defined(GHASH_ASM_ARM)
689 	if (OPENSSL_armcap_P & ARMV7_NEON) {
690 		ctx->gmult = gcm_gmult_neon;
691 		ctx->ghash = gcm_ghash_neon;
692 	} else {
693 		gcm_init_4bit(ctx->Htable, ctx->H.u);
694 		ctx->gmult = gcm_gmult_4bit;
695 		ctx->ghash = gcm_ghash_4bit;
696 	}
697 # else
698 	gcm_init_4bit(ctx->Htable, ctx->H.u);
699 # endif
700 #endif
701 }
702 LCRYPTO_ALIAS(CRYPTO_gcm128_init);
703 
704 void
705 CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, size_t len)
706 {
707 	unsigned int ctr;
708 #ifdef GCM_FUNCREF_4BIT
709 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
710 #endif
711 
712 	ctx->Yi.u[0] = 0;
713 	ctx->Yi.u[1] = 0;
714 	ctx->Xi.u[0] = 0;
715 	ctx->Xi.u[1] = 0;
716 	ctx->len.u[0] = 0;	/* AAD length */
717 	ctx->len.u[1] = 0;	/* message length */
718 	ctx->ares = 0;
719 	ctx->mres = 0;
720 
721 	if (len == 12) {
722 		memcpy(ctx->Yi.c, iv, 12);
723 		ctx->Yi.c[15] = 1;
724 		ctr = 1;
725 	} else {
726 		size_t i;
727 		u64 len0 = len;
728 
729 		while (len >= 16) {
730 			for (i = 0; i < 16; ++i)
731 				ctx->Yi.c[i] ^= iv[i];
732 			GCM_MUL(ctx, Yi);
733 			iv += 16;
734 			len -= 16;
735 		}
736 		if (len) {
737 			for (i = 0; i < len; ++i)
738 				ctx->Yi.c[i] ^= iv[i];
739 			GCM_MUL(ctx, Yi);
740 		}
741 		len0 <<= 3;
742 		ctx->Yi.u[1] ^= htobe64(len0);
743 
744 		GCM_MUL(ctx, Yi);
745 
746 		ctr = be32toh(ctx->Yi.d[3]);
747 	}
748 
749 	(*ctx->block)(ctx->Yi.c, ctx->EK0.c, ctx->key);
750 	++ctr;
751 	ctx->Yi.d[3] = htobe32(ctr);
752 }
753 LCRYPTO_ALIAS(CRYPTO_gcm128_setiv);
754 
755 int
756 CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, size_t len)
757 {
758 	size_t i;
759 	unsigned int n;
760 	u64 alen = ctx->len.u[0];
761 #ifdef GCM_FUNCREF_4BIT
762 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
763 # ifdef GHASH
764 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
765 	    const u8 *inp, size_t len) = ctx->ghash;
766 # endif
767 #endif
768 
769 	if (ctx->len.u[1])
770 		return -2;
771 
772 	alen += len;
773 	if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
774 		return -1;
775 	ctx->len.u[0] = alen;
776 
777 	n = ctx->ares;
778 	if (n) {
779 		while (n && len) {
780 			ctx->Xi.c[n] ^= *(aad++);
781 			--len;
782 			n = (n + 1) % 16;
783 		}
784 		if (n == 0)
785 			GCM_MUL(ctx, Xi);
786 		else {
787 			ctx->ares = n;
788 			return 0;
789 		}
790 	}
791 
792 #ifdef GHASH
793 	if ((i = (len & (size_t)-16))) {
794 		GHASH(ctx, aad, i);
795 		aad += i;
796 		len -= i;
797 	}
798 #else
799 	while (len >= 16) {
800 		for (i = 0; i < 16; ++i)
801 			ctx->Xi.c[i] ^= aad[i];
802 		GCM_MUL(ctx, Xi);
803 		aad += 16;
804 		len -= 16;
805 	}
806 #endif
807 	if (len) {
808 		n = (unsigned int)len;
809 		for (i = 0; i < len; ++i)
810 			ctx->Xi.c[i] ^= aad[i];
811 	}
812 
813 	ctx->ares = n;
814 	return 0;
815 }
816 LCRYPTO_ALIAS(CRYPTO_gcm128_aad);
817 
818 int
819 CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
820     const unsigned char *in, unsigned char *out,
821     size_t len)
822 {
823 	unsigned int n, ctr;
824 	size_t i;
825 	u64 mlen = ctx->len.u[1];
826 	block128_f block = ctx->block;
827 	void *key = ctx->key;
828 #ifdef GCM_FUNCREF_4BIT
829 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
830 # ifdef GHASH
831 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
832 	    const u8 *inp, size_t len) = ctx->ghash;
833 # endif
834 #endif
835 
836 	mlen += len;
837 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
838 		return -1;
839 	ctx->len.u[1] = mlen;
840 
841 	if (ctx->ares) {
842 		/* First call to encrypt finalizes GHASH(AAD) */
843 		GCM_MUL(ctx, Xi);
844 		ctx->ares = 0;
845 	}
846 
847 	ctr = be32toh(ctx->Yi.d[3]);
848 
849 	n = ctx->mres;
850 #if !defined(OPENSSL_SMALL_FOOTPRINT)
851 	if (16 % sizeof(size_t) == 0)
852 		do {	/* always true actually */
853 			if (n) {
854 				while (n && len) {
855 					ctx->Xi.c[n] ^= *(out++) = *(in++) ^
856 					    ctx->EKi.c[n];
857 					--len;
858 					n = (n + 1) % 16;
859 				}
860 				if (n == 0)
861 					GCM_MUL(ctx, Xi);
862 				else {
863 					ctx->mres = n;
864 					return 0;
865 				}
866 			}
867 #ifdef __STRICT_ALIGNMENT
868 			if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
869 				break;
870 #endif
871 #if defined(GHASH) && defined(GHASH_CHUNK)
872 			while (len >= GHASH_CHUNK) {
873 				size_t j = GHASH_CHUNK;
874 
875 				while (j) {
876 					size_t *out_t = (size_t *)out;
877 					const size_t *in_t = (const size_t *)in;
878 
879 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
880 					++ctr;
881 					ctx->Yi.d[3] = htobe32(ctr);
882 
883 					for (i = 0; i < 16/sizeof(size_t); ++i)
884 						out_t[i] = in_t[i] ^
885 						    ctx->EKi.t[i];
886 					out += 16;
887 					in += 16;
888 					j -= 16;
889 				}
890 				GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
891 				len -= GHASH_CHUNK;
892 			}
893 			if ((i = (len & (size_t)-16))) {
894 				size_t j = i;
895 
896 				while (len >= 16) {
897 					size_t *out_t = (size_t *)out;
898 					const size_t *in_t = (const size_t *)in;
899 
900 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
901 					++ctr;
902 					ctx->Yi.d[3] = htobe32(ctr);
903 
904 					for (i = 0; i < 16/sizeof(size_t); ++i)
905 						out_t[i] = in_t[i] ^
906 						    ctx->EKi.t[i];
907 					out += 16;
908 					in += 16;
909 					len -= 16;
910 				}
911 				GHASH(ctx, out - j, j);
912 			}
913 #else
914 			while (len >= 16) {
915 				size_t *out_t = (size_t *)out;
916 				const size_t *in_t = (const size_t *)in;
917 
918 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
919 				++ctr;
920 				ctx->Yi.d[3] = htobe32(ctr);
921 
922 				for (i = 0; i < 16/sizeof(size_t); ++i)
923 					ctx->Xi.t[i] ^=
924 					    out_t[i] = in_t[i] ^ ctx->EKi.t[i];
925 				GCM_MUL(ctx, Xi);
926 				out += 16;
927 				in += 16;
928 				len -= 16;
929 			}
930 #endif
931 			if (len) {
932 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
933 				++ctr;
934 				ctx->Yi.d[3] = htobe32(ctr);
935 
936 				while (len--) {
937 					ctx->Xi.c[n] ^= out[n] = in[n] ^
938 					    ctx->EKi.c[n];
939 					++n;
940 				}
941 			}
942 
943 			ctx->mres = n;
944 			return 0;
945 		} while (0);
946 #endif
947 	for (i = 0; i < len; ++i) {
948 		if (n == 0) {
949 			(*block)(ctx->Yi.c, ctx->EKi.c, key);
950 			++ctr;
951 			ctx->Yi.d[3] = htobe32(ctr);
952 		}
953 		ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
954 		n = (n + 1) % 16;
955 		if (n == 0)
956 			GCM_MUL(ctx, Xi);
957 	}
958 
959 	ctx->mres = n;
960 	return 0;
961 }
962 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt);
963 
964 int
965 CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
966     const unsigned char *in, unsigned char *out,
967     size_t len)
968 {
969 	unsigned int n, ctr;
970 	size_t i;
971 	u64 mlen = ctx->len.u[1];
972 	block128_f block = ctx->block;
973 	void *key = ctx->key;
974 #ifdef GCM_FUNCREF_4BIT
975 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
976 # ifdef GHASH
977 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
978 	    const u8 *inp, size_t len) = ctx->ghash;
979 # endif
980 #endif
981 
982 	mlen += len;
983 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
984 		return -1;
985 	ctx->len.u[1] = mlen;
986 
987 	if (ctx->ares) {
988 		/* First call to decrypt finalizes GHASH(AAD) */
989 		GCM_MUL(ctx, Xi);
990 		ctx->ares = 0;
991 	}
992 
993 	ctr = be32toh(ctx->Yi.d[3]);
994 
995 	n = ctx->mres;
996 #if !defined(OPENSSL_SMALL_FOOTPRINT)
997 	if (16 % sizeof(size_t) == 0)
998 		do {	/* always true actually */
999 			if (n) {
1000 				while (n && len) {
1001 					u8 c = *(in++);
1002 					*(out++) = c ^ ctx->EKi.c[n];
1003 					ctx->Xi.c[n] ^= c;
1004 					--len;
1005 					n = (n + 1) % 16;
1006 				}
1007 				if (n == 0)
1008 					GCM_MUL(ctx, Xi);
1009 				else {
1010 					ctx->mres = n;
1011 					return 0;
1012 				}
1013 			}
1014 #ifdef __STRICT_ALIGNMENT
1015 			if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
1016 				break;
1017 #endif
1018 #if defined(GHASH) && defined(GHASH_CHUNK)
1019 			while (len >= GHASH_CHUNK) {
1020 				size_t j = GHASH_CHUNK;
1021 
1022 				GHASH(ctx, in, GHASH_CHUNK);
1023 				while (j) {
1024 					size_t *out_t = (size_t *)out;
1025 					const size_t *in_t = (const size_t *)in;
1026 
1027 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
1028 					++ctr;
1029 					ctx->Yi.d[3] = htobe32(ctr);
1030 
1031 					for (i = 0; i < 16/sizeof(size_t); ++i)
1032 						out_t[i] = in_t[i] ^
1033 						    ctx->EKi.t[i];
1034 					out += 16;
1035 					in += 16;
1036 					j -= 16;
1037 				}
1038 				len -= GHASH_CHUNK;
1039 			}
1040 			if ((i = (len & (size_t)-16))) {
1041 				GHASH(ctx, in, i);
1042 				while (len >= 16) {
1043 					size_t *out_t = (size_t *)out;
1044 					const size_t *in_t = (const size_t *)in;
1045 
1046 					(*block)(ctx->Yi.c, ctx->EKi.c, key);
1047 					++ctr;
1048 					ctx->Yi.d[3] = htobe32(ctr);
1049 
1050 					for (i = 0; i < 16/sizeof(size_t); ++i)
1051 						out_t[i] = in_t[i] ^
1052 						    ctx->EKi.t[i];
1053 					out += 16;
1054 					in += 16;
1055 					len -= 16;
1056 				}
1057 			}
1058 #else
1059 			while (len >= 16) {
1060 				size_t *out_t = (size_t *)out;
1061 				const size_t *in_t = (const size_t *)in;
1062 
1063 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
1064 				++ctr;
1065 				ctx->Yi.d[3] = htobe32(ctr);
1066 
1067 				for (i = 0; i < 16/sizeof(size_t); ++i) {
1068 					size_t c = in[i];
1069 					out[i] = c ^ ctx->EKi.t[i];
1070 					ctx->Xi.t[i] ^= c;
1071 				}
1072 				GCM_MUL(ctx, Xi);
1073 				out += 16;
1074 				in += 16;
1075 				len -= 16;
1076 			}
1077 #endif
1078 			if (len) {
1079 				(*block)(ctx->Yi.c, ctx->EKi.c, key);
1080 				++ctr;
1081 				ctx->Yi.d[3] = htobe32(ctr);
1082 
1083 				while (len--) {
1084 					u8 c = in[n];
1085 					ctx->Xi.c[n] ^= c;
1086 					out[n] = c ^ ctx->EKi.c[n];
1087 					++n;
1088 				}
1089 			}
1090 
1091 			ctx->mres = n;
1092 			return 0;
1093 		} while (0);
1094 #endif
1095 	for (i = 0; i < len; ++i) {
1096 		u8 c;
1097 		if (n == 0) {
1098 			(*block)(ctx->Yi.c, ctx->EKi.c, key);
1099 			++ctr;
1100 			ctx->Yi.d[3] = htobe32(ctr);
1101 		}
1102 		c = in[i];
1103 		out[i] = c ^ ctx->EKi.c[n];
1104 		ctx->Xi.c[n] ^= c;
1105 		n = (n + 1) % 16;
1106 		if (n == 0)
1107 			GCM_MUL(ctx, Xi);
1108 	}
1109 
1110 	ctx->mres = n;
1111 	return 0;
1112 }
1113 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt);
1114 
1115 int
1116 CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1117     const unsigned char *in, unsigned char *out,
1118     size_t len, ctr128_f stream)
1119 {
1120 	unsigned int n, ctr;
1121 	size_t i;
1122 	u64 mlen = ctx->len.u[1];
1123 	void *key = ctx->key;
1124 #ifdef GCM_FUNCREF_4BIT
1125 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1126 # ifdef GHASH
1127 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1128 	    const u8 *inp, size_t len) = ctx->ghash;
1129 # endif
1130 #endif
1131 
1132 	mlen += len;
1133 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1134 		return -1;
1135 	ctx->len.u[1] = mlen;
1136 
1137 	if (ctx->ares) {
1138 		/* First call to encrypt finalizes GHASH(AAD) */
1139 		GCM_MUL(ctx, Xi);
1140 		ctx->ares = 0;
1141 	}
1142 
1143 	ctr = be32toh(ctx->Yi.d[3]);
1144 
1145 	n = ctx->mres;
1146 	if (n) {
1147 		while (n && len) {
1148 			ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1149 			--len;
1150 			n = (n + 1) % 16;
1151 		}
1152 		if (n == 0)
1153 			GCM_MUL(ctx, Xi);
1154 		else {
1155 			ctx->mres = n;
1156 			return 0;
1157 		}
1158 	}
1159 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1160 	while (len >= GHASH_CHUNK) {
1161 		(*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
1162 		ctr += GHASH_CHUNK/16;
1163 		ctx->Yi.d[3] = htobe32(ctr);
1164 		GHASH(ctx, out, GHASH_CHUNK);
1165 		out += GHASH_CHUNK;
1166 		in += GHASH_CHUNK;
1167 		len -= GHASH_CHUNK;
1168 	}
1169 #endif
1170 	if ((i = (len & (size_t)-16))) {
1171 		size_t j = i/16;
1172 
1173 		(*stream)(in, out, j, key, ctx->Yi.c);
1174 		ctr += (unsigned int)j;
1175 		ctx->Yi.d[3] = htobe32(ctr);
1176 		in += i;
1177 		len -= i;
1178 #if defined(GHASH)
1179 		GHASH(ctx, out, i);
1180 		out += i;
1181 #else
1182 		while (j--) {
1183 			for (i = 0; i < 16; ++i)
1184 				ctx->Xi.c[i] ^= out[i];
1185 			GCM_MUL(ctx, Xi);
1186 			out += 16;
1187 		}
1188 #endif
1189 	}
1190 	if (len) {
1191 		(*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1192 		++ctr;
1193 		ctx->Yi.d[3] = htobe32(ctr);
1194 		while (len--) {
1195 			ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1196 			++n;
1197 		}
1198 	}
1199 
1200 	ctx->mres = n;
1201 	return 0;
1202 }
1203 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt_ctr32);
1204 
1205 int
1206 CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1207     const unsigned char *in, unsigned char *out,
1208     size_t len, ctr128_f stream)
1209 {
1210 	unsigned int n, ctr;
1211 	size_t i;
1212 	u64 mlen = ctx->len.u[1];
1213 	void *key = ctx->key;
1214 #ifdef GCM_FUNCREF_4BIT
1215 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1216 # ifdef GHASH
1217 	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1218 	    const u8 *inp, size_t len) = ctx->ghash;
1219 # endif
1220 #endif
1221 
1222 	mlen += len;
1223 	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1224 		return -1;
1225 	ctx->len.u[1] = mlen;
1226 
1227 	if (ctx->ares) {
1228 		/* First call to decrypt finalizes GHASH(AAD) */
1229 		GCM_MUL(ctx, Xi);
1230 		ctx->ares = 0;
1231 	}
1232 
1233 	ctr = be32toh(ctx->Yi.d[3]);
1234 
1235 	n = ctx->mres;
1236 	if (n) {
1237 		while (n && len) {
1238 			u8 c = *(in++);
1239 			*(out++) = c ^ ctx->EKi.c[n];
1240 			ctx->Xi.c[n] ^= c;
1241 			--len;
1242 			n = (n + 1) % 16;
1243 		}
1244 		if (n == 0)
1245 			GCM_MUL(ctx, Xi);
1246 		else {
1247 			ctx->mres = n;
1248 			return 0;
1249 		}
1250 	}
1251 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1252 	while (len >= GHASH_CHUNK) {
1253 		GHASH(ctx, in, GHASH_CHUNK);
1254 		(*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
1255 		ctr += GHASH_CHUNK/16;
1256 		ctx->Yi.d[3] = htobe32(ctr);
1257 		out += GHASH_CHUNK;
1258 		in += GHASH_CHUNK;
1259 		len -= GHASH_CHUNK;
1260 	}
1261 #endif
1262 	if ((i = (len & (size_t)-16))) {
1263 		size_t j = i/16;
1264 
1265 #if defined(GHASH)
1266 		GHASH(ctx, in, i);
1267 #else
1268 		while (j--) {
1269 			size_t k;
1270 			for (k = 0; k < 16; ++k)
1271 				ctx->Xi.c[k] ^= in[k];
1272 			GCM_MUL(ctx, Xi);
1273 			in += 16;
1274 		}
1275 		j = i/16;
1276 		in -= i;
1277 #endif
1278 		(*stream)(in, out, j, key, ctx->Yi.c);
1279 		ctr += (unsigned int)j;
1280 		ctx->Yi.d[3] = htobe32(ctr);
1281 		out += i;
1282 		in += i;
1283 		len -= i;
1284 	}
1285 	if (len) {
1286 		(*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1287 		++ctr;
1288 		ctx->Yi.d[3] = htobe32(ctr);
1289 		while (len--) {
1290 			u8 c = in[n];
1291 			ctx->Xi.c[n] ^= c;
1292 			out[n] = c ^ ctx->EKi.c[n];
1293 			++n;
1294 		}
1295 	}
1296 
1297 	ctx->mres = n;
1298 	return 0;
1299 }
1300 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt_ctr32);
1301 
1302 int
1303 CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1304     size_t len)
1305 {
1306 	u64 alen = ctx->len.u[0] << 3;
1307 	u64 clen = ctx->len.u[1] << 3;
1308 #ifdef GCM_FUNCREF_4BIT
1309 	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1310 #endif
1311 
1312 	if (ctx->mres || ctx->ares)
1313 		GCM_MUL(ctx, Xi);
1314 
1315 	ctx->Xi.u[0] ^= htobe64(alen);
1316 	ctx->Xi.u[1] ^= htobe64(clen);
1317 	GCM_MUL(ctx, Xi);
1318 
1319 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1320 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1321 
1322 	if (tag && len <= sizeof(ctx->Xi))
1323 		return memcmp(ctx->Xi.c, tag, len);
1324 	else
1325 		return -1;
1326 }
1327 LCRYPTO_ALIAS(CRYPTO_gcm128_finish);
1328 
1329 void
1330 CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1331 {
1332 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1333 	memcpy(tag, ctx->Xi.c,
1334 	    len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1335 }
1336 LCRYPTO_ALIAS(CRYPTO_gcm128_tag);
1337 
1338 GCM128_CONTEXT *
1339 CRYPTO_gcm128_new(void *key, block128_f block)
1340 {
1341 	GCM128_CONTEXT *ret;
1342 
1343 	if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1344 		CRYPTO_gcm128_init(ret, key, block);
1345 
1346 	return ret;
1347 }
1348 LCRYPTO_ALIAS(CRYPTO_gcm128_new);
1349 
1350 void
1351 CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1352 {
1353 	freezero(ctx, sizeof(*ctx));
1354 }
1355 LCRYPTO_ALIAS(CRYPTO_gcm128_release);
1356