1 /* $OpenBSD: gcm128.c,v 1.12 2015/02/10 09:46:30 miod Exp $ */
2 /* ====================================================================
3  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * 3. All advertising materials mentioning features or use of this
18  *    software must display the following acknowledgment:
19  *    "This product includes software developed by the OpenSSL Project
20  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21  *
22  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23  *    endorse or promote products derived from this software without
24  *    prior written permission. For written permission, please contact
25  *    openssl-core@openssl.org.
26  *
27  * 5. Products derived from this software may not be called "OpenSSL"
28  *    nor may "OpenSSL" appear in their names without prior written
29  *    permission of the OpenSSL Project.
30  *
31  * 6. Redistributions of any form whatsoever must retain the following
32  *    acknowledgment:
33  *    "This product includes software developed by the OpenSSL Project
34  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35  *
36  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47  * OF THE POSSIBILITY OF SUCH DAMAGE.
48  * ====================================================================
49  */
50 
51 #define OPENSSL_FIPSAPI
52 
53 #include <openssl/crypto.h>
54 #include "modes_lcl.h"
55 #include <string.h>
56 
57 #ifndef MODES_DEBUG
58 # ifndef NDEBUG
59 #  define NDEBUG
60 # endif
61 #endif
62 
63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef	GETU32
66 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67 #undef	PUTU32
68 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
69 #endif
70 
71 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)	\
73 	do { \
74 		if (sizeof(size_t)==8) { \
75 			u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
76 			V.lo  = (V.hi<<63)|(V.lo>>1); \
77 			V.hi  = (V.hi>>1 )^T; \
78 		} else { \
79 			u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 			V.lo  = (V.hi<<63)|(V.lo>>1); \
81 			V.hi  = (V.hi>>1 )^((u64)T<<32); \
82 		} \
83 	} while(0)
84 
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if	TABLE_BITS==8
120 
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123 	int  i, j;
124 	u128 V;
125 
126 	Htable[0].hi = 0;
127 	Htable[0].lo = 0;
128 	V.hi = H[0];
129 	V.lo = H[1];
130 
131 	for (Htable[128]=V, i=64; i>0; i>>=1) {
132 		REDUCE1BIT(V);
133 		Htable[i] = V;
134 	}
135 
136 	for (i=2; i<256; i<<=1) {
137 		u128 *Hi = Htable+i, H0 = *Hi;
138 		for (j=1; j<i; ++j) {
139 			Hi[j].hi = H0.hi^Htable[j].hi;
140 			Hi[j].lo = H0.lo^Htable[j].lo;
141 		}
142 	}
143 }
144 
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147 	u128 Z = { 0, 0};
148 	const u8 *xi = (const u8 *)Xi+15;
149 	size_t rem, n = *xi;
150 	static const size_t rem_8bit[256] = {
151 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
152 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
153 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
154 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
155 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
156 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
157 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
158 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
159 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
160 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
161 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
162 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
163 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
164 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
165 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
166 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
167 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
168 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
169 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
170 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
171 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
172 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
173 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
174 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
175 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
176 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
177 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
178 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
179 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
180 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
181 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
182 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
183 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
184 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
185 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
186 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
187 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
188 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
189 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
190 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
191 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
192 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
193 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
194 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
195 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
196 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
197 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
198 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
199 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
200 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
201 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
202 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
203 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
204 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
205 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
206 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
207 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
208 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
209 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
210 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
211 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
212 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
213 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
214 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
215 
216 	while (1) {
217 		Z.hi ^= Htable[n].hi;
218 		Z.lo ^= Htable[n].lo;
219 
220 		if ((u8 *)Xi==xi)	break;
221 
222 		n = *(--xi);
223 
224 		rem  = (size_t)Z.lo&0xff;
225 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
226 		Z.hi = (Z.hi>>8);
227 		if (sizeof(size_t)==8)
228 			Z.hi ^= rem_8bit[rem];
229 		else
230 			Z.hi ^= (u64)rem_8bit[rem]<<32;
231 	}
232 
233 	if (BYTE_ORDER == LITTLE_ENDIAN) {
234 #ifdef BSWAP8
235 		Xi[0] = BSWAP8(Z.hi);
236 		Xi[1] = BSWAP8(Z.lo);
237 #else
238 		u8 *p = (u8 *)Xi;
239 		u32 v;
240 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
241 		v = (u32)(Z.hi);	PUTU32(p+4,v);
242 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
243 		v = (u32)(Z.lo);	PUTU32(p+12,v);
244 #endif
245 	}
246 	else {
247 		Xi[0] = Z.hi;
248 		Xi[1] = Z.lo;
249 	}
250 }
251 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
252 
253 #elif	TABLE_BITS==4
254 
255 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
256 {
257 	u128 V;
258 #if defined(OPENSSL_SMALL_FOOTPRINT)
259 	int  i;
260 #endif
261 
262 	Htable[0].hi = 0;
263 	Htable[0].lo = 0;
264 	V.hi = H[0];
265 	V.lo = H[1];
266 
267 #if defined(OPENSSL_SMALL_FOOTPRINT)
268 	for (Htable[8]=V, i=4; i>0; i>>=1) {
269 		REDUCE1BIT(V);
270 		Htable[i] = V;
271 	}
272 
273 	for (i=2; i<16; i<<=1) {
274 		u128 *Hi = Htable+i;
275 		int   j;
276 		for (V=*Hi, j=1; j<i; ++j) {
277 			Hi[j].hi = V.hi^Htable[j].hi;
278 			Hi[j].lo = V.lo^Htable[j].lo;
279 		}
280 	}
281 #else
282 	Htable[8] = V;
283 	REDUCE1BIT(V);
284 	Htable[4] = V;
285 	REDUCE1BIT(V);
286 	Htable[2] = V;
287 	REDUCE1BIT(V);
288 	Htable[1] = V;
289 	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
290 	V=Htable[4];
291 	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
292 	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
293 	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
294 	V=Htable[8];
295 	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
296 	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
297 	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
298 	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
299 	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
300 	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
301 	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
302 #endif
303 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
304 	/*
305 	 * ARM assembler expects specific dword order in Htable.
306 	 */
307 	{
308 	int j;
309 
310 	if (BYTE_ORDER == LITTLE_ENDIAN)
311 		for (j=0;j<16;++j) {
312 			V = Htable[j];
313 			Htable[j].hi = V.lo;
314 			Htable[j].lo = V.hi;
315 		}
316 	else
317 		for (j=0;j<16;++j) {
318 			V = Htable[j];
319 			Htable[j].hi = V.lo<<32|V.lo>>32;
320 			Htable[j].lo = V.hi<<32|V.hi>>32;
321 		}
322 	}
323 #endif
324 }
325 
326 #ifndef GHASH_ASM
327 static const size_t rem_4bit[16] = {
328 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
329 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
330 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
331 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
332 
333 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
334 {
335 	u128 Z;
336 	int cnt = 15;
337 	size_t rem, nlo, nhi;
338 
339 	nlo  = ((const u8 *)Xi)[15];
340 	nhi  = nlo>>4;
341 	nlo &= 0xf;
342 
343 	Z.hi = Htable[nlo].hi;
344 	Z.lo = Htable[nlo].lo;
345 
346 	while (1) {
347 		rem  = (size_t)Z.lo&0xf;
348 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
349 		Z.hi = (Z.hi>>4);
350 		if (sizeof(size_t)==8)
351 			Z.hi ^= rem_4bit[rem];
352 		else
353 			Z.hi ^= (u64)rem_4bit[rem]<<32;
354 
355 		Z.hi ^= Htable[nhi].hi;
356 		Z.lo ^= Htable[nhi].lo;
357 
358 		if (--cnt<0)		break;
359 
360 		nlo  = ((const u8 *)Xi)[cnt];
361 		nhi  = nlo>>4;
362 		nlo &= 0xf;
363 
364 		rem  = (size_t)Z.lo&0xf;
365 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
366 		Z.hi = (Z.hi>>4);
367 		if (sizeof(size_t)==8)
368 			Z.hi ^= rem_4bit[rem];
369 		else
370 			Z.hi ^= (u64)rem_4bit[rem]<<32;
371 
372 		Z.hi ^= Htable[nlo].hi;
373 		Z.lo ^= Htable[nlo].lo;
374 	}
375 
376 	if (BYTE_ORDER == LITTLE_ENDIAN) {
377 #ifdef BSWAP8
378 		Xi[0] = BSWAP8(Z.hi);
379 		Xi[1] = BSWAP8(Z.lo);
380 #else
381 		u8 *p = (u8 *)Xi;
382 		u32 v;
383 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
384 		v = (u32)(Z.hi);	PUTU32(p+4,v);
385 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
386 		v = (u32)(Z.lo);	PUTU32(p+12,v);
387 #endif
388 	}
389 	else {
390 		Xi[0] = Z.hi;
391 		Xi[1] = Z.lo;
392 	}
393 }
394 
395 #if !defined(OPENSSL_SMALL_FOOTPRINT)
396 /*
397  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
398  * details... Compiler-generated code doesn't seem to give any
399  * performance improvement, at least not on x86[_64]. It's here
400  * mostly as reference and a placeholder for possible future
401  * non-trivial optimization[s]...
402  */
403 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
404 				const u8 *inp,size_t len)
405 {
406     u128 Z;
407     int cnt;
408     size_t rem, nlo, nhi;
409 
410 #if 1
411     do {
412 	cnt  = 15;
413 	nlo  = ((const u8 *)Xi)[15];
414 	nlo ^= inp[15];
415 	nhi  = nlo>>4;
416 	nlo &= 0xf;
417 
418 	Z.hi = Htable[nlo].hi;
419 	Z.lo = Htable[nlo].lo;
420 
421 	while (1) {
422 		rem  = (size_t)Z.lo&0xf;
423 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
424 		Z.hi = (Z.hi>>4);
425 		if (sizeof(size_t)==8)
426 			Z.hi ^= rem_4bit[rem];
427 		else
428 			Z.hi ^= (u64)rem_4bit[rem]<<32;
429 
430 		Z.hi ^= Htable[nhi].hi;
431 		Z.lo ^= Htable[nhi].lo;
432 
433 		if (--cnt<0)		break;
434 
435 		nlo  = ((const u8 *)Xi)[cnt];
436 		nlo ^= inp[cnt];
437 		nhi  = nlo>>4;
438 		nlo &= 0xf;
439 
440 		rem  = (size_t)Z.lo&0xf;
441 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
442 		Z.hi = (Z.hi>>4);
443 		if (sizeof(size_t)==8)
444 			Z.hi ^= rem_4bit[rem];
445 		else
446 			Z.hi ^= (u64)rem_4bit[rem]<<32;
447 
448 		Z.hi ^= Htable[nlo].hi;
449 		Z.lo ^= Htable[nlo].lo;
450 	}
451 #else
452     /*
453      * Extra 256+16 bytes per-key plus 512 bytes shared tables
454      * [should] give ~50% improvement... One could have PACK()-ed
455      * the rem_8bit even here, but the priority is to minimize
456      * cache footprint...
457      */
458     u128 Hshr4[16];	/* Htable shifted right by 4 bits */
459     u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
460     static const unsigned short rem_8bit[256] = {
461 	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
462 	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
463 	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
464 	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
465 	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
466 	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
467 	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
468 	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
469 	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
470 	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
471 	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
472 	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
473 	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
474 	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
475 	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
476 	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
477 	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
478 	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
479 	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
480 	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
481 	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
482 	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
483 	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
484 	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
485 	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
486 	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
487 	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
488 	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
489 	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
490 	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
491 	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
492 	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
493     /*
494      * This pre-processing phase slows down procedure by approximately
495      * same time as it makes each loop spin faster. In other words
496      * single block performance is approximately same as straightforward
497      * "4-bit" implementation, and then it goes only faster...
498      */
499     for (cnt=0; cnt<16; ++cnt) {
500 	Z.hi = Htable[cnt].hi;
501 	Z.lo = Htable[cnt].lo;
502 	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
503 	Hshr4[cnt].hi = (Z.hi>>4);
504 	Hshl4[cnt]    = (u8)(Z.lo<<4);
505     }
506 
507     do {
508 	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
509 		nlo  = ((const u8 *)Xi)[cnt];
510 		nlo ^= inp[cnt];
511 		nhi  = nlo>>4;
512 		nlo &= 0xf;
513 
514 		Z.hi ^= Htable[nlo].hi;
515 		Z.lo ^= Htable[nlo].lo;
516 
517 		rem = (size_t)Z.lo&0xff;
518 
519 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
520 		Z.hi = (Z.hi>>8);
521 
522 		Z.hi ^= Hshr4[nhi].hi;
523 		Z.lo ^= Hshr4[nhi].lo;
524 		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
525 	}
526 
527 	nlo  = ((const u8 *)Xi)[0];
528 	nlo ^= inp[0];
529 	nhi  = nlo>>4;
530 	nlo &= 0xf;
531 
532 	Z.hi ^= Htable[nlo].hi;
533 	Z.lo ^= Htable[nlo].lo;
534 
535 	rem = (size_t)Z.lo&0xf;
536 
537 	Z.lo = (Z.hi<<60)|(Z.lo>>4);
538 	Z.hi = (Z.hi>>4);
539 
540 	Z.hi ^= Htable[nhi].hi;
541 	Z.lo ^= Htable[nhi].lo;
542 	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
543 #endif
544 
545 	if (BYTE_ORDER == LITTLE_ENDIAN) {
546 #ifdef BSWAP8
547 		Xi[0] = BSWAP8(Z.hi);
548 		Xi[1] = BSWAP8(Z.lo);
549 #else
550 		u8 *p = (u8 *)Xi;
551 		u32 v;
552 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
553 		v = (u32)(Z.hi);	PUTU32(p+4,v);
554 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
555 		v = (u32)(Z.lo);	PUTU32(p+12,v);
556 #endif
557 	}
558 	else {
559 		Xi[0] = Z.hi;
560 		Xi[1] = Z.lo;
561 	}
562     } while (inp+=16, len-=16);
563 }
564 #endif
565 #else
566 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
567 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
568 #endif
569 
570 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
571 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
572 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
573 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
574  * trashing effect. In other words idea is to hash data while it's
575  * still in L1 cache after encryption pass... */
576 #define GHASH_CHUNK       (3*1024)
577 #endif
578 
579 #else	/* TABLE_BITS */
580 
581 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
582 {
583 	u128 V,Z = { 0,0 };
584 	long X;
585 	int  i,j;
586 	const long *xi = (const long *)Xi;
587 
588 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
589 	V.lo = H[1];
590 
591 	for (j=0; j<16/sizeof(long); ++j) {
592 		if (BYTE_ORDER == LITTLE_ENDIAN) {
593 			if (sizeof(long)==8) {
594 #ifdef BSWAP8
595 				X = (long)(BSWAP8(xi[j]));
596 #else
597 				const u8 *p = (const u8 *)(xi+j);
598 				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
599 #endif
600 			}
601 			else {
602 				const u8 *p = (const u8 *)(xi+j);
603 				X = (long)GETU32(p);
604 			}
605 		}
606 		else
607 			X = xi[j];
608 
609 		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
610 			u64 M = (u64)(X>>(8*sizeof(long)-1));
611 			Z.hi ^= V.hi&M;
612 			Z.lo ^= V.lo&M;
613 
614 			REDUCE1BIT(V);
615 		}
616 	}
617 
618 	if (BYTE_ORDER == LITTLE_ENDIAN) {
619 #ifdef BSWAP8
620 		Xi[0] = BSWAP8(Z.hi);
621 		Xi[1] = BSWAP8(Z.lo);
622 #else
623 		u8 *p = (u8 *)Xi;
624 		u32 v;
625 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
626 		v = (u32)(Z.hi);	PUTU32(p+4,v);
627 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
628 		v = (u32)(Z.lo);	PUTU32(p+12,v);
629 #endif
630 	}
631 	else {
632 		Xi[0] = Z.hi;
633 		Xi[1] = Z.lo;
634 	}
635 }
636 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
637 
638 #endif
639 
640 #if	TABLE_BITS==4 && defined(GHASH_ASM)
641 # if	!defined(I386_ONLY) && \
642 	(defined(__i386)	|| defined(__i386__)	|| \
643 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
644 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
645 #  define GHASH_ASM_X86_OR_64
646 #  define GCM_FUNCREF_4BIT
647 extern unsigned int OPENSSL_ia32cap_P[2];
648 
649 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
650 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
651 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
652 
653 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
654 #   define GHASH_ASM_X86
655 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657 
658 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660 #  endif
661 # elif defined(__arm__) || defined(__arm)
662 #  include "arm_arch.h"
663 #  if __ARM_ARCH__>=7
664 #   define GHASH_ASM_ARM
665 #   define GCM_FUNCREF_4BIT
666 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
667 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
668 #  endif
669 # endif
670 #endif
671 
672 #ifdef GCM_FUNCREF_4BIT
673 # undef  GCM_MUL
674 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
675 # ifdef GHASH
676 #  undef  GHASH
677 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
678 # endif
679 #endif
680 
681 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
682 {
683 	memset(ctx,0,sizeof(*ctx));
684 	ctx->block = block;
685 	ctx->key   = key;
686 
687 	(*block)(ctx->H.c,ctx->H.c,key);
688 
689 	if (BYTE_ORDER == LITTLE_ENDIAN) {
690 		/* H is stored in host byte order */
691 #ifdef BSWAP8
692 		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
693 		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
694 #else
695 		u8 *p = ctx->H.c;
696 		u64 hi,lo;
697 		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
698 		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
699 		ctx->H.u[0] = hi;
700 		ctx->H.u[1] = lo;
701 #endif
702 	}
703 
704 #if	TABLE_BITS==8
705 	gcm_init_8bit(ctx->Htable,ctx->H.u);
706 #elif	TABLE_BITS==4
707 # if	defined(GHASH_ASM_X86_OR_64)
708 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
709 	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
710 	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
711 		gcm_init_clmul(ctx->Htable,ctx->H.u);
712 		ctx->gmult = gcm_gmult_clmul;
713 		ctx->ghash = gcm_ghash_clmul;
714 		return;
715 	}
716 #  endif
717 	gcm_init_4bit(ctx->Htable,ctx->H.u);
718 #  if	defined(GHASH_ASM_X86)			/* x86 only */
719 #   if	defined(OPENSSL_IA32_SSE2)
720 	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
721 #   else
722 	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
723 #   endif
724 		ctx->gmult = gcm_gmult_4bit_mmx;
725 		ctx->ghash = gcm_ghash_4bit_mmx;
726 	} else {
727 		ctx->gmult = gcm_gmult_4bit_x86;
728 		ctx->ghash = gcm_ghash_4bit_x86;
729 	}
730 #  else
731 	ctx->gmult = gcm_gmult_4bit;
732 	ctx->ghash = gcm_ghash_4bit;
733 #  endif
734 # elif	defined(GHASH_ASM_ARM)
735 	if (OPENSSL_armcap_P & ARMV7_NEON) {
736 		ctx->gmult = gcm_gmult_neon;
737 		ctx->ghash = gcm_ghash_neon;
738 	} else {
739 		gcm_init_4bit(ctx->Htable,ctx->H.u);
740 		ctx->gmult = gcm_gmult_4bit;
741 		ctx->ghash = gcm_ghash_4bit;
742 	}
743 # else
744 	gcm_init_4bit(ctx->Htable,ctx->H.u);
745 # endif
746 #endif
747 }
748 
749 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
750 {
751 	unsigned int ctr;
752 #ifdef GCM_FUNCREF_4BIT
753 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
754 #endif
755 
756 	ctx->Yi.u[0]  = 0;
757 	ctx->Yi.u[1]  = 0;
758 	ctx->Xi.u[0]  = 0;
759 	ctx->Xi.u[1]  = 0;
760 	ctx->len.u[0] = 0;	/* AAD length */
761 	ctx->len.u[1] = 0;	/* message length */
762 	ctx->ares = 0;
763 	ctx->mres = 0;
764 
765 	if (len==12) {
766 		memcpy(ctx->Yi.c,iv,12);
767 		ctx->Yi.c[15]=1;
768 		ctr=1;
769 	}
770 	else {
771 		size_t i;
772 		u64 len0 = len;
773 
774 		while (len>=16) {
775 			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
776 			GCM_MUL(ctx,Yi);
777 			iv += 16;
778 			len -= 16;
779 		}
780 		if (len) {
781 			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
782 			GCM_MUL(ctx,Yi);
783 		}
784 		len0 <<= 3;
785 		if (BYTE_ORDER == LITTLE_ENDIAN) {
786 #ifdef BSWAP8
787 			ctx->Yi.u[1]  ^= BSWAP8(len0);
788 #else
789 			ctx->Yi.c[8]  ^= (u8)(len0>>56);
790 			ctx->Yi.c[9]  ^= (u8)(len0>>48);
791 			ctx->Yi.c[10] ^= (u8)(len0>>40);
792 			ctx->Yi.c[11] ^= (u8)(len0>>32);
793 			ctx->Yi.c[12] ^= (u8)(len0>>24);
794 			ctx->Yi.c[13] ^= (u8)(len0>>16);
795 			ctx->Yi.c[14] ^= (u8)(len0>>8);
796 			ctx->Yi.c[15] ^= (u8)(len0);
797 #endif
798 		}
799 		else
800 			ctx->Yi.u[1]  ^= len0;
801 
802 		GCM_MUL(ctx,Yi);
803 
804 		if (BYTE_ORDER == LITTLE_ENDIAN)
805 #ifdef BSWAP4
806 			ctr = BSWAP4(ctx->Yi.d[3]);
807 #else
808 			ctr = GETU32(ctx->Yi.c+12);
809 #endif
810 		else
811 			ctr = ctx->Yi.d[3];
812 	}
813 
814 	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
815 	++ctr;
816 	if (BYTE_ORDER == LITTLE_ENDIAN)
817 #ifdef BSWAP4
818 		ctx->Yi.d[3] = BSWAP4(ctr);
819 #else
820 		PUTU32(ctx->Yi.c+12,ctr);
821 #endif
822 	else
823 		ctx->Yi.d[3] = ctr;
824 }
825 
826 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
827 {
828 	size_t i;
829 	unsigned int n;
830 	u64 alen = ctx->len.u[0];
831 #ifdef GCM_FUNCREF_4BIT
832 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
833 # ifdef GHASH
834 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
835 				const u8 *inp,size_t len)	= ctx->ghash;
836 # endif
837 #endif
838 
839 	if (ctx->len.u[1]) return -2;
840 
841 	alen += len;
842 	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
843 		return -1;
844 	ctx->len.u[0] = alen;
845 
846 	n = ctx->ares;
847 	if (n) {
848 		while (n && len) {
849 			ctx->Xi.c[n] ^= *(aad++);
850 			--len;
851 			n = (n+1)%16;
852 		}
853 		if (n==0) GCM_MUL(ctx,Xi);
854 		else {
855 			ctx->ares = n;
856 			return 0;
857 		}
858 	}
859 
860 #ifdef GHASH
861 	if ((i = (len&(size_t)-16))) {
862 		GHASH(ctx,aad,i);
863 		aad += i;
864 		len -= i;
865 	}
866 #else
867 	while (len>=16) {
868 		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
869 		GCM_MUL(ctx,Xi);
870 		aad += 16;
871 		len -= 16;
872 	}
873 #endif
874 	if (len) {
875 		n = (unsigned int)len;
876 		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
877 	}
878 
879 	ctx->ares = n;
880 	return 0;
881 }
882 
883 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
884 		const unsigned char *in, unsigned char *out,
885 		size_t len)
886 {
887 	unsigned int n, ctr;
888 	size_t i;
889 	u64        mlen  = ctx->len.u[1];
890 	block128_f block = ctx->block;
891 	void      *key   = ctx->key;
892 #ifdef GCM_FUNCREF_4BIT
893 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
894 # ifdef GHASH
895 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
896 				const u8 *inp,size_t len)	= ctx->ghash;
897 # endif
898 #endif
899 
900 	mlen += len;
901 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
902 		return -1;
903 	ctx->len.u[1] = mlen;
904 
905 	if (ctx->ares) {
906 		/* First call to encrypt finalizes GHASH(AAD) */
907 		GCM_MUL(ctx,Xi);
908 		ctx->ares = 0;
909 	}
910 
911 	if (BYTE_ORDER == LITTLE_ENDIAN)
912 #ifdef BSWAP4
913 		ctr = BSWAP4(ctx->Yi.d[3]);
914 #else
915 		ctr = GETU32(ctx->Yi.c+12);
916 #endif
917 	else
918 		ctr = ctx->Yi.d[3];
919 
920 	n = ctx->mres;
921 #if !defined(OPENSSL_SMALL_FOOTPRINT)
922 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
923 		if (n) {
924 			while (n && len) {
925 				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
926 				--len;
927 				n = (n+1)%16;
928 			}
929 			if (n==0) GCM_MUL(ctx,Xi);
930 			else {
931 				ctx->mres = n;
932 				return 0;
933 			}
934 		}
935 #ifdef __STRICT_ALIGNMENT
936 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
937 			break;
938 #endif
939 #if defined(GHASH) && defined(GHASH_CHUNK)
940 		while (len>=GHASH_CHUNK) {
941 		    size_t j=GHASH_CHUNK;
942 
943 		    while (j) {
944 		    	size_t *out_t=(size_t *)out;
945 		    	const size_t *in_t=(const size_t *)in;
946 
947 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
948 			++ctr;
949 			if (BYTE_ORDER == LITTLE_ENDIAN)
950 #ifdef BSWAP4
951 				ctx->Yi.d[3] = BSWAP4(ctr);
952 #else
953 				PUTU32(ctx->Yi.c+12,ctr);
954 #endif
955 			else
956 				ctx->Yi.d[3] = ctr;
957 			for (i=0; i<16/sizeof(size_t); ++i)
958 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
959 			out += 16;
960 			in  += 16;
961 			j   -= 16;
962 		    }
963 		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
964 		    len -= GHASH_CHUNK;
965 		}
966 		if ((i = (len&(size_t)-16))) {
967 		    size_t j=i;
968 
969 		    while (len>=16) {
970 		    	size_t *out_t=(size_t *)out;
971 		    	const size_t *in_t=(const size_t *)in;
972 
973 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
974 			++ctr;
975 			if (BYTE_ORDER == LITTLE_ENDIAN)
976 #ifdef BSWAP4
977 				ctx->Yi.d[3] = BSWAP4(ctr);
978 #else
979 				PUTU32(ctx->Yi.c+12,ctr);
980 #endif
981 			else
982 				ctx->Yi.d[3] = ctr;
983 			for (i=0; i<16/sizeof(size_t); ++i)
984 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
985 			out += 16;
986 			in  += 16;
987 			len -= 16;
988 		    }
989 		    GHASH(ctx,out-j,j);
990 		}
991 #else
992 		while (len>=16) {
993 		    	size_t *out_t=(size_t *)out;
994 		    	const size_t *in_t=(const size_t *)in;
995 
996 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
997 			++ctr;
998 			if (BYTE_ORDER == LITTLE_ENDIAN)
999 #ifdef BSWAP4
1000 				ctx->Yi.d[3] = BSWAP4(ctr);
1001 #else
1002 				PUTU32(ctx->Yi.c+12,ctr);
1003 #endif
1004 			else
1005 				ctx->Yi.d[3] = ctr;
1006 			for (i=0; i<16/sizeof(size_t); ++i)
1007 				ctx->Xi.t[i] ^=
1008 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1009 			GCM_MUL(ctx,Xi);
1010 			out += 16;
1011 			in  += 16;
1012 			len -= 16;
1013 		}
1014 #endif
1015 		if (len) {
1016 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1017 			++ctr;
1018 			if (BYTE_ORDER == LITTLE_ENDIAN)
1019 #ifdef BSWAP4
1020 				ctx->Yi.d[3] = BSWAP4(ctr);
1021 #else
1022 				PUTU32(ctx->Yi.c+12,ctr);
1023 #endif
1024 			else
1025 				ctx->Yi.d[3] = ctr;
1026 			while (len--) {
1027 				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1028 				++n;
1029 			}
1030 		}
1031 
1032 		ctx->mres = n;
1033 		return 0;
1034 	} while(0);
1035 #endif
1036 	for (i=0;i<len;++i) {
1037 		if (n==0) {
1038 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1039 			++ctr;
1040 			if (BYTE_ORDER == LITTLE_ENDIAN)
1041 #ifdef BSWAP4
1042 				ctx->Yi.d[3] = BSWAP4(ctr);
1043 #else
1044 				PUTU32(ctx->Yi.c+12,ctr);
1045 #endif
1046 			else
1047 				ctx->Yi.d[3] = ctr;
1048 		}
1049 		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1050 		n = (n+1)%16;
1051 		if (n==0)
1052 			GCM_MUL(ctx,Xi);
1053 	}
1054 
1055 	ctx->mres = n;
1056 	return 0;
1057 }
1058 
1059 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1060 		const unsigned char *in, unsigned char *out,
1061 		size_t len)
1062 {
1063 	unsigned int n, ctr;
1064 	size_t i;
1065 	u64        mlen  = ctx->len.u[1];
1066 	block128_f block = ctx->block;
1067 	void      *key   = ctx->key;
1068 #ifdef GCM_FUNCREF_4BIT
1069 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1070 # ifdef GHASH
1071 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1072 				const u8 *inp,size_t len)	= ctx->ghash;
1073 # endif
1074 #endif
1075 
1076 	mlen += len;
1077 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1078 		return -1;
1079 	ctx->len.u[1] = mlen;
1080 
1081 	if (ctx->ares) {
1082 		/* First call to decrypt finalizes GHASH(AAD) */
1083 		GCM_MUL(ctx,Xi);
1084 		ctx->ares = 0;
1085 	}
1086 
1087 	if (BYTE_ORDER == LITTLE_ENDIAN)
1088 #ifdef BSWAP4
1089 		ctr = BSWAP4(ctx->Yi.d[3]);
1090 #else
1091 		ctr = GETU32(ctx->Yi.c+12);
1092 #endif
1093 	else
1094 		ctr = ctx->Yi.d[3];
1095 
1096 	n = ctx->mres;
1097 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1098 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1099 		if (n) {
1100 			while (n && len) {
1101 				u8 c = *(in++);
1102 				*(out++) = c^ctx->EKi.c[n];
1103 				ctx->Xi.c[n] ^= c;
1104 				--len;
1105 				n = (n+1)%16;
1106 			}
1107 			if (n==0) GCM_MUL (ctx,Xi);
1108 			else {
1109 				ctx->mres = n;
1110 				return 0;
1111 			}
1112 		}
1113 #ifdef __STRICT_ALIGNMENT
1114 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1115 			break;
1116 #endif
1117 #if defined(GHASH) && defined(GHASH_CHUNK)
1118 		while (len>=GHASH_CHUNK) {
1119 		    size_t j=GHASH_CHUNK;
1120 
1121 		    GHASH(ctx,in,GHASH_CHUNK);
1122 		    while (j) {
1123 		    	size_t *out_t=(size_t *)out;
1124 		    	const size_t *in_t=(const size_t *)in;
1125 
1126 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1127 			++ctr;
1128 			if (BYTE_ORDER == LITTLE_ENDIAN)
1129 #ifdef BSWAP4
1130 				ctx->Yi.d[3] = BSWAP4(ctr);
1131 #else
1132 				PUTU32(ctx->Yi.c+12,ctr);
1133 #endif
1134 			else
1135 				ctx->Yi.d[3] = ctr;
1136 			for (i=0; i<16/sizeof(size_t); ++i)
1137 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1138 			out += 16;
1139 			in  += 16;
1140 			j   -= 16;
1141 		    }
1142 		    len -= GHASH_CHUNK;
1143 		}
1144 		if ((i = (len&(size_t)-16))) {
1145 		    GHASH(ctx,in,i);
1146 		    while (len>=16) {
1147 		    	size_t *out_t=(size_t *)out;
1148 		    	const size_t *in_t=(const size_t *)in;
1149 
1150 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1151 			++ctr;
1152 			if (BYTE_ORDER == LITTLE_ENDIAN)
1153 #ifdef BSWAP4
1154 				ctx->Yi.d[3] = BSWAP4(ctr);
1155 #else
1156 				PUTU32(ctx->Yi.c+12,ctr);
1157 #endif
1158 			else
1159 				ctx->Yi.d[3] = ctr;
1160 			for (i=0; i<16/sizeof(size_t); ++i)
1161 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1162 			out += 16;
1163 			in  += 16;
1164 			len -= 16;
1165 		    }
1166 		}
1167 #else
1168 		while (len>=16) {
1169 		    	size_t *out_t=(size_t *)out;
1170 		    	const size_t *in_t=(const size_t *)in;
1171 
1172 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1173 			++ctr;
1174 			if (BYTE_ORDER == LITTLE_ENDIAN)
1175 #ifdef BSWAP4
1176 				ctx->Yi.d[3] = BSWAP4(ctr);
1177 #else
1178 				PUTU32(ctx->Yi.c+12,ctr);
1179 #endif
1180 			else
1181 				ctx->Yi.d[3] = ctr;
1182 			for (i=0; i<16/sizeof(size_t); ++i) {
1183 				size_t c = in[i];
1184 				out[i] = c^ctx->EKi.t[i];
1185 				ctx->Xi.t[i] ^= c;
1186 			}
1187 			GCM_MUL(ctx,Xi);
1188 			out += 16;
1189 			in  += 16;
1190 			len -= 16;
1191 		}
1192 #endif
1193 		if (len) {
1194 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1195 			++ctr;
1196 			if (BYTE_ORDER == LITTLE_ENDIAN)
1197 #ifdef BSWAP4
1198 				ctx->Yi.d[3] = BSWAP4(ctr);
1199 #else
1200 				PUTU32(ctx->Yi.c+12,ctr);
1201 #endif
1202 			else
1203 				ctx->Yi.d[3] = ctr;
1204 			while (len--) {
1205 				u8 c = in[n];
1206 				ctx->Xi.c[n] ^= c;
1207 				out[n] = c^ctx->EKi.c[n];
1208 				++n;
1209 			}
1210 		}
1211 
1212 		ctx->mres = n;
1213 		return 0;
1214 	} while(0);
1215 #endif
1216 	for (i=0;i<len;++i) {
1217 		u8 c;
1218 		if (n==0) {
1219 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1220 			++ctr;
1221 			if (BYTE_ORDER == LITTLE_ENDIAN)
1222 #ifdef BSWAP4
1223 				ctx->Yi.d[3] = BSWAP4(ctr);
1224 #else
1225 				PUTU32(ctx->Yi.c+12,ctr);
1226 #endif
1227 			else
1228 				ctx->Yi.d[3] = ctr;
1229 		}
1230 		c = in[i];
1231 		out[i] = c^ctx->EKi.c[n];
1232 		ctx->Xi.c[n] ^= c;
1233 		n = (n+1)%16;
1234 		if (n==0)
1235 			GCM_MUL(ctx,Xi);
1236 	}
1237 
1238 	ctx->mres = n;
1239 	return 0;
1240 }
1241 
1242 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1243 		const unsigned char *in, unsigned char *out,
1244 		size_t len, ctr128_f stream)
1245 {
1246 	unsigned int n, ctr;
1247 	size_t i;
1248 	u64   mlen = ctx->len.u[1];
1249 	void *key  = ctx->key;
1250 #ifdef GCM_FUNCREF_4BIT
1251 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1252 # ifdef GHASH
1253 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1254 				const u8 *inp,size_t len)	= ctx->ghash;
1255 # endif
1256 #endif
1257 
1258 	mlen += len;
1259 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1260 		return -1;
1261 	ctx->len.u[1] = mlen;
1262 
1263 	if (ctx->ares) {
1264 		/* First call to encrypt finalizes GHASH(AAD) */
1265 		GCM_MUL(ctx,Xi);
1266 		ctx->ares = 0;
1267 	}
1268 
1269 	if (BYTE_ORDER == LITTLE_ENDIAN)
1270 #ifdef BSWAP4
1271 		ctr = BSWAP4(ctx->Yi.d[3]);
1272 #else
1273 		ctr = GETU32(ctx->Yi.c+12);
1274 #endif
1275 	else
1276 		ctr = ctx->Yi.d[3];
1277 
1278 	n = ctx->mres;
1279 	if (n) {
1280 		while (n && len) {
1281 			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1282 			--len;
1283 			n = (n+1)%16;
1284 		}
1285 		if (n==0) GCM_MUL(ctx,Xi);
1286 		else {
1287 			ctx->mres = n;
1288 			return 0;
1289 		}
1290 	}
1291 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1292 	while (len>=GHASH_CHUNK) {
1293 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1294 		ctr += GHASH_CHUNK/16;
1295 		if (BYTE_ORDER == LITTLE_ENDIAN)
1296 #ifdef BSWAP4
1297 			ctx->Yi.d[3] = BSWAP4(ctr);
1298 #else
1299 			PUTU32(ctx->Yi.c+12,ctr);
1300 #endif
1301 		else
1302 			ctx->Yi.d[3] = ctr;
1303 		GHASH(ctx,out,GHASH_CHUNK);
1304 		out += GHASH_CHUNK;
1305 		in  += GHASH_CHUNK;
1306 		len -= GHASH_CHUNK;
1307 	}
1308 #endif
1309 	if ((i = (len&(size_t)-16))) {
1310 		size_t j=i/16;
1311 
1312 		(*stream)(in,out,j,key,ctx->Yi.c);
1313 		ctr += (unsigned int)j;
1314 		if (BYTE_ORDER == LITTLE_ENDIAN)
1315 #ifdef BSWAP4
1316 			ctx->Yi.d[3] = BSWAP4(ctr);
1317 #else
1318 			PUTU32(ctx->Yi.c+12,ctr);
1319 #endif
1320 		else
1321 			ctx->Yi.d[3] = ctr;
1322 		in  += i;
1323 		len -= i;
1324 #if defined(GHASH)
1325 		GHASH(ctx,out,i);
1326 		out += i;
1327 #else
1328 		while (j--) {
1329 			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1330 			GCM_MUL(ctx,Xi);
1331 			out += 16;
1332 		}
1333 #endif
1334 	}
1335 	if (len) {
1336 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1337 		++ctr;
1338 		if (BYTE_ORDER == LITTLE_ENDIAN)
1339 #ifdef BSWAP4
1340 			ctx->Yi.d[3] = BSWAP4(ctr);
1341 #else
1342 			PUTU32(ctx->Yi.c+12,ctr);
1343 #endif
1344 		else
1345 			ctx->Yi.d[3] = ctr;
1346 		while (len--) {
1347 			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1348 			++n;
1349 		}
1350 	}
1351 
1352 	ctx->mres = n;
1353 	return 0;
1354 }
1355 
1356 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1357 		const unsigned char *in, unsigned char *out,
1358 		size_t len,ctr128_f stream)
1359 {
1360 	unsigned int n, ctr;
1361 	size_t i;
1362 	u64   mlen = ctx->len.u[1];
1363 	void *key  = ctx->key;
1364 #ifdef GCM_FUNCREF_4BIT
1365 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1366 # ifdef GHASH
1367 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1368 				const u8 *inp,size_t len)	= ctx->ghash;
1369 # endif
1370 #endif
1371 
1372 	mlen += len;
1373 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1374 		return -1;
1375 	ctx->len.u[1] = mlen;
1376 
1377 	if (ctx->ares) {
1378 		/* First call to decrypt finalizes GHASH(AAD) */
1379 		GCM_MUL(ctx,Xi);
1380 		ctx->ares = 0;
1381 	}
1382 
1383 	if (BYTE_ORDER == LITTLE_ENDIAN)
1384 #ifdef BSWAP4
1385 		ctr = BSWAP4(ctx->Yi.d[3]);
1386 #else
1387 		ctr = GETU32(ctx->Yi.c+12);
1388 #endif
1389 	else
1390 		ctr = ctx->Yi.d[3];
1391 
1392 	n = ctx->mres;
1393 	if (n) {
1394 		while (n && len) {
1395 			u8 c = *(in++);
1396 			*(out++) = c^ctx->EKi.c[n];
1397 			ctx->Xi.c[n] ^= c;
1398 			--len;
1399 			n = (n+1)%16;
1400 		}
1401 		if (n==0) GCM_MUL (ctx,Xi);
1402 		else {
1403 			ctx->mres = n;
1404 			return 0;
1405 		}
1406 	}
1407 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1408 	while (len>=GHASH_CHUNK) {
1409 		GHASH(ctx,in,GHASH_CHUNK);
1410 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1411 		ctr += GHASH_CHUNK/16;
1412 		if (BYTE_ORDER == LITTLE_ENDIAN)
1413 #ifdef BSWAP4
1414 			ctx->Yi.d[3] = BSWAP4(ctr);
1415 #else
1416 			PUTU32(ctx->Yi.c+12,ctr);
1417 #endif
1418 		else
1419 			ctx->Yi.d[3] = ctr;
1420 		out += GHASH_CHUNK;
1421 		in  += GHASH_CHUNK;
1422 		len -= GHASH_CHUNK;
1423 	}
1424 #endif
1425 	if ((i = (len&(size_t)-16))) {
1426 		size_t j=i/16;
1427 
1428 #if defined(GHASH)
1429 		GHASH(ctx,in,i);
1430 #else
1431 		while (j--) {
1432 			size_t k;
1433 			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1434 			GCM_MUL(ctx,Xi);
1435 			in += 16;
1436 		}
1437 		j   = i/16;
1438 		in -= i;
1439 #endif
1440 		(*stream)(in,out,j,key,ctx->Yi.c);
1441 		ctr += (unsigned int)j;
1442 		if (BYTE_ORDER == LITTLE_ENDIAN)
1443 #ifdef BSWAP4
1444 			ctx->Yi.d[3] = BSWAP4(ctr);
1445 #else
1446 			PUTU32(ctx->Yi.c+12,ctr);
1447 #endif
1448 		else
1449 			ctx->Yi.d[3] = ctr;
1450 		out += i;
1451 		in  += i;
1452 		len -= i;
1453 	}
1454 	if (len) {
1455 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1456 		++ctr;
1457 		if (BYTE_ORDER == LITTLE_ENDIAN)
1458 #ifdef BSWAP4
1459 			ctx->Yi.d[3] = BSWAP4(ctr);
1460 #else
1461 			PUTU32(ctx->Yi.c+12,ctr);
1462 #endif
1463 		else
1464 			ctx->Yi.d[3] = ctr;
1465 		while (len--) {
1466 			u8 c = in[n];
1467 			ctx->Xi.c[n] ^= c;
1468 			out[n] = c^ctx->EKi.c[n];
1469 			++n;
1470 		}
1471 	}
1472 
1473 	ctx->mres = n;
1474 	return 0;
1475 }
1476 
1477 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1478 			size_t len)
1479 {
1480 	u64 alen = ctx->len.u[0]<<3;
1481 	u64 clen = ctx->len.u[1]<<3;
1482 #ifdef GCM_FUNCREF_4BIT
1483 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1484 #endif
1485 
1486 	if (ctx->mres || ctx->ares)
1487 		GCM_MUL(ctx,Xi);
1488 
1489 	if (BYTE_ORDER == LITTLE_ENDIAN) {
1490 #ifdef BSWAP8
1491 		alen = BSWAP8(alen);
1492 		clen = BSWAP8(clen);
1493 #else
1494 		u8 *p = ctx->len.c;
1495 
1496 		ctx->len.u[0] = alen;
1497 		ctx->len.u[1] = clen;
1498 
1499 		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1500 		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1501 #endif
1502 	}
1503 
1504 	ctx->Xi.u[0] ^= alen;
1505 	ctx->Xi.u[1] ^= clen;
1506 	GCM_MUL(ctx,Xi);
1507 
1508 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1509 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1510 
1511 	if (tag && len<=sizeof(ctx->Xi))
1512 		return memcmp(ctx->Xi.c,tag,len);
1513 	else
1514 		return -1;
1515 }
1516 
1517 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1518 {
1519 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1520 	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1521 }
1522 
1523 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1524 {
1525 	GCM128_CONTEXT *ret;
1526 
1527 	if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1528 		CRYPTO_gcm128_init(ret,key,block);
1529 
1530 	return ret;
1531 }
1532 
1533 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1534 {
1535 	if (ctx) {
1536 		explicit_bzero(ctx,sizeof(*ctx));
1537 		free(ctx);
1538 	}
1539 }
1540