1 /* $OpenBSD: gcm128.c,v 1.22 2018/01/24 23:03:37 kettenis Exp $ */
2 /* ====================================================================
3  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * 3. All advertising materials mentioning features or use of this
18  *    software must display the following acknowledgment:
19  *    "This product includes software developed by the OpenSSL Project
20  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21  *
22  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23  *    endorse or promote products derived from this software without
24  *    prior written permission. For written permission, please contact
25  *    openssl-core@openssl.org.
26  *
27  * 5. Products derived from this software may not be called "OpenSSL"
28  *    nor may "OpenSSL" appear in their names without prior written
29  *    permission of the OpenSSL Project.
30  *
31  * 6. Redistributions of any form whatsoever must retain the following
32  *    acknowledgment:
33  *    "This product includes software developed by the OpenSSL Project
34  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35  *
36  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47  * OF THE POSSIBILITY OF SUCH DAMAGE.
48  * ====================================================================
49  */
50 
51 #define OPENSSL_FIPSAPI
52 
53 #include <openssl/crypto.h>
54 #include "modes_lcl.h"
55 #include <string.h>
56 
57 #ifndef MODES_DEBUG
58 # ifndef NDEBUG
59 #  define NDEBUG
60 # endif
61 #endif
62 
63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef	GETU32
66 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67 #undef	PUTU32
68 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
69 #endif
70 
71 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)	\
73 	do { \
74 		if (sizeof(size_t)==8) { \
75 			u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
76 			V.lo  = (V.hi<<63)|(V.lo>>1); \
77 			V.hi  = (V.hi>>1 )^T; \
78 		} else { \
79 			u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 			V.lo  = (V.hi<<63)|(V.lo>>1); \
81 			V.hi  = (V.hi>>1 )^((u64)T<<32); \
82 		} \
83 	} while(0)
84 
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if	TABLE_BITS==8
120 
gcm_init_8bit(u128 Htable[256],u64 H[2])121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123 	int  i, j;
124 	u128 V;
125 
126 	Htable[0].hi = 0;
127 	Htable[0].lo = 0;
128 	V.hi = H[0];
129 	V.lo = H[1];
130 
131 	for (Htable[128]=V, i=64; i>0; i>>=1) {
132 		REDUCE1BIT(V);
133 		Htable[i] = V;
134 	}
135 
136 	for (i=2; i<256; i<<=1) {
137 		u128 *Hi = Htable+i, H0 = *Hi;
138 		for (j=1; j<i; ++j) {
139 			Hi[j].hi = H0.hi^Htable[j].hi;
140 			Hi[j].lo = H0.lo^Htable[j].lo;
141 		}
142 	}
143 }
144 
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147 	u128 Z = { 0, 0};
148 	const u8 *xi = (const u8 *)Xi+15;
149 	size_t rem, n = *xi;
150 	static const size_t rem_8bit[256] = {
151 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
152 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
153 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
154 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
155 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
156 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
157 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
158 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
159 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
160 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
161 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
162 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
163 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
164 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
165 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
166 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
167 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
168 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
169 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
170 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
171 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
172 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
173 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
174 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
175 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
176 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
177 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
178 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
179 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
180 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
181 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
182 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
183 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
184 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
185 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
186 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
187 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
188 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
189 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
190 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
191 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
192 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
193 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
194 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
195 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
196 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
197 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
198 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
199 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
200 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
201 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
202 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
203 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
204 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
205 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
206 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
207 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
208 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
209 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
210 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
211 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
212 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
213 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
214 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
215 
216 	while (1) {
217 		Z.hi ^= Htable[n].hi;
218 		Z.lo ^= Htable[n].lo;
219 
220 		if ((u8 *)Xi==xi)	break;
221 
222 		n = *(--xi);
223 
224 		rem  = (size_t)Z.lo&0xff;
225 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
226 		Z.hi = (Z.hi>>8);
227 #if SIZE_MAX == 0xffffffffffffffff
228 		Z.hi ^= rem_8bit[rem];
229 #else
230 		Z.hi ^= (u64)rem_8bit[rem]<<32;
231 #endif
232 	}
233 
234 #if BYTE_ORDER == LITTLE_ENDIAN
235 #ifdef BSWAP8
236 	Xi[0] = BSWAP8(Z.hi);
237 	Xi[1] = BSWAP8(Z.lo);
238 #else
239 	u8 *p = (u8 *)Xi;
240 	u32 v;
241 	v = (u32)(Z.hi>>32);	PUTU32(p,v);
242 	v = (u32)(Z.hi);	PUTU32(p+4,v);
243 	v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
244 	v = (u32)(Z.lo);	PUTU32(p+12,v);
245 #endif
246 #else /* BIG_ENDIAN */
247 	Xi[0] = Z.hi;
248 	Xi[1] = Z.lo;
249 #endif
250 }
251 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
252 
253 #elif	TABLE_BITS==4
254 
gcm_init_4bit(u128 Htable[16],u64 H[2])255 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
256 {
257 	u128 V;
258 #if defined(OPENSSL_SMALL_FOOTPRINT)
259 	int  i;
260 #endif
261 
262 	Htable[0].hi = 0;
263 	Htable[0].lo = 0;
264 	V.hi = H[0];
265 	V.lo = H[1];
266 
267 #if defined(OPENSSL_SMALL_FOOTPRINT)
268 	for (Htable[8]=V, i=4; i>0; i>>=1) {
269 		REDUCE1BIT(V);
270 		Htable[i] = V;
271 	}
272 
273 	for (i=2; i<16; i<<=1) {
274 		u128 *Hi = Htable+i;
275 		int   j;
276 		for (V=*Hi, j=1; j<i; ++j) {
277 			Hi[j].hi = V.hi^Htable[j].hi;
278 			Hi[j].lo = V.lo^Htable[j].lo;
279 		}
280 	}
281 #else
282 	Htable[8] = V;
283 	REDUCE1BIT(V);
284 	Htable[4] = V;
285 	REDUCE1BIT(V);
286 	Htable[2] = V;
287 	REDUCE1BIT(V);
288 	Htable[1] = V;
289 	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
290 	V=Htable[4];
291 	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
292 	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
293 	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
294 	V=Htable[8];
295 	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
296 	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
297 	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
298 	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
299 	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
300 	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
301 	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
302 #endif
303 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
304 	/*
305 	 * ARM assembler expects specific dword order in Htable.
306 	 */
307 	{
308 		int j;
309 #if BYTE_ORDER == LITTLE_ENDIAN
310 		for (j=0;j<16;++j) {
311 			V = Htable[j];
312 			Htable[j].hi = V.lo;
313 			Htable[j].lo = V.hi;
314 		}
315 #else /* BIG_ENDIAN */
316 		for (j=0;j<16;++j) {
317 			V = Htable[j];
318 			Htable[j].hi = V.lo<<32|V.lo>>32;
319 			Htable[j].lo = V.hi<<32|V.hi>>32;
320 		}
321 #endif
322 	}
323 #endif
324 }
325 
326 #ifndef GHASH_ASM
327 static const size_t rem_4bit[16] = {
328 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
329 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
330 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
331 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
332 
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])333 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
334 {
335 	u128 Z;
336 	int cnt = 15;
337 	size_t rem, nlo, nhi;
338 
339 	nlo  = ((const u8 *)Xi)[15];
340 	nhi  = nlo>>4;
341 	nlo &= 0xf;
342 
343 	Z.hi = Htable[nlo].hi;
344 	Z.lo = Htable[nlo].lo;
345 
346 	while (1) {
347 		rem  = (size_t)Z.lo&0xf;
348 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
349 		Z.hi = (Z.hi>>4);
350 #if SIZE_MAX == 0xffffffffffffffff
351 		Z.hi ^= rem_4bit[rem];
352 #else
353 		Z.hi ^= (u64)rem_4bit[rem]<<32;
354 #endif
355 		Z.hi ^= Htable[nhi].hi;
356 		Z.lo ^= Htable[nhi].lo;
357 
358 		if (--cnt<0)		break;
359 
360 		nlo  = ((const u8 *)Xi)[cnt];
361 		nhi  = nlo>>4;
362 		nlo &= 0xf;
363 
364 		rem  = (size_t)Z.lo&0xf;
365 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
366 		Z.hi = (Z.hi>>4);
367 #if SIZE_MAX == 0xffffffffffffffff
368 		Z.hi ^= rem_4bit[rem];
369 #else
370 		Z.hi ^= (u64)rem_4bit[rem]<<32;
371 #endif
372 		Z.hi ^= Htable[nlo].hi;
373 		Z.lo ^= Htable[nlo].lo;
374 	}
375 
376 #if BYTE_ORDER == LITTLE_ENDIAN
377 #ifdef BSWAP8
378 	Xi[0] = BSWAP8(Z.hi);
379 	Xi[1] = BSWAP8(Z.lo);
380 #else
381 	u8 *p = (u8 *)Xi;
382 	u32 v;
383 	v = (u32)(Z.hi>>32);	PUTU32(p,v);
384 	v = (u32)(Z.hi);	PUTU32(p+4,v);
385 	v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
386 	v = (u32)(Z.lo);	PUTU32(p+12,v);
387 #endif
388 #else /* BIG_ENDIAN */
389 	Xi[0] = Z.hi;
390 	Xi[1] = Z.lo;
391 #endif
392 }
393 
394 #if !defined(OPENSSL_SMALL_FOOTPRINT)
395 /*
396  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
397  * details... Compiler-generated code doesn't seem to give any
398  * performance improvement, at least not on x86[_64]. It's here
399  * mostly as reference and a placeholder for possible future
400  * non-trivial optimization[s]...
401  */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)402 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
403 				const u8 *inp,size_t len)
404 {
405     u128 Z;
406     int cnt;
407     size_t rem, nlo, nhi;
408 
409 #if 1
410     do {
411 	cnt  = 15;
412 	nlo  = ((const u8 *)Xi)[15];
413 	nlo ^= inp[15];
414 	nhi  = nlo>>4;
415 	nlo &= 0xf;
416 
417 	Z.hi = Htable[nlo].hi;
418 	Z.lo = Htable[nlo].lo;
419 
420 	while (1) {
421 		rem  = (size_t)Z.lo&0xf;
422 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
423 		Z.hi = (Z.hi>>4);
424 #if SIZE_MAX == 0xffffffffffffffff
425 		Z.hi ^= rem_4bit[rem];
426 #else
427 		Z.hi ^= (u64)rem_4bit[rem]<<32;
428 #endif
429 		Z.hi ^= Htable[nhi].hi;
430 		Z.lo ^= Htable[nhi].lo;
431 
432 		if (--cnt<0)		break;
433 
434 		nlo  = ((const u8 *)Xi)[cnt];
435 		nlo ^= inp[cnt];
436 		nhi  = nlo>>4;
437 		nlo &= 0xf;
438 
439 		rem  = (size_t)Z.lo&0xf;
440 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
441 		Z.hi = (Z.hi>>4);
442 #if SIZE_MAX == 0xffffffffffffffff
443 		Z.hi ^= rem_4bit[rem];
444 #else
445 		Z.hi ^= (u64)rem_4bit[rem]<<32;
446 #endif
447 		Z.hi ^= Htable[nlo].hi;
448 		Z.lo ^= Htable[nlo].lo;
449 	}
450 #else
451     /*
452      * Extra 256+16 bytes per-key plus 512 bytes shared tables
453      * [should] give ~50% improvement... One could have PACK()-ed
454      * the rem_8bit even here, but the priority is to minimize
455      * cache footprint...
456      */
457     u128 Hshr4[16];	/* Htable shifted right by 4 bits */
458     u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
459     static const unsigned short rem_8bit[256] = {
460 	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
461 	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
462 	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
463 	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
464 	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
465 	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
466 	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
467 	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
468 	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
469 	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
470 	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
471 	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
472 	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
473 	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
474 	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
475 	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
476 	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
477 	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
478 	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
479 	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
480 	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
481 	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
482 	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
483 	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
484 	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
485 	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
486 	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
487 	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
488 	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
489 	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
490 	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
491 	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
492     /*
493      * This pre-processing phase slows down procedure by approximately
494      * same time as it makes each loop spin faster. In other words
495      * single block performance is approximately same as straightforward
496      * "4-bit" implementation, and then it goes only faster...
497      */
498     for (cnt=0; cnt<16; ++cnt) {
499 	Z.hi = Htable[cnt].hi;
500 	Z.lo = Htable[cnt].lo;
501 	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
502 	Hshr4[cnt].hi = (Z.hi>>4);
503 	Hshl4[cnt]    = (u8)(Z.lo<<4);
504     }
505 
506     do {
507 	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
508 		nlo  = ((const u8 *)Xi)[cnt];
509 		nlo ^= inp[cnt];
510 		nhi  = nlo>>4;
511 		nlo &= 0xf;
512 
513 		Z.hi ^= Htable[nlo].hi;
514 		Z.lo ^= Htable[nlo].lo;
515 
516 		rem = (size_t)Z.lo&0xff;
517 
518 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
519 		Z.hi = (Z.hi>>8);
520 
521 		Z.hi ^= Hshr4[nhi].hi;
522 		Z.lo ^= Hshr4[nhi].lo;
523 		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
524 	}
525 
526 	nlo  = ((const u8 *)Xi)[0];
527 	nlo ^= inp[0];
528 	nhi  = nlo>>4;
529 	nlo &= 0xf;
530 
531 	Z.hi ^= Htable[nlo].hi;
532 	Z.lo ^= Htable[nlo].lo;
533 
534 	rem = (size_t)Z.lo&0xf;
535 
536 	Z.lo = (Z.hi<<60)|(Z.lo>>4);
537 	Z.hi = (Z.hi>>4);
538 
539 	Z.hi ^= Htable[nhi].hi;
540 	Z.lo ^= Htable[nhi].lo;
541 	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
542 #endif
543 
544 #if BYTE_ORDER == LITTLE_ENDIAN
545 #ifdef BSWAP8
546 	Xi[0] = BSWAP8(Z.hi);
547 	Xi[1] = BSWAP8(Z.lo);
548 #else
549 	u8 *p = (u8 *)Xi;
550 	u32 v;
551 	v = (u32)(Z.hi>>32);	PUTU32(p,v);
552 	v = (u32)(Z.hi);	PUTU32(p+4,v);
553 	v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
554 	v = (u32)(Z.lo);	PUTU32(p+12,v);
555 #endif
556 #else /* BIG_ENDIAN */
557 	Xi[0] = Z.hi;
558 	Xi[1] = Z.lo;
559 #endif
560     } while (inp+=16, len-=16);
561 }
562 #endif
563 #else
564 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
565 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
566 #endif
567 
568 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
569 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
570 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
571 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
572  * trashing effect. In other words idea is to hash data while it's
573  * still in L1 cache after encryption pass... */
574 #define GHASH_CHUNK       (3*1024)
575 #endif
576 
577 #else	/* TABLE_BITS */
578 
579 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
580 {
581 	u128 V,Z = { 0,0 };
582 	long X;
583 	int  i,j;
584 	const long *xi = (const long *)Xi;
585 
586 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
587 	V.lo = H[1];
588 
589 	for (j=0; j<16/sizeof(long); ++j) {
590 #if BYTE_ORDER == LITTLE_ENDIAN
591 #if SIZE_MAX == 0xffffffffffffffff
592 #ifdef BSWAP8
593 			X = (long)(BSWAP8(xi[j]));
594 #else
595 			const u8 *p = (const u8 *)(xi+j);
596 			X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
597 #endif
598 #else
599 			const u8 *p = (const u8 *)(xi+j);
600 			X = (long)GETU32(p);
601 #endif
602 #else /* BIG_ENDIAN */
603 		X = xi[j];
604 #endif
605 
606 		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
607 			u64 M = (u64)(X>>(8*sizeof(long)-1));
608 			Z.hi ^= V.hi&M;
609 			Z.lo ^= V.lo&M;
610 
611 			REDUCE1BIT(V);
612 		}
613 	}
614 
615 #if BYTE_ORDER == LITTLE_ENDIAN
616 #ifdef BSWAP8
617 	Xi[0] = BSWAP8(Z.hi);
618 	Xi[1] = BSWAP8(Z.lo);
619 #else
620 	u8 *p = (u8 *)Xi;
621 	u32 v;
622 	v = (u32)(Z.hi>>32);	PUTU32(p,v);
623 	v = (u32)(Z.hi);	PUTU32(p+4,v);
624 	v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
625 	v = (u32)(Z.lo);	PUTU32(p+12,v);
626 #endif
627 #else /* BIG_ENDIAN */
628 	Xi[0] = Z.hi;
629 	Xi[1] = Z.lo;
630 #endif
631 }
632 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
633 
634 #endif
635 
636 #if	defined(GHASH_ASM) && \
637 	(defined(__i386)	|| defined(__i386__)	|| \
638 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
639 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
640 #include "x86_arch.h"
641 #endif
642 
643 #if	TABLE_BITS==4 && defined(GHASH_ASM)
644 # if	(defined(__i386)	|| defined(__i386__)	|| \
645 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
646 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
647 #  define GHASH_ASM_X86_OR_64
648 #  define GCM_FUNCREF_4BIT
649 
650 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
651 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
652 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
653 
654 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
655 #   define GHASH_ASM_X86
656 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
657 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
658 
659 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
660 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
661 #  endif
662 # elif defined(__arm__) || defined(__arm)
663 #  include "arm_arch.h"
664 #  if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
665 #   define GHASH_ASM_ARM
666 #   define GCM_FUNCREF_4BIT
667 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
668 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
669 #  endif
670 # endif
671 #endif
672 
673 #ifdef GCM_FUNCREF_4BIT
674 # undef  GCM_MUL
675 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
676 # ifdef GHASH
677 #  undef  GHASH
678 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
679 # endif
680 #endif
681 
682 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
683 {
684 	memset(ctx,0,sizeof(*ctx));
685 	ctx->block = block;
686 	ctx->key   = key;
687 
688 	(*block)(ctx->H.c,ctx->H.c,key);
689 
690 #if BYTE_ORDER == LITTLE_ENDIAN
691 	/* H is stored in host byte order */
692 #ifdef BSWAP8
693 	ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
694 	ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
695 #else
696 	u8 *p = ctx->H.c;
697 	u64 hi,lo;
698 	hi = (u64)GETU32(p)  <<32|GETU32(p+4);
699 	lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
700 	ctx->H.u[0] = hi;
701 	ctx->H.u[1] = lo;
702 #endif
703 #endif
704 
705 #if	TABLE_BITS==8
706 	gcm_init_8bit(ctx->Htable,ctx->H.u);
707 #elif	TABLE_BITS==4
708 # if	defined(GHASH_ASM_X86_OR_64)
709 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
710 	/* check FXSR and PCLMULQDQ bits */
711 	if ((OPENSSL_cpu_caps() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) ==
712 	    (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) {
713 		gcm_init_clmul(ctx->Htable,ctx->H.u);
714 		ctx->gmult = gcm_gmult_clmul;
715 		ctx->ghash = gcm_ghash_clmul;
716 		return;
717 	}
718 #  endif
719 	gcm_init_4bit(ctx->Htable,ctx->H.u);
720 #  if	defined(GHASH_ASM_X86)			/* x86 only */
721 #   if	defined(OPENSSL_IA32_SSE2)
722 	if (OPENSSL_cpu_caps() & CPUCAP_MASK_SSE) {	/* check SSE bit */
723 #   else
724 	if (OPENSSL_cpu_caps() & CPUCAP_MASK_MMX) {	/* check MMX bit */
725 #   endif
726 		ctx->gmult = gcm_gmult_4bit_mmx;
727 		ctx->ghash = gcm_ghash_4bit_mmx;
728 	} else {
729 		ctx->gmult = gcm_gmult_4bit_x86;
730 		ctx->ghash = gcm_ghash_4bit_x86;
731 	}
732 #  else
733 	ctx->gmult = gcm_gmult_4bit;
734 	ctx->ghash = gcm_ghash_4bit;
735 #  endif
736 # elif	defined(GHASH_ASM_ARM)
737 	if (OPENSSL_armcap_P & ARMV7_NEON) {
738 		ctx->gmult = gcm_gmult_neon;
739 		ctx->ghash = gcm_ghash_neon;
740 	} else {
741 		gcm_init_4bit(ctx->Htable,ctx->H.u);
742 		ctx->gmult = gcm_gmult_4bit;
743 		ctx->ghash = gcm_ghash_4bit;
744 	}
745 # else
746 	gcm_init_4bit(ctx->Htable,ctx->H.u);
747 # endif
748 #endif
749 }
750 
751 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
752 {
753 	unsigned int ctr;
754 #ifdef GCM_FUNCREF_4BIT
755 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
756 #endif
757 
758 	ctx->Yi.u[0]  = 0;
759 	ctx->Yi.u[1]  = 0;
760 	ctx->Xi.u[0]  = 0;
761 	ctx->Xi.u[1]  = 0;
762 	ctx->len.u[0] = 0;	/* AAD length */
763 	ctx->len.u[1] = 0;	/* message length */
764 	ctx->ares = 0;
765 	ctx->mres = 0;
766 
767 	if (len==12) {
768 		memcpy(ctx->Yi.c,iv,12);
769 		ctx->Yi.c[15]=1;
770 		ctr=1;
771 	}
772 	else {
773 		size_t i;
774 		u64 len0 = len;
775 
776 		while (len>=16) {
777 			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
778 			GCM_MUL(ctx,Yi);
779 			iv += 16;
780 			len -= 16;
781 		}
782 		if (len) {
783 			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
784 			GCM_MUL(ctx,Yi);
785 		}
786 		len0 <<= 3;
787 #if BYTE_ORDER == LITTLE_ENDIAN
788 #ifdef BSWAP8
789 		ctx->Yi.u[1]  ^= BSWAP8(len0);
790 #else
791 		ctx->Yi.c[8]  ^= (u8)(len0>>56);
792 		ctx->Yi.c[9]  ^= (u8)(len0>>48);
793 		ctx->Yi.c[10] ^= (u8)(len0>>40);
794 		ctx->Yi.c[11] ^= (u8)(len0>>32);
795 		ctx->Yi.c[12] ^= (u8)(len0>>24);
796 		ctx->Yi.c[13] ^= (u8)(len0>>16);
797 		ctx->Yi.c[14] ^= (u8)(len0>>8);
798 		ctx->Yi.c[15] ^= (u8)(len0);
799 #endif
800 #else /* BIG_ENDIAN */
801 		ctx->Yi.u[1]  ^= len0;
802 #endif
803 
804 		GCM_MUL(ctx,Yi);
805 
806 #if BYTE_ORDER == LITTLE_ENDIAN
807 #ifdef BSWAP4
808 		ctr = BSWAP4(ctx->Yi.d[3]);
809 #else
810 		ctr = GETU32(ctx->Yi.c+12);
811 #endif
812 #else /* BIG_ENDIAN */
813 		ctr = ctx->Yi.d[3];
814 #endif
815 	}
816 
817 	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
818 	++ctr;
819 #if BYTE_ORDER == LITTLE_ENDIAN
820 #ifdef BSWAP4
821 	ctx->Yi.d[3] = BSWAP4(ctr);
822 #else
823 	PUTU32(ctx->Yi.c+12,ctr);
824 #endif
825 #else /* BIG_ENDIAN */
826 	ctx->Yi.d[3] = ctr;
827 #endif
828 }
829 
830 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
831 {
832 	size_t i;
833 	unsigned int n;
834 	u64 alen = ctx->len.u[0];
835 #ifdef GCM_FUNCREF_4BIT
836 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
837 # ifdef GHASH
838 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
839 				const u8 *inp,size_t len)	= ctx->ghash;
840 # endif
841 #endif
842 
843 	if (ctx->len.u[1]) return -2;
844 
845 	alen += len;
846 	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
847 		return -1;
848 	ctx->len.u[0] = alen;
849 
850 	n = ctx->ares;
851 	if (n) {
852 		while (n && len) {
853 			ctx->Xi.c[n] ^= *(aad++);
854 			--len;
855 			n = (n+1)%16;
856 		}
857 		if (n==0) GCM_MUL(ctx,Xi);
858 		else {
859 			ctx->ares = n;
860 			return 0;
861 		}
862 	}
863 
864 #ifdef GHASH
865 	if ((i = (len&(size_t)-16))) {
866 		GHASH(ctx,aad,i);
867 		aad += i;
868 		len -= i;
869 	}
870 #else
871 	while (len>=16) {
872 		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
873 		GCM_MUL(ctx,Xi);
874 		aad += 16;
875 		len -= 16;
876 	}
877 #endif
878 	if (len) {
879 		n = (unsigned int)len;
880 		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
881 	}
882 
883 	ctx->ares = n;
884 	return 0;
885 }
886 
887 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
888 		const unsigned char *in, unsigned char *out,
889 		size_t len)
890 {
891 	unsigned int n, ctr;
892 	size_t i;
893 	u64        mlen  = ctx->len.u[1];
894 	block128_f block = ctx->block;
895 	void      *key   = ctx->key;
896 #ifdef GCM_FUNCREF_4BIT
897 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
898 # ifdef GHASH
899 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
900 				const u8 *inp,size_t len)	= ctx->ghash;
901 # endif
902 #endif
903 
904 	mlen += len;
905 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
906 		return -1;
907 	ctx->len.u[1] = mlen;
908 
909 	if (ctx->ares) {
910 		/* First call to encrypt finalizes GHASH(AAD) */
911 		GCM_MUL(ctx,Xi);
912 		ctx->ares = 0;
913 	}
914 
915 #if BYTE_ORDER == LITTLE_ENDIAN
916 #ifdef BSWAP4
917 	ctr = BSWAP4(ctx->Yi.d[3]);
918 #else
919 	ctr = GETU32(ctx->Yi.c+12);
920 #endif
921 #else /* BIG_ENDIAN */
922 	ctr = ctx->Yi.d[3];
923 #endif
924 
925 	n = ctx->mres;
926 #if !defined(OPENSSL_SMALL_FOOTPRINT)
927 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
928 		if (n) {
929 			while (n && len) {
930 				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
931 				--len;
932 				n = (n+1)%16;
933 			}
934 			if (n==0) GCM_MUL(ctx,Xi);
935 			else {
936 				ctx->mres = n;
937 				return 0;
938 			}
939 		}
940 #ifdef __STRICT_ALIGNMENT
941 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
942 			break;
943 #endif
944 #if defined(GHASH) && defined(GHASH_CHUNK)
945 		while (len>=GHASH_CHUNK) {
946 		    size_t j=GHASH_CHUNK;
947 
948 		    while (j) {
949 		    	size_t *out_t=(size_t *)out;
950 		    	const size_t *in_t=(const size_t *)in;
951 
952 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
953 			++ctr;
954 #if BYTE_ORDER == LITTLE_ENDIAN
955 #ifdef BSWAP4
956 			ctx->Yi.d[3] = BSWAP4(ctr);
957 #else
958 			PUTU32(ctx->Yi.c+12,ctr);
959 #endif
960 #else /* BIG_ENDIAN */
961 			ctx->Yi.d[3] = ctr;
962 #endif
963 			for (i=0; i<16/sizeof(size_t); ++i)
964 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
965 			out += 16;
966 			in  += 16;
967 			j   -= 16;
968 		    }
969 		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
970 		    len -= GHASH_CHUNK;
971 		}
972 		if ((i = (len&(size_t)-16))) {
973 		    size_t j=i;
974 
975 		    while (len>=16) {
976 		    	size_t *out_t=(size_t *)out;
977 		    	const size_t *in_t=(const size_t *)in;
978 
979 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
980 			++ctr;
981 #if BYTE_ORDER == LITTLE_ENDIAN
982 #ifdef BSWAP4
983 			ctx->Yi.d[3] = BSWAP4(ctr);
984 #else
985 			PUTU32(ctx->Yi.c+12,ctr);
986 #endif
987 #else /* BIG_ENDIAN */
988 			ctx->Yi.d[3] = ctr;
989 #endif
990 			for (i=0; i<16/sizeof(size_t); ++i)
991 				out_t[i] = in_t[i] ^ ctx->EKi.t[i];
992 			out += 16;
993 			in  += 16;
994 			len -= 16;
995 		    }
996 		    GHASH(ctx,out-j,j);
997 		}
998 #else
999 		while (len>=16) {
1000 		    	size_t *out_t=(size_t *)out;
1001 		    	const size_t *in_t=(const size_t *)in;
1002 
1003 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1004 			++ctr;
1005 #if BYTE_ORDER == LITTLE_ENDIAN
1006 #ifdef BSWAP4
1007 			ctx->Yi.d[3] = BSWAP4(ctr);
1008 #else
1009 			PUTU32(ctx->Yi.c+12,ctr);
1010 #endif
1011 #else /* BIG_ENDIAN */
1012 			ctx->Yi.d[3] = ctr;
1013 #endif
1014 			for (i=0; i<16/sizeof(size_t); ++i)
1015 				ctx->Xi.t[i] ^=
1016 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1017 			GCM_MUL(ctx,Xi);
1018 			out += 16;
1019 			in  += 16;
1020 			len -= 16;
1021 		}
1022 #endif
1023 		if (len) {
1024 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1025 			++ctr;
1026 #if BYTE_ORDER == LITTLE_ENDIAN
1027 #ifdef BSWAP4
1028 			ctx->Yi.d[3] = BSWAP4(ctr);
1029 #else
1030 			PUTU32(ctx->Yi.c+12,ctr);
1031 #endif
1032 #else /* BIG_ENDIAN */
1033 			ctx->Yi.d[3] = ctr;
1034 #endif
1035 			while (len--) {
1036 				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1037 				++n;
1038 			}
1039 		}
1040 
1041 		ctx->mres = n;
1042 		return 0;
1043 	} while(0);
1044 #endif
1045 	for (i=0;i<len;++i) {
1046 		if (n==0) {
1047 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1048 			++ctr;
1049 #if BYTE_ORDER == LITTLE_ENDIAN
1050 #ifdef BSWAP4
1051 			ctx->Yi.d[3] = BSWAP4(ctr);
1052 #else
1053 			PUTU32(ctx->Yi.c+12,ctr);
1054 #endif
1055 #else /* BIG_ENDIAN */
1056 			ctx->Yi.d[3] = ctr;
1057 #endif
1058 		}
1059 		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1060 		n = (n+1)%16;
1061 		if (n==0)
1062 			GCM_MUL(ctx,Xi);
1063 	}
1064 
1065 	ctx->mres = n;
1066 	return 0;
1067 }
1068 
1069 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1070 		const unsigned char *in, unsigned char *out,
1071 		size_t len)
1072 {
1073 	unsigned int n, ctr;
1074 	size_t i;
1075 	u64        mlen  = ctx->len.u[1];
1076 	block128_f block = ctx->block;
1077 	void      *key   = ctx->key;
1078 #ifdef GCM_FUNCREF_4BIT
1079 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1080 # ifdef GHASH
1081 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1082 				const u8 *inp,size_t len)	= ctx->ghash;
1083 # endif
1084 #endif
1085 
1086 	mlen += len;
1087 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1088 		return -1;
1089 	ctx->len.u[1] = mlen;
1090 
1091 	if (ctx->ares) {
1092 		/* First call to decrypt finalizes GHASH(AAD) */
1093 		GCM_MUL(ctx,Xi);
1094 		ctx->ares = 0;
1095 	}
1096 
1097 #if BYTE_ORDER == LITTLE_ENDIAN
1098 #ifdef BSWAP4
1099 	ctr = BSWAP4(ctx->Yi.d[3]);
1100 #else
1101 	ctr = GETU32(ctx->Yi.c+12);
1102 #endif
1103 #else /* BIG_ENDIAN */
1104 	ctr = ctx->Yi.d[3];
1105 #endif
1106 
1107 	n = ctx->mres;
1108 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1109 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1110 		if (n) {
1111 			while (n && len) {
1112 				u8 c = *(in++);
1113 				*(out++) = c^ctx->EKi.c[n];
1114 				ctx->Xi.c[n] ^= c;
1115 				--len;
1116 				n = (n+1)%16;
1117 			}
1118 			if (n==0) GCM_MUL (ctx,Xi);
1119 			else {
1120 				ctx->mres = n;
1121 				return 0;
1122 			}
1123 		}
1124 #ifdef __STRICT_ALIGNMENT
1125 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1126 			break;
1127 #endif
1128 #if defined(GHASH) && defined(GHASH_CHUNK)
1129 		while (len>=GHASH_CHUNK) {
1130 		    size_t j=GHASH_CHUNK;
1131 
1132 		    GHASH(ctx,in,GHASH_CHUNK);
1133 		    while (j) {
1134 		    	size_t *out_t=(size_t *)out;
1135 		    	const size_t *in_t=(const size_t *)in;
1136 
1137 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1138 			++ctr;
1139 #if BYTE_ORDER == LITTLE_ENDIAN
1140 #ifdef BSWAP4
1141 				ctx->Yi.d[3] = BSWAP4(ctr);
1142 #else
1143 				PUTU32(ctx->Yi.c+12,ctr);
1144 #endif
1145 #else /* BIG_ENDIAN */
1146 				ctx->Yi.d[3] = ctr;
1147 #endif
1148 			for (i=0; i<16/sizeof(size_t); ++i)
1149 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1150 			out += 16;
1151 			in  += 16;
1152 			j   -= 16;
1153 		    }
1154 		    len -= GHASH_CHUNK;
1155 		}
1156 		if ((i = (len&(size_t)-16))) {
1157 		    GHASH(ctx,in,i);
1158 		    while (len>=16) {
1159 		    	size_t *out_t=(size_t *)out;
1160 		    	const size_t *in_t=(const size_t *)in;
1161 
1162 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1163 			++ctr;
1164 #if BYTE_ORDER == LITTLE_ENDIAN
1165 #ifdef BSWAP4
1166 			ctx->Yi.d[3] = BSWAP4(ctr);
1167 #else
1168 			PUTU32(ctx->Yi.c+12,ctr);
1169 #endif
1170 #else /* BIG_ENDIAN */
1171 			ctx->Yi.d[3] = ctr;
1172 #endif
1173 			for (i=0; i<16/sizeof(size_t); ++i)
1174 				out_t[i] = in_t[i]^ctx->EKi.t[i];
1175 			out += 16;
1176 			in  += 16;
1177 			len -= 16;
1178 		    }
1179 		}
1180 #else
1181 		while (len>=16) {
1182 		    	size_t *out_t=(size_t *)out;
1183 		    	const size_t *in_t=(const size_t *)in;
1184 
1185 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1186 			++ctr;
1187 #if BYTE_ORDER == LITTLE_ENDIAN
1188 #ifdef BSWAP4
1189 			ctx->Yi.d[3] = BSWAP4(ctr);
1190 #else
1191 			PUTU32(ctx->Yi.c+12,ctr);
1192 #endif
1193 #else /* BIG_ENDIAN */
1194 			ctx->Yi.d[3] = ctr;
1195 #endif
1196 			for (i=0; i<16/sizeof(size_t); ++i) {
1197 				size_t c = in[i];
1198 				out[i] = c^ctx->EKi.t[i];
1199 				ctx->Xi.t[i] ^= c;
1200 			}
1201 			GCM_MUL(ctx,Xi);
1202 			out += 16;
1203 			in  += 16;
1204 			len -= 16;
1205 		}
1206 #endif
1207 		if (len) {
1208 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1209 			++ctr;
1210 #if BYTE_ORDER == LITTLE_ENDIAN
1211 #ifdef BSWAP4
1212 			ctx->Yi.d[3] = BSWAP4(ctr);
1213 #else
1214 			PUTU32(ctx->Yi.c+12,ctr);
1215 #endif
1216 #else /* BIG_ENDIAN */
1217 			ctx->Yi.d[3] = ctr;
1218 #endif
1219 			while (len--) {
1220 				u8 c = in[n];
1221 				ctx->Xi.c[n] ^= c;
1222 				out[n] = c^ctx->EKi.c[n];
1223 				++n;
1224 			}
1225 		}
1226 
1227 		ctx->mres = n;
1228 		return 0;
1229 	} while(0);
1230 #endif
1231 	for (i=0;i<len;++i) {
1232 		u8 c;
1233 		if (n==0) {
1234 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1235 			++ctr;
1236 #if BYTE_ORDER == LITTLE_ENDIAN
1237 #ifdef BSWAP4
1238 			ctx->Yi.d[3] = BSWAP4(ctr);
1239 #else
1240 			PUTU32(ctx->Yi.c+12,ctr);
1241 #endif
1242 #else /* BIG_ENDIAN */
1243 			ctx->Yi.d[3] = ctr;
1244 #endif
1245 		}
1246 		c = in[i];
1247 		out[i] = c^ctx->EKi.c[n];
1248 		ctx->Xi.c[n] ^= c;
1249 		n = (n+1)%16;
1250 		if (n==0)
1251 			GCM_MUL(ctx,Xi);
1252 	}
1253 
1254 	ctx->mres = n;
1255 	return 0;
1256 }
1257 
1258 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1259 		const unsigned char *in, unsigned char *out,
1260 		size_t len, ctr128_f stream)
1261 {
1262 	unsigned int n, ctr;
1263 	size_t i;
1264 	u64   mlen = ctx->len.u[1];
1265 	void *key  = ctx->key;
1266 #ifdef GCM_FUNCREF_4BIT
1267 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1268 # ifdef GHASH
1269 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1270 				const u8 *inp,size_t len)	= ctx->ghash;
1271 # endif
1272 #endif
1273 
1274 	mlen += len;
1275 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1276 		return -1;
1277 	ctx->len.u[1] = mlen;
1278 
1279 	if (ctx->ares) {
1280 		/* First call to encrypt finalizes GHASH(AAD) */
1281 		GCM_MUL(ctx,Xi);
1282 		ctx->ares = 0;
1283 	}
1284 
1285 #if BYTE_ORDER == LITTLE_ENDIAN
1286 #ifdef BSWAP4
1287 	ctr = BSWAP4(ctx->Yi.d[3]);
1288 #else
1289 	ctr = GETU32(ctx->Yi.c+12);
1290 #endif
1291 #else /* BIG_ENDIAN */
1292 	ctr = ctx->Yi.d[3];
1293 #endif
1294 
1295 	n = ctx->mres;
1296 	if (n) {
1297 		while (n && len) {
1298 			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1299 			--len;
1300 			n = (n+1)%16;
1301 		}
1302 		if (n==0) GCM_MUL(ctx,Xi);
1303 		else {
1304 			ctx->mres = n;
1305 			return 0;
1306 		}
1307 	}
1308 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1309 	while (len>=GHASH_CHUNK) {
1310 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1311 		ctr += GHASH_CHUNK/16;
1312 #if BYTE_ORDER == LITTLE_ENDIAN
1313 #ifdef BSWAP4
1314 		ctx->Yi.d[3] = BSWAP4(ctr);
1315 #else
1316 		PUTU32(ctx->Yi.c+12,ctr);
1317 #endif
1318 #else /* BIG_ENDIAN */
1319 		ctx->Yi.d[3] = ctr;
1320 #endif
1321 		GHASH(ctx,out,GHASH_CHUNK);
1322 		out += GHASH_CHUNK;
1323 		in  += GHASH_CHUNK;
1324 		len -= GHASH_CHUNK;
1325 	}
1326 #endif
1327 	if ((i = (len&(size_t)-16))) {
1328 		size_t j=i/16;
1329 
1330 		(*stream)(in,out,j,key,ctx->Yi.c);
1331 		ctr += (unsigned int)j;
1332 #if BYTE_ORDER == LITTLE_ENDIAN
1333 #ifdef BSWAP4
1334 		ctx->Yi.d[3] = BSWAP4(ctr);
1335 #else
1336 		PUTU32(ctx->Yi.c+12,ctr);
1337 #endif
1338 #else /* BIG_ENDIAN */
1339 		ctx->Yi.d[3] = ctr;
1340 #endif
1341 		in  += i;
1342 		len -= i;
1343 #if defined(GHASH)
1344 		GHASH(ctx,out,i);
1345 		out += i;
1346 #else
1347 		while (j--) {
1348 			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1349 			GCM_MUL(ctx,Xi);
1350 			out += 16;
1351 		}
1352 #endif
1353 	}
1354 	if (len) {
1355 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1356 		++ctr;
1357 #if BYTE_ORDER == LITTLE_ENDIAN
1358 #ifdef BSWAP4
1359 		ctx->Yi.d[3] = BSWAP4(ctr);
1360 #else
1361 		PUTU32(ctx->Yi.c+12,ctr);
1362 #endif
1363 #else /* BIG_ENDIAN */
1364 		ctx->Yi.d[3] = ctr;
1365 #endif
1366 		while (len--) {
1367 			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1368 			++n;
1369 		}
1370 	}
1371 
1372 	ctx->mres = n;
1373 	return 0;
1374 }
1375 
1376 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1377 		const unsigned char *in, unsigned char *out,
1378 		size_t len,ctr128_f stream)
1379 {
1380 	unsigned int n, ctr;
1381 	size_t i;
1382 	u64   mlen = ctx->len.u[1];
1383 	void *key  = ctx->key;
1384 #ifdef GCM_FUNCREF_4BIT
1385 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1386 # ifdef GHASH
1387 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1388 				const u8 *inp,size_t len)	= ctx->ghash;
1389 # endif
1390 #endif
1391 
1392 	mlen += len;
1393 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1394 		return -1;
1395 	ctx->len.u[1] = mlen;
1396 
1397 	if (ctx->ares) {
1398 		/* First call to decrypt finalizes GHASH(AAD) */
1399 		GCM_MUL(ctx,Xi);
1400 		ctx->ares = 0;
1401 	}
1402 
1403 #if BYTE_ORDER == LITTLE_ENDIAN
1404 #ifdef BSWAP4
1405 	ctr = BSWAP4(ctx->Yi.d[3]);
1406 #else
1407 	ctr = GETU32(ctx->Yi.c+12);
1408 #endif
1409 #else /* BIG_ENDIAN */
1410 	ctr = ctx->Yi.d[3];
1411 #endif
1412 
1413 	n = ctx->mres;
1414 	if (n) {
1415 		while (n && len) {
1416 			u8 c = *(in++);
1417 			*(out++) = c^ctx->EKi.c[n];
1418 			ctx->Xi.c[n] ^= c;
1419 			--len;
1420 			n = (n+1)%16;
1421 		}
1422 		if (n==0) GCM_MUL (ctx,Xi);
1423 		else {
1424 			ctx->mres = n;
1425 			return 0;
1426 		}
1427 	}
1428 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1429 	while (len>=GHASH_CHUNK) {
1430 		GHASH(ctx,in,GHASH_CHUNK);
1431 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1432 		ctr += GHASH_CHUNK/16;
1433 #if BYTE_ORDER == LITTLE_ENDIAN
1434 #ifdef BSWAP4
1435 		ctx->Yi.d[3] = BSWAP4(ctr);
1436 #else
1437 		PUTU32(ctx->Yi.c+12,ctr);
1438 #endif
1439 #else /* BIG_ENDIAN */
1440 		ctx->Yi.d[3] = ctr;
1441 #endif
1442 		out += GHASH_CHUNK;
1443 		in  += GHASH_CHUNK;
1444 		len -= GHASH_CHUNK;
1445 	}
1446 #endif
1447 	if ((i = (len&(size_t)-16))) {
1448 		size_t j=i/16;
1449 
1450 #if defined(GHASH)
1451 		GHASH(ctx,in,i);
1452 #else
1453 		while (j--) {
1454 			size_t k;
1455 			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1456 			GCM_MUL(ctx,Xi);
1457 			in += 16;
1458 		}
1459 		j   = i/16;
1460 		in -= i;
1461 #endif
1462 		(*stream)(in,out,j,key,ctx->Yi.c);
1463 		ctr += (unsigned int)j;
1464 #if BYTE_ORDER == LITTLE_ENDIAN
1465 #ifdef BSWAP4
1466 		ctx->Yi.d[3] = BSWAP4(ctr);
1467 #else
1468 		PUTU32(ctx->Yi.c+12,ctr);
1469 #endif
1470 #else /* BIG_ENDIAN */
1471 		ctx->Yi.d[3] = ctr;
1472 #endif
1473 		out += i;
1474 		in  += i;
1475 		len -= i;
1476 	}
1477 	if (len) {
1478 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1479 		++ctr;
1480 #if BYTE_ORDER == LITTLE_ENDIAN
1481 #ifdef BSWAP4
1482 		ctx->Yi.d[3] = BSWAP4(ctr);
1483 #else
1484 		PUTU32(ctx->Yi.c+12,ctr);
1485 #endif
1486 #else /* BIG_ENDIAN */
1487 		ctx->Yi.d[3] = ctr;
1488 #endif
1489 		while (len--) {
1490 			u8 c = in[n];
1491 			ctx->Xi.c[n] ^= c;
1492 			out[n] = c^ctx->EKi.c[n];
1493 			++n;
1494 		}
1495 	}
1496 
1497 	ctx->mres = n;
1498 	return 0;
1499 }
1500 
1501 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1502 			size_t len)
1503 {
1504 	u64 alen = ctx->len.u[0]<<3;
1505 	u64 clen = ctx->len.u[1]<<3;
1506 #ifdef GCM_FUNCREF_4BIT
1507 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1508 #endif
1509 
1510 	if (ctx->mres || ctx->ares)
1511 		GCM_MUL(ctx,Xi);
1512 
1513 #if BYTE_ORDER == LITTLE_ENDIAN
1514 #ifdef BSWAP8
1515 	alen = BSWAP8(alen);
1516 	clen = BSWAP8(clen);
1517 #else
1518 	{
1519 		u8 *p = ctx->len.c;
1520 
1521 		ctx->len.u[0] = alen;
1522 		ctx->len.u[1] = clen;
1523 
1524 		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1525 		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1526 	}
1527 #endif
1528 #endif
1529 
1530 	ctx->Xi.u[0] ^= alen;
1531 	ctx->Xi.u[1] ^= clen;
1532 	GCM_MUL(ctx,Xi);
1533 
1534 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1535 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1536 
1537 	if (tag && len<=sizeof(ctx->Xi))
1538 		return memcmp(ctx->Xi.c,tag,len);
1539 	else
1540 		return -1;
1541 }
1542 
1543 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1544 {
1545 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1546 	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1547 }
1548 
1549 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1550 {
1551 	GCM128_CONTEXT *ret;
1552 
1553 	if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1554 		CRYPTO_gcm128_init(ret,key,block);
1555 
1556 	return ret;
1557 }
1558 
1559 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1560 {
1561 	freezero(ctx, sizeof(*ctx));
1562 }
1563