1 /* $OpenBSD: gcm128.c,v 1.27 2024/09/06 09:57:32 tb Exp $ */
2 /* ====================================================================
3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 */
50
51 #define OPENSSL_FIPSAPI
52
53 #include <string.h>
54
55 #include <openssl/crypto.h>
56
57 #include "crypto_internal.h"
58 #include "modes_local.h"
59
60 #ifndef MODES_DEBUG
61 # ifndef NDEBUG
62 # define NDEBUG
63 # endif
64 #endif
65
66 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
67 /* redefine, because alignment is ensured */
68 #undef GETU32
69 #define GETU32(p) BSWAP4(*(const u32 *)(p))
70 #endif
71
72 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
73 #define REDUCE1BIT(V) \
74 do { \
75 if (sizeof(size_t)==8) { \
76 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
77 V.lo = (V.hi<<63)|(V.lo>>1); \
78 V.hi = (V.hi>>1 )^T; \
79 } else { \
80 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
81 V.lo = (V.hi<<63)|(V.lo>>1); \
82 V.hi = (V.hi>>1 )^((u64)T<<32); \
83 } \
84 } while(0)
85
86 /*
87 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
88 * never be set to 8. 8 is effectively reserved for testing purposes.
89 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
90 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
91 * whole spectrum of possible table driven implementations. Why? In
92 * non-"Shoup's" case memory access pattern is segmented in such manner,
93 * that it's trivial to see that cache timing information can reveal
94 * fair portion of intermediate hash value. Given that ciphertext is
95 * always available to attacker, it's possible for him to attempt to
96 * deduce secret parameter H and if successful, tamper with messages
97 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
98 * not as trivial, but there is no reason to believe that it's resistant
99 * to cache-timing attack. And the thing about "8-bit" implementation is
100 * that it consumes 16 (sixteen) times more memory, 4KB per individual
101 * key + 1KB shared. Well, on pros side it should be twice as fast as
102 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
103 * was observed to run ~75% faster, closer to 100% for commercial
104 * compilers... Yet "4-bit" procedure is preferred, because it's
105 * believed to provide better security-performance balance and adequate
106 * all-round performance. "All-round" refers to things like:
107 *
108 * - shorter setup time effectively improves overall timing for
109 * handling short messages;
110 * - larger table allocation can become unbearable because of VM
111 * subsystem penalties (for example on Windows large enough free
112 * results in VM working set trimming, meaning that consequent
113 * malloc would immediately incur working set expansion);
114 * - larger table has larger cache footprint, which can affect
115 * performance of other code paths (not necessarily even from same
116 * thread in Hyper-Threading world);
117 *
118 * Value of 1 is not appropriate for performance reasons.
119 */
120 #if TABLE_BITS==8
121
122 static void
gcm_init_8bit(u128 Htable[256],u64 H[2])123 gcm_init_8bit(u128 Htable[256], u64 H[2])
124 {
125 int i, j;
126 u128 V;
127
128 Htable[0].hi = 0;
129 Htable[0].lo = 0;
130 V.hi = H[0];
131 V.lo = H[1];
132
133 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
134 REDUCE1BIT(V);
135 Htable[i] = V;
136 }
137
138 for (i = 2; i < 256; i <<= 1) {
139 u128 *Hi = Htable + i, H0 = *Hi;
140 for (j = 1; j < i; ++j) {
141 Hi[j].hi = H0.hi ^ Htable[j].hi;
142 Hi[j].lo = H0.lo ^ Htable[j].lo;
143 }
144 }
145 }
146
147 static void
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])148 gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
149 {
150 u128 Z = { 0, 0};
151 const u8 *xi = (const u8 *)Xi + 15;
152 size_t rem, n = *xi;
153 static const size_t rem_8bit[256] = {
154 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
155 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
156 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
157 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
158 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
159 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
160 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
161 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
162 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
163 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
164 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
165 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
166 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
167 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
168 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
169 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
170 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
171 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
172 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
173 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
174 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
175 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
176 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
177 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
178 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
179 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
180 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
181 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
182 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
183 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
184 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
185 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
186 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
187 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
188 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
189 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
190 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
191 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
192 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
193 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
194 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
195 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
196 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
197 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
198 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
199 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
200 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
201 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
202 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
203 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
204 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
205 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
206 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
207 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
208 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
209 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
210 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
211 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
212 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
213 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
214 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
215 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
216 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
217 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
218
219 while (1) {
220 Z.hi ^= Htable[n].hi;
221 Z.lo ^= Htable[n].lo;
222
223 if ((u8 *)Xi == xi)
224 break;
225
226 n = *(--xi);
227
228 rem = (size_t)Z.lo & 0xff;
229 Z.lo = (Z.hi << 56)|(Z.lo >> 8);
230 Z.hi = (Z.hi >> 8);
231 #if SIZE_MAX == 0xffffffffffffffff
232 Z.hi ^= rem_8bit[rem];
233 #else
234 Z.hi ^= (u64)rem_8bit[rem] << 32;
235 #endif
236 }
237
238 Xi[0] = htobe64(Z.hi);
239 Xi[1] = htobe64(Z.lo);
240 }
241 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
242
243 #elif TABLE_BITS==4
244
245 static void
gcm_init_4bit(u128 Htable[16],u64 H[2])246 gcm_init_4bit(u128 Htable[16], u64 H[2])
247 {
248 u128 V;
249 #if defined(OPENSSL_SMALL_FOOTPRINT)
250 int i;
251 #endif
252
253 Htable[0].hi = 0;
254 Htable[0].lo = 0;
255 V.hi = H[0];
256 V.lo = H[1];
257
258 #if defined(OPENSSL_SMALL_FOOTPRINT)
259 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
260 REDUCE1BIT(V);
261 Htable[i] = V;
262 }
263
264 for (i = 2; i < 16; i <<= 1) {
265 u128 *Hi = Htable + i;
266 int j;
267 for (V = *Hi, j = 1; j < i; ++j) {
268 Hi[j].hi = V.hi ^ Htable[j].hi;
269 Hi[j].lo = V.lo ^ Htable[j].lo;
270 }
271 }
272 #else
273 Htable[8] = V;
274 REDUCE1BIT(V);
275 Htable[4] = V;
276 REDUCE1BIT(V);
277 Htable[2] = V;
278 REDUCE1BIT(V);
279 Htable[1] = V;
280 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
281 V = Htable[4];
282 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
283 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
284 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
285 V = Htable[8];
286 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
287 Htable[10].hi = V.hi ^ Htable[2].hi,
288 Htable[10].lo = V.lo ^ Htable[2].lo;
289 Htable[11].hi = V.hi ^ Htable[3].hi,
290 Htable[11].lo = V.lo ^ Htable[3].lo;
291 Htable[12].hi = V.hi ^ Htable[4].hi,
292 Htable[12].lo = V.lo ^ Htable[4].lo;
293 Htable[13].hi = V.hi ^ Htable[5].hi,
294 Htable[13].lo = V.lo ^ Htable[5].lo;
295 Htable[14].hi = V.hi ^ Htable[6].hi,
296 Htable[14].lo = V.lo ^ Htable[6].lo;
297 Htable[15].hi = V.hi ^ Htable[7].hi,
298 Htable[15].lo = V.lo ^ Htable[7].lo;
299 #endif
300 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
301 /*
302 * ARM assembler expects specific dword order in Htable.
303 */
304 {
305 int j;
306 #if BYTE_ORDER == LITTLE_ENDIAN
307 for (j = 0; j < 16; ++j) {
308 V = Htable[j];
309 Htable[j].hi = V.lo;
310 Htable[j].lo = V.hi;
311 }
312 #else /* BIG_ENDIAN */
313 for (j = 0; j < 16; ++j) {
314 V = Htable[j];
315 Htable[j].hi = V.lo << 32|V.lo >> 32;
316 Htable[j].lo = V.hi << 32|V.hi >> 32;
317 }
318 #endif
319 }
320 #endif
321 }
322
323 #ifndef GHASH_ASM
324 static const size_t rem_4bit[16] = {
325 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
326 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
327 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
328 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
329
330 static void
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])331 gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
332 {
333 u128 Z;
334 int cnt = 15;
335 size_t rem, nlo, nhi;
336
337 nlo = ((const u8 *)Xi)[15];
338 nhi = nlo >> 4;
339 nlo &= 0xf;
340
341 Z.hi = Htable[nlo].hi;
342 Z.lo = Htable[nlo].lo;
343
344 while (1) {
345 rem = (size_t)Z.lo & 0xf;
346 Z.lo = (Z.hi << 60)|(Z.lo >> 4);
347 Z.hi = (Z.hi >> 4);
348 #if SIZE_MAX == 0xffffffffffffffff
349 Z.hi ^= rem_4bit[rem];
350 #else
351 Z.hi ^= (u64)rem_4bit[rem] << 32;
352 #endif
353 Z.hi ^= Htable[nhi].hi;
354 Z.lo ^= Htable[nhi].lo;
355
356 if (--cnt < 0)
357 break;
358
359 nlo = ((const u8 *)Xi)[cnt];
360 nhi = nlo >> 4;
361 nlo &= 0xf;
362
363 rem = (size_t)Z.lo & 0xf;
364 Z.lo = (Z.hi << 60)|(Z.lo >> 4);
365 Z.hi = (Z.hi >> 4);
366 #if SIZE_MAX == 0xffffffffffffffff
367 Z.hi ^= rem_4bit[rem];
368 #else
369 Z.hi ^= (u64)rem_4bit[rem] << 32;
370 #endif
371 Z.hi ^= Htable[nlo].hi;
372 Z.lo ^= Htable[nlo].lo;
373 }
374
375 Xi[0] = htobe64(Z.hi);
376 Xi[1] = htobe64(Z.lo);
377 }
378
379 #if !defined(OPENSSL_SMALL_FOOTPRINT)
380 /*
381 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
382 * details... Compiler-generated code doesn't seem to give any
383 * performance improvement, at least not on x86[_64]. It's here
384 * mostly as reference and a placeholder for possible future
385 * non-trivial optimization[s]...
386 */
387 static void
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)388 gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
389 const u8 *inp, size_t len)
390 {
391 u128 Z;
392 int cnt;
393 size_t rem, nlo, nhi;
394
395 #if 1
396 do {
397 cnt = 15;
398 nlo = ((const u8 *)Xi)[15];
399 nlo ^= inp[15];
400 nhi = nlo >> 4;
401 nlo &= 0xf;
402
403 Z.hi = Htable[nlo].hi;
404 Z.lo = Htable[nlo].lo;
405
406 while (1) {
407 rem = (size_t)Z.lo & 0xf;
408 Z.lo = (Z.hi << 60)|(Z.lo >> 4);
409 Z.hi = (Z.hi >> 4);
410 #if SIZE_MAX == 0xffffffffffffffff
411 Z.hi ^= rem_4bit[rem];
412 #else
413 Z.hi ^= (u64)rem_4bit[rem] << 32;
414 #endif
415 Z.hi ^= Htable[nhi].hi;
416 Z.lo ^= Htable[nhi].lo;
417
418 if (--cnt < 0)
419 break;
420
421 nlo = ((const u8 *)Xi)[cnt];
422 nlo ^= inp[cnt];
423 nhi = nlo >> 4;
424 nlo &= 0xf;
425
426 rem = (size_t)Z.lo & 0xf;
427 Z.lo = (Z.hi << 60)|(Z.lo >> 4);
428 Z.hi = (Z.hi >> 4);
429 #if SIZE_MAX == 0xffffffffffffffff
430 Z.hi ^= rem_4bit[rem];
431 #else
432 Z.hi ^= (u64)rem_4bit[rem] << 32;
433 #endif
434 Z.hi ^= Htable[nlo].hi;
435 Z.lo ^= Htable[nlo].lo;
436 }
437 #else
438 /*
439 * Extra 256+16 bytes per-key plus 512 bytes shared tables
440 * [should] give ~50% improvement... One could have PACK()-ed
441 * the rem_8bit even here, but the priority is to minimize
442 * cache footprint...
443 */
444 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
445 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
446 static const unsigned short rem_8bit[256] = {
447 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
448 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
449 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
450 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
451 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
452 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
453 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
454 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
455 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
456 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
457 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
458 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
459 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
460 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
461 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
462 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
463 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
464 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
465 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
466 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
467 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
468 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
469 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
470 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
471 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
472 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
473 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
474 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
475 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
476 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
477 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
478 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
479 /*
480 * This pre-processing phase slows down procedure by approximately
481 * same time as it makes each loop spin faster. In other words
482 * single block performance is approximately same as straightforward
483 * "4-bit" implementation, and then it goes only faster...
484 */
485 for (cnt = 0; cnt < 16; ++cnt) {
486 Z.hi = Htable[cnt].hi;
487 Z.lo = Htable[cnt].lo;
488 Hshr4[cnt].lo = (Z.hi << 60)|(Z.lo >> 4);
489 Hshr4[cnt].hi = (Z.hi >> 4);
490 Hshl4[cnt] = (u8)(Z.lo << 4);
491 }
492
493 do {
494 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
495 nlo = ((const u8 *)Xi)[cnt];
496 nlo ^= inp[cnt];
497 nhi = nlo >> 4;
498 nlo &= 0xf;
499
500 Z.hi ^= Htable[nlo].hi;
501 Z.lo ^= Htable[nlo].lo;
502
503 rem = (size_t)Z.lo & 0xff;
504
505 Z.lo = (Z.hi << 56)|(Z.lo >> 8);
506 Z.hi = (Z.hi >> 8);
507
508 Z.hi ^= Hshr4[nhi].hi;
509 Z.lo ^= Hshr4[nhi].lo;
510 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
511 }
512
513 nlo = ((const u8 *)Xi)[0];
514 nlo ^= inp[0];
515 nhi = nlo >> 4;
516 nlo &= 0xf;
517
518 Z.hi ^= Htable[nlo].hi;
519 Z.lo ^= Htable[nlo].lo;
520
521 rem = (size_t)Z.lo & 0xf;
522
523 Z.lo = (Z.hi << 60)|(Z.lo >> 4);
524 Z.hi = (Z.hi >> 4);
525
526 Z.hi ^= Htable[nhi].hi;
527 Z.lo ^= Htable[nhi].lo;
528 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
529 #endif
530
531 Xi[0] = htobe64(Z.hi);
532 Xi[1] = htobe64(Z.lo);
533 } while (inp += 16, len -= 16);
534 }
535 #endif
536 #else
537 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
538 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
539 size_t len);
540 #endif
541
542 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
543 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
544 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
545 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
546 * trashing effect. In other words idea is to hash data while it's
547 * still in L1 cache after encryption pass... */
548 #define GHASH_CHUNK (3*1024)
549 #endif
550
551 #else /* TABLE_BITS */
552
553 static void
554 gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
555 {
556 u128 V, Z = { 0,0 };
557 long X;
558 int i, j;
559 const long *xi = (const long *)Xi;
560
561 V.hi = H[0]; /* H is in host byte order, no byte swapping */
562 V.lo = H[1];
563
564 for (j = 0; j < 16/sizeof(long); ++j) {
565 #if BYTE_ORDER == LITTLE_ENDIAN
566 #if SIZE_MAX == 0xffffffffffffffff
567 #ifdef BSWAP8
568 X = (long)(BSWAP8(xi[j]));
569 #else
570 const u8 *p = (const u8 *)(xi + j);
571 X = (long)((u64)GETU32(p) << 32|GETU32(p + 4));
572 #endif
573 #else
574 const u8 *p = (const u8 *)(xi + j);
575 X = (long)GETU32(p);
576 #endif
577 #else /* BIG_ENDIAN */
578 X = xi[j];
579 #endif
580
581 for (i = 0; i < 8*sizeof(long); ++i, X <<= 1) {
582 u64 M = (u64)(X >> (8*sizeof(long) - 1));
583 Z.hi ^= V.hi & M;
584 Z.lo ^= V.lo & M;
585
586 REDUCE1BIT(V);
587 }
588 }
589
590 Xi[0] = htobe64(Z.hi);
591 Xi[1] = htobe64(Z.lo);
592 }
593 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
594
595 #endif
596
597 #if defined(GHASH_ASM) && \
598 (defined(__i386) || defined(__i386__) || \
599 defined(__x86_64) || defined(__x86_64__) || \
600 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
601 #include "x86_arch.h"
602 #endif
603
604 #if TABLE_BITS==4 && defined(GHASH_ASM)
605 # if (defined(__i386) || defined(__i386__) || \
606 defined(__x86_64) || defined(__x86_64__) || \
607 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
608 # define GHASH_ASM_X86_OR_64
609 # define GCM_FUNCREF_4BIT
610
611 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
612 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
613 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
614 size_t len);
615
616 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
617 # define GHASH_ASM_X86
618 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
619 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
620 size_t len);
621
622 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
623 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
624 size_t len);
625 # endif
626 # elif defined(__arm__) || defined(__arm)
627 # include "arm_arch.h"
628 # if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
629 # define GHASH_ASM_ARM
630 # define GCM_FUNCREF_4BIT
631 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
632 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
633 size_t len);
634 # endif
635 # endif
636 #endif
637
638 #ifdef GCM_FUNCREF_4BIT
639 # undef GCM_MUL
640 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
641 # ifdef GHASH
642 # undef GHASH
643 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
644 # endif
645 #endif
646
647 void
648 CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
649 {
650 memset(ctx, 0, sizeof(*ctx));
651 ctx->block = block;
652 ctx->key = key;
653
654 (*block)(ctx->H.c, ctx->H.c, key);
655
656 /* H is stored in host byte order */
657 ctx->H.u[0] = be64toh(ctx->H.u[0]);
658 ctx->H.u[1] = be64toh(ctx->H.u[1]);
659
660 #if TABLE_BITS==8
661 gcm_init_8bit(ctx->Htable, ctx->H.u);
662 #elif TABLE_BITS==4
663 # if defined(GHASH_ASM_X86_OR_64)
664 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
665 /* check FXSR and PCLMULQDQ bits */
666 if ((crypto_cpu_caps_ia32() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) ==
667 (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) {
668 gcm_init_clmul(ctx->Htable, ctx->H.u);
669 ctx->gmult = gcm_gmult_clmul;
670 ctx->ghash = gcm_ghash_clmul;
671 return;
672 }
673 # endif
674 gcm_init_4bit(ctx->Htable, ctx->H.u);
675 # if defined(GHASH_ASM_X86) /* x86 only */
676 # if defined(OPENSSL_IA32_SSE2)
677 if (crypto_cpu_caps_ia32() & CPUCAP_MASK_SSE) { /* check SSE bit */
678 # else
679 if (crypto_cpu_caps_ia32() & CPUCAP_MASK_MMX) { /* check MMX bit */
680 # endif
681 ctx->gmult = gcm_gmult_4bit_mmx;
682 ctx->ghash = gcm_ghash_4bit_mmx;
683 } else {
684 ctx->gmult = gcm_gmult_4bit_x86;
685 ctx->ghash = gcm_ghash_4bit_x86;
686 }
687 # else
688 ctx->gmult = gcm_gmult_4bit;
689 ctx->ghash = gcm_ghash_4bit;
690 # endif
691 # elif defined(GHASH_ASM_ARM)
692 if (OPENSSL_armcap_P & ARMV7_NEON) {
693 ctx->gmult = gcm_gmult_neon;
694 ctx->ghash = gcm_ghash_neon;
695 } else {
696 gcm_init_4bit(ctx->Htable, ctx->H.u);
697 ctx->gmult = gcm_gmult_4bit;
698 ctx->ghash = gcm_ghash_4bit;
699 }
700 # else
701 gcm_init_4bit(ctx->Htable, ctx->H.u);
702 # endif
703 #endif
704 }
705 LCRYPTO_ALIAS(CRYPTO_gcm128_init);
706
707 void
708 CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, size_t len)
709 {
710 unsigned int ctr;
711 #ifdef GCM_FUNCREF_4BIT
712 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
713 #endif
714
715 ctx->Yi.u[0] = 0;
716 ctx->Yi.u[1] = 0;
717 ctx->Xi.u[0] = 0;
718 ctx->Xi.u[1] = 0;
719 ctx->len.u[0] = 0; /* AAD length */
720 ctx->len.u[1] = 0; /* message length */
721 ctx->ares = 0;
722 ctx->mres = 0;
723
724 if (len == 12) {
725 memcpy(ctx->Yi.c, iv, 12);
726 ctx->Yi.c[15] = 1;
727 ctr = 1;
728 } else {
729 size_t i;
730 u64 len0 = len;
731
732 while (len >= 16) {
733 for (i = 0; i < 16; ++i)
734 ctx->Yi.c[i] ^= iv[i];
735 GCM_MUL(ctx, Yi);
736 iv += 16;
737 len -= 16;
738 }
739 if (len) {
740 for (i = 0; i < len; ++i)
741 ctx->Yi.c[i] ^= iv[i];
742 GCM_MUL(ctx, Yi);
743 }
744 len0 <<= 3;
745 ctx->Yi.u[1] ^= htobe64(len0);
746
747 GCM_MUL(ctx, Yi);
748
749 ctr = be32toh(ctx->Yi.d[3]);
750 }
751
752 (*ctx->block)(ctx->Yi.c, ctx->EK0.c, ctx->key);
753 ++ctr;
754 ctx->Yi.d[3] = htobe32(ctr);
755 }
756 LCRYPTO_ALIAS(CRYPTO_gcm128_setiv);
757
758 int
759 CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, size_t len)
760 {
761 size_t i;
762 unsigned int n;
763 u64 alen = ctx->len.u[0];
764 #ifdef GCM_FUNCREF_4BIT
765 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
766 # ifdef GHASH
767 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
768 const u8 *inp, size_t len) = ctx->ghash;
769 # endif
770 #endif
771
772 if (ctx->len.u[1])
773 return -2;
774
775 alen += len;
776 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
777 return -1;
778 ctx->len.u[0] = alen;
779
780 n = ctx->ares;
781 if (n) {
782 while (n && len) {
783 ctx->Xi.c[n] ^= *(aad++);
784 --len;
785 n = (n + 1) % 16;
786 }
787 if (n == 0)
788 GCM_MUL(ctx, Xi);
789 else {
790 ctx->ares = n;
791 return 0;
792 }
793 }
794
795 #ifdef GHASH
796 if ((i = (len & (size_t)-16))) {
797 GHASH(ctx, aad, i);
798 aad += i;
799 len -= i;
800 }
801 #else
802 while (len >= 16) {
803 for (i = 0; i < 16; ++i)
804 ctx->Xi.c[i] ^= aad[i];
805 GCM_MUL(ctx, Xi);
806 aad += 16;
807 len -= 16;
808 }
809 #endif
810 if (len) {
811 n = (unsigned int)len;
812 for (i = 0; i < len; ++i)
813 ctx->Xi.c[i] ^= aad[i];
814 }
815
816 ctx->ares = n;
817 return 0;
818 }
819 LCRYPTO_ALIAS(CRYPTO_gcm128_aad);
820
821 int
822 CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
823 const unsigned char *in, unsigned char *out,
824 size_t len)
825 {
826 unsigned int n, ctr;
827 size_t i;
828 u64 mlen = ctx->len.u[1];
829 block128_f block = ctx->block;
830 void *key = ctx->key;
831 #ifdef GCM_FUNCREF_4BIT
832 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
833 # ifdef GHASH
834 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
835 const u8 *inp, size_t len) = ctx->ghash;
836 # endif
837 #endif
838
839 mlen += len;
840 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
841 return -1;
842 ctx->len.u[1] = mlen;
843
844 if (ctx->ares) {
845 /* First call to encrypt finalizes GHASH(AAD) */
846 GCM_MUL(ctx, Xi);
847 ctx->ares = 0;
848 }
849
850 ctr = be32toh(ctx->Yi.d[3]);
851
852 n = ctx->mres;
853 #if !defined(OPENSSL_SMALL_FOOTPRINT)
854 if (16 % sizeof(size_t) == 0)
855 do { /* always true actually */
856 if (n) {
857 while (n && len) {
858 ctx->Xi.c[n] ^= *(out++) = *(in++) ^
859 ctx->EKi.c[n];
860 --len;
861 n = (n + 1) % 16;
862 }
863 if (n == 0)
864 GCM_MUL(ctx, Xi);
865 else {
866 ctx->mres = n;
867 return 0;
868 }
869 }
870 #ifdef __STRICT_ALIGNMENT
871 if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
872 break;
873 #endif
874 #if defined(GHASH) && defined(GHASH_CHUNK)
875 while (len >= GHASH_CHUNK) {
876 size_t j = GHASH_CHUNK;
877
878 while (j) {
879 size_t *out_t = (size_t *)out;
880 const size_t *in_t = (const size_t *)in;
881
882 (*block)(ctx->Yi.c, ctx->EKi.c, key);
883 ++ctr;
884 ctx->Yi.d[3] = htobe32(ctr);
885
886 for (i = 0; i < 16/sizeof(size_t); ++i)
887 out_t[i] = in_t[i] ^
888 ctx->EKi.t[i];
889 out += 16;
890 in += 16;
891 j -= 16;
892 }
893 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
894 len -= GHASH_CHUNK;
895 }
896 if ((i = (len & (size_t)-16))) {
897 size_t j = i;
898
899 while (len >= 16) {
900 size_t *out_t = (size_t *)out;
901 const size_t *in_t = (const size_t *)in;
902
903 (*block)(ctx->Yi.c, ctx->EKi.c, key);
904 ++ctr;
905 ctx->Yi.d[3] = htobe32(ctr);
906
907 for (i = 0; i < 16/sizeof(size_t); ++i)
908 out_t[i] = in_t[i] ^
909 ctx->EKi.t[i];
910 out += 16;
911 in += 16;
912 len -= 16;
913 }
914 GHASH(ctx, out - j, j);
915 }
916 #else
917 while (len >= 16) {
918 size_t *out_t = (size_t *)out;
919 const size_t *in_t = (const size_t *)in;
920
921 (*block)(ctx->Yi.c, ctx->EKi.c, key);
922 ++ctr;
923 ctx->Yi.d[3] = htobe32(ctr);
924
925 for (i = 0; i < 16/sizeof(size_t); ++i)
926 ctx->Xi.t[i] ^=
927 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
928 GCM_MUL(ctx, Xi);
929 out += 16;
930 in += 16;
931 len -= 16;
932 }
933 #endif
934 if (len) {
935 (*block)(ctx->Yi.c, ctx->EKi.c, key);
936 ++ctr;
937 ctx->Yi.d[3] = htobe32(ctr);
938
939 while (len--) {
940 ctx->Xi.c[n] ^= out[n] = in[n] ^
941 ctx->EKi.c[n];
942 ++n;
943 }
944 }
945
946 ctx->mres = n;
947 return 0;
948 } while (0);
949 #endif
950 for (i = 0; i < len; ++i) {
951 if (n == 0) {
952 (*block)(ctx->Yi.c, ctx->EKi.c, key);
953 ++ctr;
954 ctx->Yi.d[3] = htobe32(ctr);
955 }
956 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
957 n = (n + 1) % 16;
958 if (n == 0)
959 GCM_MUL(ctx, Xi);
960 }
961
962 ctx->mres = n;
963 return 0;
964 }
965 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt);
966
967 int
968 CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
969 const unsigned char *in, unsigned char *out,
970 size_t len)
971 {
972 unsigned int n, ctr;
973 size_t i;
974 u64 mlen = ctx->len.u[1];
975 block128_f block = ctx->block;
976 void *key = ctx->key;
977 #ifdef GCM_FUNCREF_4BIT
978 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
979 # ifdef GHASH
980 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
981 const u8 *inp, size_t len) = ctx->ghash;
982 # endif
983 #endif
984
985 mlen += len;
986 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
987 return -1;
988 ctx->len.u[1] = mlen;
989
990 if (ctx->ares) {
991 /* First call to decrypt finalizes GHASH(AAD) */
992 GCM_MUL(ctx, Xi);
993 ctx->ares = 0;
994 }
995
996 ctr = be32toh(ctx->Yi.d[3]);
997
998 n = ctx->mres;
999 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1000 if (16 % sizeof(size_t) == 0)
1001 do { /* always true actually */
1002 if (n) {
1003 while (n && len) {
1004 u8 c = *(in++);
1005 *(out++) = c ^ ctx->EKi.c[n];
1006 ctx->Xi.c[n] ^= c;
1007 --len;
1008 n = (n + 1) % 16;
1009 }
1010 if (n == 0)
1011 GCM_MUL(ctx, Xi);
1012 else {
1013 ctx->mres = n;
1014 return 0;
1015 }
1016 }
1017 #ifdef __STRICT_ALIGNMENT
1018 if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
1019 break;
1020 #endif
1021 #if defined(GHASH) && defined(GHASH_CHUNK)
1022 while (len >= GHASH_CHUNK) {
1023 size_t j = GHASH_CHUNK;
1024
1025 GHASH(ctx, in, GHASH_CHUNK);
1026 while (j) {
1027 size_t *out_t = (size_t *)out;
1028 const size_t *in_t = (const size_t *)in;
1029
1030 (*block)(ctx->Yi.c, ctx->EKi.c, key);
1031 ++ctr;
1032 ctx->Yi.d[3] = htobe32(ctr);
1033
1034 for (i = 0; i < 16/sizeof(size_t); ++i)
1035 out_t[i] = in_t[i] ^
1036 ctx->EKi.t[i];
1037 out += 16;
1038 in += 16;
1039 j -= 16;
1040 }
1041 len -= GHASH_CHUNK;
1042 }
1043 if ((i = (len & (size_t)-16))) {
1044 GHASH(ctx, in, i);
1045 while (len >= 16) {
1046 size_t *out_t = (size_t *)out;
1047 const size_t *in_t = (const size_t *)in;
1048
1049 (*block)(ctx->Yi.c, ctx->EKi.c, key);
1050 ++ctr;
1051 ctx->Yi.d[3] = htobe32(ctr);
1052
1053 for (i = 0; i < 16/sizeof(size_t); ++i)
1054 out_t[i] = in_t[i] ^
1055 ctx->EKi.t[i];
1056 out += 16;
1057 in += 16;
1058 len -= 16;
1059 }
1060 }
1061 #else
1062 while (len >= 16) {
1063 size_t *out_t = (size_t *)out;
1064 const size_t *in_t = (const size_t *)in;
1065
1066 (*block)(ctx->Yi.c, ctx->EKi.c, key);
1067 ++ctr;
1068 ctx->Yi.d[3] = htobe32(ctr);
1069
1070 for (i = 0; i < 16/sizeof(size_t); ++i) {
1071 size_t c = in[i];
1072 out[i] = c ^ ctx->EKi.t[i];
1073 ctx->Xi.t[i] ^= c;
1074 }
1075 GCM_MUL(ctx, Xi);
1076 out += 16;
1077 in += 16;
1078 len -= 16;
1079 }
1080 #endif
1081 if (len) {
1082 (*block)(ctx->Yi.c, ctx->EKi.c, key);
1083 ++ctr;
1084 ctx->Yi.d[3] = htobe32(ctr);
1085
1086 while (len--) {
1087 u8 c = in[n];
1088 ctx->Xi.c[n] ^= c;
1089 out[n] = c ^ ctx->EKi.c[n];
1090 ++n;
1091 }
1092 }
1093
1094 ctx->mres = n;
1095 return 0;
1096 } while (0);
1097 #endif
1098 for (i = 0; i < len; ++i) {
1099 u8 c;
1100 if (n == 0) {
1101 (*block)(ctx->Yi.c, ctx->EKi.c, key);
1102 ++ctr;
1103 ctx->Yi.d[3] = htobe32(ctr);
1104 }
1105 c = in[i];
1106 out[i] = c ^ ctx->EKi.c[n];
1107 ctx->Xi.c[n] ^= c;
1108 n = (n + 1) % 16;
1109 if (n == 0)
1110 GCM_MUL(ctx, Xi);
1111 }
1112
1113 ctx->mres = n;
1114 return 0;
1115 }
1116 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt);
1117
1118 int
1119 CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1120 const unsigned char *in, unsigned char *out,
1121 size_t len, ctr128_f stream)
1122 {
1123 unsigned int n, ctr;
1124 size_t i;
1125 u64 mlen = ctx->len.u[1];
1126 void *key = ctx->key;
1127 #ifdef GCM_FUNCREF_4BIT
1128 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1129 # ifdef GHASH
1130 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1131 const u8 *inp, size_t len) = ctx->ghash;
1132 # endif
1133 #endif
1134
1135 mlen += len;
1136 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1137 return -1;
1138 ctx->len.u[1] = mlen;
1139
1140 if (ctx->ares) {
1141 /* First call to encrypt finalizes GHASH(AAD) */
1142 GCM_MUL(ctx, Xi);
1143 ctx->ares = 0;
1144 }
1145
1146 ctr = be32toh(ctx->Yi.d[3]);
1147
1148 n = ctx->mres;
1149 if (n) {
1150 while (n && len) {
1151 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1152 --len;
1153 n = (n + 1) % 16;
1154 }
1155 if (n == 0)
1156 GCM_MUL(ctx, Xi);
1157 else {
1158 ctx->mres = n;
1159 return 0;
1160 }
1161 }
1162 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1163 while (len >= GHASH_CHUNK) {
1164 (*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
1165 ctr += GHASH_CHUNK/16;
1166 ctx->Yi.d[3] = htobe32(ctr);
1167 GHASH(ctx, out, GHASH_CHUNK);
1168 out += GHASH_CHUNK;
1169 in += GHASH_CHUNK;
1170 len -= GHASH_CHUNK;
1171 }
1172 #endif
1173 if ((i = (len & (size_t)-16))) {
1174 size_t j = i/16;
1175
1176 (*stream)(in, out, j, key, ctx->Yi.c);
1177 ctr += (unsigned int)j;
1178 ctx->Yi.d[3] = htobe32(ctr);
1179 in += i;
1180 len -= i;
1181 #if defined(GHASH)
1182 GHASH(ctx, out, i);
1183 out += i;
1184 #else
1185 while (j--) {
1186 for (i = 0; i < 16; ++i)
1187 ctx->Xi.c[i] ^= out[i];
1188 GCM_MUL(ctx, Xi);
1189 out += 16;
1190 }
1191 #endif
1192 }
1193 if (len) {
1194 (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1195 ++ctr;
1196 ctx->Yi.d[3] = htobe32(ctr);
1197 while (len--) {
1198 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1199 ++n;
1200 }
1201 }
1202
1203 ctx->mres = n;
1204 return 0;
1205 }
1206 LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt_ctr32);
1207
1208 int
1209 CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1210 const unsigned char *in, unsigned char *out,
1211 size_t len, ctr128_f stream)
1212 {
1213 unsigned int n, ctr;
1214 size_t i;
1215 u64 mlen = ctx->len.u[1];
1216 void *key = ctx->key;
1217 #ifdef GCM_FUNCREF_4BIT
1218 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1219 # ifdef GHASH
1220 void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
1221 const u8 *inp, size_t len) = ctx->ghash;
1222 # endif
1223 #endif
1224
1225 mlen += len;
1226 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1227 return -1;
1228 ctx->len.u[1] = mlen;
1229
1230 if (ctx->ares) {
1231 /* First call to decrypt finalizes GHASH(AAD) */
1232 GCM_MUL(ctx, Xi);
1233 ctx->ares = 0;
1234 }
1235
1236 ctr = be32toh(ctx->Yi.d[3]);
1237
1238 n = ctx->mres;
1239 if (n) {
1240 while (n && len) {
1241 u8 c = *(in++);
1242 *(out++) = c ^ ctx->EKi.c[n];
1243 ctx->Xi.c[n] ^= c;
1244 --len;
1245 n = (n + 1) % 16;
1246 }
1247 if (n == 0)
1248 GCM_MUL(ctx, Xi);
1249 else {
1250 ctx->mres = n;
1251 return 0;
1252 }
1253 }
1254 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1255 while (len >= GHASH_CHUNK) {
1256 GHASH(ctx, in, GHASH_CHUNK);
1257 (*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
1258 ctr += GHASH_CHUNK/16;
1259 ctx->Yi.d[3] = htobe32(ctr);
1260 out += GHASH_CHUNK;
1261 in += GHASH_CHUNK;
1262 len -= GHASH_CHUNK;
1263 }
1264 #endif
1265 if ((i = (len & (size_t)-16))) {
1266 size_t j = i/16;
1267
1268 #if defined(GHASH)
1269 GHASH(ctx, in, i);
1270 #else
1271 while (j--) {
1272 size_t k;
1273 for (k = 0; k < 16; ++k)
1274 ctx->Xi.c[k] ^= in[k];
1275 GCM_MUL(ctx, Xi);
1276 in += 16;
1277 }
1278 j = i/16;
1279 in -= i;
1280 #endif
1281 (*stream)(in, out, j, key, ctx->Yi.c);
1282 ctr += (unsigned int)j;
1283 ctx->Yi.d[3] = htobe32(ctr);
1284 out += i;
1285 in += i;
1286 len -= i;
1287 }
1288 if (len) {
1289 (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
1290 ++ctr;
1291 ctx->Yi.d[3] = htobe32(ctr);
1292 while (len--) {
1293 u8 c = in[n];
1294 ctx->Xi.c[n] ^= c;
1295 out[n] = c ^ ctx->EKi.c[n];
1296 ++n;
1297 }
1298 }
1299
1300 ctx->mres = n;
1301 return 0;
1302 }
1303 LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt_ctr32);
1304
1305 int
1306 CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1307 size_t len)
1308 {
1309 u64 alen = ctx->len.u[0] << 3;
1310 u64 clen = ctx->len.u[1] << 3;
1311 #ifdef GCM_FUNCREF_4BIT
1312 void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1313 #endif
1314
1315 if (ctx->mres || ctx->ares)
1316 GCM_MUL(ctx, Xi);
1317
1318 ctx->Xi.u[0] ^= htobe64(alen);
1319 ctx->Xi.u[1] ^= htobe64(clen);
1320 GCM_MUL(ctx, Xi);
1321
1322 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1323 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1324
1325 if (tag && len <= sizeof(ctx->Xi))
1326 return memcmp(ctx->Xi.c, tag, len);
1327 else
1328 return -1;
1329 }
1330 LCRYPTO_ALIAS(CRYPTO_gcm128_finish);
1331
1332 void
1333 CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1334 {
1335 CRYPTO_gcm128_finish(ctx, NULL, 0);
1336 memcpy(tag, ctx->Xi.c,
1337 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1338 }
1339 LCRYPTO_ALIAS(CRYPTO_gcm128_tag);
1340
1341 GCM128_CONTEXT *
1342 CRYPTO_gcm128_new(void *key, block128_f block)
1343 {
1344 GCM128_CONTEXT *ret;
1345
1346 if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1347 CRYPTO_gcm128_init(ret, key, block);
1348
1349 return ret;
1350 }
1351 LCRYPTO_ALIAS(CRYPTO_gcm128_new);
1352
1353 void
1354 CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1355 {
1356 freezero(ctx, sizeof(*ctx));
1357 }
1358 LCRYPTO_ALIAS(CRYPTO_gcm128_release);
1359