1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49 
50 #define OPENSSL_FIPSAPI
51 
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55 
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62 
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 # undef  GETU32
66 # define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 # undef  PUTU32
68 # define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70 
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84 
85 /*-
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120 
gcm_init_8bit(u128 Htable[256],u64 H[2])121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123     int i, j;
124     u128 V;
125 
126     Htable[0].hi = 0;
127     Htable[0].lo = 0;
128     V.hi = H[0];
129     V.lo = H[1];
130 
131     for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
132         REDUCE1BIT(V);
133         Htable[i] = V;
134     }
135 
136     for (i = 2; i < 256; i <<= 1) {
137         u128 *Hi = Htable + i, H0 = *Hi;
138         for (j = 1; j < i; ++j) {
139             Hi[j].hi = H0.hi ^ Htable[j].hi;
140             Hi[j].lo = H0.lo ^ Htable[j].lo;
141         }
142     }
143 }
144 
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147     u128 Z = { 0, 0 };
148     const u8 *xi = (const u8 *)Xi + 15;
149     size_t rem, n = *xi;
150     const union {
151         long one;
152         char little;
153     } is_endian = {
154         1
155     };
156     static const size_t rem_8bit[256] = {
157         PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
158         PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
159         PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
160         PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
161         PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
162         PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
163         PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
164         PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
165         PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
166         PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
167         PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
168         PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
169         PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
170         PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
171         PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
172         PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
173         PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
174         PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
175         PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
176         PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
177         PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
178         PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
179         PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
180         PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
181         PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
182         PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
183         PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
184         PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
185         PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
186         PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
187         PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
188         PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
189         PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
190         PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
191         PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
192         PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
193         PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
194         PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
195         PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
196         PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
197         PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
198         PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
199         PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
200         PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
201         PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
202         PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
203         PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
204         PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
205         PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
206         PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
207         PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
208         PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
209         PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
210         PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
211         PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
212         PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
213         PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
214         PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
215         PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
216         PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
217         PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
218         PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
219         PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
220         PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
221     };
222 
223     while (1) {
224         Z.hi ^= Htable[n].hi;
225         Z.lo ^= Htable[n].lo;
226 
227         if ((u8 *)Xi == xi)
228             break;
229 
230         n = *(--xi);
231 
232         rem = (size_t)Z.lo & 0xff;
233         Z.lo = (Z.hi << 56) | (Z.lo >> 8);
234         Z.hi = (Z.hi >> 8);
235         if (sizeof(size_t) == 8)
236             Z.hi ^= rem_8bit[rem];
237         else
238             Z.hi ^= (u64)rem_8bit[rem] << 32;
239     }
240 
241     if (is_endian.little) {
242 # ifdef BSWAP8
243         Xi[0] = BSWAP8(Z.hi);
244         Xi[1] = BSWAP8(Z.lo);
245 # else
246         u8 *p = (u8 *)Xi;
247         u32 v;
248         v = (u32)(Z.hi >> 32);
249         PUTU32(p, v);
250         v = (u32)(Z.hi);
251         PUTU32(p + 4, v);
252         v = (u32)(Z.lo >> 32);
253         PUTU32(p + 8, v);
254         v = (u32)(Z.lo);
255         PUTU32(p + 12, v);
256 # endif
257     } else {
258         Xi[0] = Z.hi;
259         Xi[1] = Z.lo;
260     }
261 }
262 
263 # define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
264 
265 #elif   TABLE_BITS==4
266 
gcm_init_4bit(u128 Htable[16],u64 H[2])267 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
268 {
269     u128 V;
270 # if defined(OPENSSL_SMALL_FOOTPRINT)
271     int i;
272 # endif
273 
274     Htable[0].hi = 0;
275     Htable[0].lo = 0;
276     V.hi = H[0];
277     V.lo = H[1];
278 
279 # if defined(OPENSSL_SMALL_FOOTPRINT)
280     for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
281         REDUCE1BIT(V);
282         Htable[i] = V;
283     }
284 
285     for (i = 2; i < 16; i <<= 1) {
286         u128 *Hi = Htable + i;
287         int j;
288         for (V = *Hi, j = 1; j < i; ++j) {
289             Hi[j].hi = V.hi ^ Htable[j].hi;
290             Hi[j].lo = V.lo ^ Htable[j].lo;
291         }
292     }
293 # else
294     Htable[8] = V;
295     REDUCE1BIT(V);
296     Htable[4] = V;
297     REDUCE1BIT(V);
298     Htable[2] = V;
299     REDUCE1BIT(V);
300     Htable[1] = V;
301     Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
302     V = Htable[4];
303     Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
304     Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
305     Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
306     V = Htable[8];
307     Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
308     Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
309     Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
310     Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
311     Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
312     Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
313     Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
314 # endif
315 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
316     /*
317      * ARM assembler expects specific dword order in Htable.
318      */
319     {
320         int j;
321         const union {
322             long one;
323             char little;
324         } is_endian = {
325             1
326         };
327 
328         if (is_endian.little)
329             for (j = 0; j < 16; ++j) {
330                 V = Htable[j];
331                 Htable[j].hi = V.lo;
332                 Htable[j].lo = V.hi;
333         } else
334             for (j = 0; j < 16; ++j) {
335                 V = Htable[j];
336                 Htable[j].hi = V.lo << 32 | V.lo >> 32;
337                 Htable[j].lo = V.hi << 32 | V.hi >> 32;
338             }
339     }
340 # endif
341 }
342 
343 # ifndef GHASH_ASM
344 static const size_t rem_4bit[16] = {
345     PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
346     PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
347     PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
348     PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
349 };
350 
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])351 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
352 {
353     u128 Z;
354     int cnt = 15;
355     size_t rem, nlo, nhi;
356     const union {
357         long one;
358         char little;
359     } is_endian = {
360         1
361     };
362 
363     nlo = ((const u8 *)Xi)[15];
364     nhi = nlo >> 4;
365     nlo &= 0xf;
366 
367     Z.hi = Htable[nlo].hi;
368     Z.lo = Htable[nlo].lo;
369 
370     while (1) {
371         rem = (size_t)Z.lo & 0xf;
372         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
373         Z.hi = (Z.hi >> 4);
374         if (sizeof(size_t) == 8)
375             Z.hi ^= rem_4bit[rem];
376         else
377             Z.hi ^= (u64)rem_4bit[rem] << 32;
378 
379         Z.hi ^= Htable[nhi].hi;
380         Z.lo ^= Htable[nhi].lo;
381 
382         if (--cnt < 0)
383             break;
384 
385         nlo = ((const u8 *)Xi)[cnt];
386         nhi = nlo >> 4;
387         nlo &= 0xf;
388 
389         rem = (size_t)Z.lo & 0xf;
390         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
391         Z.hi = (Z.hi >> 4);
392         if (sizeof(size_t) == 8)
393             Z.hi ^= rem_4bit[rem];
394         else
395             Z.hi ^= (u64)rem_4bit[rem] << 32;
396 
397         Z.hi ^= Htable[nlo].hi;
398         Z.lo ^= Htable[nlo].lo;
399     }
400 
401     if (is_endian.little) {
402 #  ifdef BSWAP8
403         Xi[0] = BSWAP8(Z.hi);
404         Xi[1] = BSWAP8(Z.lo);
405 #  else
406         u8 *p = (u8 *)Xi;
407         u32 v;
408         v = (u32)(Z.hi >> 32);
409         PUTU32(p, v);
410         v = (u32)(Z.hi);
411         PUTU32(p + 4, v);
412         v = (u32)(Z.lo >> 32);
413         PUTU32(p + 8, v);
414         v = (u32)(Z.lo);
415         PUTU32(p + 12, v);
416 #  endif
417     } else {
418         Xi[0] = Z.hi;
419         Xi[1] = Z.lo;
420     }
421 }
422 
423 #  if !defined(OPENSSL_SMALL_FOOTPRINT)
424 /*
425  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
426  * details... Compiler-generated code doesn't seem to give any
427  * performance improvement, at least not on x86[_64]. It's here
428  * mostly as reference and a placeholder for possible future
429  * non-trivial optimization[s]...
430  */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)431 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
432                            const u8 *inp, size_t len)
433 {
434     u128 Z;
435     int cnt;
436     size_t rem, nlo, nhi;
437     const union {
438         long one;
439         char little;
440     } is_endian = {
441         1
442     };
443 
444 #   if 1
445     do {
446         cnt = 15;
447         nlo = ((const u8 *)Xi)[15];
448         nlo ^= inp[15];
449         nhi = nlo >> 4;
450         nlo &= 0xf;
451 
452         Z.hi = Htable[nlo].hi;
453         Z.lo = Htable[nlo].lo;
454 
455         while (1) {
456             rem = (size_t)Z.lo & 0xf;
457             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
458             Z.hi = (Z.hi >> 4);
459             if (sizeof(size_t) == 8)
460                 Z.hi ^= rem_4bit[rem];
461             else
462                 Z.hi ^= (u64)rem_4bit[rem] << 32;
463 
464             Z.hi ^= Htable[nhi].hi;
465             Z.lo ^= Htable[nhi].lo;
466 
467             if (--cnt < 0)
468                 break;
469 
470             nlo = ((const u8 *)Xi)[cnt];
471             nlo ^= inp[cnt];
472             nhi = nlo >> 4;
473             nlo &= 0xf;
474 
475             rem = (size_t)Z.lo & 0xf;
476             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
477             Z.hi = (Z.hi >> 4);
478             if (sizeof(size_t) == 8)
479                 Z.hi ^= rem_4bit[rem];
480             else
481                 Z.hi ^= (u64)rem_4bit[rem] << 32;
482 
483             Z.hi ^= Htable[nlo].hi;
484             Z.lo ^= Htable[nlo].lo;
485         }
486 #   else
487     /*
488      * Extra 256+16 bytes per-key plus 512 bytes shared tables
489      * [should] give ~50% improvement... One could have PACK()-ed
490      * the rem_8bit even here, but the priority is to minimize
491      * cache footprint...
492      */
493     u128 Hshr4[16];             /* Htable shifted right by 4 bits */
494     u8 Hshl4[16];               /* Htable shifted left by 4 bits */
495     static const unsigned short rem_8bit[256] = {
496         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
497         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
498         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
499         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
500         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
501         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
502         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
503         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
504         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
505         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
506         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
507         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
508         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
509         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
510         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
511         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
512         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
513         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
514         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
515         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
516         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
517         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
518         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
519         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
520         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
521         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
522         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
523         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
524         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
525         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
526         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
527         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
528     };
529     /*
530      * This pre-processing phase slows down procedure by approximately
531      * same time as it makes each loop spin faster. In other words
532      * single block performance is approximately same as straightforward
533      * "4-bit" implementation, and then it goes only faster...
534      */
535     for (cnt = 0; cnt < 16; ++cnt) {
536         Z.hi = Htable[cnt].hi;
537         Z.lo = Htable[cnt].lo;
538         Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
539         Hshr4[cnt].hi = (Z.hi >> 4);
540         Hshl4[cnt] = (u8)(Z.lo << 4);
541     }
542 
543     do {
544         for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
545             nlo = ((const u8 *)Xi)[cnt];
546             nlo ^= inp[cnt];
547             nhi = nlo >> 4;
548             nlo &= 0xf;
549 
550             Z.hi ^= Htable[nlo].hi;
551             Z.lo ^= Htable[nlo].lo;
552 
553             rem = (size_t)Z.lo & 0xff;
554 
555             Z.lo = (Z.hi << 56) | (Z.lo >> 8);
556             Z.hi = (Z.hi >> 8);
557 
558             Z.hi ^= Hshr4[nhi].hi;
559             Z.lo ^= Hshr4[nhi].lo;
560             Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
561         }
562 
563         nlo = ((const u8 *)Xi)[0];
564         nlo ^= inp[0];
565         nhi = nlo >> 4;
566         nlo &= 0xf;
567 
568         Z.hi ^= Htable[nlo].hi;
569         Z.lo ^= Htable[nlo].lo;
570 
571         rem = (size_t)Z.lo & 0xf;
572 
573         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
574         Z.hi = (Z.hi >> 4);
575 
576         Z.hi ^= Htable[nhi].hi;
577         Z.lo ^= Htable[nhi].lo;
578         Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
579 #   endif
580 
581         if (is_endian.little) {
582 #   ifdef BSWAP8
583             Xi[0] = BSWAP8(Z.hi);
584             Xi[1] = BSWAP8(Z.lo);
585 #   else
586             u8 *p = (u8 *)Xi;
587             u32 v;
588             v = (u32)(Z.hi >> 32);
589             PUTU32(p, v);
590             v = (u32)(Z.hi);
591             PUTU32(p + 4, v);
592             v = (u32)(Z.lo >> 32);
593             PUTU32(p + 8, v);
594             v = (u32)(Z.lo);
595             PUTU32(p + 12, v);
596 #   endif
597         } else {
598             Xi[0] = Z.hi;
599             Xi[1] = Z.lo;
600         }
601     } while (inp += 16, len -= 16);
602 }
603 #  endif
604 # else
605 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
606 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
607                     size_t len);
608 # endif
609 
610 # define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
611 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
612 #  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
613 /*
614  * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
615  * effect. In other words idea is to hash data while it's still in L1 cache
616  * after encryption pass...
617  */
618 #  define GHASH_CHUNK       (3*1024)
619 # endif
620 
621 #else                           /* TABLE_BITS */
622 
623 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
624 {
625     u128 V, Z = { 0, 0 };
626     long X;
627     int i, j;
628     const long *xi = (const long *)Xi;
629     const union {
630         long one;
631         char little;
632     } is_endian = {
633         1
634     };
635 
636     V.hi = H[0];                /* H is in host byte order, no byte swapping */
637     V.lo = H[1];
638 
639     for (j = 0; j < 16 / sizeof(long); ++j) {
640         if (is_endian.little) {
641             if (sizeof(long) == 8) {
642 # ifdef BSWAP8
643                 X = (long)(BSWAP8(xi[j]));
644 # else
645                 const u8 *p = (const u8 *)(xi + j);
646                 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
647 # endif
648             } else {
649                 const u8 *p = (const u8 *)(xi + j);
650                 X = (long)GETU32(p);
651             }
652         } else
653             X = xi[j];
654 
655         for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
656             u64 M = (u64)(X >> (8 * sizeof(long) - 1));
657             Z.hi ^= V.hi & M;
658             Z.lo ^= V.lo & M;
659 
660             REDUCE1BIT(V);
661         }
662     }
663 
664     if (is_endian.little) {
665 # ifdef BSWAP8
666         Xi[0] = BSWAP8(Z.hi);
667         Xi[1] = BSWAP8(Z.lo);
668 # else
669         u8 *p = (u8 *)Xi;
670         u32 v;
671         v = (u32)(Z.hi >> 32);
672         PUTU32(p, v);
673         v = (u32)(Z.hi);
674         PUTU32(p + 4, v);
675         v = (u32)(Z.lo >> 32);
676         PUTU32(p + 8, v);
677         v = (u32)(Z.lo);
678         PUTU32(p + 12, v);
679 # endif
680     } else {
681         Xi[0] = Z.hi;
682         Xi[1] = Z.lo;
683     }
684 }
685 
686 # define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
687 
688 #endif
689 
690 #if     TABLE_BITS==4 && defined(GHASH_ASM)
691 # if    !defined(I386_ONLY) && \
692         (defined(__i386)        || defined(__i386__)    || \
693          defined(__x86_64)      || defined(__x86_64__)  || \
694          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
695 #  define GHASH_ASM_X86_OR_64
696 #  define GCM_FUNCREF_4BIT
697 extern unsigned int OPENSSL_ia32cap_P[2];
698 
699 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
700 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
701 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
702                      size_t len);
703 
704 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
705 #   define GHASH_ASM_X86
706 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
707 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
708                         size_t len);
709 
710 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
711 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
712                         size_t len);
713 #  endif
714 # elif defined(__arm__) || defined(__arm)
715 #  include "arm_arch.h"
716 #  if __ARM_ARCH__>=7
717 #   define GHASH_ASM_ARM
718 #   define GCM_FUNCREF_4BIT
719 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
720 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
721                     size_t len);
722 #  endif
723 # endif
724 #endif
725 
726 #ifdef GCM_FUNCREF_4BIT
727 # undef  GCM_MUL
728 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
729 # ifdef GHASH
730 #  undef  GHASH
731 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
732 # endif
733 #endif
734 
735 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
736 {
737     const union {
738         long one;
739         char little;
740     } is_endian = {
741         1
742     };
743 
744     memset(ctx, 0, sizeof(*ctx));
745     ctx->block = block;
746     ctx->key = key;
747 
748     (*block) (ctx->H.c, ctx->H.c, key);
749 
750     if (is_endian.little) {
751         /* H is stored in host byte order */
752 #ifdef BSWAP8
753         ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
754         ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
755 #else
756         u8 *p = ctx->H.c;
757         u64 hi, lo;
758         hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
759         lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
760         ctx->H.u[0] = hi;
761         ctx->H.u[1] = lo;
762 #endif
763     }
764 #if     TABLE_BITS==8
765     gcm_init_8bit(ctx->Htable, ctx->H.u);
766 #elif   TABLE_BITS==4
767 # if    defined(GHASH_ASM_X86_OR_64)
768 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
769     if (OPENSSL_ia32cap_P[0] & (1 << 24) && /* check FXSR bit */
770         OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
771         gcm_init_clmul(ctx->Htable, ctx->H.u);
772         ctx->gmult = gcm_gmult_clmul;
773         ctx->ghash = gcm_ghash_clmul;
774         return;
775     }
776 #  endif
777     gcm_init_4bit(ctx->Htable, ctx->H.u);
778 #  if   defined(GHASH_ASM_X86)  /* x86 only */
779 #   if  defined(OPENSSL_IA32_SSE2)
780     if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
781 #   else
782     if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
783 #   endif
784         ctx->gmult = gcm_gmult_4bit_mmx;
785         ctx->ghash = gcm_ghash_4bit_mmx;
786     } else {
787         ctx->gmult = gcm_gmult_4bit_x86;
788         ctx->ghash = gcm_ghash_4bit_x86;
789     }
790 #  else
791     ctx->gmult = gcm_gmult_4bit;
792     ctx->ghash = gcm_ghash_4bit;
793 #  endif
794 # elif  defined(GHASH_ASM_ARM)
795     if (OPENSSL_armcap_P & ARMV7_NEON) {
796         ctx->gmult = gcm_gmult_neon;
797         ctx->ghash = gcm_ghash_neon;
798     } else {
799         gcm_init_4bit(ctx->Htable, ctx->H.u);
800         ctx->gmult = gcm_gmult_4bit;
801         ctx->ghash = gcm_ghash_4bit;
802     }
803 # else
804     gcm_init_4bit(ctx->Htable, ctx->H.u);
805 # endif
806 #endif
807 }
808 
809 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
810                          size_t len)
811 {
812     const union {
813         long one;
814         char little;
815     } is_endian = {
816         1
817     };
818     unsigned int ctr;
819 #ifdef GCM_FUNCREF_4BIT
820     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
821 #endif
822 
823     ctx->Yi.u[0] = 0;
824     ctx->Yi.u[1] = 0;
825     ctx->Xi.u[0] = 0;
826     ctx->Xi.u[1] = 0;
827     ctx->len.u[0] = 0;          /* AAD length */
828     ctx->len.u[1] = 0;          /* message length */
829     ctx->ares = 0;
830     ctx->mres = 0;
831 
832     if (len == 12) {
833         memcpy(ctx->Yi.c, iv, 12);
834         ctx->Yi.c[15] = 1;
835         ctr = 1;
836     } else {
837         size_t i;
838         u64 len0 = len;
839 
840         while (len >= 16) {
841             for (i = 0; i < 16; ++i)
842                 ctx->Yi.c[i] ^= iv[i];
843             GCM_MUL(ctx, Yi);
844             iv += 16;
845             len -= 16;
846         }
847         if (len) {
848             for (i = 0; i < len; ++i)
849                 ctx->Yi.c[i] ^= iv[i];
850             GCM_MUL(ctx, Yi);
851         }
852         len0 <<= 3;
853         if (is_endian.little) {
854 #ifdef BSWAP8
855             ctx->Yi.u[1] ^= BSWAP8(len0);
856 #else
857             ctx->Yi.c[8] ^= (u8)(len0 >> 56);
858             ctx->Yi.c[9] ^= (u8)(len0 >> 48);
859             ctx->Yi.c[10] ^= (u8)(len0 >> 40);
860             ctx->Yi.c[11] ^= (u8)(len0 >> 32);
861             ctx->Yi.c[12] ^= (u8)(len0 >> 24);
862             ctx->Yi.c[13] ^= (u8)(len0 >> 16);
863             ctx->Yi.c[14] ^= (u8)(len0 >> 8);
864             ctx->Yi.c[15] ^= (u8)(len0);
865 #endif
866         } else
867             ctx->Yi.u[1] ^= len0;
868 
869         GCM_MUL(ctx, Yi);
870 
871         if (is_endian.little)
872 #ifdef BSWAP4
873             ctr = BSWAP4(ctx->Yi.d[3]);
874 #else
875             ctr = GETU32(ctx->Yi.c + 12);
876 #endif
877         else
878             ctr = ctx->Yi.d[3];
879     }
880 
881     (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
882     ++ctr;
883     if (is_endian.little)
884 #ifdef BSWAP4
885         ctx->Yi.d[3] = BSWAP4(ctr);
886 #else
887         PUTU32(ctx->Yi.c + 12, ctr);
888 #endif
889     else
890         ctx->Yi.d[3] = ctr;
891 }
892 
893 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
894                       size_t len)
895 {
896     size_t i;
897     unsigned int n;
898     u64 alen = ctx->len.u[0];
899 #ifdef GCM_FUNCREF_4BIT
900     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
901 # ifdef GHASH
902     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
903                          const u8 *inp, size_t len) = ctx->ghash;
904 # endif
905 #endif
906 
907     if (ctx->len.u[1])
908         return -2;
909 
910     alen += len;
911     if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
912         return -1;
913     ctx->len.u[0] = alen;
914 
915     n = ctx->ares;
916     if (n) {
917         while (n && len) {
918             ctx->Xi.c[n] ^= *(aad++);
919             --len;
920             n = (n + 1) % 16;
921         }
922         if (n == 0)
923             GCM_MUL(ctx, Xi);
924         else {
925             ctx->ares = n;
926             return 0;
927         }
928     }
929 #ifdef GHASH
930     if ((i = (len & (size_t)-16))) {
931         GHASH(ctx, aad, i);
932         aad += i;
933         len -= i;
934     }
935 #else
936     while (len >= 16) {
937         for (i = 0; i < 16; ++i)
938             ctx->Xi.c[i] ^= aad[i];
939         GCM_MUL(ctx, Xi);
940         aad += 16;
941         len -= 16;
942     }
943 #endif
944     if (len) {
945         n = (unsigned int)len;
946         for (i = 0; i < len; ++i)
947             ctx->Xi.c[i] ^= aad[i];
948     }
949 
950     ctx->ares = n;
951     return 0;
952 }
953 
954 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
955                           const unsigned char *in, unsigned char *out,
956                           size_t len)
957 {
958     const union {
959         long one;
960         char little;
961     } is_endian = {
962         1
963     };
964     unsigned int n, ctr;
965     size_t i;
966     u64 mlen = ctx->len.u[1];
967     block128_f block = ctx->block;
968     void *key = ctx->key;
969 #ifdef GCM_FUNCREF_4BIT
970     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
971 # ifdef GHASH
972     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
973                          const u8 *inp, size_t len) = ctx->ghash;
974 # endif
975 #endif
976 
977 #if 0
978     n = (unsigned int)mlen % 16; /* alternative to ctx->mres */
979 #endif
980     mlen += len;
981     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
982         return -1;
983     ctx->len.u[1] = mlen;
984 
985     if (ctx->ares) {
986         /* First call to encrypt finalizes GHASH(AAD) */
987         GCM_MUL(ctx, Xi);
988         ctx->ares = 0;
989     }
990 
991     if (is_endian.little)
992 #ifdef BSWAP4
993         ctr = BSWAP4(ctx->Yi.d[3]);
994 #else
995         ctr = GETU32(ctx->Yi.c + 12);
996 #endif
997     else
998         ctr = ctx->Yi.d[3];
999 
1000     n = ctx->mres;
1001 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1002     if (16 % sizeof(size_t) == 0) { /* always true actually */
1003         do {
1004             if (n) {
1005                 while (n && len) {
1006                     ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1007                     --len;
1008                     n = (n + 1) % 16;
1009                 }
1010                 if (n == 0)
1011                     GCM_MUL(ctx, Xi);
1012                 else {
1013                     ctx->mres = n;
1014                     return 0;
1015                 }
1016             }
1017 # if defined(STRICT_ALIGNMENT)
1018             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1019                 break;
1020 # endif
1021 # if defined(GHASH) && defined(GHASH_CHUNK)
1022             while (len >= GHASH_CHUNK) {
1023                 size_t j = GHASH_CHUNK;
1024 
1025                 while (j) {
1026                     size_t *out_t = (size_t *)out;
1027                     const size_t *in_t = (const size_t *)in;
1028 
1029                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1030                     ++ctr;
1031                     if (is_endian.little)
1032 #  ifdef BSWAP4
1033                         ctx->Yi.d[3] = BSWAP4(ctr);
1034 #  else
1035                         PUTU32(ctx->Yi.c + 12, ctr);
1036 #  endif
1037                     else
1038                         ctx->Yi.d[3] = ctr;
1039                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1040                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1041                     out += 16;
1042                     in += 16;
1043                     j -= 16;
1044                 }
1045                 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1046                 len -= GHASH_CHUNK;
1047             }
1048             if ((i = (len & (size_t)-16))) {
1049                 size_t j = i;
1050 
1051                 while (len >= 16) {
1052                     size_t *out_t = (size_t *)out;
1053                     const size_t *in_t = (const size_t *)in;
1054 
1055                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1056                     ++ctr;
1057                     if (is_endian.little)
1058 #  ifdef BSWAP4
1059                         ctx->Yi.d[3] = BSWAP4(ctr);
1060 #  else
1061                         PUTU32(ctx->Yi.c + 12, ctr);
1062 #  endif
1063                     else
1064                         ctx->Yi.d[3] = ctr;
1065                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1066                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1067                     out += 16;
1068                     in += 16;
1069                     len -= 16;
1070                 }
1071                 GHASH(ctx, out - j, j);
1072             }
1073 # else
1074             while (len >= 16) {
1075                 size_t *out_t = (size_t *)out;
1076                 const size_t *in_t = (const size_t *)in;
1077 
1078                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1079                 ++ctr;
1080                 if (is_endian.little)
1081 #  ifdef BSWAP4
1082                     ctx->Yi.d[3] = BSWAP4(ctr);
1083 #  else
1084                     PUTU32(ctx->Yi.c + 12, ctr);
1085 #  endif
1086                 else
1087                     ctx->Yi.d[3] = ctr;
1088                 for (i = 0; i < 16 / sizeof(size_t); ++i)
1089                     ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1090                 GCM_MUL(ctx, Xi);
1091                 out += 16;
1092                 in += 16;
1093                 len -= 16;
1094             }
1095 # endif
1096             if (len) {
1097                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1098                 ++ctr;
1099                 if (is_endian.little)
1100 # ifdef BSWAP4
1101                     ctx->Yi.d[3] = BSWAP4(ctr);
1102 # else
1103                     PUTU32(ctx->Yi.c + 12, ctr);
1104 # endif
1105                 else
1106                     ctx->Yi.d[3] = ctr;
1107                 while (len--) {
1108                     ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1109                     ++n;
1110                 }
1111             }
1112 
1113             ctx->mres = n;
1114             return 0;
1115         } while (0);
1116     }
1117 #endif
1118     for (i = 0; i < len; ++i) {
1119         if (n == 0) {
1120             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1121             ++ctr;
1122             if (is_endian.little)
1123 #ifdef BSWAP4
1124                 ctx->Yi.d[3] = BSWAP4(ctr);
1125 #else
1126                 PUTU32(ctx->Yi.c + 12, ctr);
1127 #endif
1128             else
1129                 ctx->Yi.d[3] = ctr;
1130         }
1131         ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1132         n = (n + 1) % 16;
1133         if (n == 0)
1134             GCM_MUL(ctx, Xi);
1135     }
1136 
1137     ctx->mres = n;
1138     return 0;
1139 }
1140 
1141 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1142                           const unsigned char *in, unsigned char *out,
1143                           size_t len)
1144 {
1145     const union {
1146         long one;
1147         char little;
1148     } is_endian = {
1149         1
1150     };
1151     unsigned int n, ctr;
1152     size_t i;
1153     u64 mlen = ctx->len.u[1];
1154     block128_f block = ctx->block;
1155     void *key = ctx->key;
1156 #ifdef GCM_FUNCREF_4BIT
1157     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1158 # ifdef GHASH
1159     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1160                          const u8 *inp, size_t len) = ctx->ghash;
1161 # endif
1162 #endif
1163 
1164     mlen += len;
1165     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1166         return -1;
1167     ctx->len.u[1] = mlen;
1168 
1169     if (ctx->ares) {
1170         /* First call to decrypt finalizes GHASH(AAD) */
1171         GCM_MUL(ctx, Xi);
1172         ctx->ares = 0;
1173     }
1174 
1175     if (is_endian.little)
1176 #ifdef BSWAP4
1177         ctr = BSWAP4(ctx->Yi.d[3]);
1178 #else
1179         ctr = GETU32(ctx->Yi.c + 12);
1180 #endif
1181     else
1182         ctr = ctx->Yi.d[3];
1183 
1184     n = ctx->mres;
1185 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1186     if (16 % sizeof(size_t) == 0) { /* always true actually */
1187         do {
1188             if (n) {
1189                 while (n && len) {
1190                     u8 c = *(in++);
1191                     *(out++) = c ^ ctx->EKi.c[n];
1192                     ctx->Xi.c[n] ^= c;
1193                     --len;
1194                     n = (n + 1) % 16;
1195                 }
1196                 if (n == 0)
1197                     GCM_MUL(ctx, Xi);
1198                 else {
1199                     ctx->mres = n;
1200                     return 0;
1201                 }
1202             }
1203 # if defined(STRICT_ALIGNMENT)
1204             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1205                 break;
1206 # endif
1207 # if defined(GHASH) && defined(GHASH_CHUNK)
1208             while (len >= GHASH_CHUNK) {
1209                 size_t j = GHASH_CHUNK;
1210 
1211                 GHASH(ctx, in, GHASH_CHUNK);
1212                 while (j) {
1213                     size_t *out_t = (size_t *)out;
1214                     const size_t *in_t = (const size_t *)in;
1215 
1216                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1217                     ++ctr;
1218                     if (is_endian.little)
1219 #  ifdef BSWAP4
1220                         ctx->Yi.d[3] = BSWAP4(ctr);
1221 #  else
1222                         PUTU32(ctx->Yi.c + 12, ctr);
1223 #  endif
1224                     else
1225                         ctx->Yi.d[3] = ctr;
1226                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1227                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1228                     out += 16;
1229                     in += 16;
1230                     j -= 16;
1231                 }
1232                 len -= GHASH_CHUNK;
1233             }
1234             if ((i = (len & (size_t)-16))) {
1235                 GHASH(ctx, in, i);
1236                 while (len >= 16) {
1237                     size_t *out_t = (size_t *)out;
1238                     const size_t *in_t = (const size_t *)in;
1239 
1240                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1241                     ++ctr;
1242                     if (is_endian.little)
1243 #  ifdef BSWAP4
1244                         ctx->Yi.d[3] = BSWAP4(ctr);
1245 #  else
1246                         PUTU32(ctx->Yi.c + 12, ctr);
1247 #  endif
1248                     else
1249                         ctx->Yi.d[3] = ctr;
1250                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1251                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1252                     out += 16;
1253                     in += 16;
1254                     len -= 16;
1255                 }
1256             }
1257 # else
1258             while (len >= 16) {
1259                 size_t *out_t = (size_t *)out;
1260                 const size_t *in_t = (const size_t *)in;
1261 
1262                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1263                 ++ctr;
1264                 if (is_endian.little)
1265 #  ifdef BSWAP4
1266                     ctx->Yi.d[3] = BSWAP4(ctr);
1267 #  else
1268                     PUTU32(ctx->Yi.c + 12, ctr);
1269 #  endif
1270                 else
1271                     ctx->Yi.d[3] = ctr;
1272                 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1273                     size_t c = in[i];
1274                     out[i] = c ^ ctx->EKi.t[i];
1275                     ctx->Xi.t[i] ^= c;
1276                 }
1277                 GCM_MUL(ctx, Xi);
1278                 out += 16;
1279                 in += 16;
1280                 len -= 16;
1281             }
1282 # endif
1283             if (len) {
1284                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1285                 ++ctr;
1286                 if (is_endian.little)
1287 # ifdef BSWAP4
1288                     ctx->Yi.d[3] = BSWAP4(ctr);
1289 # else
1290                     PUTU32(ctx->Yi.c + 12, ctr);
1291 # endif
1292                 else
1293                     ctx->Yi.d[3] = ctr;
1294                 while (len--) {
1295                     u8 c = in[n];
1296                     ctx->Xi.c[n] ^= c;
1297                     out[n] = c ^ ctx->EKi.c[n];
1298                     ++n;
1299                 }
1300             }
1301 
1302             ctx->mres = n;
1303             return 0;
1304         } while (0);
1305     }
1306 #endif
1307     for (i = 0; i < len; ++i) {
1308         u8 c;
1309         if (n == 0) {
1310             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1311             ++ctr;
1312             if (is_endian.little)
1313 #ifdef BSWAP4
1314                 ctx->Yi.d[3] = BSWAP4(ctr);
1315 #else
1316                 PUTU32(ctx->Yi.c + 12, ctr);
1317 #endif
1318             else
1319                 ctx->Yi.d[3] = ctr;
1320         }
1321         c = in[i];
1322         out[i] = c ^ ctx->EKi.c[n];
1323         ctx->Xi.c[n] ^= c;
1324         n = (n + 1) % 16;
1325         if (n == 0)
1326             GCM_MUL(ctx, Xi);
1327     }
1328 
1329     ctx->mres = n;
1330     return 0;
1331 }
1332 
1333 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1334                                 const unsigned char *in, unsigned char *out,
1335                                 size_t len, ctr128_f stream)
1336 {
1337     const union {
1338         long one;
1339         char little;
1340     } is_endian = {
1341         1
1342     };
1343     unsigned int n, ctr;
1344     size_t i;
1345     u64 mlen = ctx->len.u[1];
1346     void *key = ctx->key;
1347 #ifdef GCM_FUNCREF_4BIT
1348     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1349 # ifdef GHASH
1350     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1351                          const u8 *inp, size_t len) = ctx->ghash;
1352 # endif
1353 #endif
1354 
1355     mlen += len;
1356     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1357         return -1;
1358     ctx->len.u[1] = mlen;
1359 
1360     if (ctx->ares) {
1361         /* First call to encrypt finalizes GHASH(AAD) */
1362         GCM_MUL(ctx, Xi);
1363         ctx->ares = 0;
1364     }
1365 
1366     if (is_endian.little)
1367 #ifdef BSWAP4
1368         ctr = BSWAP4(ctx->Yi.d[3]);
1369 #else
1370         ctr = GETU32(ctx->Yi.c + 12);
1371 #endif
1372     else
1373         ctr = ctx->Yi.d[3];
1374 
1375     n = ctx->mres;
1376     if (n) {
1377         while (n && len) {
1378             ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1379             --len;
1380             n = (n + 1) % 16;
1381         }
1382         if (n == 0)
1383             GCM_MUL(ctx, Xi);
1384         else {
1385             ctx->mres = n;
1386             return 0;
1387         }
1388     }
1389 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1390     while (len >= GHASH_CHUNK) {
1391         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1392         ctr += GHASH_CHUNK / 16;
1393         if (is_endian.little)
1394 # ifdef BSWAP4
1395             ctx->Yi.d[3] = BSWAP4(ctr);
1396 # else
1397             PUTU32(ctx->Yi.c + 12, ctr);
1398 # endif
1399         else
1400             ctx->Yi.d[3] = ctr;
1401         GHASH(ctx, out, GHASH_CHUNK);
1402         out += GHASH_CHUNK;
1403         in += GHASH_CHUNK;
1404         len -= GHASH_CHUNK;
1405     }
1406 #endif
1407     if ((i = (len & (size_t)-16))) {
1408         size_t j = i / 16;
1409 
1410         (*stream) (in, out, j, key, ctx->Yi.c);
1411         ctr += (unsigned int)j;
1412         if (is_endian.little)
1413 #ifdef BSWAP4
1414             ctx->Yi.d[3] = BSWAP4(ctr);
1415 #else
1416             PUTU32(ctx->Yi.c + 12, ctr);
1417 #endif
1418         else
1419             ctx->Yi.d[3] = ctr;
1420         in += i;
1421         len -= i;
1422 #if defined(GHASH)
1423         GHASH(ctx, out, i);
1424         out += i;
1425 #else
1426         while (j--) {
1427             for (i = 0; i < 16; ++i)
1428                 ctx->Xi.c[i] ^= out[i];
1429             GCM_MUL(ctx, Xi);
1430             out += 16;
1431         }
1432 #endif
1433     }
1434     if (len) {
1435         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1436         ++ctr;
1437         if (is_endian.little)
1438 #ifdef BSWAP4
1439             ctx->Yi.d[3] = BSWAP4(ctr);
1440 #else
1441             PUTU32(ctx->Yi.c + 12, ctr);
1442 #endif
1443         else
1444             ctx->Yi.d[3] = ctr;
1445         while (len--) {
1446             ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1447             ++n;
1448         }
1449     }
1450 
1451     ctx->mres = n;
1452     return 0;
1453 }
1454 
1455 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1456                                 const unsigned char *in, unsigned char *out,
1457                                 size_t len, ctr128_f stream)
1458 {
1459     const union {
1460         long one;
1461         char little;
1462     } is_endian = {
1463         1
1464     };
1465     unsigned int n, ctr;
1466     size_t i;
1467     u64 mlen = ctx->len.u[1];
1468     void *key = ctx->key;
1469 #ifdef GCM_FUNCREF_4BIT
1470     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1471 # ifdef GHASH
1472     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1473                          const u8 *inp, size_t len) = ctx->ghash;
1474 # endif
1475 #endif
1476 
1477     mlen += len;
1478     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1479         return -1;
1480     ctx->len.u[1] = mlen;
1481 
1482     if (ctx->ares) {
1483         /* First call to decrypt finalizes GHASH(AAD) */
1484         GCM_MUL(ctx, Xi);
1485         ctx->ares = 0;
1486     }
1487 
1488     if (is_endian.little)
1489 #ifdef BSWAP4
1490         ctr = BSWAP4(ctx->Yi.d[3]);
1491 #else
1492         ctr = GETU32(ctx->Yi.c + 12);
1493 #endif
1494     else
1495         ctr = ctx->Yi.d[3];
1496 
1497     n = ctx->mres;
1498     if (n) {
1499         while (n && len) {
1500             u8 c = *(in++);
1501             *(out++) = c ^ ctx->EKi.c[n];
1502             ctx->Xi.c[n] ^= c;
1503             --len;
1504             n = (n + 1) % 16;
1505         }
1506         if (n == 0)
1507             GCM_MUL(ctx, Xi);
1508         else {
1509             ctx->mres = n;
1510             return 0;
1511         }
1512     }
1513 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1514     while (len >= GHASH_CHUNK) {
1515         GHASH(ctx, in, GHASH_CHUNK);
1516         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1517         ctr += GHASH_CHUNK / 16;
1518         if (is_endian.little)
1519 # ifdef BSWAP4
1520             ctx->Yi.d[3] = BSWAP4(ctr);
1521 # else
1522             PUTU32(ctx->Yi.c + 12, ctr);
1523 # endif
1524         else
1525             ctx->Yi.d[3] = ctr;
1526         out += GHASH_CHUNK;
1527         in += GHASH_CHUNK;
1528         len -= GHASH_CHUNK;
1529     }
1530 #endif
1531     if ((i = (len & (size_t)-16))) {
1532         size_t j = i / 16;
1533 
1534 #if defined(GHASH)
1535         GHASH(ctx, in, i);
1536 #else
1537         while (j--) {
1538             size_t k;
1539             for (k = 0; k < 16; ++k)
1540                 ctx->Xi.c[k] ^= in[k];
1541             GCM_MUL(ctx, Xi);
1542             in += 16;
1543         }
1544         j = i / 16;
1545         in -= i;
1546 #endif
1547         (*stream) (in, out, j, key, ctx->Yi.c);
1548         ctr += (unsigned int)j;
1549         if (is_endian.little)
1550 #ifdef BSWAP4
1551             ctx->Yi.d[3] = BSWAP4(ctr);
1552 #else
1553             PUTU32(ctx->Yi.c + 12, ctr);
1554 #endif
1555         else
1556             ctx->Yi.d[3] = ctr;
1557         out += i;
1558         in += i;
1559         len -= i;
1560     }
1561     if (len) {
1562         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1563         ++ctr;
1564         if (is_endian.little)
1565 #ifdef BSWAP4
1566             ctx->Yi.d[3] = BSWAP4(ctr);
1567 #else
1568             PUTU32(ctx->Yi.c + 12, ctr);
1569 #endif
1570         else
1571             ctx->Yi.d[3] = ctr;
1572         while (len--) {
1573             u8 c = in[n];
1574             ctx->Xi.c[n] ^= c;
1575             out[n] = c ^ ctx->EKi.c[n];
1576             ++n;
1577         }
1578     }
1579 
1580     ctx->mres = n;
1581     return 0;
1582 }
1583 
1584 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1585                          size_t len)
1586 {
1587     const union {
1588         long one;
1589         char little;
1590     } is_endian = {
1591         1
1592     };
1593     u64 alen = ctx->len.u[0] << 3;
1594     u64 clen = ctx->len.u[1] << 3;
1595 #ifdef GCM_FUNCREF_4BIT
1596     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1597 #endif
1598 
1599     if (ctx->mres || ctx->ares)
1600         GCM_MUL(ctx, Xi);
1601 
1602     if (is_endian.little) {
1603 #ifdef BSWAP8
1604         alen = BSWAP8(alen);
1605         clen = BSWAP8(clen);
1606 #else
1607         u8 *p = ctx->len.c;
1608 
1609         ctx->len.u[0] = alen;
1610         ctx->len.u[1] = clen;
1611 
1612         alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1613         clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1614 #endif
1615     }
1616 
1617     ctx->Xi.u[0] ^= alen;
1618     ctx->Xi.u[1] ^= clen;
1619     GCM_MUL(ctx, Xi);
1620 
1621     ctx->Xi.u[0] ^= ctx->EK0.u[0];
1622     ctx->Xi.u[1] ^= ctx->EK0.u[1];
1623 
1624     if (tag && len <= sizeof(ctx->Xi))
1625         return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1626     else
1627         return -1;
1628 }
1629 
1630 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1631 {
1632     CRYPTO_gcm128_finish(ctx, NULL, 0);
1633     memcpy(tag, ctx->Xi.c,
1634            len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1635 }
1636 
1637 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1638 {
1639     GCM128_CONTEXT *ret;
1640 
1641     if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1642         CRYPTO_gcm128_init(ret, key, block);
1643 
1644     return ret;
1645 }
1646 
1647 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1648 {
1649     if (ctx) {
1650         OPENSSL_cleanse(ctx, sizeof(*ctx));
1651         OPENSSL_free(ctx);
1652     }
1653 }
1654 
1655 #if defined(SELFTEST)
1656 # include <stdio.h>
1657 # include <openssl/aes.h>
1658 
1659 /* Test Case 1 */
1660 static const u8 K1[16], *P1 = NULL, *A1 = NULL, IV1[12], *C1 = NULL;
1661 static const u8 T1[] = {
1662     0x58, 0xe2, 0xfc, 0xce, 0xfa, 0x7e, 0x30, 0x61,
1663     0x36, 0x7f, 0x1d, 0x57, 0xa4, 0xe7, 0x45, 0x5a
1664 };
1665 
1666 /* Test Case 2 */
1667 # define K2 K1
1668 # define A2 A1
1669 # define IV2 IV1
1670 static const u8 P2[16];
1671 static const u8 C2[] = {
1672     0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92,
1673     0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78
1674 };
1675 
1676 static const u8 T2[] = {
1677     0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd,
1678     0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf
1679 };
1680 
1681 /* Test Case 3 */
1682 # define A3 A2
1683 static const u8 K3[] = {
1684     0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
1685     0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08
1686 };
1687 
1688 static const u8 P3[] = {
1689     0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
1690     0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
1691     0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
1692     0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
1693     0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
1694     0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
1695     0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
1696     0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
1697 };
1698 
1699 static const u8 IV3[] = {
1700     0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
1701     0xde, 0xca, 0xf8, 0x88
1702 };
1703 
1704 static const u8 C3[] = {
1705     0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24,
1706     0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c,
1707     0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0,
1708     0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e,
1709     0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c,
1710     0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05,
1711     0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97,
1712     0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85
1713 };
1714 
1715 static const u8 T3[] = {
1716     0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6,
1717     0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4
1718 };
1719 
1720 /* Test Case 4 */
1721 # define K4 K3
1722 # define IV4 IV3
1723 static const u8 P4[] = {
1724     0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
1725     0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
1726     0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
1727     0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
1728     0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
1729     0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
1730     0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
1731     0xba, 0x63, 0x7b, 0x39
1732 };
1733 
1734 static const u8 A4[] = {
1735     0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
1736     0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
1737     0xab, 0xad, 0xda, 0xd2
1738 };
1739 
1740 static const u8 C4[] = {
1741     0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24,
1742     0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c,
1743     0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0,
1744     0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e,
1745     0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c,
1746     0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05,
1747     0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97,
1748     0x3d, 0x58, 0xe0, 0x91
1749 };
1750 
1751 static const u8 T4[] = {
1752     0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb,
1753     0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47
1754 };
1755 
1756 /* Test Case 5 */
1757 # define K5 K4
1758 # define P5 P4
1759 # define A5 A4
1760 static const u8 IV5[] = {
1761     0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad
1762 };
1763 
1764 static const u8 C5[] = {
1765     0x61, 0x35, 0x3b, 0x4c, 0x28, 0x06, 0x93, 0x4a,
1766     0x77, 0x7f, 0xf5, 0x1f, 0xa2, 0x2a, 0x47, 0x55,
1767     0x69, 0x9b, 0x2a, 0x71, 0x4f, 0xcd, 0xc6, 0xf8,
1768     0x37, 0x66, 0xe5, 0xf9, 0x7b, 0x6c, 0x74, 0x23,
1769     0x73, 0x80, 0x69, 0x00, 0xe4, 0x9f, 0x24, 0xb2,
1770     0x2b, 0x09, 0x75, 0x44, 0xd4, 0x89, 0x6b, 0x42,
1771     0x49, 0x89, 0xb5, 0xe1, 0xeb, 0xac, 0x0f, 0x07,
1772     0xc2, 0x3f, 0x45, 0x98
1773 };
1774 
1775 static const u8 T5[] = {
1776     0x36, 0x12, 0xd2, 0xe7, 0x9e, 0x3b, 0x07, 0x85,
1777     0x56, 0x1b, 0xe1, 0x4a, 0xac, 0xa2, 0xfc, 0xcb
1778 };
1779 
1780 /* Test Case 6 */
1781 # define K6 K5
1782 # define P6 P5
1783 # define A6 A5
1784 static const u8 IV6[] = {
1785     0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
1786     0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
1787     0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
1788     0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
1789     0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
1790     0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
1791     0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
1792     0xa6, 0x37, 0xb3, 0x9b
1793 };
1794 
1795 static const u8 C6[] = {
1796     0x8c, 0xe2, 0x49, 0x98, 0x62, 0x56, 0x15, 0xb6,
1797     0x03, 0xa0, 0x33, 0xac, 0xa1, 0x3f, 0xb8, 0x94,
1798     0xbe, 0x91, 0x12, 0xa5, 0xc3, 0xa2, 0x11, 0xa8,
1799     0xba, 0x26, 0x2a, 0x3c, 0xca, 0x7e, 0x2c, 0xa7,
1800     0x01, 0xe4, 0xa9, 0xa4, 0xfb, 0xa4, 0x3c, 0x90,
1801     0xcc, 0xdc, 0xb2, 0x81, 0xd4, 0x8c, 0x7c, 0x6f,
1802     0xd6, 0x28, 0x75, 0xd2, 0xac, 0xa4, 0x17, 0x03,
1803     0x4c, 0x34, 0xae, 0xe5
1804 };
1805 
1806 static const u8 T6[] = {
1807     0x61, 0x9c, 0xc5, 0xae, 0xff, 0xfe, 0x0b, 0xfa,
1808     0x46, 0x2a, 0xf4, 0x3c, 0x16, 0x99, 0xd0, 0x50
1809 };
1810 
1811 /* Test Case 7 */
1812 static const u8 K7[24], *P7 = NULL, *A7 = NULL, IV7[12], *C7 = NULL;
1813 static const u8 T7[] = {
1814     0xcd, 0x33, 0xb2, 0x8a, 0xc7, 0x73, 0xf7, 0x4b,
1815     0xa0, 0x0e, 0xd1, 0xf3, 0x12, 0x57, 0x24, 0x35
1816 };
1817 
1818 /* Test Case 8 */
1819 # define K8 K7
1820 # define IV8 IV7
1821 # define A8 A7
1822 static const u8 P8[16];
1823 static const u8 C8[] = {
1824     0x98, 0xe7, 0x24, 0x7c, 0x07, 0xf0, 0xfe, 0x41,
1825     0x1c, 0x26, 0x7e, 0x43, 0x84, 0xb0, 0xf6, 0x00
1826 };
1827 
1828 static const u8 T8[] = {
1829     0x2f, 0xf5, 0x8d, 0x80, 0x03, 0x39, 0x27, 0xab,
1830     0x8e, 0xf4, 0xd4, 0x58, 0x75, 0x14, 0xf0, 0xfb
1831 };
1832 
1833 /* Test Case 9 */
1834 # define A9 A8
1835 static const u8 K9[] = {
1836     0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
1837     0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
1838     0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c
1839 };
1840 
1841 static const u8 P9[] = {
1842     0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
1843     0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
1844     0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
1845     0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
1846     0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
1847     0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
1848     0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
1849     0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
1850 };
1851 
1852 static const u8 IV9[] = {
1853     0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
1854     0xde, 0xca, 0xf8, 0x88
1855 };
1856 
1857 static const u8 C9[] = {
1858     0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41,
1859     0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57,
1860     0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84,
1861     0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c,
1862     0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25,
1863     0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47,
1864     0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9,
1865     0xcc, 0xda, 0x27, 0x10, 0xac, 0xad, 0xe2, 0x56
1866 };
1867 
1868 static const u8 T9[] = {
1869     0x99, 0x24, 0xa7, 0xc8, 0x58, 0x73, 0x36, 0xbf,
1870     0xb1, 0x18, 0x02, 0x4d, 0xb8, 0x67, 0x4a, 0x14
1871 };
1872 
1873 /* Test Case 10 */
1874 # define K10 K9
1875 # define IV10 IV9
1876 static const u8 P10[] = {
1877     0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
1878     0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
1879     0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
1880     0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
1881     0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
1882     0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
1883     0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
1884     0xba, 0x63, 0x7b, 0x39
1885 };
1886 
1887 static const u8 A10[] = {
1888     0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
1889     0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
1890     0xab, 0xad, 0xda, 0xd2
1891 };
1892 
1893 static const u8 C10[] = {
1894     0x39, 0x80, 0xca, 0x0b, 0x3c, 0x00, 0xe8, 0x41,
1895     0xeb, 0x06, 0xfa, 0xc4, 0x87, 0x2a, 0x27, 0x57,
1896     0x85, 0x9e, 0x1c, 0xea, 0xa6, 0xef, 0xd9, 0x84,
1897     0x62, 0x85, 0x93, 0xb4, 0x0c, 0xa1, 0xe1, 0x9c,
1898     0x7d, 0x77, 0x3d, 0x00, 0xc1, 0x44, 0xc5, 0x25,
1899     0xac, 0x61, 0x9d, 0x18, 0xc8, 0x4a, 0x3f, 0x47,
1900     0x18, 0xe2, 0x44, 0x8b, 0x2f, 0xe3, 0x24, 0xd9,
1901     0xcc, 0xda, 0x27, 0x10
1902 };
1903 
1904 static const u8 T10[] = {
1905     0x25, 0x19, 0x49, 0x8e, 0x80, 0xf1, 0x47, 0x8f,
1906     0x37, 0xba, 0x55, 0xbd, 0x6d, 0x27, 0x61, 0x8c
1907 };
1908 
1909 /* Test Case 11 */
1910 # define K11 K10
1911 # define P11 P10
1912 # define A11 A10
1913 static const u8 IV11[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad };
1914 
1915 static const u8 C11[] = {
1916     0x0f, 0x10, 0xf5, 0x99, 0xae, 0x14, 0xa1, 0x54,
1917     0xed, 0x24, 0xb3, 0x6e, 0x25, 0x32, 0x4d, 0xb8,
1918     0xc5, 0x66, 0x63, 0x2e, 0xf2, 0xbb, 0xb3, 0x4f,
1919     0x83, 0x47, 0x28, 0x0f, 0xc4, 0x50, 0x70, 0x57,
1920     0xfd, 0xdc, 0x29, 0xdf, 0x9a, 0x47, 0x1f, 0x75,
1921     0xc6, 0x65, 0x41, 0xd4, 0xd4, 0xda, 0xd1, 0xc9,
1922     0xe9, 0x3a, 0x19, 0xa5, 0x8e, 0x8b, 0x47, 0x3f,
1923     0xa0, 0xf0, 0x62, 0xf7
1924 };
1925 
1926 static const u8 T11[] = {
1927     0x65, 0xdc, 0xc5, 0x7f, 0xcf, 0x62, 0x3a, 0x24,
1928     0x09, 0x4f, 0xcc, 0xa4, 0x0d, 0x35, 0x33, 0xf8
1929 };
1930 
1931 /* Test Case 12 */
1932 # define K12 K11
1933 # define P12 P11
1934 # define A12 A11
1935 static const u8 IV12[] = {
1936     0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
1937     0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
1938     0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
1939     0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
1940     0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
1941     0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
1942     0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
1943     0xa6, 0x37, 0xb3, 0x9b
1944 };
1945 
1946 static const u8 C12[] = {
1947     0xd2, 0x7e, 0x88, 0x68, 0x1c, 0xe3, 0x24, 0x3c,
1948     0x48, 0x30, 0x16, 0x5a, 0x8f, 0xdc, 0xf9, 0xff,
1949     0x1d, 0xe9, 0xa1, 0xd8, 0xe6, 0xb4, 0x47, 0xef,
1950     0x6e, 0xf7, 0xb7, 0x98, 0x28, 0x66, 0x6e, 0x45,
1951     0x81, 0xe7, 0x90, 0x12, 0xaf, 0x34, 0xdd, 0xd9,
1952     0xe2, 0xf0, 0x37, 0x58, 0x9b, 0x29, 0x2d, 0xb3,
1953     0xe6, 0x7c, 0x03, 0x67, 0x45, 0xfa, 0x22, 0xe7,
1954     0xe9, 0xb7, 0x37, 0x3b
1955 };
1956 
1957 static const u8 T12[] = {
1958     0xdc, 0xf5, 0x66, 0xff, 0x29, 0x1c, 0x25, 0xbb,
1959     0xb8, 0x56, 0x8f, 0xc3, 0xd3, 0x76, 0xa6, 0xd9
1960 };
1961 
1962 /* Test Case 13 */
1963 static const u8 K13[32], *P13 = NULL, *A13 = NULL, IV13[12], *C13 = NULL;
1964 static const u8 T13[] = {
1965     0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9,
1966     0xa9, 0x63, 0xb4, 0xf1, 0xc4, 0xcb, 0x73, 0x8b
1967 };
1968 
1969 /* Test Case 14 */
1970 # define K14 K13
1971 # define A14 A13
1972 static const u8 P14[16], IV14[12];
1973 static const u8 C14[] = {
1974     0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e,
1975     0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18
1976 };
1977 
1978 static const u8 T14[] = {
1979     0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0,
1980     0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19
1981 };
1982 
1983 /* Test Case 15 */
1984 # define A15 A14
1985 static const u8 K15[] = {
1986     0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
1987     0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
1988     0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
1989     0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08
1990 };
1991 
1992 static const u8 P15[] = {
1993     0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
1994     0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
1995     0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
1996     0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
1997     0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
1998     0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
1999     0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
2000     0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
2001 };
2002 
2003 static const u8 IV15[] = {
2004     0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad,
2005     0xde, 0xca, 0xf8, 0x88
2006 };
2007 
2008 static const u8 C15[] = {
2009     0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
2010     0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
2011     0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
2012     0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
2013     0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
2014     0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
2015     0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
2016     0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
2017 };
2018 
2019 static const u8 T15[] = {
2020     0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd,
2021     0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c
2022 };
2023 
2024 /* Test Case 16 */
2025 # define K16 K15
2026 # define IV16 IV15
2027 static const u8 P16[] = {
2028     0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
2029     0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
2030     0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
2031     0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
2032     0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
2033     0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
2034     0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
2035     0xba, 0x63, 0x7b, 0x39
2036 };
2037 
2038 static const u8 A16[] = {
2039     0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
2040     0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
2041     0xab, 0xad, 0xda, 0xd2
2042 };
2043 
2044 static const u8 C16[] = {
2045     0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
2046     0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
2047     0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
2048     0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
2049     0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
2050     0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
2051     0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
2052     0xbc, 0xc9, 0xf6, 0x62
2053 };
2054 
2055 static const u8 T16[] = {
2056     0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68,
2057     0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b
2058 };
2059 
2060 /* Test Case 17 */
2061 # define K17 K16
2062 # define P17 P16
2063 # define A17 A16
2064 static const u8 IV17[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad };
2065 
2066 static const u8 C17[] = {
2067     0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32,
2068     0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb,
2069     0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa,
2070     0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0,
2071     0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0,
2072     0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78,
2073     0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99,
2074     0xf4, 0x7c, 0x9b, 0x1f
2075 };
2076 
2077 static const u8 T17[] = {
2078     0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4,
2079     0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2
2080 };
2081 
2082 /* Test Case 18 */
2083 # define K18 K17
2084 # define P18 P17
2085 # define A18 A17
2086 static const u8 IV18[] = {
2087     0x93, 0x13, 0x22, 0x5d, 0xf8, 0x84, 0x06, 0xe5,
2088     0x55, 0x90, 0x9c, 0x5a, 0xff, 0x52, 0x69, 0xaa,
2089     0x6a, 0x7a, 0x95, 0x38, 0x53, 0x4f, 0x7d, 0xa1,
2090     0xe4, 0xc3, 0x03, 0xd2, 0xa3, 0x18, 0xa7, 0x28,
2091     0xc3, 0xc0, 0xc9, 0x51, 0x56, 0x80, 0x95, 0x39,
2092     0xfc, 0xf0, 0xe2, 0x42, 0x9a, 0x6b, 0x52, 0x54,
2093     0x16, 0xae, 0xdb, 0xf5, 0xa0, 0xde, 0x6a, 0x57,
2094     0xa6, 0x37, 0xb3, 0x9b
2095 };
2096 
2097 static const u8 C18[] = {
2098     0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1,
2099     0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20,
2100     0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19,
2101     0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4,
2102     0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45,
2103     0x2d, 0xa3, 0xeb, 0xf1, 0xc5, 0xd8, 0x2c, 0xde,
2104     0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e,
2105     0x44, 0xae, 0x7e, 0x3f
2106 };
2107 
2108 static const u8 T18[] = {
2109     0xa4, 0x4a, 0x82, 0x66, 0xee, 0x1c, 0x8e, 0xb0,
2110     0xc8, 0xb5, 0xd4, 0xcf, 0x5a, 0xe9, 0xf1, 0x9a
2111 };
2112 
2113 /* Test Case 19 */
2114 # define K19 K1
2115 # define P19 P1
2116 # define IV19 IV1
2117 # define C19 C1
2118 static const u8 A19[] = {
2119     0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5,
2120     0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
2121     0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda,
2122     0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
2123     0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53,
2124     0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
2125     0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57,
2126     0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55,
2127     0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07,
2128     0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
2129     0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9,
2130     0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
2131     0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d,
2132     0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
2133     0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a,
2134     0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
2135 };
2136 
2137 static const u8 T19[] = {
2138     0x5f, 0xea, 0x79, 0x3a, 0x2d, 0x6f, 0x97, 0x4d,
2139     0x37, 0xe6, 0x8e, 0x0c, 0xb8, 0xff, 0x94, 0x92
2140 };
2141 
2142 /* Test Case 20 */
2143 # define K20 K1
2144 # define A20 A1
2145 /* this results in 0xff in counter LSB */
2146 static const u8 IV20[64] = { 0xff, 0xff, 0xff, 0xff };
2147 
2148 static const u8 P20[288];
2149 static const u8 C20[] = {
2150     0x56, 0xb3, 0x37, 0x3c, 0xa9, 0xef, 0x6e, 0x4a,
2151     0x2b, 0x64, 0xfe, 0x1e, 0x9a, 0x17, 0xb6, 0x14,
2152     0x25, 0xf1, 0x0d, 0x47, 0xa7, 0x5a, 0x5f, 0xce,
2153     0x13, 0xef, 0xc6, 0xbc, 0x78, 0x4a, 0xf2, 0x4f,
2154     0x41, 0x41, 0xbd, 0xd4, 0x8c, 0xf7, 0xc7, 0x70,
2155     0x88, 0x7a, 0xfd, 0x57, 0x3c, 0xca, 0x54, 0x18,
2156     0xa9, 0xae, 0xff, 0xcd, 0x7c, 0x5c, 0xed, 0xdf,
2157     0xc6, 0xa7, 0x83, 0x97, 0xb9, 0xa8, 0x5b, 0x49,
2158     0x9d, 0xa5, 0x58, 0x25, 0x72, 0x67, 0xca, 0xab,
2159     0x2a, 0xd0, 0xb2, 0x3c, 0xa4, 0x76, 0xa5, 0x3c,
2160     0xb1, 0x7f, 0xb4, 0x1c, 0x4b, 0x8b, 0x47, 0x5c,
2161     0xb4, 0xf3, 0xf7, 0x16, 0x50, 0x94, 0xc2, 0x29,
2162     0xc9, 0xe8, 0xc4, 0xdc, 0x0a, 0x2a, 0x5f, 0xf1,
2163     0x90, 0x3e, 0x50, 0x15, 0x11, 0x22, 0x13, 0x76,
2164     0xa1, 0xcd, 0xb8, 0x36, 0x4c, 0x50, 0x61, 0xa2,
2165     0x0c, 0xae, 0x74, 0xbc, 0x4a, 0xcd, 0x76, 0xce,
2166     0xb0, 0xab, 0xc9, 0xfd, 0x32, 0x17, 0xef, 0x9f,
2167     0x8c, 0x90, 0xbe, 0x40, 0x2d, 0xdf, 0x6d, 0x86,
2168     0x97, 0xf4, 0xf8, 0x80, 0xdf, 0xf1, 0x5b, 0xfb,
2169     0x7a, 0x6b, 0x28, 0x24, 0x1e, 0xc8, 0xfe, 0x18,
2170     0x3c, 0x2d, 0x59, 0xe3, 0xf9, 0xdf, 0xff, 0x65,
2171     0x3c, 0x71, 0x26, 0xf0, 0xac, 0xb9, 0xe6, 0x42,
2172     0x11, 0xf4, 0x2b, 0xae, 0x12, 0xaf, 0x46, 0x2b,
2173     0x10, 0x70, 0xbe, 0xf1, 0xab, 0x5e, 0x36, 0x06,
2174     0x87, 0x2c, 0xa1, 0x0d, 0xee, 0x15, 0xb3, 0x24,
2175     0x9b, 0x1a, 0x1b, 0x95, 0x8f, 0x23, 0x13, 0x4c,
2176     0x4b, 0xcc, 0xb7, 0xd0, 0x32, 0x00, 0xbc, 0xe4,
2177     0x20, 0xa2, 0xf8, 0xeb, 0x66, 0xdc, 0xf3, 0x64,
2178     0x4d, 0x14, 0x23, 0xc1, 0xb5, 0x69, 0x90, 0x03,
2179     0xc1, 0x3e, 0xce, 0xf4, 0xbf, 0x38, 0xa3, 0xb6,
2180     0x0e, 0xed, 0xc3, 0x40, 0x33, 0xba, 0xc1, 0x90,
2181     0x27, 0x83, 0xdc, 0x6d, 0x89, 0xe2, 0xe7, 0x74,
2182     0x18, 0x8a, 0x43, 0x9c, 0x7e, 0xbc, 0xc0, 0x67,
2183     0x2d, 0xbd, 0xa4, 0xdd, 0xcf, 0xb2, 0x79, 0x46,
2184     0x13, 0xb0, 0xbe, 0x41, 0x31, 0x5e, 0xf7, 0x78,
2185     0x70, 0x8a, 0x70, 0xee, 0x7d, 0x75, 0x16, 0x5c
2186 };
2187 
2188 static const u8 T20[] = {
2189     0x8b, 0x30, 0x7f, 0x6b, 0x33, 0x28, 0x6d, 0x0a,
2190     0xb0, 0x26, 0xa9, 0xed, 0x3f, 0xe1, 0xe8, 0x5f
2191 };
2192 
2193 # define TEST_CASE(n)    do {                                    \
2194         u8 out[sizeof(P##n)];                                   \
2195         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
2196         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
2197         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
2198         memset(out,0,sizeof(out));                              \
2199         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
2200         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
2201         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
2202             (C##n && memcmp(out,C##n,sizeof(out))))             \
2203                 ret++, printf ("encrypt test#%d failed.\n",n);  \
2204         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
2205         memset(out,0,sizeof(out));                              \
2206         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
2207         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
2208         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
2209             (P##n && memcmp(out,P##n,sizeof(out))))             \
2210                 ret++, printf ("decrypt test#%d failed.\n",n);  \
2211         } while(0)
2212 
2213 int main()
2214 {
2215     GCM128_CONTEXT ctx;
2216     AES_KEY key;
2217     int ret = 0;
2218 
2219     TEST_CASE(1);
2220     TEST_CASE(2);
2221     TEST_CASE(3);
2222     TEST_CASE(4);
2223     TEST_CASE(5);
2224     TEST_CASE(6);
2225     TEST_CASE(7);
2226     TEST_CASE(8);
2227     TEST_CASE(9);
2228     TEST_CASE(10);
2229     TEST_CASE(11);
2230     TEST_CASE(12);
2231     TEST_CASE(13);
2232     TEST_CASE(14);
2233     TEST_CASE(15);
2234     TEST_CASE(16);
2235     TEST_CASE(17);
2236     TEST_CASE(18);
2237     TEST_CASE(19);
2238     TEST_CASE(20);
2239 
2240 # ifdef OPENSSL_CPUID_OBJ
2241     {
2242         size_t start, stop, gcm_t, ctr_t, OPENSSL_rdtsc();
2243         union {
2244             u64 u;
2245             u8 c[1024];
2246         } buf;
2247         int i;
2248 
2249         AES_set_encrypt_key(K1, sizeof(K1) * 8, &key);
2250         CRYPTO_gcm128_init(&ctx, &key, (block128_f) AES_encrypt);
2251         CRYPTO_gcm128_setiv(&ctx, IV1, sizeof(IV1));
2252 
2253         CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf));
2254         start = OPENSSL_rdtsc();
2255         CRYPTO_gcm128_encrypt(&ctx, buf.c, buf.c, sizeof(buf));
2256         gcm_t = OPENSSL_rdtsc() - start;
2257 
2258         CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf),
2259                               &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres,
2260                               (block128_f) AES_encrypt);
2261         start = OPENSSL_rdtsc();
2262         CRYPTO_ctr128_encrypt(buf.c, buf.c, sizeof(buf),
2263                               &key, ctx.Yi.c, ctx.EKi.c, &ctx.mres,
2264                               (block128_f) AES_encrypt);
2265         ctr_t = OPENSSL_rdtsc() - start;
2266 
2267         printf("%.2f-%.2f=%.2f\n",
2268                gcm_t / (double)sizeof(buf),
2269                ctr_t / (double)sizeof(buf),
2270                (gcm_t - ctr_t) / (double)sizeof(buf));
2271 #  ifdef GHASH
2272         {
2273             void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
2274                                  const u8 *inp, size_t len) = ctx.ghash;
2275 
2276             GHASH((&ctx), buf.c, sizeof(buf));
2277             start = OPENSSL_rdtsc();
2278             for (i = 0; i < 100; ++i)
2279                 GHASH((&ctx), buf.c, sizeof(buf));
2280             gcm_t = OPENSSL_rdtsc() - start;
2281             printf("%.2f\n", gcm_t / (double)sizeof(buf) / (double)i);
2282         }
2283 #  endif
2284     }
2285 # endif
2286 
2287     return ret;
2288 }
2289 #endif
2290