1 /* Twofish for GPG
2  * Copyright (C) 1998, 2002, 2003 Free Software Foundation, Inc.
3  * Written by Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
4  * 256-bit key length added March 20, 1999
5  * Some modifications to reduce the text size by Werner Koch, April, 1998
6  *
7  * This file is part of Libgcrypt.
8  *
9  * Libgcrypt is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as
11  * published by the Free Software Foundation; either version 2.1 of
12  * the License, or (at your option) any later version.
13  *
14  * Libgcrypt is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
22  ********************************************************************
23  *
24  * This code is a "clean room" implementation, written from the paper
25  * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
26  * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
27  * through http://www.counterpane.com/twofish.html
28  *
29  * For background information on multiplication in finite fields, used for
30  * the matrix operations in the key schedule, see the book _Contemporary
31  * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
32  * Third Edition.
33  *
34  * Only the 128- and 256-bit key sizes are supported.  This code is intended
35  * for GNU C on a 32-bit system, but it should work almost anywhere.  Loops
36  * are unrolled, precomputation tables are used, etc., for maximum speed at
37  * some cost in memory consumption. */
38 
39 #include <config.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h> /* for memcmp() */
43 
44 #include "types.h"  /* for byte and u32 typedefs */
45 #include "g10lib.h"
46 #include "cipher.h"
47 #include "bufhelp.h"
48 #include "cipher-internal.h"
49 #include "cipher-selftest.h"
50 
51 
52 #define TWOFISH_BLOCKSIZE 16
53 
54 
55 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
56 #undef USE_AMD64_ASM
57 #if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
58     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
59 # define USE_AMD64_ASM 1
60 #endif
61 
62 /* USE_ARM_ASM indicates whether to use ARM assembly code. */
63 #undef USE_ARM_ASM
64 #if defined(__ARMEL__)
65 # if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
66 #  define USE_ARM_ASM 1
67 # endif
68 #endif
69 # if defined(__AARCH64EL__)
70 #  ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
71 #   define USE_ARM_ASM 1
72 #  endif
73 # endif
74 
75 /* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
76 #undef USE_AVX2
77 #if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
78     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
79 # if defined(ENABLE_AVX2_SUPPORT)
80 #  define USE_AVX2 1
81 # endif
82 #endif
83 
84 
85 /* Prototype for the self-test function. */
86 static const char *selftest(void);
87 
88 
89 /* Prototypes for the bulk functions. */
90 static void _gcry_twofish_ctr_enc (void *context, unsigned char *ctr,
91 				   void *outbuf_arg, const void *inbuf_arg,
92 				   size_t nblocks);
93 static void _gcry_twofish_cbc_dec (void *context, unsigned char *iv,
94 				   void *outbuf_arg, const void *inbuf_arg,
95 				   size_t nblocks);
96 static void _gcry_twofish_cfb_dec (void *context, unsigned char *iv,
97 				   void *outbuf_arg, const void *inbuf_arg,
98 				   size_t nblocks);
99 static size_t _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
100 				       const void *inbuf_arg, size_t nblocks,
101 				       int encrypt);
102 static size_t _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
103 				      size_t nblocks);
104 
105 
106 /* Structure for an expanded Twofish key.  s contains the key-dependent
107  * S-boxes composed with the MDS matrix; w contains the eight "whitening"
108  * subkeys, K[0] through K[7].	k holds the remaining, "round" subkeys.  Note
109  * that k[i] corresponds to what the Twofish paper calls K[i+8]. */
110 typedef struct {
111    u32 s[4][256], w[8], k[32];
112 
113 #ifdef USE_AVX2
114   int use_avx2;
115 #endif
116 } TWOFISH_context;
117 
118 
119 /* Assembly implementations use SystemV ABI, ABI conversion and additional
120  * stack to store XMM6-XMM15 needed on Win64. */
121 #undef ASM_FUNC_ABI
122 #if defined(USE_AVX2)
123 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
124 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
125 # else
126 #  define ASM_FUNC_ABI
127 # endif
128 #endif
129 
130 
131 /* These two tables are the q0 and q1 permutations, exactly as described in
132  * the Twofish paper. */
133 
134 static const byte q0[256] = {
135    0xA9, 0x67, 0xB3, 0xE8, 0x04, 0xFD, 0xA3, 0x76, 0x9A, 0x92, 0x80, 0x78,
136    0xE4, 0xDD, 0xD1, 0x38, 0x0D, 0xC6, 0x35, 0x98, 0x18, 0xF7, 0xEC, 0x6C,
137    0x43, 0x75, 0x37, 0x26, 0xFA, 0x13, 0x94, 0x48, 0xF2, 0xD0, 0x8B, 0x30,
138    0x84, 0x54, 0xDF, 0x23, 0x19, 0x5B, 0x3D, 0x59, 0xF3, 0xAE, 0xA2, 0x82,
139    0x63, 0x01, 0x83, 0x2E, 0xD9, 0x51, 0x9B, 0x7C, 0xA6, 0xEB, 0xA5, 0xBE,
140    0x16, 0x0C, 0xE3, 0x61, 0xC0, 0x8C, 0x3A, 0xF5, 0x73, 0x2C, 0x25, 0x0B,
141    0xBB, 0x4E, 0x89, 0x6B, 0x53, 0x6A, 0xB4, 0xF1, 0xE1, 0xE6, 0xBD, 0x45,
142    0xE2, 0xF4, 0xB6, 0x66, 0xCC, 0x95, 0x03, 0x56, 0xD4, 0x1C, 0x1E, 0xD7,
143    0xFB, 0xC3, 0x8E, 0xB5, 0xE9, 0xCF, 0xBF, 0xBA, 0xEA, 0x77, 0x39, 0xAF,
144    0x33, 0xC9, 0x62, 0x71, 0x81, 0x79, 0x09, 0xAD, 0x24, 0xCD, 0xF9, 0xD8,
145    0xE5, 0xC5, 0xB9, 0x4D, 0x44, 0x08, 0x86, 0xE7, 0xA1, 0x1D, 0xAA, 0xED,
146    0x06, 0x70, 0xB2, 0xD2, 0x41, 0x7B, 0xA0, 0x11, 0x31, 0xC2, 0x27, 0x90,
147    0x20, 0xF6, 0x60, 0xFF, 0x96, 0x5C, 0xB1, 0xAB, 0x9E, 0x9C, 0x52, 0x1B,
148    0x5F, 0x93, 0x0A, 0xEF, 0x91, 0x85, 0x49, 0xEE, 0x2D, 0x4F, 0x8F, 0x3B,
149    0x47, 0x87, 0x6D, 0x46, 0xD6, 0x3E, 0x69, 0x64, 0x2A, 0xCE, 0xCB, 0x2F,
150    0xFC, 0x97, 0x05, 0x7A, 0xAC, 0x7F, 0xD5, 0x1A, 0x4B, 0x0E, 0xA7, 0x5A,
151    0x28, 0x14, 0x3F, 0x29, 0x88, 0x3C, 0x4C, 0x02, 0xB8, 0xDA, 0xB0, 0x17,
152    0x55, 0x1F, 0x8A, 0x7D, 0x57, 0xC7, 0x8D, 0x74, 0xB7, 0xC4, 0x9F, 0x72,
153    0x7E, 0x15, 0x22, 0x12, 0x58, 0x07, 0x99, 0x34, 0x6E, 0x50, 0xDE, 0x68,
154    0x65, 0xBC, 0xDB, 0xF8, 0xC8, 0xA8, 0x2B, 0x40, 0xDC, 0xFE, 0x32, 0xA4,
155    0xCA, 0x10, 0x21, 0xF0, 0xD3, 0x5D, 0x0F, 0x00, 0x6F, 0x9D, 0x36, 0x42,
156    0x4A, 0x5E, 0xC1, 0xE0
157 };
158 
159 static const byte q1[256] = {
160    0x75, 0xF3, 0xC6, 0xF4, 0xDB, 0x7B, 0xFB, 0xC8, 0x4A, 0xD3, 0xE6, 0x6B,
161    0x45, 0x7D, 0xE8, 0x4B, 0xD6, 0x32, 0xD8, 0xFD, 0x37, 0x71, 0xF1, 0xE1,
162    0x30, 0x0F, 0xF8, 0x1B, 0x87, 0xFA, 0x06, 0x3F, 0x5E, 0xBA, 0xAE, 0x5B,
163    0x8A, 0x00, 0xBC, 0x9D, 0x6D, 0xC1, 0xB1, 0x0E, 0x80, 0x5D, 0xD2, 0xD5,
164    0xA0, 0x84, 0x07, 0x14, 0xB5, 0x90, 0x2C, 0xA3, 0xB2, 0x73, 0x4C, 0x54,
165    0x92, 0x74, 0x36, 0x51, 0x38, 0xB0, 0xBD, 0x5A, 0xFC, 0x60, 0x62, 0x96,
166    0x6C, 0x42, 0xF7, 0x10, 0x7C, 0x28, 0x27, 0x8C, 0x13, 0x95, 0x9C, 0xC7,
167    0x24, 0x46, 0x3B, 0x70, 0xCA, 0xE3, 0x85, 0xCB, 0x11, 0xD0, 0x93, 0xB8,
168    0xA6, 0x83, 0x20, 0xFF, 0x9F, 0x77, 0xC3, 0xCC, 0x03, 0x6F, 0x08, 0xBF,
169    0x40, 0xE7, 0x2B, 0xE2, 0x79, 0x0C, 0xAA, 0x82, 0x41, 0x3A, 0xEA, 0xB9,
170    0xE4, 0x9A, 0xA4, 0x97, 0x7E, 0xDA, 0x7A, 0x17, 0x66, 0x94, 0xA1, 0x1D,
171    0x3D, 0xF0, 0xDE, 0xB3, 0x0B, 0x72, 0xA7, 0x1C, 0xEF, 0xD1, 0x53, 0x3E,
172    0x8F, 0x33, 0x26, 0x5F, 0xEC, 0x76, 0x2A, 0x49, 0x81, 0x88, 0xEE, 0x21,
173    0xC4, 0x1A, 0xEB, 0xD9, 0xC5, 0x39, 0x99, 0xCD, 0xAD, 0x31, 0x8B, 0x01,
174    0x18, 0x23, 0xDD, 0x1F, 0x4E, 0x2D, 0xF9, 0x48, 0x4F, 0xF2, 0x65, 0x8E,
175    0x78, 0x5C, 0x58, 0x19, 0x8D, 0xE5, 0x98, 0x57, 0x67, 0x7F, 0x05, 0x64,
176    0xAF, 0x63, 0xB6, 0xFE, 0xF5, 0xB7, 0x3C, 0xA5, 0xCE, 0xE9, 0x68, 0x44,
177    0xE0, 0x4D, 0x43, 0x69, 0x29, 0x2E, 0xAC, 0x15, 0x59, 0xA8, 0x0A, 0x9E,
178    0x6E, 0x47, 0xDF, 0x34, 0x35, 0x6A, 0xCF, 0xDC, 0x22, 0xC9, 0xC0, 0x9B,
179    0x89, 0xD4, 0xED, 0xAB, 0x12, 0xA2, 0x0D, 0x52, 0xBB, 0x02, 0x2F, 0xA9,
180    0xD7, 0x61, 0x1E, 0xB4, 0x50, 0x04, 0xF6, 0xC2, 0x16, 0x25, 0x86, 0x56,
181    0x55, 0x09, 0xBE, 0x91
182 };
183 
184 /* These MDS tables are actually tables of MDS composed with q0 and q1,
185  * because it is only ever used that way and we can save some time by
186  * precomputing.  Of course the main saving comes from precomputing the
187  * GF(2^8) multiplication involved in the MDS matrix multiply; by looking
188  * things up in these tables we reduce the matrix multiply to four lookups
189  * and three XORs.  Semi-formally, the definition of these tables is:
190  * mds[0][i] = MDS (q1[i] 0 0 0)^T  mds[1][i] = MDS (0 q0[i] 0 0)^T
191  * mds[2][i] = MDS (0 0 q1[i] 0)^T  mds[3][i] = MDS (0 0 0 q0[i])^T
192  * where ^T means "transpose", the matrix multiply is performed in GF(2^8)
193  * represented as GF(2)[x]/v(x) where v(x)=x^8+x^6+x^5+x^3+1 as described
194  * by Schneier et al, and I'm casually glossing over the byte/word
195  * conversion issues. */
196 
197 static const u32 mds[4][256] = {
198    {0xBCBC3275, 0xECEC21F3, 0x202043C6, 0xB3B3C9F4, 0xDADA03DB, 0x02028B7B,
199     0xE2E22BFB, 0x9E9EFAC8, 0xC9C9EC4A, 0xD4D409D3, 0x18186BE6, 0x1E1E9F6B,
200     0x98980E45, 0xB2B2387D, 0xA6A6D2E8, 0x2626B74B, 0x3C3C57D6, 0x93938A32,
201     0x8282EED8, 0x525298FD, 0x7B7BD437, 0xBBBB3771, 0x5B5B97F1, 0x474783E1,
202     0x24243C30, 0x5151E20F, 0xBABAC6F8, 0x4A4AF31B, 0xBFBF4887, 0x0D0D70FA,
203     0xB0B0B306, 0x7575DE3F, 0xD2D2FD5E, 0x7D7D20BA, 0x666631AE, 0x3A3AA35B,
204     0x59591C8A, 0x00000000, 0xCDCD93BC, 0x1A1AE09D, 0xAEAE2C6D, 0x7F7FABC1,
205     0x2B2BC7B1, 0xBEBEB90E, 0xE0E0A080, 0x8A8A105D, 0x3B3B52D2, 0x6464BAD5,
206     0xD8D888A0, 0xE7E7A584, 0x5F5FE807, 0x1B1B1114, 0x2C2CC2B5, 0xFCFCB490,
207     0x3131272C, 0x808065A3, 0x73732AB2, 0x0C0C8173, 0x79795F4C, 0x6B6B4154,
208     0x4B4B0292, 0x53536974, 0x94948F36, 0x83831F51, 0x2A2A3638, 0xC4C49CB0,
209     0x2222C8BD, 0xD5D5F85A, 0xBDBDC3FC, 0x48487860, 0xFFFFCE62, 0x4C4C0796,
210     0x4141776C, 0xC7C7E642, 0xEBEB24F7, 0x1C1C1410, 0x5D5D637C, 0x36362228,
211     0x6767C027, 0xE9E9AF8C, 0x4444F913, 0x1414EA95, 0xF5F5BB9C, 0xCFCF18C7,
212     0x3F3F2D24, 0xC0C0E346, 0x7272DB3B, 0x54546C70, 0x29294CCA, 0xF0F035E3,
213     0x0808FE85, 0xC6C617CB, 0xF3F34F11, 0x8C8CE4D0, 0xA4A45993, 0xCACA96B8,
214     0x68683BA6, 0xB8B84D83, 0x38382820, 0xE5E52EFF, 0xADAD569F, 0x0B0B8477,
215     0xC8C81DC3, 0x9999FFCC, 0x5858ED03, 0x19199A6F, 0x0E0E0A08, 0x95957EBF,
216     0x70705040, 0xF7F730E7, 0x6E6ECF2B, 0x1F1F6EE2, 0xB5B53D79, 0x09090F0C,
217     0x616134AA, 0x57571682, 0x9F9F0B41, 0x9D9D803A, 0x111164EA, 0x2525CDB9,
218     0xAFAFDDE4, 0x4545089A, 0xDFDF8DA4, 0xA3A35C97, 0xEAEAD57E, 0x353558DA,
219     0xEDEDD07A, 0x4343FC17, 0xF8F8CB66, 0xFBFBB194, 0x3737D3A1, 0xFAFA401D,
220     0xC2C2683D, 0xB4B4CCF0, 0x32325DDE, 0x9C9C71B3, 0x5656E70B, 0xE3E3DA72,
221     0x878760A7, 0x15151B1C, 0xF9F93AEF, 0x6363BFD1, 0x3434A953, 0x9A9A853E,
222     0xB1B1428F, 0x7C7CD133, 0x88889B26, 0x3D3DA65F, 0xA1A1D7EC, 0xE4E4DF76,
223     0x8181942A, 0x91910149, 0x0F0FFB81, 0xEEEEAA88, 0x161661EE, 0xD7D77321,
224     0x9797F5C4, 0xA5A5A81A, 0xFEFE3FEB, 0x6D6DB5D9, 0x7878AEC5, 0xC5C56D39,
225     0x1D1DE599, 0x7676A4CD, 0x3E3EDCAD, 0xCBCB6731, 0xB6B6478B, 0xEFEF5B01,
226     0x12121E18, 0x6060C523, 0x6A6AB0DD, 0x4D4DF61F, 0xCECEE94E, 0xDEDE7C2D,
227     0x55559DF9, 0x7E7E5A48, 0x2121B24F, 0x03037AF2, 0xA0A02665, 0x5E5E198E,
228     0x5A5A6678, 0x65654B5C, 0x62624E58, 0xFDFD4519, 0x0606F48D, 0x404086E5,
229     0xF2F2BE98, 0x3333AC57, 0x17179067, 0x05058E7F, 0xE8E85E05, 0x4F4F7D64,
230     0x89896AAF, 0x10109563, 0x74742FB6, 0x0A0A75FE, 0x5C5C92F5, 0x9B9B74B7,
231     0x2D2D333C, 0x3030D6A5, 0x2E2E49CE, 0x494989E9, 0x46467268, 0x77775544,
232     0xA8A8D8E0, 0x9696044D, 0x2828BD43, 0xA9A92969, 0xD9D97929, 0x8686912E,
233     0xD1D187AC, 0xF4F44A15, 0x8D8D1559, 0xD6D682A8, 0xB9B9BC0A, 0x42420D9E,
234     0xF6F6C16E, 0x2F2FB847, 0xDDDD06DF, 0x23233934, 0xCCCC6235, 0xF1F1C46A,
235     0xC1C112CF, 0x8585EBDC, 0x8F8F9E22, 0x7171A1C9, 0x9090F0C0, 0xAAAA539B,
236     0x0101F189, 0x8B8BE1D4, 0x4E4E8CED, 0x8E8E6FAB, 0xABABA212, 0x6F6F3EA2,
237     0xE6E6540D, 0xDBDBF252, 0x92927BBB, 0xB7B7B602, 0x6969CA2F, 0x3939D9A9,
238     0xD3D30CD7, 0xA7A72361, 0xA2A2AD1E, 0xC3C399B4, 0x6C6C4450, 0x07070504,
239     0x04047FF6, 0x272746C2, 0xACACA716, 0xD0D07625, 0x50501386, 0xDCDCF756,
240     0x84841A55, 0xE1E15109, 0x7A7A25BE, 0x1313EF91},
241 
242    {0xA9D93939, 0x67901717, 0xB3719C9C, 0xE8D2A6A6, 0x04050707, 0xFD985252,
243     0xA3658080, 0x76DFE4E4, 0x9A084545, 0x92024B4B, 0x80A0E0E0, 0x78665A5A,
244     0xE4DDAFAF, 0xDDB06A6A, 0xD1BF6363, 0x38362A2A, 0x0D54E6E6, 0xC6432020,
245     0x3562CCCC, 0x98BEF2F2, 0x181E1212, 0xF724EBEB, 0xECD7A1A1, 0x6C774141,
246     0x43BD2828, 0x7532BCBC, 0x37D47B7B, 0x269B8888, 0xFA700D0D, 0x13F94444,
247     0x94B1FBFB, 0x485A7E7E, 0xF27A0303, 0xD0E48C8C, 0x8B47B6B6, 0x303C2424,
248     0x84A5E7E7, 0x54416B6B, 0xDF06DDDD, 0x23C56060, 0x1945FDFD, 0x5BA33A3A,
249     0x3D68C2C2, 0x59158D8D, 0xF321ECEC, 0xAE316666, 0xA23E6F6F, 0x82165757,
250     0x63951010, 0x015BEFEF, 0x834DB8B8, 0x2E918686, 0xD9B56D6D, 0x511F8383,
251     0x9B53AAAA, 0x7C635D5D, 0xA63B6868, 0xEB3FFEFE, 0xA5D63030, 0xBE257A7A,
252     0x16A7ACAC, 0x0C0F0909, 0xE335F0F0, 0x6123A7A7, 0xC0F09090, 0x8CAFE9E9,
253     0x3A809D9D, 0xF5925C5C, 0x73810C0C, 0x2C273131, 0x2576D0D0, 0x0BE75656,
254     0xBB7B9292, 0x4EE9CECE, 0x89F10101, 0x6B9F1E1E, 0x53A93434, 0x6AC4F1F1,
255     0xB499C3C3, 0xF1975B5B, 0xE1834747, 0xE66B1818, 0xBDC82222, 0x450E9898,
256     0xE26E1F1F, 0xF4C9B3B3, 0xB62F7474, 0x66CBF8F8, 0xCCFF9999, 0x95EA1414,
257     0x03ED5858, 0x56F7DCDC, 0xD4E18B8B, 0x1C1B1515, 0x1EADA2A2, 0xD70CD3D3,
258     0xFB2BE2E2, 0xC31DC8C8, 0x8E195E5E, 0xB5C22C2C, 0xE9894949, 0xCF12C1C1,
259     0xBF7E9595, 0xBA207D7D, 0xEA641111, 0x77840B0B, 0x396DC5C5, 0xAF6A8989,
260     0x33D17C7C, 0xC9A17171, 0x62CEFFFF, 0x7137BBBB, 0x81FB0F0F, 0x793DB5B5,
261     0x0951E1E1, 0xADDC3E3E, 0x242D3F3F, 0xCDA47676, 0xF99D5555, 0xD8EE8282,
262     0xE5864040, 0xC5AE7878, 0xB9CD2525, 0x4D049696, 0x44557777, 0x080A0E0E,
263     0x86135050, 0xE730F7F7, 0xA1D33737, 0x1D40FAFA, 0xAA346161, 0xED8C4E4E,
264     0x06B3B0B0, 0x706C5454, 0xB22A7373, 0xD2523B3B, 0x410B9F9F, 0x7B8B0202,
265     0xA088D8D8, 0x114FF3F3, 0x3167CBCB, 0xC2462727, 0x27C06767, 0x90B4FCFC,
266     0x20283838, 0xF67F0404, 0x60784848, 0xFF2EE5E5, 0x96074C4C, 0x5C4B6565,
267     0xB1C72B2B, 0xAB6F8E8E, 0x9E0D4242, 0x9CBBF5F5, 0x52F2DBDB, 0x1BF34A4A,
268     0x5FA63D3D, 0x9359A4A4, 0x0ABCB9B9, 0xEF3AF9F9, 0x91EF1313, 0x85FE0808,
269     0x49019191, 0xEE611616, 0x2D7CDEDE, 0x4FB22121, 0x8F42B1B1, 0x3BDB7272,
270     0x47B82F2F, 0x8748BFBF, 0x6D2CAEAE, 0x46E3C0C0, 0xD6573C3C, 0x3E859A9A,
271     0x6929A9A9, 0x647D4F4F, 0x2A948181, 0xCE492E2E, 0xCB17C6C6, 0x2FCA6969,
272     0xFCC3BDBD, 0x975CA3A3, 0x055EE8E8, 0x7AD0EDED, 0xAC87D1D1, 0x7F8E0505,
273     0xD5BA6464, 0x1AA8A5A5, 0x4BB72626, 0x0EB9BEBE, 0xA7608787, 0x5AF8D5D5,
274     0x28223636, 0x14111B1B, 0x3FDE7575, 0x2979D9D9, 0x88AAEEEE, 0x3C332D2D,
275     0x4C5F7979, 0x02B6B7B7, 0xB896CACA, 0xDA583535, 0xB09CC4C4, 0x17FC4343,
276     0x551A8484, 0x1FF64D4D, 0x8A1C5959, 0x7D38B2B2, 0x57AC3333, 0xC718CFCF,
277     0x8DF40606, 0x74695353, 0xB7749B9B, 0xC4F59797, 0x9F56ADAD, 0x72DAE3E3,
278     0x7ED5EAEA, 0x154AF4F4, 0x229E8F8F, 0x12A2ABAB, 0x584E6262, 0x07E85F5F,
279     0x99E51D1D, 0x34392323, 0x6EC1F6F6, 0x50446C6C, 0xDE5D3232, 0x68724646,
280     0x6526A0A0, 0xBC93CDCD, 0xDB03DADA, 0xF8C6BABA, 0xC8FA9E9E, 0xA882D6D6,
281     0x2BCF6E6E, 0x40507070, 0xDCEB8585, 0xFE750A0A, 0x328A9393, 0xA48DDFDF,
282     0xCA4C2929, 0x10141C1C, 0x2173D7D7, 0xF0CCB4B4, 0xD309D4D4, 0x5D108A8A,
283     0x0FE25151, 0x00000000, 0x6F9A1919, 0x9DE01A1A, 0x368F9494, 0x42E6C7C7,
284     0x4AECC9C9, 0x5EFDD2D2, 0xC1AB7F7F, 0xE0D8A8A8},
285 
286    {0xBC75BC32, 0xECF3EC21, 0x20C62043, 0xB3F4B3C9, 0xDADBDA03, 0x027B028B,
287     0xE2FBE22B, 0x9EC89EFA, 0xC94AC9EC, 0xD4D3D409, 0x18E6186B, 0x1E6B1E9F,
288     0x9845980E, 0xB27DB238, 0xA6E8A6D2, 0x264B26B7, 0x3CD63C57, 0x9332938A,
289     0x82D882EE, 0x52FD5298, 0x7B377BD4, 0xBB71BB37, 0x5BF15B97, 0x47E14783,
290     0x2430243C, 0x510F51E2, 0xBAF8BAC6, 0x4A1B4AF3, 0xBF87BF48, 0x0DFA0D70,
291     0xB006B0B3, 0x753F75DE, 0xD25ED2FD, 0x7DBA7D20, 0x66AE6631, 0x3A5B3AA3,
292     0x598A591C, 0x00000000, 0xCDBCCD93, 0x1A9D1AE0, 0xAE6DAE2C, 0x7FC17FAB,
293     0x2BB12BC7, 0xBE0EBEB9, 0xE080E0A0, 0x8A5D8A10, 0x3BD23B52, 0x64D564BA,
294     0xD8A0D888, 0xE784E7A5, 0x5F075FE8, 0x1B141B11, 0x2CB52CC2, 0xFC90FCB4,
295     0x312C3127, 0x80A38065, 0x73B2732A, 0x0C730C81, 0x794C795F, 0x6B546B41,
296     0x4B924B02, 0x53745369, 0x9436948F, 0x8351831F, 0x2A382A36, 0xC4B0C49C,
297     0x22BD22C8, 0xD55AD5F8, 0xBDFCBDC3, 0x48604878, 0xFF62FFCE, 0x4C964C07,
298     0x416C4177, 0xC742C7E6, 0xEBF7EB24, 0x1C101C14, 0x5D7C5D63, 0x36283622,
299     0x672767C0, 0xE98CE9AF, 0x441344F9, 0x149514EA, 0xF59CF5BB, 0xCFC7CF18,
300     0x3F243F2D, 0xC046C0E3, 0x723B72DB, 0x5470546C, 0x29CA294C, 0xF0E3F035,
301     0x088508FE, 0xC6CBC617, 0xF311F34F, 0x8CD08CE4, 0xA493A459, 0xCAB8CA96,
302     0x68A6683B, 0xB883B84D, 0x38203828, 0xE5FFE52E, 0xAD9FAD56, 0x0B770B84,
303     0xC8C3C81D, 0x99CC99FF, 0x580358ED, 0x196F199A, 0x0E080E0A, 0x95BF957E,
304     0x70407050, 0xF7E7F730, 0x6E2B6ECF, 0x1FE21F6E, 0xB579B53D, 0x090C090F,
305     0x61AA6134, 0x57825716, 0x9F419F0B, 0x9D3A9D80, 0x11EA1164, 0x25B925CD,
306     0xAFE4AFDD, 0x459A4508, 0xDFA4DF8D, 0xA397A35C, 0xEA7EEAD5, 0x35DA3558,
307     0xED7AEDD0, 0x431743FC, 0xF866F8CB, 0xFB94FBB1, 0x37A137D3, 0xFA1DFA40,
308     0xC23DC268, 0xB4F0B4CC, 0x32DE325D, 0x9CB39C71, 0x560B56E7, 0xE372E3DA,
309     0x87A78760, 0x151C151B, 0xF9EFF93A, 0x63D163BF, 0x345334A9, 0x9A3E9A85,
310     0xB18FB142, 0x7C337CD1, 0x8826889B, 0x3D5F3DA6, 0xA1ECA1D7, 0xE476E4DF,
311     0x812A8194, 0x91499101, 0x0F810FFB, 0xEE88EEAA, 0x16EE1661, 0xD721D773,
312     0x97C497F5, 0xA51AA5A8, 0xFEEBFE3F, 0x6DD96DB5, 0x78C578AE, 0xC539C56D,
313     0x1D991DE5, 0x76CD76A4, 0x3EAD3EDC, 0xCB31CB67, 0xB68BB647, 0xEF01EF5B,
314     0x1218121E, 0x602360C5, 0x6ADD6AB0, 0x4D1F4DF6, 0xCE4ECEE9, 0xDE2DDE7C,
315     0x55F9559D, 0x7E487E5A, 0x214F21B2, 0x03F2037A, 0xA065A026, 0x5E8E5E19,
316     0x5A785A66, 0x655C654B, 0x6258624E, 0xFD19FD45, 0x068D06F4, 0x40E54086,
317     0xF298F2BE, 0x335733AC, 0x17671790, 0x057F058E, 0xE805E85E, 0x4F644F7D,
318     0x89AF896A, 0x10631095, 0x74B6742F, 0x0AFE0A75, 0x5CF55C92, 0x9BB79B74,
319     0x2D3C2D33, 0x30A530D6, 0x2ECE2E49, 0x49E94989, 0x46684672, 0x77447755,
320     0xA8E0A8D8, 0x964D9604, 0x284328BD, 0xA969A929, 0xD929D979, 0x862E8691,
321     0xD1ACD187, 0xF415F44A, 0x8D598D15, 0xD6A8D682, 0xB90AB9BC, 0x429E420D,
322     0xF66EF6C1, 0x2F472FB8, 0xDDDFDD06, 0x23342339, 0xCC35CC62, 0xF16AF1C4,
323     0xC1CFC112, 0x85DC85EB, 0x8F228F9E, 0x71C971A1, 0x90C090F0, 0xAA9BAA53,
324     0x018901F1, 0x8BD48BE1, 0x4EED4E8C, 0x8EAB8E6F, 0xAB12ABA2, 0x6FA26F3E,
325     0xE60DE654, 0xDB52DBF2, 0x92BB927B, 0xB702B7B6, 0x692F69CA, 0x39A939D9,
326     0xD3D7D30C, 0xA761A723, 0xA21EA2AD, 0xC3B4C399, 0x6C506C44, 0x07040705,
327     0x04F6047F, 0x27C22746, 0xAC16ACA7, 0xD025D076, 0x50865013, 0xDC56DCF7,
328     0x8455841A, 0xE109E151, 0x7ABE7A25, 0x139113EF},
329 
330    {0xD939A9D9, 0x90176790, 0x719CB371, 0xD2A6E8D2, 0x05070405, 0x9852FD98,
331     0x6580A365, 0xDFE476DF, 0x08459A08, 0x024B9202, 0xA0E080A0, 0x665A7866,
332     0xDDAFE4DD, 0xB06ADDB0, 0xBF63D1BF, 0x362A3836, 0x54E60D54, 0x4320C643,
333     0x62CC3562, 0xBEF298BE, 0x1E12181E, 0x24EBF724, 0xD7A1ECD7, 0x77416C77,
334     0xBD2843BD, 0x32BC7532, 0xD47B37D4, 0x9B88269B, 0x700DFA70, 0xF94413F9,
335     0xB1FB94B1, 0x5A7E485A, 0x7A03F27A, 0xE48CD0E4, 0x47B68B47, 0x3C24303C,
336     0xA5E784A5, 0x416B5441, 0x06DDDF06, 0xC56023C5, 0x45FD1945, 0xA33A5BA3,
337     0x68C23D68, 0x158D5915, 0x21ECF321, 0x3166AE31, 0x3E6FA23E, 0x16578216,
338     0x95106395, 0x5BEF015B, 0x4DB8834D, 0x91862E91, 0xB56DD9B5, 0x1F83511F,
339     0x53AA9B53, 0x635D7C63, 0x3B68A63B, 0x3FFEEB3F, 0xD630A5D6, 0x257ABE25,
340     0xA7AC16A7, 0x0F090C0F, 0x35F0E335, 0x23A76123, 0xF090C0F0, 0xAFE98CAF,
341     0x809D3A80, 0x925CF592, 0x810C7381, 0x27312C27, 0x76D02576, 0xE7560BE7,
342     0x7B92BB7B, 0xE9CE4EE9, 0xF10189F1, 0x9F1E6B9F, 0xA93453A9, 0xC4F16AC4,
343     0x99C3B499, 0x975BF197, 0x8347E183, 0x6B18E66B, 0xC822BDC8, 0x0E98450E,
344     0x6E1FE26E, 0xC9B3F4C9, 0x2F74B62F, 0xCBF866CB, 0xFF99CCFF, 0xEA1495EA,
345     0xED5803ED, 0xF7DC56F7, 0xE18BD4E1, 0x1B151C1B, 0xADA21EAD, 0x0CD3D70C,
346     0x2BE2FB2B, 0x1DC8C31D, 0x195E8E19, 0xC22CB5C2, 0x8949E989, 0x12C1CF12,
347     0x7E95BF7E, 0x207DBA20, 0x6411EA64, 0x840B7784, 0x6DC5396D, 0x6A89AF6A,
348     0xD17C33D1, 0xA171C9A1, 0xCEFF62CE, 0x37BB7137, 0xFB0F81FB, 0x3DB5793D,
349     0x51E10951, 0xDC3EADDC, 0x2D3F242D, 0xA476CDA4, 0x9D55F99D, 0xEE82D8EE,
350     0x8640E586, 0xAE78C5AE, 0xCD25B9CD, 0x04964D04, 0x55774455, 0x0A0E080A,
351     0x13508613, 0x30F7E730, 0xD337A1D3, 0x40FA1D40, 0x3461AA34, 0x8C4EED8C,
352     0xB3B006B3, 0x6C54706C, 0x2A73B22A, 0x523BD252, 0x0B9F410B, 0x8B027B8B,
353     0x88D8A088, 0x4FF3114F, 0x67CB3167, 0x4627C246, 0xC06727C0, 0xB4FC90B4,
354     0x28382028, 0x7F04F67F, 0x78486078, 0x2EE5FF2E, 0x074C9607, 0x4B655C4B,
355     0xC72BB1C7, 0x6F8EAB6F, 0x0D429E0D, 0xBBF59CBB, 0xF2DB52F2, 0xF34A1BF3,
356     0xA63D5FA6, 0x59A49359, 0xBCB90ABC, 0x3AF9EF3A, 0xEF1391EF, 0xFE0885FE,
357     0x01914901, 0x6116EE61, 0x7CDE2D7C, 0xB2214FB2, 0x42B18F42, 0xDB723BDB,
358     0xB82F47B8, 0x48BF8748, 0x2CAE6D2C, 0xE3C046E3, 0x573CD657, 0x859A3E85,
359     0x29A96929, 0x7D4F647D, 0x94812A94, 0x492ECE49, 0x17C6CB17, 0xCA692FCA,
360     0xC3BDFCC3, 0x5CA3975C, 0x5EE8055E, 0xD0ED7AD0, 0x87D1AC87, 0x8E057F8E,
361     0xBA64D5BA, 0xA8A51AA8, 0xB7264BB7, 0xB9BE0EB9, 0x6087A760, 0xF8D55AF8,
362     0x22362822, 0x111B1411, 0xDE753FDE, 0x79D92979, 0xAAEE88AA, 0x332D3C33,
363     0x5F794C5F, 0xB6B702B6, 0x96CAB896, 0x5835DA58, 0x9CC4B09C, 0xFC4317FC,
364     0x1A84551A, 0xF64D1FF6, 0x1C598A1C, 0x38B27D38, 0xAC3357AC, 0x18CFC718,
365     0xF4068DF4, 0x69537469, 0x749BB774, 0xF597C4F5, 0x56AD9F56, 0xDAE372DA,
366     0xD5EA7ED5, 0x4AF4154A, 0x9E8F229E, 0xA2AB12A2, 0x4E62584E, 0xE85F07E8,
367     0xE51D99E5, 0x39233439, 0xC1F66EC1, 0x446C5044, 0x5D32DE5D, 0x72466872,
368     0x26A06526, 0x93CDBC93, 0x03DADB03, 0xC6BAF8C6, 0xFA9EC8FA, 0x82D6A882,
369     0xCF6E2BCF, 0x50704050, 0xEB85DCEB, 0x750AFE75, 0x8A93328A, 0x8DDFA48D,
370     0x4C29CA4C, 0x141C1014, 0x73D72173, 0xCCB4F0CC, 0x09D4D309, 0x108A5D10,
371     0xE2510FE2, 0x00000000, 0x9A196F9A, 0xE01A9DE0, 0x8F94368F, 0xE6C742E6,
372     0xECC94AEC, 0xFDD25EFD, 0xAB7FC1AB, 0xD8A8E0D8}
373 };
374 
375 /* The exp_to_poly and poly_to_exp tables are used to perform efficient
376  * operations in GF(2^8) represented as GF(2)[x]/w(x) where
377  * w(x)=x^8+x^6+x^3+x^2+1.  We care about doing that because it's part of the
378  * definition of the RS matrix in the key schedule.  Elements of that field
379  * are polynomials of degree not greater than 7 and all coefficients 0 or 1,
380  * which can be represented naturally by bytes (just substitute x=2).  In that
381  * form, GF(2^8) addition is the same as bitwise XOR, but GF(2^8)
382  * multiplication is inefficient without hardware support.  To multiply
383  * faster, I make use of the fact x is a generator for the nonzero elements,
384  * so that every element p of GF(2)[x]/w(x) is either 0 or equal to (x)^n for
385  * some n in 0..254.  Note that that caret is exponentiation in GF(2^8),
386  * *not* polynomial notation.  So if I want to compute pq where p and q are
387  * in GF(2^8), I can just say:
388  *    1. if p=0 or q=0 then pq=0
389  *    2. otherwise, find m and n such that p=x^m and q=x^n
390  *    3. pq=(x^m)(x^n)=x^(m+n), so add m and n and find pq
391  * The translations in steps 2 and 3 are looked up in the tables
392  * poly_to_exp (for step 2) and exp_to_poly (for step 3).  To see this
393  * in action, look at the CALC_S macro.  As additional wrinkles, note that
394  * one of my operands is always a constant, so the poly_to_exp lookup on it
395  * is done in advance; I included the original values in the comments so
396  * readers can have some chance of recognizing that this *is* the RS matrix
397  * from the Twofish paper.  I've only included the table entries I actually
398  * need; I never do a lookup on a variable input of zero and the biggest
399  * exponents I'll ever see are 254 (variable) and 237 (constant), so they'll
400  * never sum to more than 491.	I'm repeating part of the exp_to_poly table
401  * so that I don't have to do mod-255 reduction in the exponent arithmetic.
402  * Since I know my constant operands are never zero, I only have to worry
403  * about zero values in the variable operand, and I do it with a simple
404  * conditional branch.	I know conditionals are expensive, but I couldn't
405  * see a non-horrible way of avoiding them, and I did manage to group the
406  * statements so that each if covers four group multiplications. */
407 
408 static const u16 poly_to_exp[256] = {
409    492,
410    0x00, 0x01, 0x17, 0x02, 0x2E, 0x18, 0x53, 0x03, 0x6A, 0x2F, 0x93, 0x19,
411    0x34, 0x54, 0x45, 0x04, 0x5C, 0x6B, 0xB6, 0x30, 0xA6, 0x94, 0x4B, 0x1A,
412    0x8C, 0x35, 0x81, 0x55, 0xAA, 0x46, 0x0D, 0x05, 0x24, 0x5D, 0x87, 0x6C,
413    0x9B, 0xB7, 0xC1, 0x31, 0x2B, 0xA7, 0xA3, 0x95, 0x98, 0x4C, 0xCA, 0x1B,
414    0xE6, 0x8D, 0x73, 0x36, 0xCD, 0x82, 0x12, 0x56, 0x62, 0xAB, 0xF0, 0x47,
415    0x4F, 0x0E, 0xBD, 0x06, 0xD4, 0x25, 0xD2, 0x5E, 0x27, 0x88, 0x66, 0x6D,
416    0xD6, 0x9C, 0x79, 0xB8, 0x08, 0xC2, 0xDF, 0x32, 0x68, 0x2C, 0xFD, 0xA8,
417    0x8A, 0xA4, 0x5A, 0x96, 0x29, 0x99, 0x22, 0x4D, 0x60, 0xCB, 0xE4, 0x1C,
418    0x7B, 0xE7, 0x3B, 0x8E, 0x9E, 0x74, 0xF4, 0x37, 0xD8, 0xCE, 0xF9, 0x83,
419    0x6F, 0x13, 0xB2, 0x57, 0xE1, 0x63, 0xDC, 0xAC, 0xC4, 0xF1, 0xAF, 0x48,
420    0x0A, 0x50, 0x42, 0x0F, 0xBA, 0xBE, 0xC7, 0x07, 0xDE, 0xD5, 0x78, 0x26,
421    0x65, 0xD3, 0xD1, 0x5F, 0xE3, 0x28, 0x21, 0x89, 0x59, 0x67, 0xFC, 0x6E,
422    0xB1, 0xD7, 0xF8, 0x9D, 0xF3, 0x7A, 0x3A, 0xB9, 0xC6, 0x09, 0x41, 0xC3,
423    0xAE, 0xE0, 0xDB, 0x33, 0x44, 0x69, 0x92, 0x2D, 0x52, 0xFE, 0x16, 0xA9,
424    0x0C, 0x8B, 0x80, 0xA5, 0x4A, 0x5B, 0xB5, 0x97, 0xC9, 0x2A, 0xA2, 0x9A,
425    0xC0, 0x23, 0x86, 0x4E, 0xBC, 0x61, 0xEF, 0xCC, 0x11, 0xE5, 0x72, 0x1D,
426    0x3D, 0x7C, 0xEB, 0xE8, 0xE9, 0x3C, 0xEA, 0x8F, 0x7D, 0x9F, 0xEC, 0x75,
427    0x1E, 0xF5, 0x3E, 0x38, 0xF6, 0xD9, 0x3F, 0xCF, 0x76, 0xFA, 0x1F, 0x84,
428    0xA0, 0x70, 0xED, 0x14, 0x90, 0xB3, 0x7E, 0x58, 0xFB, 0xE2, 0x20, 0x64,
429    0xD0, 0xDD, 0x77, 0xAD, 0xDA, 0xC5, 0x40, 0xF2, 0x39, 0xB0, 0xF7, 0x49,
430    0xB4, 0x0B, 0x7F, 0x51, 0x15, 0x43, 0x91, 0x10, 0x71, 0xBB, 0xEE, 0xBF,
431    0x85, 0xC8, 0xA1
432 };
433 
434 static const byte exp_to_poly[492 + 256] = {
435    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x4D, 0x9A, 0x79, 0xF2,
436    0xA9, 0x1F, 0x3E, 0x7C, 0xF8, 0xBD, 0x37, 0x6E, 0xDC, 0xF5, 0xA7, 0x03,
437    0x06, 0x0C, 0x18, 0x30, 0x60, 0xC0, 0xCD, 0xD7, 0xE3, 0x8B, 0x5B, 0xB6,
438    0x21, 0x42, 0x84, 0x45, 0x8A, 0x59, 0xB2, 0x29, 0x52, 0xA4, 0x05, 0x0A,
439    0x14, 0x28, 0x50, 0xA0, 0x0D, 0x1A, 0x34, 0x68, 0xD0, 0xED, 0x97, 0x63,
440    0xC6, 0xC1, 0xCF, 0xD3, 0xEB, 0x9B, 0x7B, 0xF6, 0xA1, 0x0F, 0x1E, 0x3C,
441    0x78, 0xF0, 0xAD, 0x17, 0x2E, 0x5C, 0xB8, 0x3D, 0x7A, 0xF4, 0xA5, 0x07,
442    0x0E, 0x1C, 0x38, 0x70, 0xE0, 0x8D, 0x57, 0xAE, 0x11, 0x22, 0x44, 0x88,
443    0x5D, 0xBA, 0x39, 0x72, 0xE4, 0x85, 0x47, 0x8E, 0x51, 0xA2, 0x09, 0x12,
444    0x24, 0x48, 0x90, 0x6D, 0xDA, 0xF9, 0xBF, 0x33, 0x66, 0xCC, 0xD5, 0xE7,
445    0x83, 0x4B, 0x96, 0x61, 0xC2, 0xC9, 0xDF, 0xF3, 0xAB, 0x1B, 0x36, 0x6C,
446    0xD8, 0xFD, 0xB7, 0x23, 0x46, 0x8C, 0x55, 0xAA, 0x19, 0x32, 0x64, 0xC8,
447    0xDD, 0xF7, 0xA3, 0x0B, 0x16, 0x2C, 0x58, 0xB0, 0x2D, 0x5A, 0xB4, 0x25,
448    0x4A, 0x94, 0x65, 0xCA, 0xD9, 0xFF, 0xB3, 0x2B, 0x56, 0xAC, 0x15, 0x2A,
449    0x54, 0xA8, 0x1D, 0x3A, 0x74, 0xE8, 0x9D, 0x77, 0xEE, 0x91, 0x6F, 0xDE,
450    0xF1, 0xAF, 0x13, 0x26, 0x4C, 0x98, 0x7D, 0xFA, 0xB9, 0x3F, 0x7E, 0xFC,
451    0xB5, 0x27, 0x4E, 0x9C, 0x75, 0xEA, 0x99, 0x7F, 0xFE, 0xB1, 0x2F, 0x5E,
452    0xBC, 0x35, 0x6A, 0xD4, 0xE5, 0x87, 0x43, 0x86, 0x41, 0x82, 0x49, 0x92,
453    0x69, 0xD2, 0xE9, 0x9F, 0x73, 0xE6, 0x81, 0x4F, 0x9E, 0x71, 0xE2, 0x89,
454    0x5F, 0xBE, 0x31, 0x62, 0xC4, 0xC5, 0xC7, 0xC3, 0xCB, 0xDB, 0xFB, 0xBB,
455    0x3B, 0x76, 0xEC, 0x95, 0x67, 0xCE, 0xD1, 0xEF, 0x93, 0x6B, 0xD6, 0xE1,
456    0x8F, 0x53, 0xA6, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x4D,
457    0x9A, 0x79, 0xF2, 0xA9, 0x1F, 0x3E, 0x7C, 0xF8, 0xBD, 0x37, 0x6E, 0xDC,
458    0xF5, 0xA7, 0x03, 0x06, 0x0C, 0x18, 0x30, 0x60, 0xC0, 0xCD, 0xD7, 0xE3,
459    0x8B, 0x5B, 0xB6, 0x21, 0x42, 0x84, 0x45, 0x8A, 0x59, 0xB2, 0x29, 0x52,
460    0xA4, 0x05, 0x0A, 0x14, 0x28, 0x50, 0xA0, 0x0D, 0x1A, 0x34, 0x68, 0xD0,
461    0xED, 0x97, 0x63, 0xC6, 0xC1, 0xCF, 0xD3, 0xEB, 0x9B, 0x7B, 0xF6, 0xA1,
462    0x0F, 0x1E, 0x3C, 0x78, 0xF0, 0xAD, 0x17, 0x2E, 0x5C, 0xB8, 0x3D, 0x7A,
463    0xF4, 0xA5, 0x07, 0x0E, 0x1C, 0x38, 0x70, 0xE0, 0x8D, 0x57, 0xAE, 0x11,
464    0x22, 0x44, 0x88, 0x5D, 0xBA, 0x39, 0x72, 0xE4, 0x85, 0x47, 0x8E, 0x51,
465    0xA2, 0x09, 0x12, 0x24, 0x48, 0x90, 0x6D, 0xDA, 0xF9, 0xBF, 0x33, 0x66,
466    0xCC, 0xD5, 0xE7, 0x83, 0x4B, 0x96, 0x61, 0xC2, 0xC9, 0xDF, 0xF3, 0xAB,
467    0x1B, 0x36, 0x6C, 0xD8, 0xFD, 0xB7, 0x23, 0x46, 0x8C, 0x55, 0xAA, 0x19,
468    0x32, 0x64, 0xC8, 0xDD, 0xF7, 0xA3, 0x0B, 0x16, 0x2C, 0x58, 0xB0, 0x2D,
469    0x5A, 0xB4, 0x25, 0x4A, 0x94, 0x65, 0xCA, 0xD9, 0xFF, 0xB3, 0x2B, 0x56,
470    0xAC, 0x15, 0x2A, 0x54, 0xA8, 0x1D, 0x3A, 0x74, 0xE8, 0x9D, 0x77, 0xEE,
471    0x91, 0x6F, 0xDE, 0xF1, 0xAF, 0x13, 0x26, 0x4C, 0x98, 0x7D, 0xFA, 0xB9,
472    0x3F, 0x7E, 0xFC, 0xB5, 0x27, 0x4E, 0x9C, 0x75, 0xEA, 0x99, 0x7F, 0xFE,
473    0xB1, 0x2F, 0x5E, 0xBC, 0x35, 0x6A, 0xD4, 0xE5, 0x87, 0x43, 0x86, 0x41,
474    0x82, 0x49, 0x92, 0x69, 0xD2, 0xE9, 0x9F, 0x73, 0xE6, 0x81, 0x4F, 0x9E,
475    0x71, 0xE2, 0x89, 0x5F, 0xBE, 0x31, 0x62, 0xC4, 0xC5, 0xC7, 0xC3, 0xCB,
476 };
477 
478 
479 /* The table constants are indices of
480  * S-box entries, preprocessed through q0 and q1. */
481 static byte calc_sb_tbl[512] = {
482     0xA9, 0x75, 0x67, 0xF3, 0xB3, 0xC6, 0xE8, 0xF4,
483     0x04, 0xDB, 0xFD, 0x7B, 0xA3, 0xFB, 0x76, 0xC8,
484     0x9A, 0x4A, 0x92, 0xD3, 0x80, 0xE6, 0x78, 0x6B,
485     0xE4, 0x45, 0xDD, 0x7D, 0xD1, 0xE8, 0x38, 0x4B,
486     0x0D, 0xD6, 0xC6, 0x32, 0x35, 0xD8, 0x98, 0xFD,
487     0x18, 0x37, 0xF7, 0x71, 0xEC, 0xF1, 0x6C, 0xE1,
488     0x43, 0x30, 0x75, 0x0F, 0x37, 0xF8, 0x26, 0x1B,
489     0xFA, 0x87, 0x13, 0xFA, 0x94, 0x06, 0x48, 0x3F,
490     0xF2, 0x5E, 0xD0, 0xBA, 0x8B, 0xAE, 0x30, 0x5B,
491     0x84, 0x8A, 0x54, 0x00, 0xDF, 0xBC, 0x23, 0x9D,
492     0x19, 0x6D, 0x5B, 0xC1, 0x3D, 0xB1, 0x59, 0x0E,
493     0xF3, 0x80, 0xAE, 0x5D, 0xA2, 0xD2, 0x82, 0xD5,
494     0x63, 0xA0, 0x01, 0x84, 0x83, 0x07, 0x2E, 0x14,
495     0xD9, 0xB5, 0x51, 0x90, 0x9B, 0x2C, 0x7C, 0xA3,
496     0xA6, 0xB2, 0xEB, 0x73, 0xA5, 0x4C, 0xBE, 0x54,
497     0x16, 0x92, 0x0C, 0x74, 0xE3, 0x36, 0x61, 0x51,
498     0xC0, 0x38, 0x8C, 0xB0, 0x3A, 0xBD, 0xF5, 0x5A,
499     0x73, 0xFC, 0x2C, 0x60, 0x25, 0x62, 0x0B, 0x96,
500     0xBB, 0x6C, 0x4E, 0x42, 0x89, 0xF7, 0x6B, 0x10,
501     0x53, 0x7C, 0x6A, 0x28, 0xB4, 0x27, 0xF1, 0x8C,
502     0xE1, 0x13, 0xE6, 0x95, 0xBD, 0x9C, 0x45, 0xC7,
503     0xE2, 0x24, 0xF4, 0x46, 0xB6, 0x3B, 0x66, 0x70,
504     0xCC, 0xCA, 0x95, 0xE3, 0x03, 0x85, 0x56, 0xCB,
505     0xD4, 0x11, 0x1C, 0xD0, 0x1E, 0x93, 0xD7, 0xB8,
506     0xFB, 0xA6, 0xC3, 0x83, 0x8E, 0x20, 0xB5, 0xFF,
507     0xE9, 0x9F, 0xCF, 0x77, 0xBF, 0xC3, 0xBA, 0xCC,
508     0xEA, 0x03, 0x77, 0x6F, 0x39, 0x08, 0xAF, 0xBF,
509     0x33, 0x40, 0xC9, 0xE7, 0x62, 0x2B, 0x71, 0xE2,
510     0x81, 0x79, 0x79, 0x0C, 0x09, 0xAA, 0xAD, 0x82,
511     0x24, 0x41, 0xCD, 0x3A, 0xF9, 0xEA, 0xD8, 0xB9,
512     0xE5, 0xE4, 0xC5, 0x9A, 0xB9, 0xA4, 0x4D, 0x97,
513     0x44, 0x7E, 0x08, 0xDA, 0x86, 0x7A, 0xE7, 0x17,
514     0xA1, 0x66, 0x1D, 0x94, 0xAA, 0xA1, 0xED, 0x1D,
515     0x06, 0x3D, 0x70, 0xF0, 0xB2, 0xDE, 0xD2, 0xB3,
516     0x41, 0x0B, 0x7B, 0x72, 0xA0, 0xA7, 0x11, 0x1C,
517     0x31, 0xEF, 0xC2, 0xD1, 0x27, 0x53, 0x90, 0x3E,
518     0x20, 0x8F, 0xF6, 0x33, 0x60, 0x26, 0xFF, 0x5F,
519     0x96, 0xEC, 0x5C, 0x76, 0xB1, 0x2A, 0xAB, 0x49,
520     0x9E, 0x81, 0x9C, 0x88, 0x52, 0xEE, 0x1B, 0x21,
521     0x5F, 0xC4, 0x93, 0x1A, 0x0A, 0xEB, 0xEF, 0xD9,
522     0x91, 0xC5, 0x85, 0x39, 0x49, 0x99, 0xEE, 0xCD,
523     0x2D, 0xAD, 0x4F, 0x31, 0x8F, 0x8B, 0x3B, 0x01,
524     0x47, 0x18, 0x87, 0x23, 0x6D, 0xDD, 0x46, 0x1F,
525     0xD6, 0x4E, 0x3E, 0x2D, 0x69, 0xF9, 0x64, 0x48,
526     0x2A, 0x4F, 0xCE, 0xF2, 0xCB, 0x65, 0x2F, 0x8E,
527     0xFC, 0x78, 0x97, 0x5C, 0x05, 0x58, 0x7A, 0x19,
528     0xAC, 0x8D, 0x7F, 0xE5, 0xD5, 0x98, 0x1A, 0x57,
529     0x4B, 0x67, 0x0E, 0x7F, 0xA7, 0x05, 0x5A, 0x64,
530     0x28, 0xAF, 0x14, 0x63, 0x3F, 0xB6, 0x29, 0xFE,
531     0x88, 0xF5, 0x3C, 0xB7, 0x4C, 0x3C, 0x02, 0xA5,
532     0xB8, 0xCE, 0xDA, 0xE9, 0xB0, 0x68, 0x17, 0x44,
533     0x55, 0xE0, 0x1F, 0x4D, 0x8A, 0x43, 0x7D, 0x69,
534     0x57, 0x29, 0xC7, 0x2E, 0x8D, 0xAC, 0x74, 0x15,
535     0xB7, 0x59, 0xC4, 0xA8, 0x9F, 0x0A, 0x72, 0x9E,
536     0x7E, 0x6E, 0x15, 0x47, 0x22, 0xDF, 0x12, 0x34,
537     0x58, 0x35, 0x07, 0x6A, 0x99, 0xCF, 0x34, 0xDC,
538     0x6E, 0x22, 0x50, 0xC9, 0xDE, 0xC0, 0x68, 0x9B,
539     0x65, 0x89, 0xBC, 0xD4, 0xDB, 0xED, 0xF8, 0xAB,
540     0xC8, 0x12, 0xA8, 0xA2, 0x2B, 0x0D, 0x40, 0x52,
541     0xDC, 0xBB, 0xFE, 0x02, 0x32, 0x2F, 0xA4, 0xA9,
542     0xCA, 0xD7, 0x10, 0x61, 0x21, 0x1E, 0xF0, 0xB4,
543     0xD3, 0x50, 0x5D, 0x04, 0x0F, 0xF6, 0x00, 0xC2,
544     0x6F, 0x16, 0x9D, 0x25, 0x36, 0x86, 0x42, 0x56,
545     0x4A, 0x55, 0x5E, 0x09, 0xC1, 0xBE, 0xE0, 0x91
546 };
547 
548 /* Macro to perform one column of the RS matrix multiplication.  The
549  * parameters a, b, c, and d are the four bytes of output; i is the index
550  * of the key bytes, and w, x, y, and z, are the column of constants from
551  * the RS matrix, preprocessed through the poly_to_exp table. */
552 
553 #define CALC_S(a, b, c, d, i, w, x, y, z) \
554    { \
555       tmp = poly_to_exp[key[i]]; \
556       (a) ^= exp_to_poly[tmp + (w)]; \
557       (b) ^= exp_to_poly[tmp + (x)]; \
558       (c) ^= exp_to_poly[tmp + (y)]; \
559       (d) ^= exp_to_poly[tmp + (z)]; \
560    }
561 
562 /* Macros to calculate the key-dependent S-boxes for a 128-bit key using
563  * the S vector from CALC_S.  CALC_SB_2 computes a single entry in all
564  * four S-boxes, where i is the index of the entry to compute, and a and b
565  * are the index numbers preprocessed through the q0 and q1 tables
566  * respectively.  CALC_SB is simply a convenience to make the code shorter;
567  * it calls CALC_SB_2 four times with consecutive indices from i to i+3,
568  * using the remaining parameters two by two. */
569 
570 #define CALC_SB_2(i, a, b) \
571    ctx->s[0][i] = mds[0][q0[(a) ^ sa] ^ se]; \
572    ctx->s[1][i] = mds[1][q0[(b) ^ sb] ^ sf]; \
573    ctx->s[2][i] = mds[2][q1[(a) ^ sc] ^ sg]; \
574    ctx->s[3][i] = mds[3][q1[(b) ^ sd] ^ sh]
575 
576 #define CALC_SB(i, a, b, c, d, e, f, g, h) \
577    CALC_SB_2 (i, a, b); CALC_SB_2 ((i)+1, c, d); \
578    CALC_SB_2 ((i)+2, e, f); CALC_SB_2 ((i)+3, g, h)
579 
580 /* Macros exactly like CALC_SB and CALC_SB_2, but for 256-bit keys. */
581 
582 #define CALC_SB256_2(i, a, b) \
583    ctx->s[0][i] = mds[0][q0[q0[q1[(b) ^ sa] ^ se] ^ si] ^ sm]; \
584    ctx->s[1][i] = mds[1][q0[q1[q1[(a) ^ sb] ^ sf] ^ sj] ^ sn]; \
585    ctx->s[2][i] = mds[2][q1[q0[q0[(a) ^ sc] ^ sg] ^ sk] ^ so]; \
586    ctx->s[3][i] = mds[3][q1[q1[q0[(b) ^ sd] ^ sh] ^ sl] ^ sp];
587 
588 #define CALC_SB256(i, a, b, c, d, e, f, g, h) \
589    CALC_SB256_2 (i, a, b); CALC_SB256_2 ((i)+1, c, d); \
590    CALC_SB256_2 ((i)+2, e, f); CALC_SB256_2 ((i)+3, g, h)
591 
592 /* Macros to calculate the whitening and round subkeys.  CALC_K_2 computes the
593  * last two stages of the h() function for a given index (either 2i or 2i+1).
594  * a, b, c, and d are the four bytes going into the last two stages.  For
595  * 128-bit keys, this is the entire h() function and a and c are the index
596  * preprocessed through q0 and q1 respectively; for longer keys they are the
597  * output of previous stages.  j is the index of the first key byte to use.
598  * CALC_K computes a pair of subkeys for 128-bit Twofish, by calling CALC_K_2
599  * twice, doing the Pseudo-Hadamard Transform, and doing the necessary
600  * rotations.  Its parameters are: a, the array to write the results into,
601  * j, the index of the first output entry, k and l, the preprocessed indices
602  * for index 2i, and m and n, the preprocessed indices for index 2i+1.
603  * CALC_K256_2 expands CALC_K_2 to handle 256-bit keys, by doing two
604  * additional lookup-and-XOR stages.  The parameters a and b are the index
605  * preprocessed through q0 and q1 respectively; j is the index of the first
606  * key byte to use.  CALC_K256 is identical to CALC_K but for using the
607  * CALC_K256_2 macro instead of CALC_K_2. */
608 
609 #define CALC_K_2(a, b, c, d, j) \
610      mds[0][q0[a ^ key[(j) + 8]] ^ key[j]] \
611    ^ mds[1][q0[b ^ key[(j) + 9]] ^ key[(j) + 1]] \
612    ^ mds[2][q1[c ^ key[(j) + 10]] ^ key[(j) + 2]] \
613    ^ mds[3][q1[d ^ key[(j) + 11]] ^ key[(j) + 3]]
614 
615 #define CALC_K(a, j, k, l, m, n) \
616    x = CALC_K_2 (k, l, k, l, 0); \
617    y = CALC_K_2 (m, n, m, n, 4); \
618    y = (y << 8) + (y >> 24); \
619    x += y; y += x; ctx->a[j] = x; \
620    ctx->a[(j) + 1] = (y << 9) + (y >> 23)
621 
622 #define CALC_K256_2(a, b, j) \
623    CALC_K_2 (q0[q1[b ^ key[(j) + 24]] ^ key[(j) + 16]], \
624 	     q1[q1[a ^ key[(j) + 25]] ^ key[(j) + 17]], \
625 	     q0[q0[a ^ key[(j) + 26]] ^ key[(j) + 18]], \
626 	     q1[q0[b ^ key[(j) + 27]] ^ key[(j) + 19]], j)
627 
628 #define CALC_K256(a, j, k, l, m, n) \
629    x = CALC_K256_2 (k, l, 0); \
630    y = CALC_K256_2 (m, n, 4); \
631    y = (y << 8) + (y >> 24); \
632    x += y; y += x; ctx->a[j] = x; \
633    ctx->a[(j) + 1] = (y << 9) + (y >> 23)
634 
635 
636 
637 /* Perform the key setup.  Note that this works only with 128- and 256-bit
638  * keys, despite the API that looks like it might support other sizes. */
639 
640 static gcry_err_code_t
do_twofish_setkey(TWOFISH_context * ctx,const byte * key,const unsigned keylen)641 do_twofish_setkey (TWOFISH_context *ctx, const byte *key, const unsigned keylen)
642 {
643   int i, j, k;
644 
645   /* Temporaries for CALC_K. */
646   u32 x, y;
647 
648   /* The S vector used to key the S-boxes, split up into individual bytes.
649    * 128-bit keys use only sa through sh; 256-bit use all of them. */
650   byte sa = 0, sb = 0, sc = 0, sd = 0, se = 0, sf = 0, sg = 0, sh = 0;
651   byte si = 0, sj = 0, sk = 0, sl = 0, sm = 0, sn = 0, so = 0, sp = 0;
652 
653   /* Temporary for CALC_S. */
654   unsigned int tmp;
655 
656   /* Flags for self-test. */
657   static int initialized = 0;
658   static const char *selftest_failed=0;
659 
660   /* Check key length. */
661   if( ( ( keylen - 16 ) | 16 ) != 16 )
662     return GPG_ERR_INV_KEYLEN;
663 
664   /* Do self-test if necessary. */
665   if (!initialized)
666     {
667       initialized = 1;
668       selftest_failed = selftest ();
669       if( selftest_failed )
670         log_error("%s\n", selftest_failed );
671     }
672   if( selftest_failed )
673     return GPG_ERR_SELFTEST_FAILED;
674 
675   /* Compute the first two words of the S vector.  The magic numbers are
676    * the entries of the RS matrix, preprocessed through poly_to_exp.	The
677    * numbers in the comments are the original (polynomial form) matrix
678    * entries. */
679   CALC_S (sa, sb, sc, sd, 0, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
680   CALC_S (sa, sb, sc, sd, 1, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
681   CALC_S (sa, sb, sc, sd, 2, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
682   CALC_S (sa, sb, sc, sd, 3, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
683   CALC_S (sa, sb, sc, sd, 4, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
684   CALC_S (sa, sb, sc, sd, 5, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
685   CALC_S (sa, sb, sc, sd, 6, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
686   CALC_S (sa, sb, sc, sd, 7, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
687   CALC_S (se, sf, sg, sh, 8, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
688   CALC_S (se, sf, sg, sh, 9, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
689   CALC_S (se, sf, sg, sh, 10, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
690   CALC_S (se, sf, sg, sh, 11, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
691   CALC_S (se, sf, sg, sh, 12, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
692   CALC_S (se, sf, sg, sh, 13, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
693   CALC_S (se, sf, sg, sh, 14, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
694   CALC_S (se, sf, sg, sh, 15, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
695 
696   if (keylen == 32)  /* 256-bit key */
697     {
698       /* Calculate the remaining two words of the S vector */
699       CALC_S (si, sj, sk, sl, 16, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
700       CALC_S (si, sj, sk, sl, 17, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
701       CALC_S (si, sj, sk, sl, 18, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
702       CALC_S (si, sj, sk, sl, 19, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
703       CALC_S (si, sj, sk, sl, 20, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
704       CALC_S (si, sj, sk, sl, 21, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
705       CALC_S (si, sj, sk, sl, 22, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
706       CALC_S (si, sj, sk, sl, 23, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
707       CALC_S (sm, sn, so, sp, 24, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
708       CALC_S (sm, sn, so, sp, 25, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
709       CALC_S (sm, sn, so, sp, 26, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
710       CALC_S (sm, sn, so, sp, 27, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
711       CALC_S (sm, sn, so, sp, 28, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
712       CALC_S (sm, sn, so, sp, 29, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
713       CALC_S (sm, sn, so, sp, 30, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
714       CALC_S (sm, sn, so, sp, 31, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
715 
716       /* Compute the S-boxes. */
717       for(i=j=0,k=1; i < 256; i++, j += 2, k += 2 )
718         {
719           CALC_SB256_2( i, calc_sb_tbl[j], calc_sb_tbl[k] );
720 	}
721 
722       /* Calculate whitening and round subkeys. */
723       for (i = 0; i < 8; i += 2)
724 	{
725 	  CALC_K256 ( w, i, q0[i], q1[i], q0[i + 1], q1[i + 1] );
726 	}
727       for (j = 0; j < 32; j += 2, i += 2)
728 	{
729 	  CALC_K256 ( k, j, q0[i], q1[i], q0[i + 1], q1[i + 1] );
730 	}
731     }
732   else
733     {
734       /* Compute the S-boxes. */
735       for(i=j=0,k=1; i < 256; i++, j += 2, k += 2 )
736         {
737           CALC_SB_2( i, calc_sb_tbl[j], calc_sb_tbl[k] );
738         }
739 
740       /* Calculate whitening and round subkeys. */
741       for (i = 0; i < 8; i += 2)
742 	{
743 	  CALC_K ( w, i, q0[i], q1[i], q0[i + 1], q1[i + 1] );
744 	}
745       for (j = 0; j < 32; j += 2, i += 2)
746 	{
747 	  CALC_K ( k, j, q0[i], q1[i], q0[i + 1], q1[i + 1] );
748 	}
749     }
750 
751   return 0;
752 }
753 
754 static gcry_err_code_t
twofish_setkey(void * context,const byte * key,unsigned int keylen,cipher_bulk_ops_t * bulk_ops)755 twofish_setkey (void *context, const byte *key, unsigned int keylen,
756                 cipher_bulk_ops_t *bulk_ops)
757 {
758   TWOFISH_context *ctx = context;
759   unsigned int hwfeatures = _gcry_get_hw_features ();
760   int rc;
761 
762   rc = do_twofish_setkey (ctx, key, keylen);
763 
764 #ifdef USE_AVX2
765   ctx->use_avx2 = 0;
766   if ((hwfeatures & HWF_INTEL_AVX2) && (hwfeatures & HWF_INTEL_FAST_VPGATHER))
767     {
768       ctx->use_avx2 = 1;
769     }
770 #endif
771 
772   /* Setup bulk encryption routines.  */
773   memset (bulk_ops, 0, sizeof(*bulk_ops));
774   bulk_ops->cbc_dec = _gcry_twofish_cbc_dec;
775   bulk_ops->cfb_dec = _gcry_twofish_cfb_dec;
776   bulk_ops->ctr_enc = _gcry_twofish_ctr_enc;
777   bulk_ops->ocb_crypt = _gcry_twofish_ocb_crypt;
778   bulk_ops->ocb_auth  = _gcry_twofish_ocb_auth;
779 
780   (void)hwfeatures;
781 
782   _gcry_burn_stack (23+6*sizeof(void*));
783   return rc;
784 }
785 
786 
787 #ifdef USE_AVX2
788 /* Assembler implementations of Twofish using AVX2.  Process 16 block in
789    parallel.
790  */
791 extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx,
792 				       unsigned char *out,
793 				       const unsigned char *in,
794 				       unsigned char *ctr) ASM_FUNC_ABI;
795 
796 extern void _gcry_twofish_avx2_cbc_dec(const TWOFISH_context *ctx,
797 				       unsigned char *out,
798 				       const unsigned char *in,
799 				       unsigned char *iv) ASM_FUNC_ABI;
800 
801 extern void _gcry_twofish_avx2_cfb_dec(const TWOFISH_context *ctx,
802 				       unsigned char *out,
803 				       const unsigned char *in,
804 				       unsigned char *iv) ASM_FUNC_ABI;
805 
806 extern void _gcry_twofish_avx2_ocb_enc(const TWOFISH_context *ctx,
807 				       unsigned char *out,
808 				       const unsigned char *in,
809 				       unsigned char *offset,
810 				       unsigned char *checksum,
811 				       const u64 Ls[16]) ASM_FUNC_ABI;
812 
813 extern void _gcry_twofish_avx2_ocb_dec(const TWOFISH_context *ctx,
814 				       unsigned char *out,
815 				       const unsigned char *in,
816 				       unsigned char *offset,
817 				       unsigned char *checksum,
818 				       const u64 Ls[16]) ASM_FUNC_ABI;
819 
820 extern void _gcry_twofish_avx2_ocb_auth(const TWOFISH_context *ctx,
821 					const unsigned char *abuf,
822 					unsigned char *offset,
823 					unsigned char *checksum,
824 					const u64 Ls[16]) ASM_FUNC_ABI;
825 #endif
826 
827 
828 #ifdef USE_AMD64_ASM
829 
830 /* Assembly implementations of Twofish. */
831 extern void _gcry_twofish_amd64_encrypt_block(const TWOFISH_context *c,
832 					      byte *out, const byte *in);
833 
834 extern void _gcry_twofish_amd64_decrypt_block(const TWOFISH_context *c,
835 					      byte *out, const byte *in);
836 
837 /* These assembly implementations process three blocks in parallel. */
838 extern void _gcry_twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out,
839 					const byte *in, byte *ctr);
840 
841 extern void _gcry_twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out,
842 					const byte *in, byte *iv);
843 
844 extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
845 					const byte *in, byte *iv);
846 
847 extern void _gcry_twofish_amd64_ocb_enc(const TWOFISH_context *ctx, byte *out,
848 					const byte *in, byte *offset,
849 					byte *checksum, const u64 Ls[3]);
850 
851 extern void _gcry_twofish_amd64_ocb_dec(const TWOFISH_context *ctx, byte *out,
852 					const byte *in, byte *offset,
853 					byte *checksum, const u64 Ls[3]);
854 
855 extern void _gcry_twofish_amd64_ocb_auth(const TWOFISH_context *ctx,
856 					 const byte *abuf, byte *offset,
857 					 byte *checksum, const u64 Ls[3]);
858 
859 static inline void
twofish_amd64_encrypt_block(const TWOFISH_context * c,byte * out,const byte * in)860 twofish_amd64_encrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
861 {
862   _gcry_twofish_amd64_encrypt_block(c, out, in);
863 }
864 
865 static inline void
twofish_amd64_decrypt_block(const TWOFISH_context * c,byte * out,const byte * in)866 twofish_amd64_decrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
867 {
868   _gcry_twofish_amd64_decrypt_block(c, out, in);
869 }
870 
871 static inline void
twofish_amd64_ctr_enc(const TWOFISH_context * c,byte * out,const byte * in,byte * ctr)872 twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out, const byte *in,
873                       byte *ctr)
874 {
875   _gcry_twofish_amd64_ctr_enc(c, out, in, ctr);
876 }
877 
878 static inline void
twofish_amd64_cbc_dec(const TWOFISH_context * c,byte * out,const byte * in,byte * iv)879 twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out, const byte *in,
880                       byte *iv)
881 {
882   _gcry_twofish_amd64_cbc_dec(c, out, in, iv);
883 }
884 
885 static inline void
twofish_amd64_cfb_dec(const TWOFISH_context * c,byte * out,const byte * in,byte * iv)886 twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, const byte *in,
887                       byte *iv)
888 {
889   _gcry_twofish_amd64_cfb_dec(c, out, in, iv);
890 }
891 
892 static inline void
twofish_amd64_ocb_enc(const TWOFISH_context * ctx,byte * out,const byte * in,byte * offset,byte * checksum,const u64 Ls[3])893 twofish_amd64_ocb_enc(const TWOFISH_context *ctx, byte *out, const byte *in,
894 		      byte *offset, byte *checksum, const u64 Ls[3])
895 {
896   _gcry_twofish_amd64_ocb_enc(ctx, out, in, offset, checksum, Ls);
897 }
898 
899 static inline void
twofish_amd64_ocb_dec(const TWOFISH_context * ctx,byte * out,const byte * in,byte * offset,byte * checksum,const u64 Ls[3])900 twofish_amd64_ocb_dec(const TWOFISH_context *ctx, byte *out, const byte *in,
901 		      byte *offset, byte *checksum, const u64 Ls[3])
902 {
903   _gcry_twofish_amd64_ocb_dec(ctx, out, in, offset, checksum, Ls);
904 }
905 
906 static inline void
twofish_amd64_ocb_auth(const TWOFISH_context * ctx,const byte * abuf,byte * offset,byte * checksum,const u64 Ls[3])907 twofish_amd64_ocb_auth(const TWOFISH_context *ctx, const byte *abuf,
908 		       byte *offset, byte *checksum, const u64 Ls[3])
909 {
910   _gcry_twofish_amd64_ocb_auth(ctx, abuf, offset, checksum, Ls);
911 }
912 
913 #elif defined(USE_ARM_ASM)
914 
915 /* Assembly implementations of Twofish. */
916 extern void _gcry_twofish_arm_encrypt_block(const TWOFISH_context *c,
917 					      byte *out, const byte *in);
918 
919 extern void _gcry_twofish_arm_decrypt_block(const TWOFISH_context *c,
920 					      byte *out, const byte *in);
921 
922 #else /*!USE_AMD64_ASM && !USE_ARM_ASM*/
923 
924 /* Macros to compute the g() function in the encryption and decryption
925  * rounds.  G1 is the straight g() function; G2 includes the 8-bit
926  * rotation for the high 32-bit word. */
927 
928 #define G1(a) \
929      (ctx->s[0][(a) & 0xFF]) ^ (ctx->s[1][((a) >> 8) & 0xFF]) \
930    ^ (ctx->s[2][((a) >> 16) & 0xFF]) ^ (ctx->s[3][(a) >> 24])
931 
932 #define G2(b) \
933      (ctx->s[1][(b) & 0xFF]) ^ (ctx->s[2][((b) >> 8) & 0xFF]) \
934    ^ (ctx->s[3][((b) >> 16) & 0xFF]) ^ (ctx->s[0][(b) >> 24])
935 
936 /* Encryption and decryption Feistel rounds.  Each one calls the two g()
937  * macros, does the PHT, and performs the XOR and the appropriate bit
938  * rotations.  The parameters are the round number (used to select subkeys),
939  * and the four 32-bit chunks of the text. */
940 
941 #define ENCROUND(n, a, b, c, d) \
942    x = G1 (a); y = G2 (b); \
943    x += y; y += x + ctx->k[2 * (n) + 1]; \
944    (c) ^= x + ctx->k[2 * (n)]; \
945    (c) = ((c) >> 1) + ((c) << 31); \
946    (d) = (((d) << 1)+((d) >> 31)) ^ y
947 
948 #define DECROUND(n, a, b, c, d) \
949    x = G1 (a); y = G2 (b); \
950    x += y; y += x; \
951    (d) ^= y + ctx->k[2 * (n) + 1]; \
952    (d) = ((d) >> 1) + ((d) << 31); \
953    (c) = (((c) << 1)+((c) >> 31)); \
954    (c) ^= (x + ctx->k[2 * (n)])
955 
956 /* Encryption and decryption cycles; each one is simply two Feistel rounds
957  * with the 32-bit chunks re-ordered to simulate the "swap" */
958 
959 #define ENCCYCLE(n) \
960    ENCROUND (2 * (n), a, b, c, d); \
961    ENCROUND (2 * (n) + 1, c, d, a, b)
962 
963 #define DECCYCLE(n) \
964    DECROUND (2 * (n) + 1, c, d, a, b); \
965    DECROUND (2 * (n), a, b, c, d)
966 
967 /* Macros to convert the input and output bytes into 32-bit words,
968  * and simultaneously perform the whitening step.  INPACK packs word
969  * number n into the variable named by x, using whitening subkey number m.
970  * OUTUNPACK unpacks word number n from the variable named by x, using
971  * whitening subkey number m. */
972 
973 #define INPACK(n, x, m) \
974    x = buf_get_le32(in + (n) * 4); \
975    x ^= ctx->w[m]
976 
977 #define OUTUNPACK(n, x, m) \
978    x ^= ctx->w[m]; \
979    buf_put_le32(out + (n) * 4, x)
980 
981 #endif /*!USE_AMD64_ASM*/
982 
983 
984 /* Encrypt one block.  in and out may be the same. */
985 
986 #ifdef USE_AMD64_ASM
987 
988 static unsigned int
twofish_encrypt(void * context,byte * out,const byte * in)989 twofish_encrypt (void *context, byte *out, const byte *in)
990 {
991   TWOFISH_context *ctx = context;
992   twofish_amd64_encrypt_block(ctx, out, in);
993   return /*burn_stack*/ (4*sizeof (void*));
994 }
995 
996 #elif defined(USE_ARM_ASM)
997 
998 static unsigned int
twofish_encrypt(void * context,byte * out,const byte * in)999 twofish_encrypt (void *context, byte *out, const byte *in)
1000 {
1001   TWOFISH_context *ctx = context;
1002   _gcry_twofish_arm_encrypt_block(ctx, out, in);
1003   return /*burn_stack*/ (4*sizeof (void*));
1004 }
1005 
1006 #else /*!USE_AMD64_ASM && !USE_ARM_ASM*/
1007 
1008 static void
do_twofish_encrypt(const TWOFISH_context * ctx,byte * out,const byte * in)1009 do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
1010 {
1011   /* The four 32-bit chunks of the text. */
1012   u32 a, b, c, d;
1013 
1014   /* Temporaries used by the round function. */
1015   u32 x, y;
1016 
1017   /* Input whitening and packing. */
1018   INPACK (0, a, 0);
1019   INPACK (1, b, 1);
1020   INPACK (2, c, 2);
1021   INPACK (3, d, 3);
1022 
1023   /* Encryption Feistel cycles. */
1024   ENCCYCLE (0);
1025   ENCCYCLE (1);
1026   ENCCYCLE (2);
1027   ENCCYCLE (3);
1028   ENCCYCLE (4);
1029   ENCCYCLE (5);
1030   ENCCYCLE (6);
1031   ENCCYCLE (7);
1032 
1033   /* Output whitening and unpacking. */
1034   OUTUNPACK (0, c, 4);
1035   OUTUNPACK (1, d, 5);
1036   OUTUNPACK (2, a, 6);
1037   OUTUNPACK (3, b, 7);
1038 }
1039 
1040 static unsigned int
twofish_encrypt(void * context,byte * out,const byte * in)1041 twofish_encrypt (void *context, byte *out, const byte *in)
1042 {
1043   TWOFISH_context *ctx = context;
1044   do_twofish_encrypt (ctx, out, in);
1045   return /*burn_stack*/ (24+3*sizeof (void*));
1046 }
1047 
1048 #endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
1049 
1050 
1051 /* Decrypt one block.  in and out may be the same. */
1052 
1053 #ifdef USE_AMD64_ASM
1054 
1055 static unsigned int
twofish_decrypt(void * context,byte * out,const byte * in)1056 twofish_decrypt (void *context, byte *out, const byte *in)
1057 {
1058   TWOFISH_context *ctx = context;
1059   twofish_amd64_decrypt_block(ctx, out, in);
1060   return /*burn_stack*/ (4*sizeof (void*));
1061 }
1062 
1063 #elif defined(USE_ARM_ASM)
1064 
1065 static unsigned int
twofish_decrypt(void * context,byte * out,const byte * in)1066 twofish_decrypt (void *context, byte *out, const byte *in)
1067 {
1068   TWOFISH_context *ctx = context;
1069   _gcry_twofish_arm_decrypt_block(ctx, out, in);
1070   return /*burn_stack*/ (4*sizeof (void*));
1071 }
1072 
1073 #else /*!USE_AMD64_ASM && !USE_ARM_ASM*/
1074 
1075 static void
do_twofish_decrypt(const TWOFISH_context * ctx,byte * out,const byte * in)1076 do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
1077 {
1078   /* The four 32-bit chunks of the text. */
1079   u32 a, b, c, d;
1080 
1081   /* Temporaries used by the round function. */
1082   u32 x, y;
1083 
1084   /* Input whitening and packing. */
1085   INPACK (0, c, 4);
1086   INPACK (1, d, 5);
1087   INPACK (2, a, 6);
1088   INPACK (3, b, 7);
1089 
1090   /* Encryption Feistel cycles. */
1091   DECCYCLE (7);
1092   DECCYCLE (6);
1093   DECCYCLE (5);
1094   DECCYCLE (4);
1095   DECCYCLE (3);
1096   DECCYCLE (2);
1097   DECCYCLE (1);
1098   DECCYCLE (0);
1099 
1100   /* Output whitening and unpacking. */
1101   OUTUNPACK (0, a, 0);
1102   OUTUNPACK (1, b, 1);
1103   OUTUNPACK (2, c, 2);
1104   OUTUNPACK (3, d, 3);
1105 }
1106 
1107 static unsigned int
twofish_decrypt(void * context,byte * out,const byte * in)1108 twofish_decrypt (void *context, byte *out, const byte *in)
1109 {
1110   TWOFISH_context *ctx = context;
1111 
1112   do_twofish_decrypt (ctx, out, in);
1113   return /*burn_stack*/ (24+3*sizeof (void*));
1114 }
1115 
1116 #endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
1117 
1118 
1119 
1120 /* Bulk encryption of complete blocks in CTR mode.  This function is only
1121    intended for the bulk encryption feature of cipher.c.  CTR is expected to be
1122    of size TWOFISH_BLOCKSIZE. */
1123 static void
_gcry_twofish_ctr_enc(void * context,unsigned char * ctr,void * outbuf_arg,const void * inbuf_arg,size_t nblocks)1124 _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
1125 		      const void *inbuf_arg, size_t nblocks)
1126 {
1127   TWOFISH_context *ctx = context;
1128   unsigned char *outbuf = outbuf_arg;
1129   const unsigned char *inbuf = inbuf_arg;
1130   unsigned char tmpbuf[TWOFISH_BLOCKSIZE];
1131   unsigned int burn, burn_stack_depth = 0;
1132 
1133 #ifdef USE_AVX2
1134   if (ctx->use_avx2)
1135     {
1136       int did_use_avx2 = 0;
1137 
1138       /* Process data in 16 block chunks. */
1139       while (nblocks >= 16)
1140         {
1141           _gcry_twofish_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
1142 
1143           nblocks -= 16;
1144           outbuf += 16 * TWOFISH_BLOCKSIZE;
1145           inbuf  += 16 * TWOFISH_BLOCKSIZE;
1146           did_use_avx2 = 1;
1147         }
1148 
1149       if (did_use_avx2)
1150         {
1151           /* twofish-avx2 assembly code does not use stack */
1152           if (nblocks == 0)
1153             burn_stack_depth = 0;
1154         }
1155     }
1156 #endif
1157 
1158 #ifdef USE_AMD64_ASM
1159   {
1160     /* Process data in 3 block chunks. */
1161     while (nblocks >= 3)
1162       {
1163         twofish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
1164 
1165         nblocks -= 3;
1166         outbuf += 3 * TWOFISH_BLOCKSIZE;
1167         inbuf += 3 * TWOFISH_BLOCKSIZE;
1168 
1169         burn = 8 * sizeof(void*);
1170         if (burn > burn_stack_depth)
1171           burn_stack_depth = burn;
1172       }
1173 
1174     /* Use generic code to handle smaller chunks... */
1175     /* TODO: use caching instead? */
1176   }
1177 #endif
1178 
1179   for ( ;nblocks; nblocks-- )
1180     {
1181       /* Encrypt the counter. */
1182       burn = twofish_encrypt(ctx, tmpbuf, ctr);
1183       if (burn > burn_stack_depth)
1184         burn_stack_depth = burn;
1185 
1186       /* XOR the input with the encrypted counter and store in output.  */
1187       cipher_block_xor(outbuf, tmpbuf, inbuf, TWOFISH_BLOCKSIZE);
1188       outbuf += TWOFISH_BLOCKSIZE;
1189       inbuf  += TWOFISH_BLOCKSIZE;
1190       /* Increment the counter.  */
1191       cipher_block_add(ctr, 1, TWOFISH_BLOCKSIZE);
1192     }
1193 
1194   wipememory(tmpbuf, sizeof(tmpbuf));
1195   _gcry_burn_stack(burn_stack_depth);
1196 }
1197 
1198 
1199 /* Bulk decryption of complete blocks in CBC mode.  This function is only
1200    intended for the bulk encryption feature of cipher.c. */
1201 static void
_gcry_twofish_cbc_dec(void * context,unsigned char * iv,void * outbuf_arg,const void * inbuf_arg,size_t nblocks)1202 _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
1203 		      const void *inbuf_arg, size_t nblocks)
1204 {
1205   TWOFISH_context *ctx = context;
1206   unsigned char *outbuf = outbuf_arg;
1207   const unsigned char *inbuf = inbuf_arg;
1208   unsigned char savebuf[TWOFISH_BLOCKSIZE];
1209   unsigned int burn, burn_stack_depth = 0;
1210 
1211 #ifdef USE_AVX2
1212   if (ctx->use_avx2)
1213     {
1214       int did_use_avx2 = 0;
1215 
1216       /* Process data in 16 block chunks. */
1217       while (nblocks >= 16)
1218         {
1219           _gcry_twofish_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
1220 
1221           nblocks -= 16;
1222           outbuf += 16 * TWOFISH_BLOCKSIZE;
1223           inbuf  += 16 * TWOFISH_BLOCKSIZE;
1224           did_use_avx2 = 1;
1225         }
1226 
1227       if (did_use_avx2)
1228         {
1229           /* twofish-avx2 assembly code does not use stack */
1230           if (nblocks == 0)
1231             burn_stack_depth = 0;
1232         }
1233     }
1234 #endif
1235 
1236 #ifdef USE_AMD64_ASM
1237   {
1238     /* Process data in 3 block chunks. */
1239     while (nblocks >= 3)
1240       {
1241         twofish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
1242 
1243         nblocks -= 3;
1244         outbuf += 3 * TWOFISH_BLOCKSIZE;
1245         inbuf += 3 * TWOFISH_BLOCKSIZE;
1246 
1247         burn = 9 * sizeof(void*);
1248         if (burn > burn_stack_depth)
1249           burn_stack_depth = burn;
1250       }
1251 
1252     /* Use generic code to handle smaller chunks... */
1253   }
1254 #endif
1255 
1256   for ( ;nblocks; nblocks-- )
1257     {
1258       /* INBUF is needed later and it may be identical to OUTBUF, so store
1259          the intermediate result to SAVEBUF.  */
1260       burn = twofish_decrypt (ctx, savebuf, inbuf);
1261       if (burn > burn_stack_depth)
1262         burn_stack_depth = burn;
1263 
1264       cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, TWOFISH_BLOCKSIZE);
1265       inbuf += TWOFISH_BLOCKSIZE;
1266       outbuf += TWOFISH_BLOCKSIZE;
1267     }
1268 
1269   wipememory(savebuf, sizeof(savebuf));
1270   _gcry_burn_stack(burn_stack_depth);
1271 }
1272 
1273 
1274 /* Bulk decryption of complete blocks in CFB mode.  This function is only
1275    intended for the bulk encryption feature of cipher.c. */
1276 static void
_gcry_twofish_cfb_dec(void * context,unsigned char * iv,void * outbuf_arg,const void * inbuf_arg,size_t nblocks)1277 _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
1278 		    const void *inbuf_arg, size_t nblocks)
1279 {
1280   TWOFISH_context *ctx = context;
1281   unsigned char *outbuf = outbuf_arg;
1282   const unsigned char *inbuf = inbuf_arg;
1283   unsigned int burn, burn_stack_depth = 0;
1284 
1285 #ifdef USE_AVX2
1286   if (ctx->use_avx2)
1287     {
1288       int did_use_avx2 = 0;
1289 
1290       /* Process data in 16 block chunks. */
1291       while (nblocks >= 16)
1292         {
1293           _gcry_twofish_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
1294 
1295           nblocks -= 16;
1296           outbuf += 16 * TWOFISH_BLOCKSIZE;
1297           inbuf  += 16 * TWOFISH_BLOCKSIZE;
1298           did_use_avx2 = 1;
1299         }
1300 
1301       if (did_use_avx2)
1302         {
1303           /* twofish-avx2 assembly code does not use stack */
1304           if (nblocks == 0)
1305             burn_stack_depth = 0;
1306         }
1307     }
1308 #endif
1309 
1310 #ifdef USE_AMD64_ASM
1311   {
1312     /* Process data in 3 block chunks. */
1313     while (nblocks >= 3)
1314       {
1315         twofish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
1316 
1317         nblocks -= 3;
1318         outbuf += 3 * TWOFISH_BLOCKSIZE;
1319         inbuf += 3 * TWOFISH_BLOCKSIZE;
1320 
1321         burn = 8 * sizeof(void*);
1322         if (burn > burn_stack_depth)
1323           burn_stack_depth = burn;
1324       }
1325 
1326     /* Use generic code to handle smaller chunks... */
1327   }
1328 #endif
1329 
1330   for ( ;nblocks; nblocks-- )
1331     {
1332       burn = twofish_encrypt(ctx, iv, iv);
1333       if (burn > burn_stack_depth)
1334         burn_stack_depth = burn;
1335 
1336       cipher_block_xor_n_copy(outbuf, iv, inbuf, TWOFISH_BLOCKSIZE);
1337       outbuf += TWOFISH_BLOCKSIZE;
1338       inbuf += TWOFISH_BLOCKSIZE;
1339     }
1340 
1341   _gcry_burn_stack(burn_stack_depth);
1342 }
1343 
1344 /* Bulk encryption/decryption of complete blocks in OCB mode. */
1345 static size_t
_gcry_twofish_ocb_crypt(gcry_cipher_hd_t c,void * outbuf_arg,const void * inbuf_arg,size_t nblocks,int encrypt)1346 _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
1347 			const void *inbuf_arg, size_t nblocks, int encrypt)
1348 {
1349 #ifdef USE_AMD64_ASM
1350   TWOFISH_context *ctx = (void *)&c->context.c;
1351   unsigned char *outbuf = outbuf_arg;
1352   const unsigned char *inbuf = inbuf_arg;
1353   unsigned int burn, burn_stack_depth = 0;
1354   u64 blkn = c->u_mode.ocb.data_nblocks;
1355 
1356 #ifdef USE_AVX2
1357   if (ctx->use_avx2)
1358     {
1359       int did_use_avx2 = 0;
1360       u64 Ls[16];
1361       unsigned int n = 16 - (blkn % 16);
1362       u64 *l;
1363       int i;
1364 
1365       if (nblocks >= 16)
1366 	{
1367 	  for (i = 0; i < 16; i += 8)
1368 	    {
1369 	      /* Use u64 to store pointers for x32 support (assembly function
1370 	       * assumes 64-bit pointers). */
1371 	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1372 	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
1373 	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1374 	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
1375 	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1376 	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
1377 	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1378 	    }
1379 
1380 	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
1381 	  l = &Ls[(15 + n) % 16];
1382 
1383 	  /* Process data in 16 block chunks. */
1384 	  while (nblocks >= 16)
1385 	    {
1386 	      blkn += 16;
1387 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
1388 
1389 	      if (encrypt)
1390 		_gcry_twofish_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
1391 					  c->u_ctr.ctr, Ls);
1392 	      else
1393 		_gcry_twofish_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
1394 					  c->u_ctr.ctr, Ls);
1395 
1396 	      nblocks -= 16;
1397 	      outbuf += 16 * TWOFISH_BLOCKSIZE;
1398 	      inbuf  += 16 * TWOFISH_BLOCKSIZE;
1399 	      did_use_avx2 = 1;
1400 	    }
1401 	}
1402 
1403       if (did_use_avx2)
1404 	{
1405 	  /* twofish-avx2 assembly code does not use stack */
1406 	  if (nblocks == 0)
1407 	    burn_stack_depth = 0;
1408 	}
1409     }
1410 #endif
1411 
1412   {
1413     /* Use u64 to store pointers for x32 support (assembly function
1414       * assumes 64-bit pointers). */
1415     u64 Ls[3];
1416 
1417     /* Process data in 3 block chunks. */
1418     while (nblocks >= 3)
1419       {
1420 	Ls[0] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 1);
1421 	Ls[1] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 2);
1422 	Ls[2] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 3);
1423 	blkn += 3;
1424 
1425 	if (encrypt)
1426 	  twofish_amd64_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
1427 				Ls);
1428 	else
1429 	  twofish_amd64_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
1430 				Ls);
1431 
1432 	nblocks -= 3;
1433 	outbuf += 3 * TWOFISH_BLOCKSIZE;
1434 	inbuf  += 3 * TWOFISH_BLOCKSIZE;
1435 
1436 	burn = 8 * sizeof(void*);
1437 	if (burn > burn_stack_depth)
1438 	  burn_stack_depth = burn;
1439       }
1440 
1441     /* Use generic code to handle smaller chunks... */
1442   }
1443 
1444   c->u_mode.ocb.data_nblocks = blkn;
1445 
1446   if (burn_stack_depth)
1447     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
1448 #else
1449   (void)c;
1450   (void)outbuf_arg;
1451   (void)inbuf_arg;
1452   (void)encrypt;
1453 #endif
1454 
1455   return nblocks;
1456 }
1457 
1458 /* Bulk authentication of complete blocks in OCB mode. */
1459 static size_t
_gcry_twofish_ocb_auth(gcry_cipher_hd_t c,const void * abuf_arg,size_t nblocks)1460 _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
1461 			size_t nblocks)
1462 {
1463 #ifdef USE_AMD64_ASM
1464   TWOFISH_context *ctx = (void *)&c->context.c;
1465   const unsigned char *abuf = abuf_arg;
1466   unsigned int burn, burn_stack_depth = 0;
1467   u64 blkn = c->u_mode.ocb.aad_nblocks;
1468 
1469 #ifdef USE_AVX2
1470   if (ctx->use_avx2)
1471     {
1472       int did_use_avx2 = 0;
1473       u64 Ls[16];
1474       unsigned int n = 16 - (blkn % 16);
1475       u64 *l;
1476       int i;
1477 
1478       if (nblocks >= 16)
1479 	{
1480 	  for (i = 0; i < 16; i += 8)
1481 	    {
1482 	      /* Use u64 to store pointers for x32 support (assembly function
1483 	       * assumes 64-bit pointers). */
1484 	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1485 	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
1486 	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1487 	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
1488 	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1489 	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
1490 	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1491 	    }
1492 
1493 	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
1494 	  l = &Ls[(15 + n) % 16];
1495 
1496 	  /* Process data in 16 block chunks. */
1497 	  while (nblocks >= 16)
1498 	    {
1499 	      blkn += 16;
1500 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
1501 
1502 	      _gcry_twofish_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
1503 					  c->u_mode.ocb.aad_sum, Ls);
1504 
1505 	      nblocks -= 16;
1506 	      abuf += 16 * TWOFISH_BLOCKSIZE;
1507 	      did_use_avx2 = 1;
1508 	    }
1509 	}
1510 
1511       if (did_use_avx2)
1512 	{
1513 	  /* twofish-avx2 assembly code does not use stack */
1514 	  if (nblocks == 0)
1515 	    burn_stack_depth = 0;
1516 	}
1517 
1518       /* Use generic code to handle smaller chunks... */
1519     }
1520 #endif
1521 
1522   {
1523     /* Use u64 to store pointers for x32 support (assembly function
1524       * assumes 64-bit pointers). */
1525     u64 Ls[3];
1526 
1527     /* Process data in 3 block chunks. */
1528     while (nblocks >= 3)
1529       {
1530 	Ls[0] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 1);
1531 	Ls[1] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 2);
1532 	Ls[2] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 3);
1533 	blkn += 3;
1534 
1535 	twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
1536 			      c->u_mode.ocb.aad_sum, Ls);
1537 
1538 	nblocks -= 3;
1539 	abuf += 3 * TWOFISH_BLOCKSIZE;
1540 
1541 	burn = 8 * sizeof(void*);
1542 	if (burn > burn_stack_depth)
1543 	  burn_stack_depth = burn;
1544       }
1545 
1546     /* Use generic code to handle smaller chunks... */
1547   }
1548 
1549   c->u_mode.ocb.aad_nblocks = blkn;
1550 
1551   if (burn_stack_depth)
1552     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
1553 #else
1554   (void)c;
1555   (void)abuf_arg;
1556 #endif
1557 
1558   return nblocks;
1559 }
1560 
1561 
1562 
1563 /* Run the self-tests for TWOFISH-CTR, tests IV increment of bulk CTR
1564    encryption.  Returns NULL on success. */
1565 static const char *
selftest_ctr(void)1566 selftest_ctr (void)
1567 {
1568   const int nblocks = 16+1;
1569   const int blocksize = TWOFISH_BLOCKSIZE;
1570   const int context_size = sizeof(TWOFISH_context);
1571 
1572   return _gcry_selftest_helper_ctr("TWOFISH", &twofish_setkey,
1573            &twofish_encrypt, nblocks, blocksize, context_size);
1574 }
1575 
1576 /* Run the self-tests for TWOFISH-CBC, tests bulk CBC decryption.
1577    Returns NULL on success. */
1578 static const char *
selftest_cbc(void)1579 selftest_cbc (void)
1580 {
1581   const int nblocks = 16+2;
1582   const int blocksize = TWOFISH_BLOCKSIZE;
1583   const int context_size = sizeof(TWOFISH_context);
1584 
1585   return _gcry_selftest_helper_cbc("TWOFISH", &twofish_setkey,
1586            &twofish_encrypt, nblocks, blocksize, context_size);
1587 }
1588 
1589 /* Run the self-tests for TWOFISH-CFB, tests bulk CBC decryption.
1590    Returns NULL on success. */
1591 static const char *
selftest_cfb(void)1592 selftest_cfb (void)
1593 {
1594   const int nblocks = 16+2;
1595   const int blocksize = TWOFISH_BLOCKSIZE;
1596   const int context_size = sizeof(TWOFISH_context);
1597 
1598   return _gcry_selftest_helper_cfb("TWOFISH", &twofish_setkey,
1599            &twofish_encrypt, nblocks, blocksize, context_size);
1600 }
1601 
1602 
1603 /* Test a single encryption and decryption with each key size. */
1604 
1605 static const char*
selftest(void)1606 selftest (void)
1607 {
1608   TWOFISH_context ctx; /* Expanded key. */
1609   byte scratch[16];    /* Encryption/decryption result buffer. */
1610   cipher_bulk_ops_t bulk_ops;
1611   const char *r;
1612 
1613   /* Test vectors for single encryption/decryption.  Note that I am using
1614    * the vectors from the Twofish paper's "known answer test", I=3 for
1615    * 128-bit and I=4 for 256-bit, instead of the all-0 vectors from the
1616    * "intermediate value test", because an all-0 key would trigger all the
1617    * special cases in the RS matrix multiply, leaving the math untested. */
1618   static  byte plaintext[16] = {
1619     0xD4, 0x91, 0xDB, 0x16, 0xE7, 0xB1, 0xC3, 0x9E,
1620     0x86, 0xCB, 0x08, 0x6B, 0x78, 0x9F, 0x54, 0x19
1621   };
1622   static byte key[16] = {
1623     0x9F, 0x58, 0x9F, 0x5C, 0xF6, 0x12, 0x2C, 0x32,
1624     0xB6, 0xBF, 0xEC, 0x2F, 0x2A, 0xE8, 0xC3, 0x5A
1625   };
1626   static const byte ciphertext[16] = {
1627     0x01, 0x9F, 0x98, 0x09, 0xDE, 0x17, 0x11, 0x85,
1628     0x8F, 0xAA, 0xC3, 0xA3, 0xBA, 0x20, 0xFB, 0xC3
1629   };
1630   static byte plaintext_256[16] = {
1631     0x90, 0xAF, 0xE9, 0x1B, 0xB2, 0x88, 0x54, 0x4F,
1632     0x2C, 0x32, 0xDC, 0x23, 0x9B, 0x26, 0x35, 0xE6
1633   };
1634   static byte key_256[32] = {
1635     0xD4, 0x3B, 0xB7, 0x55, 0x6E, 0xA3, 0x2E, 0x46,
1636     0xF2, 0xA2, 0x82, 0xB7, 0xD4, 0x5B, 0x4E, 0x0D,
1637     0x57, 0xFF, 0x73, 0x9D, 0x4D, 0xC9, 0x2C, 0x1B,
1638     0xD7, 0xFC, 0x01, 0x70, 0x0C, 0xC8, 0x21, 0x6F
1639   };
1640   static const byte ciphertext_256[16] = {
1641     0x6C, 0xB4, 0x56, 0x1C, 0x40, 0xBF, 0x0A, 0x97,
1642     0x05, 0x93, 0x1C, 0xB6, 0xD4, 0x08, 0xE7, 0xFA
1643   };
1644 
1645   twofish_setkey (&ctx, key, sizeof(key), &bulk_ops);
1646   twofish_encrypt (&ctx, scratch, plaintext);
1647   if (memcmp (scratch, ciphertext, sizeof (ciphertext)))
1648     return "Twofish-128 test encryption failed.";
1649   twofish_decrypt (&ctx, scratch, scratch);
1650   if (memcmp (scratch, plaintext, sizeof (plaintext)))
1651     return "Twofish-128 test decryption failed.";
1652 
1653   twofish_setkey (&ctx, key_256, sizeof(key_256), &bulk_ops);
1654   twofish_encrypt (&ctx, scratch, plaintext_256);
1655   if (memcmp (scratch, ciphertext_256, sizeof (ciphertext_256)))
1656     return "Twofish-256 test encryption failed.";
1657   twofish_decrypt (&ctx, scratch, scratch);
1658   if (memcmp (scratch, plaintext_256, sizeof (plaintext_256)))
1659     return "Twofish-256 test decryption failed.";
1660 
1661   if ((r = selftest_ctr()) != NULL)
1662     return r;
1663   if ((r = selftest_cbc()) != NULL)
1664     return r;
1665   if ((r = selftest_cfb()) != NULL)
1666     return r;
1667 
1668   return NULL;
1669 }
1670 
1671 /* More complete test program.	This does 1000 encryptions and decryptions
1672  * with each of 250 128-bit keys and 2000 encryptions and decryptions with
1673  * each of 125 256-bit keys, using a feedback scheme similar to a Feistel
1674  * cipher, so as to be sure of testing all the table entries pretty
1675  * thoroughly.	We keep changing the keys so as to get a more meaningful
1676  * performance number, since the key setup is non-trivial for Twofish. */
1677 
1678 #ifdef TEST
1679 
1680 #include <stdio.h>
1681 #include <string.h>
1682 #include <time.h>
1683 
1684 int
main()1685 main()
1686 {
1687   TWOFISH_context ctx;     /* Expanded key. */
1688   int i, j;                /* Loop counters. */
1689   cipher_bulk_ops_t bulk_ops;
1690 
1691   const char *encrypt_msg; /* Message to print regarding encryption test;
1692                             * the printf is done outside the loop to avoid
1693                             * stuffing up the timing. */
1694   clock_t timer; /* For computing elapsed time. */
1695 
1696   /* Test buffer. */
1697   byte buffer[4][16] = {
1698     {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
1699      0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF},
1700     {0x0F, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78,
1701      0x87, 0x96, 0xA5, 0xB4, 0xC3, 0xD2 ,0xE1, 0xF0},
1702     {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
1703      0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54 ,0x32, 0x10},
1704     {0x01, 0x23, 0x45, 0x67, 0x76, 0x54 ,0x32, 0x10,
1705      0x89, 0xAB, 0xCD, 0xEF, 0xFE, 0xDC, 0xBA, 0x98}
1706   };
1707 
1708   /* Expected outputs for the million-operation test */
1709   static const byte test_encrypt[4][16] = {
1710     {0xC8, 0x23, 0xB8, 0xB7, 0x6B, 0xFE, 0x91, 0x13,
1711      0x2F, 0xA7, 0x5E, 0xE6, 0x94, 0x77, 0x6F, 0x6B},
1712     {0x90, 0x36, 0xD8, 0x29, 0xD5, 0x96, 0xC2, 0x8E,
1713      0xE4, 0xFF, 0x76, 0xBC, 0xE5, 0x77, 0x88, 0x27},
1714     {0xB8, 0x78, 0x69, 0xAF, 0x42, 0x8B, 0x48, 0x64,
1715      0xF7, 0xE9, 0xF3, 0x9C, 0x42, 0x18, 0x7B, 0x73},
1716     {0x7A, 0x88, 0xFB, 0xEB, 0x90, 0xA4, 0xB4, 0xA8,
1717      0x43, 0xA3, 0x1D, 0xF1, 0x26, 0xC4, 0x53, 0x57}
1718   };
1719   static const byte test_decrypt[4][16] = {
1720     {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
1721      0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF},
1722     {0x0F, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78,
1723      0x87, 0x96, 0xA5, 0xB4, 0xC3, 0xD2 ,0xE1, 0xF0},
1724     {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
1725      0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54 ,0x32, 0x10},
1726     {0x01, 0x23, 0x45, 0x67, 0x76, 0x54 ,0x32, 0x10,
1727      0x89, 0xAB, 0xCD, 0xEF, 0xFE, 0xDC, 0xBA, 0x98}
1728   };
1729 
1730   /* Start the timer ticking. */
1731   timer = clock ();
1732 
1733   /* Encryption test. */
1734   for (i = 0; i < 125; i++)
1735     {
1736       twofish_setkey (&ctx, buffer[0], sizeof (buffer[0]), &bulk_ops);
1737       for (j = 0; j < 1000; j++)
1738         twofish_encrypt (&ctx, buffer[2], buffer[2]);
1739       twofish_setkey (&ctx, buffer[1], sizeof (buffer[1]), &bulk_ops);
1740       for (j = 0; j < 1000; j++)
1741         twofish_encrypt (&ctx, buffer[3], buffer[3]);
1742       twofish_setkey (&ctx, buffer[2], sizeof (buffer[2])*2, &bulk_ops);
1743       for (j = 0; j < 1000; j++) {
1744         twofish_encrypt (&ctx, buffer[0], buffer[0]);
1745         twofish_encrypt (&ctx, buffer[1], buffer[1]);
1746       }
1747     }
1748   encrypt_msg = memcmp (buffer, test_encrypt, sizeof (test_encrypt)) ?
1749     "encryption failure!\n" : "encryption OK!\n";
1750 
1751   /* Decryption test. */
1752   for (i = 0; i < 125; i++)
1753     {
1754       twofish_setkey (&ctx, buffer[2], sizeof (buffer[2])*2, &bulk_ops);
1755       for (j = 0; j < 1000; j++) {
1756         twofish_decrypt (&ctx, buffer[0], buffer[0]);
1757         twofish_decrypt (&ctx, buffer[1], buffer[1]);
1758       }
1759       twofish_setkey (&ctx, buffer[1], sizeof (buffer[1]), &bulk_ops);
1760       for (j = 0; j < 1000; j++)
1761         twofish_decrypt (&ctx, buffer[3], buffer[3]);
1762       twofish_setkey (&ctx, buffer[0], sizeof (buffer[0]), &bulk_ops);
1763       for (j = 0; j < 1000; j++)
1764         twofish_decrypt (&ctx, buffer[2], buffer[2]);
1765     }
1766 
1767   /* Stop the timer, and print results. */
1768   timer = clock () - timer;
1769   printf (encrypt_msg);
1770   printf (memcmp (buffer, test_decrypt, sizeof (test_decrypt)) ?
1771           "decryption failure!\n" : "decryption OK!\n");
1772   printf ("elapsed time: %.1f s.\n", (float) timer / CLOCKS_PER_SEC);
1773 
1774   return 0;
1775 }
1776 
1777 #endif /* TEST */
1778 
1779 
1780 
1781 gcry_cipher_spec_t _gcry_cipher_spec_twofish =
1782   {
1783     GCRY_CIPHER_TWOFISH, {0, 0},
1784     "TWOFISH", NULL, NULL, 16, 256, sizeof (TWOFISH_context),
1785     twofish_setkey, twofish_encrypt, twofish_decrypt
1786   };
1787 
1788 gcry_cipher_spec_t _gcry_cipher_spec_twofish128 =
1789   {
1790     GCRY_CIPHER_TWOFISH128, {0, 0},
1791     "TWOFISH128", NULL, NULL, 16, 128, sizeof (TWOFISH_context),
1792     twofish_setkey, twofish_encrypt, twofish_decrypt
1793   };
1794