1 /* Twofish for GPG
2 * Copyright (C) 1998, 2002, 2003 Free Software Foundation, Inc.
3 * Written by Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
4 * 256-bit key length added March 20, 1999
5 * Some modifications to reduce the text size by Werner Koch, April, 1998
6 *
7 * This file is part of Libgcrypt.
8 *
9 * Libgcrypt is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser General Public License as
11 * published by the Free Software Foundation; either version 2.1 of
12 * the License, or (at your option) any later version.
13 *
14 * Libgcrypt is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
22 ********************************************************************
23 *
24 * This code is a "clean room" implementation, written from the paper
25 * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
26 * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
27 * through http://www.counterpane.com/twofish.html
28 *
29 * For background information on multiplication in finite fields, used for
30 * the matrix operations in the key schedule, see the book _Contemporary
31 * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
32 * Third Edition.
33 *
34 * Only the 128- and 256-bit key sizes are supported. This code is intended
35 * for GNU C on a 32-bit system, but it should work almost anywhere. Loops
36 * are unrolled, precomputation tables are used, etc., for maximum speed at
37 * some cost in memory consumption. */
38
39 #include <config.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h> /* for memcmp() */
43
44 #include "types.h" /* for byte and u32 typedefs */
45 #include "g10lib.h"
46 #include "cipher.h"
47 #include "bufhelp.h"
48 #include "cipher-internal.h"
49 #include "cipher-selftest.h"
50
51
52 #define TWOFISH_BLOCKSIZE 16
53
54
55 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
56 #undef USE_AMD64_ASM
57 #if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
58 defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
59 # define USE_AMD64_ASM 1
60 #endif
61
62 /* USE_ARM_ASM indicates whether to use ARM assembly code. */
63 #undef USE_ARM_ASM
64 #if defined(__ARMEL__)
65 # if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
66 # define USE_ARM_ASM 1
67 # endif
68 #endif
69 # if defined(__AARCH64EL__)
70 # ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
71 # define USE_ARM_ASM 1
72 # endif
73 # endif
74
75 /* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
76 #undef USE_AVX2
77 #if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
78 defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
79 # if defined(ENABLE_AVX2_SUPPORT)
80 # define USE_AVX2 1
81 # endif
82 #endif
83
84
85 /* Prototype for the self-test function. */
86 static const char *selftest(void);
87
88
89 /* Prototypes for the bulk functions. */
90 static void _gcry_twofish_ctr_enc (void *context, unsigned char *ctr,
91 void *outbuf_arg, const void *inbuf_arg,
92 size_t nblocks);
93 static void _gcry_twofish_cbc_dec (void *context, unsigned char *iv,
94 void *outbuf_arg, const void *inbuf_arg,
95 size_t nblocks);
96 static void _gcry_twofish_cfb_dec (void *context, unsigned char *iv,
97 void *outbuf_arg, const void *inbuf_arg,
98 size_t nblocks);
99 static size_t _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
100 const void *inbuf_arg, size_t nblocks,
101 int encrypt);
102 static size_t _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
103 size_t nblocks);
104
105
106 /* Structure for an expanded Twofish key. s contains the key-dependent
107 * S-boxes composed with the MDS matrix; w contains the eight "whitening"
108 * subkeys, K[0] through K[7]. k holds the remaining, "round" subkeys. Note
109 * that k[i] corresponds to what the Twofish paper calls K[i+8]. */
110 typedef struct {
111 u32 s[4][256], w[8], k[32];
112
113 #ifdef USE_AVX2
114 int use_avx2;
115 #endif
116 } TWOFISH_context;
117
118
119 /* Assembly implementations use SystemV ABI, ABI conversion and additional
120 * stack to store XMM6-XMM15 needed on Win64. */
121 #undef ASM_FUNC_ABI
122 #if defined(USE_AVX2)
123 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
124 # define ASM_FUNC_ABI __attribute__((sysv_abi))
125 # else
126 # define ASM_FUNC_ABI
127 # endif
128 #endif
129
130
131 /* These two tables are the q0 and q1 permutations, exactly as described in
132 * the Twofish paper. */
133
134 static const byte q0[256] = {
135 0xA9, 0x67, 0xB3, 0xE8, 0x04, 0xFD, 0xA3, 0x76, 0x9A, 0x92, 0x80, 0x78,
136 0xE4, 0xDD, 0xD1, 0x38, 0x0D, 0xC6, 0x35, 0x98, 0x18, 0xF7, 0xEC, 0x6C,
137 0x43, 0x75, 0x37, 0x26, 0xFA, 0x13, 0x94, 0x48, 0xF2, 0xD0, 0x8B, 0x30,
138 0x84, 0x54, 0xDF, 0x23, 0x19, 0x5B, 0x3D, 0x59, 0xF3, 0xAE, 0xA2, 0x82,
139 0x63, 0x01, 0x83, 0x2E, 0xD9, 0x51, 0x9B, 0x7C, 0xA6, 0xEB, 0xA5, 0xBE,
140 0x16, 0x0C, 0xE3, 0x61, 0xC0, 0x8C, 0x3A, 0xF5, 0x73, 0x2C, 0x25, 0x0B,
141 0xBB, 0x4E, 0x89, 0x6B, 0x53, 0x6A, 0xB4, 0xF1, 0xE1, 0xE6, 0xBD, 0x45,
142 0xE2, 0xF4, 0xB6, 0x66, 0xCC, 0x95, 0x03, 0x56, 0xD4, 0x1C, 0x1E, 0xD7,
143 0xFB, 0xC3, 0x8E, 0xB5, 0xE9, 0xCF, 0xBF, 0xBA, 0xEA, 0x77, 0x39, 0xAF,
144 0x33, 0xC9, 0x62, 0x71, 0x81, 0x79, 0x09, 0xAD, 0x24, 0xCD, 0xF9, 0xD8,
145 0xE5, 0xC5, 0xB9, 0x4D, 0x44, 0x08, 0x86, 0xE7, 0xA1, 0x1D, 0xAA, 0xED,
146 0x06, 0x70, 0xB2, 0xD2, 0x41, 0x7B, 0xA0, 0x11, 0x31, 0xC2, 0x27, 0x90,
147 0x20, 0xF6, 0x60, 0xFF, 0x96, 0x5C, 0xB1, 0xAB, 0x9E, 0x9C, 0x52, 0x1B,
148 0x5F, 0x93, 0x0A, 0xEF, 0x91, 0x85, 0x49, 0xEE, 0x2D, 0x4F, 0x8F, 0x3B,
149 0x47, 0x87, 0x6D, 0x46, 0xD6, 0x3E, 0x69, 0x64, 0x2A, 0xCE, 0xCB, 0x2F,
150 0xFC, 0x97, 0x05, 0x7A, 0xAC, 0x7F, 0xD5, 0x1A, 0x4B, 0x0E, 0xA7, 0x5A,
151 0x28, 0x14, 0x3F, 0x29, 0x88, 0x3C, 0x4C, 0x02, 0xB8, 0xDA, 0xB0, 0x17,
152 0x55, 0x1F, 0x8A, 0x7D, 0x57, 0xC7, 0x8D, 0x74, 0xB7, 0xC4, 0x9F, 0x72,
153 0x7E, 0x15, 0x22, 0x12, 0x58, 0x07, 0x99, 0x34, 0x6E, 0x50, 0xDE, 0x68,
154 0x65, 0xBC, 0xDB, 0xF8, 0xC8, 0xA8, 0x2B, 0x40, 0xDC, 0xFE, 0x32, 0xA4,
155 0xCA, 0x10, 0x21, 0xF0, 0xD3, 0x5D, 0x0F, 0x00, 0x6F, 0x9D, 0x36, 0x42,
156 0x4A, 0x5E, 0xC1, 0xE0
157 };
158
159 static const byte q1[256] = {
160 0x75, 0xF3, 0xC6, 0xF4, 0xDB, 0x7B, 0xFB, 0xC8, 0x4A, 0xD3, 0xE6, 0x6B,
161 0x45, 0x7D, 0xE8, 0x4B, 0xD6, 0x32, 0xD8, 0xFD, 0x37, 0x71, 0xF1, 0xE1,
162 0x30, 0x0F, 0xF8, 0x1B, 0x87, 0xFA, 0x06, 0x3F, 0x5E, 0xBA, 0xAE, 0x5B,
163 0x8A, 0x00, 0xBC, 0x9D, 0x6D, 0xC1, 0xB1, 0x0E, 0x80, 0x5D, 0xD2, 0xD5,
164 0xA0, 0x84, 0x07, 0x14, 0xB5, 0x90, 0x2C, 0xA3, 0xB2, 0x73, 0x4C, 0x54,
165 0x92, 0x74, 0x36, 0x51, 0x38, 0xB0, 0xBD, 0x5A, 0xFC, 0x60, 0x62, 0x96,
166 0x6C, 0x42, 0xF7, 0x10, 0x7C, 0x28, 0x27, 0x8C, 0x13, 0x95, 0x9C, 0xC7,
167 0x24, 0x46, 0x3B, 0x70, 0xCA, 0xE3, 0x85, 0xCB, 0x11, 0xD0, 0x93, 0xB8,
168 0xA6, 0x83, 0x20, 0xFF, 0x9F, 0x77, 0xC3, 0xCC, 0x03, 0x6F, 0x08, 0xBF,
169 0x40, 0xE7, 0x2B, 0xE2, 0x79, 0x0C, 0xAA, 0x82, 0x41, 0x3A, 0xEA, 0xB9,
170 0xE4, 0x9A, 0xA4, 0x97, 0x7E, 0xDA, 0x7A, 0x17, 0x66, 0x94, 0xA1, 0x1D,
171 0x3D, 0xF0, 0xDE, 0xB3, 0x0B, 0x72, 0xA7, 0x1C, 0xEF, 0xD1, 0x53, 0x3E,
172 0x8F, 0x33, 0x26, 0x5F, 0xEC, 0x76, 0x2A, 0x49, 0x81, 0x88, 0xEE, 0x21,
173 0xC4, 0x1A, 0xEB, 0xD9, 0xC5, 0x39, 0x99, 0xCD, 0xAD, 0x31, 0x8B, 0x01,
174 0x18, 0x23, 0xDD, 0x1F, 0x4E, 0x2D, 0xF9, 0x48, 0x4F, 0xF2, 0x65, 0x8E,
175 0x78, 0x5C, 0x58, 0x19, 0x8D, 0xE5, 0x98, 0x57, 0x67, 0x7F, 0x05, 0x64,
176 0xAF, 0x63, 0xB6, 0xFE, 0xF5, 0xB7, 0x3C, 0xA5, 0xCE, 0xE9, 0x68, 0x44,
177 0xE0, 0x4D, 0x43, 0x69, 0x29, 0x2E, 0xAC, 0x15, 0x59, 0xA8, 0x0A, 0x9E,
178 0x6E, 0x47, 0xDF, 0x34, 0x35, 0x6A, 0xCF, 0xDC, 0x22, 0xC9, 0xC0, 0x9B,
179 0x89, 0xD4, 0xED, 0xAB, 0x12, 0xA2, 0x0D, 0x52, 0xBB, 0x02, 0x2F, 0xA9,
180 0xD7, 0x61, 0x1E, 0xB4, 0x50, 0x04, 0xF6, 0xC2, 0x16, 0x25, 0x86, 0x56,
181 0x55, 0x09, 0xBE, 0x91
182 };
183
184 /* These MDS tables are actually tables of MDS composed with q0 and q1,
185 * because it is only ever used that way and we can save some time by
186 * precomputing. Of course the main saving comes from precomputing the
187 * GF(2^8) multiplication involved in the MDS matrix multiply; by looking
188 * things up in these tables we reduce the matrix multiply to four lookups
189 * and three XORs. Semi-formally, the definition of these tables is:
190 * mds[0][i] = MDS (q1[i] 0 0 0)^T mds[1][i] = MDS (0 q0[i] 0 0)^T
191 * mds[2][i] = MDS (0 0 q1[i] 0)^T mds[3][i] = MDS (0 0 0 q0[i])^T
192 * where ^T means "transpose", the matrix multiply is performed in GF(2^8)
193 * represented as GF(2)[x]/v(x) where v(x)=x^8+x^6+x^5+x^3+1 as described
194 * by Schneier et al, and I'm casually glossing over the byte/word
195 * conversion issues. */
196
197 static const u32 mds[4][256] = {
198 {0xBCBC3275, 0xECEC21F3, 0x202043C6, 0xB3B3C9F4, 0xDADA03DB, 0x02028B7B,
199 0xE2E22BFB, 0x9E9EFAC8, 0xC9C9EC4A, 0xD4D409D3, 0x18186BE6, 0x1E1E9F6B,
200 0x98980E45, 0xB2B2387D, 0xA6A6D2E8, 0x2626B74B, 0x3C3C57D6, 0x93938A32,
201 0x8282EED8, 0x525298FD, 0x7B7BD437, 0xBBBB3771, 0x5B5B97F1, 0x474783E1,
202 0x24243C30, 0x5151E20F, 0xBABAC6F8, 0x4A4AF31B, 0xBFBF4887, 0x0D0D70FA,
203 0xB0B0B306, 0x7575DE3F, 0xD2D2FD5E, 0x7D7D20BA, 0x666631AE, 0x3A3AA35B,
204 0x59591C8A, 0x00000000, 0xCDCD93BC, 0x1A1AE09D, 0xAEAE2C6D, 0x7F7FABC1,
205 0x2B2BC7B1, 0xBEBEB90E, 0xE0E0A080, 0x8A8A105D, 0x3B3B52D2, 0x6464BAD5,
206 0xD8D888A0, 0xE7E7A584, 0x5F5FE807, 0x1B1B1114, 0x2C2CC2B5, 0xFCFCB490,
207 0x3131272C, 0x808065A3, 0x73732AB2, 0x0C0C8173, 0x79795F4C, 0x6B6B4154,
208 0x4B4B0292, 0x53536974, 0x94948F36, 0x83831F51, 0x2A2A3638, 0xC4C49CB0,
209 0x2222C8BD, 0xD5D5F85A, 0xBDBDC3FC, 0x48487860, 0xFFFFCE62, 0x4C4C0796,
210 0x4141776C, 0xC7C7E642, 0xEBEB24F7, 0x1C1C1410, 0x5D5D637C, 0x36362228,
211 0x6767C027, 0xE9E9AF8C, 0x4444F913, 0x1414EA95, 0xF5F5BB9C, 0xCFCF18C7,
212 0x3F3F2D24, 0xC0C0E346, 0x7272DB3B, 0x54546C70, 0x29294CCA, 0xF0F035E3,
213 0x0808FE85, 0xC6C617CB, 0xF3F34F11, 0x8C8CE4D0, 0xA4A45993, 0xCACA96B8,
214 0x68683BA6, 0xB8B84D83, 0x38382820, 0xE5E52EFF, 0xADAD569F, 0x0B0B8477,
215 0xC8C81DC3, 0x9999FFCC, 0x5858ED03, 0x19199A6F, 0x0E0E0A08, 0x95957EBF,
216 0x70705040, 0xF7F730E7, 0x6E6ECF2B, 0x1F1F6EE2, 0xB5B53D79, 0x09090F0C,
217 0x616134AA, 0x57571682, 0x9F9F0B41, 0x9D9D803A, 0x111164EA, 0x2525CDB9,
218 0xAFAFDDE4, 0x4545089A, 0xDFDF8DA4, 0xA3A35C97, 0xEAEAD57E, 0x353558DA,
219 0xEDEDD07A, 0x4343FC17, 0xF8F8CB66, 0xFBFBB194, 0x3737D3A1, 0xFAFA401D,
220 0xC2C2683D, 0xB4B4CCF0, 0x32325DDE, 0x9C9C71B3, 0x5656E70B, 0xE3E3DA72,
221 0x878760A7, 0x15151B1C, 0xF9F93AEF, 0x6363BFD1, 0x3434A953, 0x9A9A853E,
222 0xB1B1428F, 0x7C7CD133, 0x88889B26, 0x3D3DA65F, 0xA1A1D7EC, 0xE4E4DF76,
223 0x8181942A, 0x91910149, 0x0F0FFB81, 0xEEEEAA88, 0x161661EE, 0xD7D77321,
224 0x9797F5C4, 0xA5A5A81A, 0xFEFE3FEB, 0x6D6DB5D9, 0x7878AEC5, 0xC5C56D39,
225 0x1D1DE599, 0x7676A4CD, 0x3E3EDCAD, 0xCBCB6731, 0xB6B6478B, 0xEFEF5B01,
226 0x12121E18, 0x6060C523, 0x6A6AB0DD, 0x4D4DF61F, 0xCECEE94E, 0xDEDE7C2D,
227 0x55559DF9, 0x7E7E5A48, 0x2121B24F, 0x03037AF2, 0xA0A02665, 0x5E5E198E,
228 0x5A5A6678, 0x65654B5C, 0x62624E58, 0xFDFD4519, 0x0606F48D, 0x404086E5,
229 0xF2F2BE98, 0x3333AC57, 0x17179067, 0x05058E7F, 0xE8E85E05, 0x4F4F7D64,
230 0x89896AAF, 0x10109563, 0x74742FB6, 0x0A0A75FE, 0x5C5C92F5, 0x9B9B74B7,
231 0x2D2D333C, 0x3030D6A5, 0x2E2E49CE, 0x494989E9, 0x46467268, 0x77775544,
232 0xA8A8D8E0, 0x9696044D, 0x2828BD43, 0xA9A92969, 0xD9D97929, 0x8686912E,
233 0xD1D187AC, 0xF4F44A15, 0x8D8D1559, 0xD6D682A8, 0xB9B9BC0A, 0x42420D9E,
234 0xF6F6C16E, 0x2F2FB847, 0xDDDD06DF, 0x23233934, 0xCCCC6235, 0xF1F1C46A,
235 0xC1C112CF, 0x8585EBDC, 0x8F8F9E22, 0x7171A1C9, 0x9090F0C0, 0xAAAA539B,
236 0x0101F189, 0x8B8BE1D4, 0x4E4E8CED, 0x8E8E6FAB, 0xABABA212, 0x6F6F3EA2,
237 0xE6E6540D, 0xDBDBF252, 0x92927BBB, 0xB7B7B602, 0x6969CA2F, 0x3939D9A9,
238 0xD3D30CD7, 0xA7A72361, 0xA2A2AD1E, 0xC3C399B4, 0x6C6C4450, 0x07070504,
239 0x04047FF6, 0x272746C2, 0xACACA716, 0xD0D07625, 0x50501386, 0xDCDCF756,
240 0x84841A55, 0xE1E15109, 0x7A7A25BE, 0x1313EF91},
241
242 {0xA9D93939, 0x67901717, 0xB3719C9C, 0xE8D2A6A6, 0x04050707, 0xFD985252,
243 0xA3658080, 0x76DFE4E4, 0x9A084545, 0x92024B4B, 0x80A0E0E0, 0x78665A5A,
244 0xE4DDAFAF, 0xDDB06A6A, 0xD1BF6363, 0x38362A2A, 0x0D54E6E6, 0xC6432020,
245 0x3562CCCC, 0x98BEF2F2, 0x181E1212, 0xF724EBEB, 0xECD7A1A1, 0x6C774141,
246 0x43BD2828, 0x7532BCBC, 0x37D47B7B, 0x269B8888, 0xFA700D0D, 0x13F94444,
247 0x94B1FBFB, 0x485A7E7E, 0xF27A0303, 0xD0E48C8C, 0x8B47B6B6, 0x303C2424,
248 0x84A5E7E7, 0x54416B6B, 0xDF06DDDD, 0x23C56060, 0x1945FDFD, 0x5BA33A3A,
249 0x3D68C2C2, 0x59158D8D, 0xF321ECEC, 0xAE316666, 0xA23E6F6F, 0x82165757,
250 0x63951010, 0x015BEFEF, 0x834DB8B8, 0x2E918686, 0xD9B56D6D, 0x511F8383,
251 0x9B53AAAA, 0x7C635D5D, 0xA63B6868, 0xEB3FFEFE, 0xA5D63030, 0xBE257A7A,
252 0x16A7ACAC, 0x0C0F0909, 0xE335F0F0, 0x6123A7A7, 0xC0F09090, 0x8CAFE9E9,
253 0x3A809D9D, 0xF5925C5C, 0x73810C0C, 0x2C273131, 0x2576D0D0, 0x0BE75656,
254 0xBB7B9292, 0x4EE9CECE, 0x89F10101, 0x6B9F1E1E, 0x53A93434, 0x6AC4F1F1,
255 0xB499C3C3, 0xF1975B5B, 0xE1834747, 0xE66B1818, 0xBDC82222, 0x450E9898,
256 0xE26E1F1F, 0xF4C9B3B3, 0xB62F7474, 0x66CBF8F8, 0xCCFF9999, 0x95EA1414,
257 0x03ED5858, 0x56F7DCDC, 0xD4E18B8B, 0x1C1B1515, 0x1EADA2A2, 0xD70CD3D3,
258 0xFB2BE2E2, 0xC31DC8C8, 0x8E195E5E, 0xB5C22C2C, 0xE9894949, 0xCF12C1C1,
259 0xBF7E9595, 0xBA207D7D, 0xEA641111, 0x77840B0B, 0x396DC5C5, 0xAF6A8989,
260 0x33D17C7C, 0xC9A17171, 0x62CEFFFF, 0x7137BBBB, 0x81FB0F0F, 0x793DB5B5,
261 0x0951E1E1, 0xADDC3E3E, 0x242D3F3F, 0xCDA47676, 0xF99D5555, 0xD8EE8282,
262 0xE5864040, 0xC5AE7878, 0xB9CD2525, 0x4D049696, 0x44557777, 0x080A0E0E,
263 0x86135050, 0xE730F7F7, 0xA1D33737, 0x1D40FAFA, 0xAA346161, 0xED8C4E4E,
264 0x06B3B0B0, 0x706C5454, 0xB22A7373, 0xD2523B3B, 0x410B9F9F, 0x7B8B0202,
265 0xA088D8D8, 0x114FF3F3, 0x3167CBCB, 0xC2462727, 0x27C06767, 0x90B4FCFC,
266 0x20283838, 0xF67F0404, 0x60784848, 0xFF2EE5E5, 0x96074C4C, 0x5C4B6565,
267 0xB1C72B2B, 0xAB6F8E8E, 0x9E0D4242, 0x9CBBF5F5, 0x52F2DBDB, 0x1BF34A4A,
268 0x5FA63D3D, 0x9359A4A4, 0x0ABCB9B9, 0xEF3AF9F9, 0x91EF1313, 0x85FE0808,
269 0x49019191, 0xEE611616, 0x2D7CDEDE, 0x4FB22121, 0x8F42B1B1, 0x3BDB7272,
270 0x47B82F2F, 0x8748BFBF, 0x6D2CAEAE, 0x46E3C0C0, 0xD6573C3C, 0x3E859A9A,
271 0x6929A9A9, 0x647D4F4F, 0x2A948181, 0xCE492E2E, 0xCB17C6C6, 0x2FCA6969,
272 0xFCC3BDBD, 0x975CA3A3, 0x055EE8E8, 0x7AD0EDED, 0xAC87D1D1, 0x7F8E0505,
273 0xD5BA6464, 0x1AA8A5A5, 0x4BB72626, 0x0EB9BEBE, 0xA7608787, 0x5AF8D5D5,
274 0x28223636, 0x14111B1B, 0x3FDE7575, 0x2979D9D9, 0x88AAEEEE, 0x3C332D2D,
275 0x4C5F7979, 0x02B6B7B7, 0xB896CACA, 0xDA583535, 0xB09CC4C4, 0x17FC4343,
276 0x551A8484, 0x1FF64D4D, 0x8A1C5959, 0x7D38B2B2, 0x57AC3333, 0xC718CFCF,
277 0x8DF40606, 0x74695353, 0xB7749B9B, 0xC4F59797, 0x9F56ADAD, 0x72DAE3E3,
278 0x7ED5EAEA, 0x154AF4F4, 0x229E8F8F, 0x12A2ABAB, 0x584E6262, 0x07E85F5F,
279 0x99E51D1D, 0x34392323, 0x6EC1F6F6, 0x50446C6C, 0xDE5D3232, 0x68724646,
280 0x6526A0A0, 0xBC93CDCD, 0xDB03DADA, 0xF8C6BABA, 0xC8FA9E9E, 0xA882D6D6,
281 0x2BCF6E6E, 0x40507070, 0xDCEB8585, 0xFE750A0A, 0x328A9393, 0xA48DDFDF,
282 0xCA4C2929, 0x10141C1C, 0x2173D7D7, 0xF0CCB4B4, 0xD309D4D4, 0x5D108A8A,
283 0x0FE25151, 0x00000000, 0x6F9A1919, 0x9DE01A1A, 0x368F9494, 0x42E6C7C7,
284 0x4AECC9C9, 0x5EFDD2D2, 0xC1AB7F7F, 0xE0D8A8A8},
285
286 {0xBC75BC32, 0xECF3EC21, 0x20C62043, 0xB3F4B3C9, 0xDADBDA03, 0x027B028B,
287 0xE2FBE22B, 0x9EC89EFA, 0xC94AC9EC, 0xD4D3D409, 0x18E6186B, 0x1E6B1E9F,
288 0x9845980E, 0xB27DB238, 0xA6E8A6D2, 0x264B26B7, 0x3CD63C57, 0x9332938A,
289 0x82D882EE, 0x52FD5298, 0x7B377BD4, 0xBB71BB37, 0x5BF15B97, 0x47E14783,
290 0x2430243C, 0x510F51E2, 0xBAF8BAC6, 0x4A1B4AF3, 0xBF87BF48, 0x0DFA0D70,
291 0xB006B0B3, 0x753F75DE, 0xD25ED2FD, 0x7DBA7D20, 0x66AE6631, 0x3A5B3AA3,
292 0x598A591C, 0x00000000, 0xCDBCCD93, 0x1A9D1AE0, 0xAE6DAE2C, 0x7FC17FAB,
293 0x2BB12BC7, 0xBE0EBEB9, 0xE080E0A0, 0x8A5D8A10, 0x3BD23B52, 0x64D564BA,
294 0xD8A0D888, 0xE784E7A5, 0x5F075FE8, 0x1B141B11, 0x2CB52CC2, 0xFC90FCB4,
295 0x312C3127, 0x80A38065, 0x73B2732A, 0x0C730C81, 0x794C795F, 0x6B546B41,
296 0x4B924B02, 0x53745369, 0x9436948F, 0x8351831F, 0x2A382A36, 0xC4B0C49C,
297 0x22BD22C8, 0xD55AD5F8, 0xBDFCBDC3, 0x48604878, 0xFF62FFCE, 0x4C964C07,
298 0x416C4177, 0xC742C7E6, 0xEBF7EB24, 0x1C101C14, 0x5D7C5D63, 0x36283622,
299 0x672767C0, 0xE98CE9AF, 0x441344F9, 0x149514EA, 0xF59CF5BB, 0xCFC7CF18,
300 0x3F243F2D, 0xC046C0E3, 0x723B72DB, 0x5470546C, 0x29CA294C, 0xF0E3F035,
301 0x088508FE, 0xC6CBC617, 0xF311F34F, 0x8CD08CE4, 0xA493A459, 0xCAB8CA96,
302 0x68A6683B, 0xB883B84D, 0x38203828, 0xE5FFE52E, 0xAD9FAD56, 0x0B770B84,
303 0xC8C3C81D, 0x99CC99FF, 0x580358ED, 0x196F199A, 0x0E080E0A, 0x95BF957E,
304 0x70407050, 0xF7E7F730, 0x6E2B6ECF, 0x1FE21F6E, 0xB579B53D, 0x090C090F,
305 0x61AA6134, 0x57825716, 0x9F419F0B, 0x9D3A9D80, 0x11EA1164, 0x25B925CD,
306 0xAFE4AFDD, 0x459A4508, 0xDFA4DF8D, 0xA397A35C, 0xEA7EEAD5, 0x35DA3558,
307 0xED7AEDD0, 0x431743FC, 0xF866F8CB, 0xFB94FBB1, 0x37A137D3, 0xFA1DFA40,
308 0xC23DC268, 0xB4F0B4CC, 0x32DE325D, 0x9CB39C71, 0x560B56E7, 0xE372E3DA,
309 0x87A78760, 0x151C151B, 0xF9EFF93A, 0x63D163BF, 0x345334A9, 0x9A3E9A85,
310 0xB18FB142, 0x7C337CD1, 0x8826889B, 0x3D5F3DA6, 0xA1ECA1D7, 0xE476E4DF,
311 0x812A8194, 0x91499101, 0x0F810FFB, 0xEE88EEAA, 0x16EE1661, 0xD721D773,
312 0x97C497F5, 0xA51AA5A8, 0xFEEBFE3F, 0x6DD96DB5, 0x78C578AE, 0xC539C56D,
313 0x1D991DE5, 0x76CD76A4, 0x3EAD3EDC, 0xCB31CB67, 0xB68BB647, 0xEF01EF5B,
314 0x1218121E, 0x602360C5, 0x6ADD6AB0, 0x4D1F4DF6, 0xCE4ECEE9, 0xDE2DDE7C,
315 0x55F9559D, 0x7E487E5A, 0x214F21B2, 0x03F2037A, 0xA065A026, 0x5E8E5E19,
316 0x5A785A66, 0x655C654B, 0x6258624E, 0xFD19FD45, 0x068D06F4, 0x40E54086,
317 0xF298F2BE, 0x335733AC, 0x17671790, 0x057F058E, 0xE805E85E, 0x4F644F7D,
318 0x89AF896A, 0x10631095, 0x74B6742F, 0x0AFE0A75, 0x5CF55C92, 0x9BB79B74,
319 0x2D3C2D33, 0x30A530D6, 0x2ECE2E49, 0x49E94989, 0x46684672, 0x77447755,
320 0xA8E0A8D8, 0x964D9604, 0x284328BD, 0xA969A929, 0xD929D979, 0x862E8691,
321 0xD1ACD187, 0xF415F44A, 0x8D598D15, 0xD6A8D682, 0xB90AB9BC, 0x429E420D,
322 0xF66EF6C1, 0x2F472FB8, 0xDDDFDD06, 0x23342339, 0xCC35CC62, 0xF16AF1C4,
323 0xC1CFC112, 0x85DC85EB, 0x8F228F9E, 0x71C971A1, 0x90C090F0, 0xAA9BAA53,
324 0x018901F1, 0x8BD48BE1, 0x4EED4E8C, 0x8EAB8E6F, 0xAB12ABA2, 0x6FA26F3E,
325 0xE60DE654, 0xDB52DBF2, 0x92BB927B, 0xB702B7B6, 0x692F69CA, 0x39A939D9,
326 0xD3D7D30C, 0xA761A723, 0xA21EA2AD, 0xC3B4C399, 0x6C506C44, 0x07040705,
327 0x04F6047F, 0x27C22746, 0xAC16ACA7, 0xD025D076, 0x50865013, 0xDC56DCF7,
328 0x8455841A, 0xE109E151, 0x7ABE7A25, 0x139113EF},
329
330 {0xD939A9D9, 0x90176790, 0x719CB371, 0xD2A6E8D2, 0x05070405, 0x9852FD98,
331 0x6580A365, 0xDFE476DF, 0x08459A08, 0x024B9202, 0xA0E080A0, 0x665A7866,
332 0xDDAFE4DD, 0xB06ADDB0, 0xBF63D1BF, 0x362A3836, 0x54E60D54, 0x4320C643,
333 0x62CC3562, 0xBEF298BE, 0x1E12181E, 0x24EBF724, 0xD7A1ECD7, 0x77416C77,
334 0xBD2843BD, 0x32BC7532, 0xD47B37D4, 0x9B88269B, 0x700DFA70, 0xF94413F9,
335 0xB1FB94B1, 0x5A7E485A, 0x7A03F27A, 0xE48CD0E4, 0x47B68B47, 0x3C24303C,
336 0xA5E784A5, 0x416B5441, 0x06DDDF06, 0xC56023C5, 0x45FD1945, 0xA33A5BA3,
337 0x68C23D68, 0x158D5915, 0x21ECF321, 0x3166AE31, 0x3E6FA23E, 0x16578216,
338 0x95106395, 0x5BEF015B, 0x4DB8834D, 0x91862E91, 0xB56DD9B5, 0x1F83511F,
339 0x53AA9B53, 0x635D7C63, 0x3B68A63B, 0x3FFEEB3F, 0xD630A5D6, 0x257ABE25,
340 0xA7AC16A7, 0x0F090C0F, 0x35F0E335, 0x23A76123, 0xF090C0F0, 0xAFE98CAF,
341 0x809D3A80, 0x925CF592, 0x810C7381, 0x27312C27, 0x76D02576, 0xE7560BE7,
342 0x7B92BB7B, 0xE9CE4EE9, 0xF10189F1, 0x9F1E6B9F, 0xA93453A9, 0xC4F16AC4,
343 0x99C3B499, 0x975BF197, 0x8347E183, 0x6B18E66B, 0xC822BDC8, 0x0E98450E,
344 0x6E1FE26E, 0xC9B3F4C9, 0x2F74B62F, 0xCBF866CB, 0xFF99CCFF, 0xEA1495EA,
345 0xED5803ED, 0xF7DC56F7, 0xE18BD4E1, 0x1B151C1B, 0xADA21EAD, 0x0CD3D70C,
346 0x2BE2FB2B, 0x1DC8C31D, 0x195E8E19, 0xC22CB5C2, 0x8949E989, 0x12C1CF12,
347 0x7E95BF7E, 0x207DBA20, 0x6411EA64, 0x840B7784, 0x6DC5396D, 0x6A89AF6A,
348 0xD17C33D1, 0xA171C9A1, 0xCEFF62CE, 0x37BB7137, 0xFB0F81FB, 0x3DB5793D,
349 0x51E10951, 0xDC3EADDC, 0x2D3F242D, 0xA476CDA4, 0x9D55F99D, 0xEE82D8EE,
350 0x8640E586, 0xAE78C5AE, 0xCD25B9CD, 0x04964D04, 0x55774455, 0x0A0E080A,
351 0x13508613, 0x30F7E730, 0xD337A1D3, 0x40FA1D40, 0x3461AA34, 0x8C4EED8C,
352 0xB3B006B3, 0x6C54706C, 0x2A73B22A, 0x523BD252, 0x0B9F410B, 0x8B027B8B,
353 0x88D8A088, 0x4FF3114F, 0x67CB3167, 0x4627C246, 0xC06727C0, 0xB4FC90B4,
354 0x28382028, 0x7F04F67F, 0x78486078, 0x2EE5FF2E, 0x074C9607, 0x4B655C4B,
355 0xC72BB1C7, 0x6F8EAB6F, 0x0D429E0D, 0xBBF59CBB, 0xF2DB52F2, 0xF34A1BF3,
356 0xA63D5FA6, 0x59A49359, 0xBCB90ABC, 0x3AF9EF3A, 0xEF1391EF, 0xFE0885FE,
357 0x01914901, 0x6116EE61, 0x7CDE2D7C, 0xB2214FB2, 0x42B18F42, 0xDB723BDB,
358 0xB82F47B8, 0x48BF8748, 0x2CAE6D2C, 0xE3C046E3, 0x573CD657, 0x859A3E85,
359 0x29A96929, 0x7D4F647D, 0x94812A94, 0x492ECE49, 0x17C6CB17, 0xCA692FCA,
360 0xC3BDFCC3, 0x5CA3975C, 0x5EE8055E, 0xD0ED7AD0, 0x87D1AC87, 0x8E057F8E,
361 0xBA64D5BA, 0xA8A51AA8, 0xB7264BB7, 0xB9BE0EB9, 0x6087A760, 0xF8D55AF8,
362 0x22362822, 0x111B1411, 0xDE753FDE, 0x79D92979, 0xAAEE88AA, 0x332D3C33,
363 0x5F794C5F, 0xB6B702B6, 0x96CAB896, 0x5835DA58, 0x9CC4B09C, 0xFC4317FC,
364 0x1A84551A, 0xF64D1FF6, 0x1C598A1C, 0x38B27D38, 0xAC3357AC, 0x18CFC718,
365 0xF4068DF4, 0x69537469, 0x749BB774, 0xF597C4F5, 0x56AD9F56, 0xDAE372DA,
366 0xD5EA7ED5, 0x4AF4154A, 0x9E8F229E, 0xA2AB12A2, 0x4E62584E, 0xE85F07E8,
367 0xE51D99E5, 0x39233439, 0xC1F66EC1, 0x446C5044, 0x5D32DE5D, 0x72466872,
368 0x26A06526, 0x93CDBC93, 0x03DADB03, 0xC6BAF8C6, 0xFA9EC8FA, 0x82D6A882,
369 0xCF6E2BCF, 0x50704050, 0xEB85DCEB, 0x750AFE75, 0x8A93328A, 0x8DDFA48D,
370 0x4C29CA4C, 0x141C1014, 0x73D72173, 0xCCB4F0CC, 0x09D4D309, 0x108A5D10,
371 0xE2510FE2, 0x00000000, 0x9A196F9A, 0xE01A9DE0, 0x8F94368F, 0xE6C742E6,
372 0xECC94AEC, 0xFDD25EFD, 0xAB7FC1AB, 0xD8A8E0D8}
373 };
374
375 /* The exp_to_poly and poly_to_exp tables are used to perform efficient
376 * operations in GF(2^8) represented as GF(2)[x]/w(x) where
377 * w(x)=x^8+x^6+x^3+x^2+1. We care about doing that because it's part of the
378 * definition of the RS matrix in the key schedule. Elements of that field
379 * are polynomials of degree not greater than 7 and all coefficients 0 or 1,
380 * which can be represented naturally by bytes (just substitute x=2). In that
381 * form, GF(2^8) addition is the same as bitwise XOR, but GF(2^8)
382 * multiplication is inefficient without hardware support. To multiply
383 * faster, I make use of the fact x is a generator for the nonzero elements,
384 * so that every element p of GF(2)[x]/w(x) is either 0 or equal to (x)^n for
385 * some n in 0..254. Note that that caret is exponentiation in GF(2^8),
386 * *not* polynomial notation. So if I want to compute pq where p and q are
387 * in GF(2^8), I can just say:
388 * 1. if p=0 or q=0 then pq=0
389 * 2. otherwise, find m and n such that p=x^m and q=x^n
390 * 3. pq=(x^m)(x^n)=x^(m+n), so add m and n and find pq
391 * The translations in steps 2 and 3 are looked up in the tables
392 * poly_to_exp (for step 2) and exp_to_poly (for step 3). To see this
393 * in action, look at the CALC_S macro. As additional wrinkles, note that
394 * one of my operands is always a constant, so the poly_to_exp lookup on it
395 * is done in advance; I included the original values in the comments so
396 * readers can have some chance of recognizing that this *is* the RS matrix
397 * from the Twofish paper. I've only included the table entries I actually
398 * need; I never do a lookup on a variable input of zero and the biggest
399 * exponents I'll ever see are 254 (variable) and 237 (constant), so they'll
400 * never sum to more than 491. I'm repeating part of the exp_to_poly table
401 * so that I don't have to do mod-255 reduction in the exponent arithmetic.
402 * Since I know my constant operands are never zero, I only have to worry
403 * about zero values in the variable operand, and I do it with a simple
404 * conditional branch. I know conditionals are expensive, but I couldn't
405 * see a non-horrible way of avoiding them, and I did manage to group the
406 * statements so that each if covers four group multiplications. */
407
408 static const u16 poly_to_exp[256] = {
409 492,
410 0x00, 0x01, 0x17, 0x02, 0x2E, 0x18, 0x53, 0x03, 0x6A, 0x2F, 0x93, 0x19,
411 0x34, 0x54, 0x45, 0x04, 0x5C, 0x6B, 0xB6, 0x30, 0xA6, 0x94, 0x4B, 0x1A,
412 0x8C, 0x35, 0x81, 0x55, 0xAA, 0x46, 0x0D, 0x05, 0x24, 0x5D, 0x87, 0x6C,
413 0x9B, 0xB7, 0xC1, 0x31, 0x2B, 0xA7, 0xA3, 0x95, 0x98, 0x4C, 0xCA, 0x1B,
414 0xE6, 0x8D, 0x73, 0x36, 0xCD, 0x82, 0x12, 0x56, 0x62, 0xAB, 0xF0, 0x47,
415 0x4F, 0x0E, 0xBD, 0x06, 0xD4, 0x25, 0xD2, 0x5E, 0x27, 0x88, 0x66, 0x6D,
416 0xD6, 0x9C, 0x79, 0xB8, 0x08, 0xC2, 0xDF, 0x32, 0x68, 0x2C, 0xFD, 0xA8,
417 0x8A, 0xA4, 0x5A, 0x96, 0x29, 0x99, 0x22, 0x4D, 0x60, 0xCB, 0xE4, 0x1C,
418 0x7B, 0xE7, 0x3B, 0x8E, 0x9E, 0x74, 0xF4, 0x37, 0xD8, 0xCE, 0xF9, 0x83,
419 0x6F, 0x13, 0xB2, 0x57, 0xE1, 0x63, 0xDC, 0xAC, 0xC4, 0xF1, 0xAF, 0x48,
420 0x0A, 0x50, 0x42, 0x0F, 0xBA, 0xBE, 0xC7, 0x07, 0xDE, 0xD5, 0x78, 0x26,
421 0x65, 0xD3, 0xD1, 0x5F, 0xE3, 0x28, 0x21, 0x89, 0x59, 0x67, 0xFC, 0x6E,
422 0xB1, 0xD7, 0xF8, 0x9D, 0xF3, 0x7A, 0x3A, 0xB9, 0xC6, 0x09, 0x41, 0xC3,
423 0xAE, 0xE0, 0xDB, 0x33, 0x44, 0x69, 0x92, 0x2D, 0x52, 0xFE, 0x16, 0xA9,
424 0x0C, 0x8B, 0x80, 0xA5, 0x4A, 0x5B, 0xB5, 0x97, 0xC9, 0x2A, 0xA2, 0x9A,
425 0xC0, 0x23, 0x86, 0x4E, 0xBC, 0x61, 0xEF, 0xCC, 0x11, 0xE5, 0x72, 0x1D,
426 0x3D, 0x7C, 0xEB, 0xE8, 0xE9, 0x3C, 0xEA, 0x8F, 0x7D, 0x9F, 0xEC, 0x75,
427 0x1E, 0xF5, 0x3E, 0x38, 0xF6, 0xD9, 0x3F, 0xCF, 0x76, 0xFA, 0x1F, 0x84,
428 0xA0, 0x70, 0xED, 0x14, 0x90, 0xB3, 0x7E, 0x58, 0xFB, 0xE2, 0x20, 0x64,
429 0xD0, 0xDD, 0x77, 0xAD, 0xDA, 0xC5, 0x40, 0xF2, 0x39, 0xB0, 0xF7, 0x49,
430 0xB4, 0x0B, 0x7F, 0x51, 0x15, 0x43, 0x91, 0x10, 0x71, 0xBB, 0xEE, 0xBF,
431 0x85, 0xC8, 0xA1
432 };
433
434 static const byte exp_to_poly[492 + 256] = {
435 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x4D, 0x9A, 0x79, 0xF2,
436 0xA9, 0x1F, 0x3E, 0x7C, 0xF8, 0xBD, 0x37, 0x6E, 0xDC, 0xF5, 0xA7, 0x03,
437 0x06, 0x0C, 0x18, 0x30, 0x60, 0xC0, 0xCD, 0xD7, 0xE3, 0x8B, 0x5B, 0xB6,
438 0x21, 0x42, 0x84, 0x45, 0x8A, 0x59, 0xB2, 0x29, 0x52, 0xA4, 0x05, 0x0A,
439 0x14, 0x28, 0x50, 0xA0, 0x0D, 0x1A, 0x34, 0x68, 0xD0, 0xED, 0x97, 0x63,
440 0xC6, 0xC1, 0xCF, 0xD3, 0xEB, 0x9B, 0x7B, 0xF6, 0xA1, 0x0F, 0x1E, 0x3C,
441 0x78, 0xF0, 0xAD, 0x17, 0x2E, 0x5C, 0xB8, 0x3D, 0x7A, 0xF4, 0xA5, 0x07,
442 0x0E, 0x1C, 0x38, 0x70, 0xE0, 0x8D, 0x57, 0xAE, 0x11, 0x22, 0x44, 0x88,
443 0x5D, 0xBA, 0x39, 0x72, 0xE4, 0x85, 0x47, 0x8E, 0x51, 0xA2, 0x09, 0x12,
444 0x24, 0x48, 0x90, 0x6D, 0xDA, 0xF9, 0xBF, 0x33, 0x66, 0xCC, 0xD5, 0xE7,
445 0x83, 0x4B, 0x96, 0x61, 0xC2, 0xC9, 0xDF, 0xF3, 0xAB, 0x1B, 0x36, 0x6C,
446 0xD8, 0xFD, 0xB7, 0x23, 0x46, 0x8C, 0x55, 0xAA, 0x19, 0x32, 0x64, 0xC8,
447 0xDD, 0xF7, 0xA3, 0x0B, 0x16, 0x2C, 0x58, 0xB0, 0x2D, 0x5A, 0xB4, 0x25,
448 0x4A, 0x94, 0x65, 0xCA, 0xD9, 0xFF, 0xB3, 0x2B, 0x56, 0xAC, 0x15, 0x2A,
449 0x54, 0xA8, 0x1D, 0x3A, 0x74, 0xE8, 0x9D, 0x77, 0xEE, 0x91, 0x6F, 0xDE,
450 0xF1, 0xAF, 0x13, 0x26, 0x4C, 0x98, 0x7D, 0xFA, 0xB9, 0x3F, 0x7E, 0xFC,
451 0xB5, 0x27, 0x4E, 0x9C, 0x75, 0xEA, 0x99, 0x7F, 0xFE, 0xB1, 0x2F, 0x5E,
452 0xBC, 0x35, 0x6A, 0xD4, 0xE5, 0x87, 0x43, 0x86, 0x41, 0x82, 0x49, 0x92,
453 0x69, 0xD2, 0xE9, 0x9F, 0x73, 0xE6, 0x81, 0x4F, 0x9E, 0x71, 0xE2, 0x89,
454 0x5F, 0xBE, 0x31, 0x62, 0xC4, 0xC5, 0xC7, 0xC3, 0xCB, 0xDB, 0xFB, 0xBB,
455 0x3B, 0x76, 0xEC, 0x95, 0x67, 0xCE, 0xD1, 0xEF, 0x93, 0x6B, 0xD6, 0xE1,
456 0x8F, 0x53, 0xA6, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x4D,
457 0x9A, 0x79, 0xF2, 0xA9, 0x1F, 0x3E, 0x7C, 0xF8, 0xBD, 0x37, 0x6E, 0xDC,
458 0xF5, 0xA7, 0x03, 0x06, 0x0C, 0x18, 0x30, 0x60, 0xC0, 0xCD, 0xD7, 0xE3,
459 0x8B, 0x5B, 0xB6, 0x21, 0x42, 0x84, 0x45, 0x8A, 0x59, 0xB2, 0x29, 0x52,
460 0xA4, 0x05, 0x0A, 0x14, 0x28, 0x50, 0xA0, 0x0D, 0x1A, 0x34, 0x68, 0xD0,
461 0xED, 0x97, 0x63, 0xC6, 0xC1, 0xCF, 0xD3, 0xEB, 0x9B, 0x7B, 0xF6, 0xA1,
462 0x0F, 0x1E, 0x3C, 0x78, 0xF0, 0xAD, 0x17, 0x2E, 0x5C, 0xB8, 0x3D, 0x7A,
463 0xF4, 0xA5, 0x07, 0x0E, 0x1C, 0x38, 0x70, 0xE0, 0x8D, 0x57, 0xAE, 0x11,
464 0x22, 0x44, 0x88, 0x5D, 0xBA, 0x39, 0x72, 0xE4, 0x85, 0x47, 0x8E, 0x51,
465 0xA2, 0x09, 0x12, 0x24, 0x48, 0x90, 0x6D, 0xDA, 0xF9, 0xBF, 0x33, 0x66,
466 0xCC, 0xD5, 0xE7, 0x83, 0x4B, 0x96, 0x61, 0xC2, 0xC9, 0xDF, 0xF3, 0xAB,
467 0x1B, 0x36, 0x6C, 0xD8, 0xFD, 0xB7, 0x23, 0x46, 0x8C, 0x55, 0xAA, 0x19,
468 0x32, 0x64, 0xC8, 0xDD, 0xF7, 0xA3, 0x0B, 0x16, 0x2C, 0x58, 0xB0, 0x2D,
469 0x5A, 0xB4, 0x25, 0x4A, 0x94, 0x65, 0xCA, 0xD9, 0xFF, 0xB3, 0x2B, 0x56,
470 0xAC, 0x15, 0x2A, 0x54, 0xA8, 0x1D, 0x3A, 0x74, 0xE8, 0x9D, 0x77, 0xEE,
471 0x91, 0x6F, 0xDE, 0xF1, 0xAF, 0x13, 0x26, 0x4C, 0x98, 0x7D, 0xFA, 0xB9,
472 0x3F, 0x7E, 0xFC, 0xB5, 0x27, 0x4E, 0x9C, 0x75, 0xEA, 0x99, 0x7F, 0xFE,
473 0xB1, 0x2F, 0x5E, 0xBC, 0x35, 0x6A, 0xD4, 0xE5, 0x87, 0x43, 0x86, 0x41,
474 0x82, 0x49, 0x92, 0x69, 0xD2, 0xE9, 0x9F, 0x73, 0xE6, 0x81, 0x4F, 0x9E,
475 0x71, 0xE2, 0x89, 0x5F, 0xBE, 0x31, 0x62, 0xC4, 0xC5, 0xC7, 0xC3, 0xCB,
476 };
477
478
479 /* The table constants are indices of
480 * S-box entries, preprocessed through q0 and q1. */
481 static byte calc_sb_tbl[512] = {
482 0xA9, 0x75, 0x67, 0xF3, 0xB3, 0xC6, 0xE8, 0xF4,
483 0x04, 0xDB, 0xFD, 0x7B, 0xA3, 0xFB, 0x76, 0xC8,
484 0x9A, 0x4A, 0x92, 0xD3, 0x80, 0xE6, 0x78, 0x6B,
485 0xE4, 0x45, 0xDD, 0x7D, 0xD1, 0xE8, 0x38, 0x4B,
486 0x0D, 0xD6, 0xC6, 0x32, 0x35, 0xD8, 0x98, 0xFD,
487 0x18, 0x37, 0xF7, 0x71, 0xEC, 0xF1, 0x6C, 0xE1,
488 0x43, 0x30, 0x75, 0x0F, 0x37, 0xF8, 0x26, 0x1B,
489 0xFA, 0x87, 0x13, 0xFA, 0x94, 0x06, 0x48, 0x3F,
490 0xF2, 0x5E, 0xD0, 0xBA, 0x8B, 0xAE, 0x30, 0x5B,
491 0x84, 0x8A, 0x54, 0x00, 0xDF, 0xBC, 0x23, 0x9D,
492 0x19, 0x6D, 0x5B, 0xC1, 0x3D, 0xB1, 0x59, 0x0E,
493 0xF3, 0x80, 0xAE, 0x5D, 0xA2, 0xD2, 0x82, 0xD5,
494 0x63, 0xA0, 0x01, 0x84, 0x83, 0x07, 0x2E, 0x14,
495 0xD9, 0xB5, 0x51, 0x90, 0x9B, 0x2C, 0x7C, 0xA3,
496 0xA6, 0xB2, 0xEB, 0x73, 0xA5, 0x4C, 0xBE, 0x54,
497 0x16, 0x92, 0x0C, 0x74, 0xE3, 0x36, 0x61, 0x51,
498 0xC0, 0x38, 0x8C, 0xB0, 0x3A, 0xBD, 0xF5, 0x5A,
499 0x73, 0xFC, 0x2C, 0x60, 0x25, 0x62, 0x0B, 0x96,
500 0xBB, 0x6C, 0x4E, 0x42, 0x89, 0xF7, 0x6B, 0x10,
501 0x53, 0x7C, 0x6A, 0x28, 0xB4, 0x27, 0xF1, 0x8C,
502 0xE1, 0x13, 0xE6, 0x95, 0xBD, 0x9C, 0x45, 0xC7,
503 0xE2, 0x24, 0xF4, 0x46, 0xB6, 0x3B, 0x66, 0x70,
504 0xCC, 0xCA, 0x95, 0xE3, 0x03, 0x85, 0x56, 0xCB,
505 0xD4, 0x11, 0x1C, 0xD0, 0x1E, 0x93, 0xD7, 0xB8,
506 0xFB, 0xA6, 0xC3, 0x83, 0x8E, 0x20, 0xB5, 0xFF,
507 0xE9, 0x9F, 0xCF, 0x77, 0xBF, 0xC3, 0xBA, 0xCC,
508 0xEA, 0x03, 0x77, 0x6F, 0x39, 0x08, 0xAF, 0xBF,
509 0x33, 0x40, 0xC9, 0xE7, 0x62, 0x2B, 0x71, 0xE2,
510 0x81, 0x79, 0x79, 0x0C, 0x09, 0xAA, 0xAD, 0x82,
511 0x24, 0x41, 0xCD, 0x3A, 0xF9, 0xEA, 0xD8, 0xB9,
512 0xE5, 0xE4, 0xC5, 0x9A, 0xB9, 0xA4, 0x4D, 0x97,
513 0x44, 0x7E, 0x08, 0xDA, 0x86, 0x7A, 0xE7, 0x17,
514 0xA1, 0x66, 0x1D, 0x94, 0xAA, 0xA1, 0xED, 0x1D,
515 0x06, 0x3D, 0x70, 0xF0, 0xB2, 0xDE, 0xD2, 0xB3,
516 0x41, 0x0B, 0x7B, 0x72, 0xA0, 0xA7, 0x11, 0x1C,
517 0x31, 0xEF, 0xC2, 0xD1, 0x27, 0x53, 0x90, 0x3E,
518 0x20, 0x8F, 0xF6, 0x33, 0x60, 0x26, 0xFF, 0x5F,
519 0x96, 0xEC, 0x5C, 0x76, 0xB1, 0x2A, 0xAB, 0x49,
520 0x9E, 0x81, 0x9C, 0x88, 0x52, 0xEE, 0x1B, 0x21,
521 0x5F, 0xC4, 0x93, 0x1A, 0x0A, 0xEB, 0xEF, 0xD9,
522 0x91, 0xC5, 0x85, 0x39, 0x49, 0x99, 0xEE, 0xCD,
523 0x2D, 0xAD, 0x4F, 0x31, 0x8F, 0x8B, 0x3B, 0x01,
524 0x47, 0x18, 0x87, 0x23, 0x6D, 0xDD, 0x46, 0x1F,
525 0xD6, 0x4E, 0x3E, 0x2D, 0x69, 0xF9, 0x64, 0x48,
526 0x2A, 0x4F, 0xCE, 0xF2, 0xCB, 0x65, 0x2F, 0x8E,
527 0xFC, 0x78, 0x97, 0x5C, 0x05, 0x58, 0x7A, 0x19,
528 0xAC, 0x8D, 0x7F, 0xE5, 0xD5, 0x98, 0x1A, 0x57,
529 0x4B, 0x67, 0x0E, 0x7F, 0xA7, 0x05, 0x5A, 0x64,
530 0x28, 0xAF, 0x14, 0x63, 0x3F, 0xB6, 0x29, 0xFE,
531 0x88, 0xF5, 0x3C, 0xB7, 0x4C, 0x3C, 0x02, 0xA5,
532 0xB8, 0xCE, 0xDA, 0xE9, 0xB0, 0x68, 0x17, 0x44,
533 0x55, 0xE0, 0x1F, 0x4D, 0x8A, 0x43, 0x7D, 0x69,
534 0x57, 0x29, 0xC7, 0x2E, 0x8D, 0xAC, 0x74, 0x15,
535 0xB7, 0x59, 0xC4, 0xA8, 0x9F, 0x0A, 0x72, 0x9E,
536 0x7E, 0x6E, 0x15, 0x47, 0x22, 0xDF, 0x12, 0x34,
537 0x58, 0x35, 0x07, 0x6A, 0x99, 0xCF, 0x34, 0xDC,
538 0x6E, 0x22, 0x50, 0xC9, 0xDE, 0xC0, 0x68, 0x9B,
539 0x65, 0x89, 0xBC, 0xD4, 0xDB, 0xED, 0xF8, 0xAB,
540 0xC8, 0x12, 0xA8, 0xA2, 0x2B, 0x0D, 0x40, 0x52,
541 0xDC, 0xBB, 0xFE, 0x02, 0x32, 0x2F, 0xA4, 0xA9,
542 0xCA, 0xD7, 0x10, 0x61, 0x21, 0x1E, 0xF0, 0xB4,
543 0xD3, 0x50, 0x5D, 0x04, 0x0F, 0xF6, 0x00, 0xC2,
544 0x6F, 0x16, 0x9D, 0x25, 0x36, 0x86, 0x42, 0x56,
545 0x4A, 0x55, 0x5E, 0x09, 0xC1, 0xBE, 0xE0, 0x91
546 };
547
548 /* Macro to perform one column of the RS matrix multiplication. The
549 * parameters a, b, c, and d are the four bytes of output; i is the index
550 * of the key bytes, and w, x, y, and z, are the column of constants from
551 * the RS matrix, preprocessed through the poly_to_exp table. */
552
553 #define CALC_S(a, b, c, d, i, w, x, y, z) \
554 { \
555 tmp = poly_to_exp[key[i]]; \
556 (a) ^= exp_to_poly[tmp + (w)]; \
557 (b) ^= exp_to_poly[tmp + (x)]; \
558 (c) ^= exp_to_poly[tmp + (y)]; \
559 (d) ^= exp_to_poly[tmp + (z)]; \
560 }
561
562 /* Macros to calculate the key-dependent S-boxes for a 128-bit key using
563 * the S vector from CALC_S. CALC_SB_2 computes a single entry in all
564 * four S-boxes, where i is the index of the entry to compute, and a and b
565 * are the index numbers preprocessed through the q0 and q1 tables
566 * respectively. CALC_SB is simply a convenience to make the code shorter;
567 * it calls CALC_SB_2 four times with consecutive indices from i to i+3,
568 * using the remaining parameters two by two. */
569
570 #define CALC_SB_2(i, a, b) \
571 ctx->s[0][i] = mds[0][q0[(a) ^ sa] ^ se]; \
572 ctx->s[1][i] = mds[1][q0[(b) ^ sb] ^ sf]; \
573 ctx->s[2][i] = mds[2][q1[(a) ^ sc] ^ sg]; \
574 ctx->s[3][i] = mds[3][q1[(b) ^ sd] ^ sh]
575
576 #define CALC_SB(i, a, b, c, d, e, f, g, h) \
577 CALC_SB_2 (i, a, b); CALC_SB_2 ((i)+1, c, d); \
578 CALC_SB_2 ((i)+2, e, f); CALC_SB_2 ((i)+3, g, h)
579
580 /* Macros exactly like CALC_SB and CALC_SB_2, but for 256-bit keys. */
581
582 #define CALC_SB256_2(i, a, b) \
583 ctx->s[0][i] = mds[0][q0[q0[q1[(b) ^ sa] ^ se] ^ si] ^ sm]; \
584 ctx->s[1][i] = mds[1][q0[q1[q1[(a) ^ sb] ^ sf] ^ sj] ^ sn]; \
585 ctx->s[2][i] = mds[2][q1[q0[q0[(a) ^ sc] ^ sg] ^ sk] ^ so]; \
586 ctx->s[3][i] = mds[3][q1[q1[q0[(b) ^ sd] ^ sh] ^ sl] ^ sp];
587
588 #define CALC_SB256(i, a, b, c, d, e, f, g, h) \
589 CALC_SB256_2 (i, a, b); CALC_SB256_2 ((i)+1, c, d); \
590 CALC_SB256_2 ((i)+2, e, f); CALC_SB256_2 ((i)+3, g, h)
591
592 /* Macros to calculate the whitening and round subkeys. CALC_K_2 computes the
593 * last two stages of the h() function for a given index (either 2i or 2i+1).
594 * a, b, c, and d are the four bytes going into the last two stages. For
595 * 128-bit keys, this is the entire h() function and a and c are the index
596 * preprocessed through q0 and q1 respectively; for longer keys they are the
597 * output of previous stages. j is the index of the first key byte to use.
598 * CALC_K computes a pair of subkeys for 128-bit Twofish, by calling CALC_K_2
599 * twice, doing the Pseudo-Hadamard Transform, and doing the necessary
600 * rotations. Its parameters are: a, the array to write the results into,
601 * j, the index of the first output entry, k and l, the preprocessed indices
602 * for index 2i, and m and n, the preprocessed indices for index 2i+1.
603 * CALC_K256_2 expands CALC_K_2 to handle 256-bit keys, by doing two
604 * additional lookup-and-XOR stages. The parameters a and b are the index
605 * preprocessed through q0 and q1 respectively; j is the index of the first
606 * key byte to use. CALC_K256 is identical to CALC_K but for using the
607 * CALC_K256_2 macro instead of CALC_K_2. */
608
609 #define CALC_K_2(a, b, c, d, j) \
610 mds[0][q0[a ^ key[(j) + 8]] ^ key[j]] \
611 ^ mds[1][q0[b ^ key[(j) + 9]] ^ key[(j) + 1]] \
612 ^ mds[2][q1[c ^ key[(j) + 10]] ^ key[(j) + 2]] \
613 ^ mds[3][q1[d ^ key[(j) + 11]] ^ key[(j) + 3]]
614
615 #define CALC_K(a, j, k, l, m, n) \
616 x = CALC_K_2 (k, l, k, l, 0); \
617 y = CALC_K_2 (m, n, m, n, 4); \
618 y = (y << 8) + (y >> 24); \
619 x += y; y += x; ctx->a[j] = x; \
620 ctx->a[(j) + 1] = (y << 9) + (y >> 23)
621
622 #define CALC_K256_2(a, b, j) \
623 CALC_K_2 (q0[q1[b ^ key[(j) + 24]] ^ key[(j) + 16]], \
624 q1[q1[a ^ key[(j) + 25]] ^ key[(j) + 17]], \
625 q0[q0[a ^ key[(j) + 26]] ^ key[(j) + 18]], \
626 q1[q0[b ^ key[(j) + 27]] ^ key[(j) + 19]], j)
627
628 #define CALC_K256(a, j, k, l, m, n) \
629 x = CALC_K256_2 (k, l, 0); \
630 y = CALC_K256_2 (m, n, 4); \
631 y = (y << 8) + (y >> 24); \
632 x += y; y += x; ctx->a[j] = x; \
633 ctx->a[(j) + 1] = (y << 9) + (y >> 23)
634
635
636
637 /* Perform the key setup. Note that this works only with 128- and 256-bit
638 * keys, despite the API that looks like it might support other sizes. */
639
640 static gcry_err_code_t
do_twofish_setkey(TWOFISH_context * ctx,const byte * key,const unsigned keylen)641 do_twofish_setkey (TWOFISH_context *ctx, const byte *key, const unsigned keylen)
642 {
643 int i, j, k;
644
645 /* Temporaries for CALC_K. */
646 u32 x, y;
647
648 /* The S vector used to key the S-boxes, split up into individual bytes.
649 * 128-bit keys use only sa through sh; 256-bit use all of them. */
650 byte sa = 0, sb = 0, sc = 0, sd = 0, se = 0, sf = 0, sg = 0, sh = 0;
651 byte si = 0, sj = 0, sk = 0, sl = 0, sm = 0, sn = 0, so = 0, sp = 0;
652
653 /* Temporary for CALC_S. */
654 unsigned int tmp;
655
656 /* Flags for self-test. */
657 static int initialized = 0;
658 static const char *selftest_failed=0;
659
660 /* Check key length. */
661 if( ( ( keylen - 16 ) | 16 ) != 16 )
662 return GPG_ERR_INV_KEYLEN;
663
664 /* Do self-test if necessary. */
665 if (!initialized)
666 {
667 initialized = 1;
668 selftest_failed = selftest ();
669 if( selftest_failed )
670 log_error("%s\n", selftest_failed );
671 }
672 if( selftest_failed )
673 return GPG_ERR_SELFTEST_FAILED;
674
675 /* Compute the first two words of the S vector. The magic numbers are
676 * the entries of the RS matrix, preprocessed through poly_to_exp. The
677 * numbers in the comments are the original (polynomial form) matrix
678 * entries. */
679 CALC_S (sa, sb, sc, sd, 0, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
680 CALC_S (sa, sb, sc, sd, 1, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
681 CALC_S (sa, sb, sc, sd, 2, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
682 CALC_S (sa, sb, sc, sd, 3, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
683 CALC_S (sa, sb, sc, sd, 4, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
684 CALC_S (sa, sb, sc, sd, 5, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
685 CALC_S (sa, sb, sc, sd, 6, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
686 CALC_S (sa, sb, sc, sd, 7, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
687 CALC_S (se, sf, sg, sh, 8, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
688 CALC_S (se, sf, sg, sh, 9, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
689 CALC_S (se, sf, sg, sh, 10, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
690 CALC_S (se, sf, sg, sh, 11, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
691 CALC_S (se, sf, sg, sh, 12, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
692 CALC_S (se, sf, sg, sh, 13, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
693 CALC_S (se, sf, sg, sh, 14, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
694 CALC_S (se, sf, sg, sh, 15, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
695
696 if (keylen == 32) /* 256-bit key */
697 {
698 /* Calculate the remaining two words of the S vector */
699 CALC_S (si, sj, sk, sl, 16, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
700 CALC_S (si, sj, sk, sl, 17, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
701 CALC_S (si, sj, sk, sl, 18, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
702 CALC_S (si, sj, sk, sl, 19, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
703 CALC_S (si, sj, sk, sl, 20, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
704 CALC_S (si, sj, sk, sl, 21, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
705 CALC_S (si, sj, sk, sl, 22, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
706 CALC_S (si, sj, sk, sl, 23, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
707 CALC_S (sm, sn, so, sp, 24, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
708 CALC_S (sm, sn, so, sp, 25, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
709 CALC_S (sm, sn, so, sp, 26, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
710 CALC_S (sm, sn, so, sp, 27, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
711 CALC_S (sm, sn, so, sp, 28, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
712 CALC_S (sm, sn, so, sp, 29, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
713 CALC_S (sm, sn, so, sp, 30, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
714 CALC_S (sm, sn, so, sp, 31, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
715
716 /* Compute the S-boxes. */
717 for(i=j=0,k=1; i < 256; i++, j += 2, k += 2 )
718 {
719 CALC_SB256_2( i, calc_sb_tbl[j], calc_sb_tbl[k] );
720 }
721
722 /* Calculate whitening and round subkeys. */
723 for (i = 0; i < 8; i += 2)
724 {
725 CALC_K256 ( w, i, q0[i], q1[i], q0[i + 1], q1[i + 1] );
726 }
727 for (j = 0; j < 32; j += 2, i += 2)
728 {
729 CALC_K256 ( k, j, q0[i], q1[i], q0[i + 1], q1[i + 1] );
730 }
731 }
732 else
733 {
734 /* Compute the S-boxes. */
735 for(i=j=0,k=1; i < 256; i++, j += 2, k += 2 )
736 {
737 CALC_SB_2( i, calc_sb_tbl[j], calc_sb_tbl[k] );
738 }
739
740 /* Calculate whitening and round subkeys. */
741 for (i = 0; i < 8; i += 2)
742 {
743 CALC_K ( w, i, q0[i], q1[i], q0[i + 1], q1[i + 1] );
744 }
745 for (j = 0; j < 32; j += 2, i += 2)
746 {
747 CALC_K ( k, j, q0[i], q1[i], q0[i + 1], q1[i + 1] );
748 }
749 }
750
751 return 0;
752 }
753
754 static gcry_err_code_t
twofish_setkey(void * context,const byte * key,unsigned int keylen,cipher_bulk_ops_t * bulk_ops)755 twofish_setkey (void *context, const byte *key, unsigned int keylen,
756 cipher_bulk_ops_t *bulk_ops)
757 {
758 TWOFISH_context *ctx = context;
759 unsigned int hwfeatures = _gcry_get_hw_features ();
760 int rc;
761
762 rc = do_twofish_setkey (ctx, key, keylen);
763
764 #ifdef USE_AVX2
765 ctx->use_avx2 = 0;
766 if ((hwfeatures & HWF_INTEL_AVX2) && (hwfeatures & HWF_INTEL_FAST_VPGATHER))
767 {
768 ctx->use_avx2 = 1;
769 }
770 #endif
771
772 /* Setup bulk encryption routines. */
773 memset (bulk_ops, 0, sizeof(*bulk_ops));
774 bulk_ops->cbc_dec = _gcry_twofish_cbc_dec;
775 bulk_ops->cfb_dec = _gcry_twofish_cfb_dec;
776 bulk_ops->ctr_enc = _gcry_twofish_ctr_enc;
777 bulk_ops->ocb_crypt = _gcry_twofish_ocb_crypt;
778 bulk_ops->ocb_auth = _gcry_twofish_ocb_auth;
779
780 (void)hwfeatures;
781
782 _gcry_burn_stack (23+6*sizeof(void*));
783 return rc;
784 }
785
786
787 #ifdef USE_AVX2
788 /* Assembler implementations of Twofish using AVX2. Process 16 block in
789 parallel.
790 */
791 extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx,
792 unsigned char *out,
793 const unsigned char *in,
794 unsigned char *ctr) ASM_FUNC_ABI;
795
796 extern void _gcry_twofish_avx2_cbc_dec(const TWOFISH_context *ctx,
797 unsigned char *out,
798 const unsigned char *in,
799 unsigned char *iv) ASM_FUNC_ABI;
800
801 extern void _gcry_twofish_avx2_cfb_dec(const TWOFISH_context *ctx,
802 unsigned char *out,
803 const unsigned char *in,
804 unsigned char *iv) ASM_FUNC_ABI;
805
806 extern void _gcry_twofish_avx2_ocb_enc(const TWOFISH_context *ctx,
807 unsigned char *out,
808 const unsigned char *in,
809 unsigned char *offset,
810 unsigned char *checksum,
811 const u64 Ls[16]) ASM_FUNC_ABI;
812
813 extern void _gcry_twofish_avx2_ocb_dec(const TWOFISH_context *ctx,
814 unsigned char *out,
815 const unsigned char *in,
816 unsigned char *offset,
817 unsigned char *checksum,
818 const u64 Ls[16]) ASM_FUNC_ABI;
819
820 extern void _gcry_twofish_avx2_ocb_auth(const TWOFISH_context *ctx,
821 const unsigned char *abuf,
822 unsigned char *offset,
823 unsigned char *checksum,
824 const u64 Ls[16]) ASM_FUNC_ABI;
825 #endif
826
827
828 #ifdef USE_AMD64_ASM
829
830 /* Assembly implementations of Twofish. */
831 extern void _gcry_twofish_amd64_encrypt_block(const TWOFISH_context *c,
832 byte *out, const byte *in);
833
834 extern void _gcry_twofish_amd64_decrypt_block(const TWOFISH_context *c,
835 byte *out, const byte *in);
836
837 /* These assembly implementations process three blocks in parallel. */
838 extern void _gcry_twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out,
839 const byte *in, byte *ctr);
840
841 extern void _gcry_twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out,
842 const byte *in, byte *iv);
843
844 extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
845 const byte *in, byte *iv);
846
847 extern void _gcry_twofish_amd64_ocb_enc(const TWOFISH_context *ctx, byte *out,
848 const byte *in, byte *offset,
849 byte *checksum, const u64 Ls[3]);
850
851 extern void _gcry_twofish_amd64_ocb_dec(const TWOFISH_context *ctx, byte *out,
852 const byte *in, byte *offset,
853 byte *checksum, const u64 Ls[3]);
854
855 extern void _gcry_twofish_amd64_ocb_auth(const TWOFISH_context *ctx,
856 const byte *abuf, byte *offset,
857 byte *checksum, const u64 Ls[3]);
858
859 static inline void
twofish_amd64_encrypt_block(const TWOFISH_context * c,byte * out,const byte * in)860 twofish_amd64_encrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
861 {
862 _gcry_twofish_amd64_encrypt_block(c, out, in);
863 }
864
865 static inline void
twofish_amd64_decrypt_block(const TWOFISH_context * c,byte * out,const byte * in)866 twofish_amd64_decrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
867 {
868 _gcry_twofish_amd64_decrypt_block(c, out, in);
869 }
870
871 static inline void
twofish_amd64_ctr_enc(const TWOFISH_context * c,byte * out,const byte * in,byte * ctr)872 twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out, const byte *in,
873 byte *ctr)
874 {
875 _gcry_twofish_amd64_ctr_enc(c, out, in, ctr);
876 }
877
878 static inline void
twofish_amd64_cbc_dec(const TWOFISH_context * c,byte * out,const byte * in,byte * iv)879 twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out, const byte *in,
880 byte *iv)
881 {
882 _gcry_twofish_amd64_cbc_dec(c, out, in, iv);
883 }
884
885 static inline void
twofish_amd64_cfb_dec(const TWOFISH_context * c,byte * out,const byte * in,byte * iv)886 twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, const byte *in,
887 byte *iv)
888 {
889 _gcry_twofish_amd64_cfb_dec(c, out, in, iv);
890 }
891
892 static inline void
twofish_amd64_ocb_enc(const TWOFISH_context * ctx,byte * out,const byte * in,byte * offset,byte * checksum,const u64 Ls[3])893 twofish_amd64_ocb_enc(const TWOFISH_context *ctx, byte *out, const byte *in,
894 byte *offset, byte *checksum, const u64 Ls[3])
895 {
896 _gcry_twofish_amd64_ocb_enc(ctx, out, in, offset, checksum, Ls);
897 }
898
899 static inline void
twofish_amd64_ocb_dec(const TWOFISH_context * ctx,byte * out,const byte * in,byte * offset,byte * checksum,const u64 Ls[3])900 twofish_amd64_ocb_dec(const TWOFISH_context *ctx, byte *out, const byte *in,
901 byte *offset, byte *checksum, const u64 Ls[3])
902 {
903 _gcry_twofish_amd64_ocb_dec(ctx, out, in, offset, checksum, Ls);
904 }
905
906 static inline void
twofish_amd64_ocb_auth(const TWOFISH_context * ctx,const byte * abuf,byte * offset,byte * checksum,const u64 Ls[3])907 twofish_amd64_ocb_auth(const TWOFISH_context *ctx, const byte *abuf,
908 byte *offset, byte *checksum, const u64 Ls[3])
909 {
910 _gcry_twofish_amd64_ocb_auth(ctx, abuf, offset, checksum, Ls);
911 }
912
913 #elif defined(USE_ARM_ASM)
914
915 /* Assembly implementations of Twofish. */
916 extern void _gcry_twofish_arm_encrypt_block(const TWOFISH_context *c,
917 byte *out, const byte *in);
918
919 extern void _gcry_twofish_arm_decrypt_block(const TWOFISH_context *c,
920 byte *out, const byte *in);
921
922 #else /*!USE_AMD64_ASM && !USE_ARM_ASM*/
923
924 /* Macros to compute the g() function in the encryption and decryption
925 * rounds. G1 is the straight g() function; G2 includes the 8-bit
926 * rotation for the high 32-bit word. */
927
928 #define G1(a) \
929 (ctx->s[0][(a) & 0xFF]) ^ (ctx->s[1][((a) >> 8) & 0xFF]) \
930 ^ (ctx->s[2][((a) >> 16) & 0xFF]) ^ (ctx->s[3][(a) >> 24])
931
932 #define G2(b) \
933 (ctx->s[1][(b) & 0xFF]) ^ (ctx->s[2][((b) >> 8) & 0xFF]) \
934 ^ (ctx->s[3][((b) >> 16) & 0xFF]) ^ (ctx->s[0][(b) >> 24])
935
936 /* Encryption and decryption Feistel rounds. Each one calls the two g()
937 * macros, does the PHT, and performs the XOR and the appropriate bit
938 * rotations. The parameters are the round number (used to select subkeys),
939 * and the four 32-bit chunks of the text. */
940
941 #define ENCROUND(n, a, b, c, d) \
942 x = G1 (a); y = G2 (b); \
943 x += y; y += x + ctx->k[2 * (n) + 1]; \
944 (c) ^= x + ctx->k[2 * (n)]; \
945 (c) = ((c) >> 1) + ((c) << 31); \
946 (d) = (((d) << 1)+((d) >> 31)) ^ y
947
948 #define DECROUND(n, a, b, c, d) \
949 x = G1 (a); y = G2 (b); \
950 x += y; y += x; \
951 (d) ^= y + ctx->k[2 * (n) + 1]; \
952 (d) = ((d) >> 1) + ((d) << 31); \
953 (c) = (((c) << 1)+((c) >> 31)); \
954 (c) ^= (x + ctx->k[2 * (n)])
955
956 /* Encryption and decryption cycles; each one is simply two Feistel rounds
957 * with the 32-bit chunks re-ordered to simulate the "swap" */
958
959 #define ENCCYCLE(n) \
960 ENCROUND (2 * (n), a, b, c, d); \
961 ENCROUND (2 * (n) + 1, c, d, a, b)
962
963 #define DECCYCLE(n) \
964 DECROUND (2 * (n) + 1, c, d, a, b); \
965 DECROUND (2 * (n), a, b, c, d)
966
967 /* Macros to convert the input and output bytes into 32-bit words,
968 * and simultaneously perform the whitening step. INPACK packs word
969 * number n into the variable named by x, using whitening subkey number m.
970 * OUTUNPACK unpacks word number n from the variable named by x, using
971 * whitening subkey number m. */
972
973 #define INPACK(n, x, m) \
974 x = buf_get_le32(in + (n) * 4); \
975 x ^= ctx->w[m]
976
977 #define OUTUNPACK(n, x, m) \
978 x ^= ctx->w[m]; \
979 buf_put_le32(out + (n) * 4, x)
980
981 #endif /*!USE_AMD64_ASM*/
982
983
984 /* Encrypt one block. in and out may be the same. */
985
986 #ifdef USE_AMD64_ASM
987
988 static unsigned int
twofish_encrypt(void * context,byte * out,const byte * in)989 twofish_encrypt (void *context, byte *out, const byte *in)
990 {
991 TWOFISH_context *ctx = context;
992 twofish_amd64_encrypt_block(ctx, out, in);
993 return /*burn_stack*/ (4*sizeof (void*));
994 }
995
996 #elif defined(USE_ARM_ASM)
997
998 static unsigned int
twofish_encrypt(void * context,byte * out,const byte * in)999 twofish_encrypt (void *context, byte *out, const byte *in)
1000 {
1001 TWOFISH_context *ctx = context;
1002 _gcry_twofish_arm_encrypt_block(ctx, out, in);
1003 return /*burn_stack*/ (4*sizeof (void*));
1004 }
1005
1006 #else /*!USE_AMD64_ASM && !USE_ARM_ASM*/
1007
1008 static void
do_twofish_encrypt(const TWOFISH_context * ctx,byte * out,const byte * in)1009 do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
1010 {
1011 /* The four 32-bit chunks of the text. */
1012 u32 a, b, c, d;
1013
1014 /* Temporaries used by the round function. */
1015 u32 x, y;
1016
1017 /* Input whitening and packing. */
1018 INPACK (0, a, 0);
1019 INPACK (1, b, 1);
1020 INPACK (2, c, 2);
1021 INPACK (3, d, 3);
1022
1023 /* Encryption Feistel cycles. */
1024 ENCCYCLE (0);
1025 ENCCYCLE (1);
1026 ENCCYCLE (2);
1027 ENCCYCLE (3);
1028 ENCCYCLE (4);
1029 ENCCYCLE (5);
1030 ENCCYCLE (6);
1031 ENCCYCLE (7);
1032
1033 /* Output whitening and unpacking. */
1034 OUTUNPACK (0, c, 4);
1035 OUTUNPACK (1, d, 5);
1036 OUTUNPACK (2, a, 6);
1037 OUTUNPACK (3, b, 7);
1038 }
1039
1040 static unsigned int
twofish_encrypt(void * context,byte * out,const byte * in)1041 twofish_encrypt (void *context, byte *out, const byte *in)
1042 {
1043 TWOFISH_context *ctx = context;
1044 do_twofish_encrypt (ctx, out, in);
1045 return /*burn_stack*/ (24+3*sizeof (void*));
1046 }
1047
1048 #endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
1049
1050
1051 /* Decrypt one block. in and out may be the same. */
1052
1053 #ifdef USE_AMD64_ASM
1054
1055 static unsigned int
twofish_decrypt(void * context,byte * out,const byte * in)1056 twofish_decrypt (void *context, byte *out, const byte *in)
1057 {
1058 TWOFISH_context *ctx = context;
1059 twofish_amd64_decrypt_block(ctx, out, in);
1060 return /*burn_stack*/ (4*sizeof (void*));
1061 }
1062
1063 #elif defined(USE_ARM_ASM)
1064
1065 static unsigned int
twofish_decrypt(void * context,byte * out,const byte * in)1066 twofish_decrypt (void *context, byte *out, const byte *in)
1067 {
1068 TWOFISH_context *ctx = context;
1069 _gcry_twofish_arm_decrypt_block(ctx, out, in);
1070 return /*burn_stack*/ (4*sizeof (void*));
1071 }
1072
1073 #else /*!USE_AMD64_ASM && !USE_ARM_ASM*/
1074
1075 static void
do_twofish_decrypt(const TWOFISH_context * ctx,byte * out,const byte * in)1076 do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
1077 {
1078 /* The four 32-bit chunks of the text. */
1079 u32 a, b, c, d;
1080
1081 /* Temporaries used by the round function. */
1082 u32 x, y;
1083
1084 /* Input whitening and packing. */
1085 INPACK (0, c, 4);
1086 INPACK (1, d, 5);
1087 INPACK (2, a, 6);
1088 INPACK (3, b, 7);
1089
1090 /* Encryption Feistel cycles. */
1091 DECCYCLE (7);
1092 DECCYCLE (6);
1093 DECCYCLE (5);
1094 DECCYCLE (4);
1095 DECCYCLE (3);
1096 DECCYCLE (2);
1097 DECCYCLE (1);
1098 DECCYCLE (0);
1099
1100 /* Output whitening and unpacking. */
1101 OUTUNPACK (0, a, 0);
1102 OUTUNPACK (1, b, 1);
1103 OUTUNPACK (2, c, 2);
1104 OUTUNPACK (3, d, 3);
1105 }
1106
1107 static unsigned int
twofish_decrypt(void * context,byte * out,const byte * in)1108 twofish_decrypt (void *context, byte *out, const byte *in)
1109 {
1110 TWOFISH_context *ctx = context;
1111
1112 do_twofish_decrypt (ctx, out, in);
1113 return /*burn_stack*/ (24+3*sizeof (void*));
1114 }
1115
1116 #endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
1117
1118
1119
1120 /* Bulk encryption of complete blocks in CTR mode. This function is only
1121 intended for the bulk encryption feature of cipher.c. CTR is expected to be
1122 of size TWOFISH_BLOCKSIZE. */
1123 static void
_gcry_twofish_ctr_enc(void * context,unsigned char * ctr,void * outbuf_arg,const void * inbuf_arg,size_t nblocks)1124 _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
1125 const void *inbuf_arg, size_t nblocks)
1126 {
1127 TWOFISH_context *ctx = context;
1128 unsigned char *outbuf = outbuf_arg;
1129 const unsigned char *inbuf = inbuf_arg;
1130 unsigned char tmpbuf[TWOFISH_BLOCKSIZE];
1131 unsigned int burn, burn_stack_depth = 0;
1132
1133 #ifdef USE_AVX2
1134 if (ctx->use_avx2)
1135 {
1136 int did_use_avx2 = 0;
1137
1138 /* Process data in 16 block chunks. */
1139 while (nblocks >= 16)
1140 {
1141 _gcry_twofish_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
1142
1143 nblocks -= 16;
1144 outbuf += 16 * TWOFISH_BLOCKSIZE;
1145 inbuf += 16 * TWOFISH_BLOCKSIZE;
1146 did_use_avx2 = 1;
1147 }
1148
1149 if (did_use_avx2)
1150 {
1151 /* twofish-avx2 assembly code does not use stack */
1152 if (nblocks == 0)
1153 burn_stack_depth = 0;
1154 }
1155 }
1156 #endif
1157
1158 #ifdef USE_AMD64_ASM
1159 {
1160 /* Process data in 3 block chunks. */
1161 while (nblocks >= 3)
1162 {
1163 twofish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
1164
1165 nblocks -= 3;
1166 outbuf += 3 * TWOFISH_BLOCKSIZE;
1167 inbuf += 3 * TWOFISH_BLOCKSIZE;
1168
1169 burn = 8 * sizeof(void*);
1170 if (burn > burn_stack_depth)
1171 burn_stack_depth = burn;
1172 }
1173
1174 /* Use generic code to handle smaller chunks... */
1175 /* TODO: use caching instead? */
1176 }
1177 #endif
1178
1179 for ( ;nblocks; nblocks-- )
1180 {
1181 /* Encrypt the counter. */
1182 burn = twofish_encrypt(ctx, tmpbuf, ctr);
1183 if (burn > burn_stack_depth)
1184 burn_stack_depth = burn;
1185
1186 /* XOR the input with the encrypted counter and store in output. */
1187 cipher_block_xor(outbuf, tmpbuf, inbuf, TWOFISH_BLOCKSIZE);
1188 outbuf += TWOFISH_BLOCKSIZE;
1189 inbuf += TWOFISH_BLOCKSIZE;
1190 /* Increment the counter. */
1191 cipher_block_add(ctr, 1, TWOFISH_BLOCKSIZE);
1192 }
1193
1194 wipememory(tmpbuf, sizeof(tmpbuf));
1195 _gcry_burn_stack(burn_stack_depth);
1196 }
1197
1198
1199 /* Bulk decryption of complete blocks in CBC mode. This function is only
1200 intended for the bulk encryption feature of cipher.c. */
1201 static void
_gcry_twofish_cbc_dec(void * context,unsigned char * iv,void * outbuf_arg,const void * inbuf_arg,size_t nblocks)1202 _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
1203 const void *inbuf_arg, size_t nblocks)
1204 {
1205 TWOFISH_context *ctx = context;
1206 unsigned char *outbuf = outbuf_arg;
1207 const unsigned char *inbuf = inbuf_arg;
1208 unsigned char savebuf[TWOFISH_BLOCKSIZE];
1209 unsigned int burn, burn_stack_depth = 0;
1210
1211 #ifdef USE_AVX2
1212 if (ctx->use_avx2)
1213 {
1214 int did_use_avx2 = 0;
1215
1216 /* Process data in 16 block chunks. */
1217 while (nblocks >= 16)
1218 {
1219 _gcry_twofish_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
1220
1221 nblocks -= 16;
1222 outbuf += 16 * TWOFISH_BLOCKSIZE;
1223 inbuf += 16 * TWOFISH_BLOCKSIZE;
1224 did_use_avx2 = 1;
1225 }
1226
1227 if (did_use_avx2)
1228 {
1229 /* twofish-avx2 assembly code does not use stack */
1230 if (nblocks == 0)
1231 burn_stack_depth = 0;
1232 }
1233 }
1234 #endif
1235
1236 #ifdef USE_AMD64_ASM
1237 {
1238 /* Process data in 3 block chunks. */
1239 while (nblocks >= 3)
1240 {
1241 twofish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
1242
1243 nblocks -= 3;
1244 outbuf += 3 * TWOFISH_BLOCKSIZE;
1245 inbuf += 3 * TWOFISH_BLOCKSIZE;
1246
1247 burn = 9 * sizeof(void*);
1248 if (burn > burn_stack_depth)
1249 burn_stack_depth = burn;
1250 }
1251
1252 /* Use generic code to handle smaller chunks... */
1253 }
1254 #endif
1255
1256 for ( ;nblocks; nblocks-- )
1257 {
1258 /* INBUF is needed later and it may be identical to OUTBUF, so store
1259 the intermediate result to SAVEBUF. */
1260 burn = twofish_decrypt (ctx, savebuf, inbuf);
1261 if (burn > burn_stack_depth)
1262 burn_stack_depth = burn;
1263
1264 cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, TWOFISH_BLOCKSIZE);
1265 inbuf += TWOFISH_BLOCKSIZE;
1266 outbuf += TWOFISH_BLOCKSIZE;
1267 }
1268
1269 wipememory(savebuf, sizeof(savebuf));
1270 _gcry_burn_stack(burn_stack_depth);
1271 }
1272
1273
1274 /* Bulk decryption of complete blocks in CFB mode. This function is only
1275 intended for the bulk encryption feature of cipher.c. */
1276 static void
_gcry_twofish_cfb_dec(void * context,unsigned char * iv,void * outbuf_arg,const void * inbuf_arg,size_t nblocks)1277 _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
1278 const void *inbuf_arg, size_t nblocks)
1279 {
1280 TWOFISH_context *ctx = context;
1281 unsigned char *outbuf = outbuf_arg;
1282 const unsigned char *inbuf = inbuf_arg;
1283 unsigned int burn, burn_stack_depth = 0;
1284
1285 #ifdef USE_AVX2
1286 if (ctx->use_avx2)
1287 {
1288 int did_use_avx2 = 0;
1289
1290 /* Process data in 16 block chunks. */
1291 while (nblocks >= 16)
1292 {
1293 _gcry_twofish_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
1294
1295 nblocks -= 16;
1296 outbuf += 16 * TWOFISH_BLOCKSIZE;
1297 inbuf += 16 * TWOFISH_BLOCKSIZE;
1298 did_use_avx2 = 1;
1299 }
1300
1301 if (did_use_avx2)
1302 {
1303 /* twofish-avx2 assembly code does not use stack */
1304 if (nblocks == 0)
1305 burn_stack_depth = 0;
1306 }
1307 }
1308 #endif
1309
1310 #ifdef USE_AMD64_ASM
1311 {
1312 /* Process data in 3 block chunks. */
1313 while (nblocks >= 3)
1314 {
1315 twofish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
1316
1317 nblocks -= 3;
1318 outbuf += 3 * TWOFISH_BLOCKSIZE;
1319 inbuf += 3 * TWOFISH_BLOCKSIZE;
1320
1321 burn = 8 * sizeof(void*);
1322 if (burn > burn_stack_depth)
1323 burn_stack_depth = burn;
1324 }
1325
1326 /* Use generic code to handle smaller chunks... */
1327 }
1328 #endif
1329
1330 for ( ;nblocks; nblocks-- )
1331 {
1332 burn = twofish_encrypt(ctx, iv, iv);
1333 if (burn > burn_stack_depth)
1334 burn_stack_depth = burn;
1335
1336 cipher_block_xor_n_copy(outbuf, iv, inbuf, TWOFISH_BLOCKSIZE);
1337 outbuf += TWOFISH_BLOCKSIZE;
1338 inbuf += TWOFISH_BLOCKSIZE;
1339 }
1340
1341 _gcry_burn_stack(burn_stack_depth);
1342 }
1343
1344 /* Bulk encryption/decryption of complete blocks in OCB mode. */
1345 static size_t
_gcry_twofish_ocb_crypt(gcry_cipher_hd_t c,void * outbuf_arg,const void * inbuf_arg,size_t nblocks,int encrypt)1346 _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
1347 const void *inbuf_arg, size_t nblocks, int encrypt)
1348 {
1349 #ifdef USE_AMD64_ASM
1350 TWOFISH_context *ctx = (void *)&c->context.c;
1351 unsigned char *outbuf = outbuf_arg;
1352 const unsigned char *inbuf = inbuf_arg;
1353 unsigned int burn, burn_stack_depth = 0;
1354 u64 blkn = c->u_mode.ocb.data_nblocks;
1355
1356 #ifdef USE_AVX2
1357 if (ctx->use_avx2)
1358 {
1359 int did_use_avx2 = 0;
1360 u64 Ls[16];
1361 unsigned int n = 16 - (blkn % 16);
1362 u64 *l;
1363 int i;
1364
1365 if (nblocks >= 16)
1366 {
1367 for (i = 0; i < 16; i += 8)
1368 {
1369 /* Use u64 to store pointers for x32 support (assembly function
1370 * assumes 64-bit pointers). */
1371 Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1372 Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
1373 Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1374 Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
1375 Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1376 Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
1377 Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1378 }
1379
1380 Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
1381 l = &Ls[(15 + n) % 16];
1382
1383 /* Process data in 16 block chunks. */
1384 while (nblocks >= 16)
1385 {
1386 blkn += 16;
1387 *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
1388
1389 if (encrypt)
1390 _gcry_twofish_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
1391 c->u_ctr.ctr, Ls);
1392 else
1393 _gcry_twofish_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
1394 c->u_ctr.ctr, Ls);
1395
1396 nblocks -= 16;
1397 outbuf += 16 * TWOFISH_BLOCKSIZE;
1398 inbuf += 16 * TWOFISH_BLOCKSIZE;
1399 did_use_avx2 = 1;
1400 }
1401 }
1402
1403 if (did_use_avx2)
1404 {
1405 /* twofish-avx2 assembly code does not use stack */
1406 if (nblocks == 0)
1407 burn_stack_depth = 0;
1408 }
1409 }
1410 #endif
1411
1412 {
1413 /* Use u64 to store pointers for x32 support (assembly function
1414 * assumes 64-bit pointers). */
1415 u64 Ls[3];
1416
1417 /* Process data in 3 block chunks. */
1418 while (nblocks >= 3)
1419 {
1420 Ls[0] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 1);
1421 Ls[1] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 2);
1422 Ls[2] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 3);
1423 blkn += 3;
1424
1425 if (encrypt)
1426 twofish_amd64_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
1427 Ls);
1428 else
1429 twofish_amd64_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
1430 Ls);
1431
1432 nblocks -= 3;
1433 outbuf += 3 * TWOFISH_BLOCKSIZE;
1434 inbuf += 3 * TWOFISH_BLOCKSIZE;
1435
1436 burn = 8 * sizeof(void*);
1437 if (burn > burn_stack_depth)
1438 burn_stack_depth = burn;
1439 }
1440
1441 /* Use generic code to handle smaller chunks... */
1442 }
1443
1444 c->u_mode.ocb.data_nblocks = blkn;
1445
1446 if (burn_stack_depth)
1447 _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
1448 #else
1449 (void)c;
1450 (void)outbuf_arg;
1451 (void)inbuf_arg;
1452 (void)encrypt;
1453 #endif
1454
1455 return nblocks;
1456 }
1457
1458 /* Bulk authentication of complete blocks in OCB mode. */
1459 static size_t
_gcry_twofish_ocb_auth(gcry_cipher_hd_t c,const void * abuf_arg,size_t nblocks)1460 _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
1461 size_t nblocks)
1462 {
1463 #ifdef USE_AMD64_ASM
1464 TWOFISH_context *ctx = (void *)&c->context.c;
1465 const unsigned char *abuf = abuf_arg;
1466 unsigned int burn, burn_stack_depth = 0;
1467 u64 blkn = c->u_mode.ocb.aad_nblocks;
1468
1469 #ifdef USE_AVX2
1470 if (ctx->use_avx2)
1471 {
1472 int did_use_avx2 = 0;
1473 u64 Ls[16];
1474 unsigned int n = 16 - (blkn % 16);
1475 u64 *l;
1476 int i;
1477
1478 if (nblocks >= 16)
1479 {
1480 for (i = 0; i < 16; i += 8)
1481 {
1482 /* Use u64 to store pointers for x32 support (assembly function
1483 * assumes 64-bit pointers). */
1484 Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1485 Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
1486 Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1487 Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
1488 Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1489 Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
1490 Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
1491 }
1492
1493 Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
1494 l = &Ls[(15 + n) % 16];
1495
1496 /* Process data in 16 block chunks. */
1497 while (nblocks >= 16)
1498 {
1499 blkn += 16;
1500 *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
1501
1502 _gcry_twofish_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
1503 c->u_mode.ocb.aad_sum, Ls);
1504
1505 nblocks -= 16;
1506 abuf += 16 * TWOFISH_BLOCKSIZE;
1507 did_use_avx2 = 1;
1508 }
1509 }
1510
1511 if (did_use_avx2)
1512 {
1513 /* twofish-avx2 assembly code does not use stack */
1514 if (nblocks == 0)
1515 burn_stack_depth = 0;
1516 }
1517
1518 /* Use generic code to handle smaller chunks... */
1519 }
1520 #endif
1521
1522 {
1523 /* Use u64 to store pointers for x32 support (assembly function
1524 * assumes 64-bit pointers). */
1525 u64 Ls[3];
1526
1527 /* Process data in 3 block chunks. */
1528 while (nblocks >= 3)
1529 {
1530 Ls[0] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 1);
1531 Ls[1] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 2);
1532 Ls[2] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 3);
1533 blkn += 3;
1534
1535 twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
1536 c->u_mode.ocb.aad_sum, Ls);
1537
1538 nblocks -= 3;
1539 abuf += 3 * TWOFISH_BLOCKSIZE;
1540
1541 burn = 8 * sizeof(void*);
1542 if (burn > burn_stack_depth)
1543 burn_stack_depth = burn;
1544 }
1545
1546 /* Use generic code to handle smaller chunks... */
1547 }
1548
1549 c->u_mode.ocb.aad_nblocks = blkn;
1550
1551 if (burn_stack_depth)
1552 _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
1553 #else
1554 (void)c;
1555 (void)abuf_arg;
1556 #endif
1557
1558 return nblocks;
1559 }
1560
1561
1562
1563 /* Run the self-tests for TWOFISH-CTR, tests IV increment of bulk CTR
1564 encryption. Returns NULL on success. */
1565 static const char *
selftest_ctr(void)1566 selftest_ctr (void)
1567 {
1568 const int nblocks = 16+1;
1569 const int blocksize = TWOFISH_BLOCKSIZE;
1570 const int context_size = sizeof(TWOFISH_context);
1571
1572 return _gcry_selftest_helper_ctr("TWOFISH", &twofish_setkey,
1573 &twofish_encrypt, nblocks, blocksize, context_size);
1574 }
1575
1576 /* Run the self-tests for TWOFISH-CBC, tests bulk CBC decryption.
1577 Returns NULL on success. */
1578 static const char *
selftest_cbc(void)1579 selftest_cbc (void)
1580 {
1581 const int nblocks = 16+2;
1582 const int blocksize = TWOFISH_BLOCKSIZE;
1583 const int context_size = sizeof(TWOFISH_context);
1584
1585 return _gcry_selftest_helper_cbc("TWOFISH", &twofish_setkey,
1586 &twofish_encrypt, nblocks, blocksize, context_size);
1587 }
1588
1589 /* Run the self-tests for TWOFISH-CFB, tests bulk CBC decryption.
1590 Returns NULL on success. */
1591 static const char *
selftest_cfb(void)1592 selftest_cfb (void)
1593 {
1594 const int nblocks = 16+2;
1595 const int blocksize = TWOFISH_BLOCKSIZE;
1596 const int context_size = sizeof(TWOFISH_context);
1597
1598 return _gcry_selftest_helper_cfb("TWOFISH", &twofish_setkey,
1599 &twofish_encrypt, nblocks, blocksize, context_size);
1600 }
1601
1602
1603 /* Test a single encryption and decryption with each key size. */
1604
1605 static const char*
selftest(void)1606 selftest (void)
1607 {
1608 TWOFISH_context ctx; /* Expanded key. */
1609 byte scratch[16]; /* Encryption/decryption result buffer. */
1610 cipher_bulk_ops_t bulk_ops;
1611 const char *r;
1612
1613 /* Test vectors for single encryption/decryption. Note that I am using
1614 * the vectors from the Twofish paper's "known answer test", I=3 for
1615 * 128-bit and I=4 for 256-bit, instead of the all-0 vectors from the
1616 * "intermediate value test", because an all-0 key would trigger all the
1617 * special cases in the RS matrix multiply, leaving the math untested. */
1618 static byte plaintext[16] = {
1619 0xD4, 0x91, 0xDB, 0x16, 0xE7, 0xB1, 0xC3, 0x9E,
1620 0x86, 0xCB, 0x08, 0x6B, 0x78, 0x9F, 0x54, 0x19
1621 };
1622 static byte key[16] = {
1623 0x9F, 0x58, 0x9F, 0x5C, 0xF6, 0x12, 0x2C, 0x32,
1624 0xB6, 0xBF, 0xEC, 0x2F, 0x2A, 0xE8, 0xC3, 0x5A
1625 };
1626 static const byte ciphertext[16] = {
1627 0x01, 0x9F, 0x98, 0x09, 0xDE, 0x17, 0x11, 0x85,
1628 0x8F, 0xAA, 0xC3, 0xA3, 0xBA, 0x20, 0xFB, 0xC3
1629 };
1630 static byte plaintext_256[16] = {
1631 0x90, 0xAF, 0xE9, 0x1B, 0xB2, 0x88, 0x54, 0x4F,
1632 0x2C, 0x32, 0xDC, 0x23, 0x9B, 0x26, 0x35, 0xE6
1633 };
1634 static byte key_256[32] = {
1635 0xD4, 0x3B, 0xB7, 0x55, 0x6E, 0xA3, 0x2E, 0x46,
1636 0xF2, 0xA2, 0x82, 0xB7, 0xD4, 0x5B, 0x4E, 0x0D,
1637 0x57, 0xFF, 0x73, 0x9D, 0x4D, 0xC9, 0x2C, 0x1B,
1638 0xD7, 0xFC, 0x01, 0x70, 0x0C, 0xC8, 0x21, 0x6F
1639 };
1640 static const byte ciphertext_256[16] = {
1641 0x6C, 0xB4, 0x56, 0x1C, 0x40, 0xBF, 0x0A, 0x97,
1642 0x05, 0x93, 0x1C, 0xB6, 0xD4, 0x08, 0xE7, 0xFA
1643 };
1644
1645 twofish_setkey (&ctx, key, sizeof(key), &bulk_ops);
1646 twofish_encrypt (&ctx, scratch, plaintext);
1647 if (memcmp (scratch, ciphertext, sizeof (ciphertext)))
1648 return "Twofish-128 test encryption failed.";
1649 twofish_decrypt (&ctx, scratch, scratch);
1650 if (memcmp (scratch, plaintext, sizeof (plaintext)))
1651 return "Twofish-128 test decryption failed.";
1652
1653 twofish_setkey (&ctx, key_256, sizeof(key_256), &bulk_ops);
1654 twofish_encrypt (&ctx, scratch, plaintext_256);
1655 if (memcmp (scratch, ciphertext_256, sizeof (ciphertext_256)))
1656 return "Twofish-256 test encryption failed.";
1657 twofish_decrypt (&ctx, scratch, scratch);
1658 if (memcmp (scratch, plaintext_256, sizeof (plaintext_256)))
1659 return "Twofish-256 test decryption failed.";
1660
1661 if ((r = selftest_ctr()) != NULL)
1662 return r;
1663 if ((r = selftest_cbc()) != NULL)
1664 return r;
1665 if ((r = selftest_cfb()) != NULL)
1666 return r;
1667
1668 return NULL;
1669 }
1670
1671 /* More complete test program. This does 1000 encryptions and decryptions
1672 * with each of 250 128-bit keys and 2000 encryptions and decryptions with
1673 * each of 125 256-bit keys, using a feedback scheme similar to a Feistel
1674 * cipher, so as to be sure of testing all the table entries pretty
1675 * thoroughly. We keep changing the keys so as to get a more meaningful
1676 * performance number, since the key setup is non-trivial for Twofish. */
1677
1678 #ifdef TEST
1679
1680 #include <stdio.h>
1681 #include <string.h>
1682 #include <time.h>
1683
1684 int
main()1685 main()
1686 {
1687 TWOFISH_context ctx; /* Expanded key. */
1688 int i, j; /* Loop counters. */
1689 cipher_bulk_ops_t bulk_ops;
1690
1691 const char *encrypt_msg; /* Message to print regarding encryption test;
1692 * the printf is done outside the loop to avoid
1693 * stuffing up the timing. */
1694 clock_t timer; /* For computing elapsed time. */
1695
1696 /* Test buffer. */
1697 byte buffer[4][16] = {
1698 {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
1699 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF},
1700 {0x0F, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78,
1701 0x87, 0x96, 0xA5, 0xB4, 0xC3, 0xD2 ,0xE1, 0xF0},
1702 {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
1703 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54 ,0x32, 0x10},
1704 {0x01, 0x23, 0x45, 0x67, 0x76, 0x54 ,0x32, 0x10,
1705 0x89, 0xAB, 0xCD, 0xEF, 0xFE, 0xDC, 0xBA, 0x98}
1706 };
1707
1708 /* Expected outputs for the million-operation test */
1709 static const byte test_encrypt[4][16] = {
1710 {0xC8, 0x23, 0xB8, 0xB7, 0x6B, 0xFE, 0x91, 0x13,
1711 0x2F, 0xA7, 0x5E, 0xE6, 0x94, 0x77, 0x6F, 0x6B},
1712 {0x90, 0x36, 0xD8, 0x29, 0xD5, 0x96, 0xC2, 0x8E,
1713 0xE4, 0xFF, 0x76, 0xBC, 0xE5, 0x77, 0x88, 0x27},
1714 {0xB8, 0x78, 0x69, 0xAF, 0x42, 0x8B, 0x48, 0x64,
1715 0xF7, 0xE9, 0xF3, 0x9C, 0x42, 0x18, 0x7B, 0x73},
1716 {0x7A, 0x88, 0xFB, 0xEB, 0x90, 0xA4, 0xB4, 0xA8,
1717 0x43, 0xA3, 0x1D, 0xF1, 0x26, 0xC4, 0x53, 0x57}
1718 };
1719 static const byte test_decrypt[4][16] = {
1720 {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
1721 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF},
1722 {0x0F, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78,
1723 0x87, 0x96, 0xA5, 0xB4, 0xC3, 0xD2 ,0xE1, 0xF0},
1724 {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
1725 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54 ,0x32, 0x10},
1726 {0x01, 0x23, 0x45, 0x67, 0x76, 0x54 ,0x32, 0x10,
1727 0x89, 0xAB, 0xCD, 0xEF, 0xFE, 0xDC, 0xBA, 0x98}
1728 };
1729
1730 /* Start the timer ticking. */
1731 timer = clock ();
1732
1733 /* Encryption test. */
1734 for (i = 0; i < 125; i++)
1735 {
1736 twofish_setkey (&ctx, buffer[0], sizeof (buffer[0]), &bulk_ops);
1737 for (j = 0; j < 1000; j++)
1738 twofish_encrypt (&ctx, buffer[2], buffer[2]);
1739 twofish_setkey (&ctx, buffer[1], sizeof (buffer[1]), &bulk_ops);
1740 for (j = 0; j < 1000; j++)
1741 twofish_encrypt (&ctx, buffer[3], buffer[3]);
1742 twofish_setkey (&ctx, buffer[2], sizeof (buffer[2])*2, &bulk_ops);
1743 for (j = 0; j < 1000; j++) {
1744 twofish_encrypt (&ctx, buffer[0], buffer[0]);
1745 twofish_encrypt (&ctx, buffer[1], buffer[1]);
1746 }
1747 }
1748 encrypt_msg = memcmp (buffer, test_encrypt, sizeof (test_encrypt)) ?
1749 "encryption failure!\n" : "encryption OK!\n";
1750
1751 /* Decryption test. */
1752 for (i = 0; i < 125; i++)
1753 {
1754 twofish_setkey (&ctx, buffer[2], sizeof (buffer[2])*2, &bulk_ops);
1755 for (j = 0; j < 1000; j++) {
1756 twofish_decrypt (&ctx, buffer[0], buffer[0]);
1757 twofish_decrypt (&ctx, buffer[1], buffer[1]);
1758 }
1759 twofish_setkey (&ctx, buffer[1], sizeof (buffer[1]), &bulk_ops);
1760 for (j = 0; j < 1000; j++)
1761 twofish_decrypt (&ctx, buffer[3], buffer[3]);
1762 twofish_setkey (&ctx, buffer[0], sizeof (buffer[0]), &bulk_ops);
1763 for (j = 0; j < 1000; j++)
1764 twofish_decrypt (&ctx, buffer[2], buffer[2]);
1765 }
1766
1767 /* Stop the timer, and print results. */
1768 timer = clock () - timer;
1769 printf (encrypt_msg);
1770 printf (memcmp (buffer, test_decrypt, sizeof (test_decrypt)) ?
1771 "decryption failure!\n" : "decryption OK!\n");
1772 printf ("elapsed time: %.1f s.\n", (float) timer / CLOCKS_PER_SEC);
1773
1774 return 0;
1775 }
1776
1777 #endif /* TEST */
1778
1779
1780
1781 gcry_cipher_spec_t _gcry_cipher_spec_twofish =
1782 {
1783 GCRY_CIPHER_TWOFISH, {0, 0},
1784 "TWOFISH", NULL, NULL, 16, 256, sizeof (TWOFISH_context),
1785 twofish_setkey, twofish_encrypt, twofish_decrypt
1786 };
1787
1788 gcry_cipher_spec_t _gcry_cipher_spec_twofish128 =
1789 {
1790 GCRY_CIPHER_TWOFISH128, {0, 0},
1791 "TWOFISH128", NULL, NULL, 16, 128, sizeof (TWOFISH_context),
1792 twofish_setkey, twofish_encrypt, twofish_decrypt
1793 };
1794