1 /* Copyright (c) 2019, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 #include <GFp/aes.h>
16 
17 #include "../../internal.h"
18 
19 #if defined(OPENSSL_SSE2)
20 #include <emmintrin.h>
21 #endif
22 
23 
24 // This file contains a constant-time implementation of AES, bitsliced with
25 // 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
26 // batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
27 //
28 // This implementation is based on the algorithms described in the following
29 // references:
30 // - https://bearssl.org/constanttime.html#aes
31 // - https://eprint.iacr.org/2009/129.pdf
32 // - https://eprint.iacr.org/2009/191.pdf
33 
34 
35 // Word operations.
36 //
37 // An aes_word_t is the word used for this AES implementation. Throughout this
38 // file, bits and bytes are ordered little-endian, though "left" and "right"
39 // shifts match the operations themselves, which makes them reversed in a
40 // little-endian, left-to-right reading.
41 //
42 // Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
43 // |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
44 // bits each, each corresponding to a byte in an AES block in column-major
45 // order (AES's byte order). We refer to these as "logical bytes". Note, in the
46 // 32-bit and 64-bit implementations, they are smaller than a byte. (The
47 // contents of a logical byte will be described later.)
48 //
49 // MSVC does not support C bit operators on |__m128i|, so the wrapper functions
50 // |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
51 // |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
52 // value ranges from 0 to 15 independent of |aes_word_t| and
53 // |AES_NOHW_BATCH_SIZE|.
54 //
55 // This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
56 // uses row-major order. Matching the AES order was easier to reason about, and
57 // we do not have PSHUFB available to arbitrarily permute bytes.
58 
59 #if defined(OPENSSL_SSE2)
60 typedef __m128i aes_word_t;
61 // AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
62 // MSVC, so we define a constant.
63 #define AES_NOHW_WORD_SIZE 16
64 #define AES_NOHW_BATCH_SIZE 8
65 #define AES_NOHW_ROW0_MASK \
66   _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff)
67 #define AES_NOHW_ROW1_MASK \
68   _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00)
69 #define AES_NOHW_ROW2_MASK \
70   _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
71 #define AES_NOHW_ROW3_MASK \
72   _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000)
73 #define AES_NOHW_COL01_MASK \
74   _mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff)
75 #define AES_NOHW_COL2_MASK \
76   _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000)
77 #define AES_NOHW_COL3_MASK \
78   _mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000)
79 
aes_nohw_and(aes_word_t a,aes_word_t b)80 static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
81   return _mm_and_si128(a, b);
82 }
83 
aes_nohw_or(aes_word_t a,aes_word_t b)84 static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
85   return _mm_or_si128(a, b);
86 }
87 
aes_nohw_xor(aes_word_t a,aes_word_t b)88 static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
89   return _mm_xor_si128(a, b);
90 }
91 
aes_nohw_not(aes_word_t a)92 static inline aes_word_t aes_nohw_not(aes_word_t a) {
93   return _mm_xor_si128(
94       a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff));
95 }
96 
97 // These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
98 // must be constants.
99 #define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \
100   _mm_slli_si128((a), (i))
101 #define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \
102   _mm_srli_si128((a), (i))
103 #else  // !OPENSSL_SSE2
104 #if defined(OPENSSL_64_BIT)
105 typedef uint64_t aes_word_t;
106 #define AES_NOHW_WORD_SIZE 8
107 #define AES_NOHW_BATCH_SIZE 4
108 #define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
109 #define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
110 #define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
111 #define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
112 #define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff)
113 #define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000)
114 #define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000)
115 #else  // !OPENSSL_64_BIT
116 typedef uint32_t aes_word_t;
117 #define AES_NOHW_WORD_SIZE 4
118 #define AES_NOHW_BATCH_SIZE 2
119 #define AES_NOHW_ROW0_MASK 0x03030303
120 #define AES_NOHW_ROW1_MASK 0x0c0c0c0c
121 #define AES_NOHW_ROW2_MASK 0x30303030
122 #define AES_NOHW_ROW3_MASK 0xc0c0c0c0
123 #define AES_NOHW_COL01_MASK 0x0000ffff
124 #define AES_NOHW_COL2_MASK 0x00ff0000
125 #define AES_NOHW_COL3_MASK 0xff000000
126 #endif  // OPENSSL_64_BIT
127 
aes_nohw_and(aes_word_t a,aes_word_t b)128 static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
129   return a & b;
130 }
131 
aes_nohw_or(aes_word_t a,aes_word_t b)132 static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
133   return a | b;
134 }
135 
aes_nohw_xor(aes_word_t a,aes_word_t b)136 static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
137   return a ^ b;
138 }
139 
aes_nohw_not(aes_word_t a)140 static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
141 
aes_nohw_shift_left(aes_word_t a,aes_word_t i)142 static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
143   return a << (i * AES_NOHW_BATCH_SIZE);
144 }
145 
aes_nohw_shift_right(aes_word_t a,aes_word_t i)146 static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
147   return a >> (i * AES_NOHW_BATCH_SIZE);
148 }
149 #endif  // OPENSSL_SSE2
150 
151 OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
152                       "batch size does not match word size");
153 OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
154                       "AES_NOHW_WORD_SIZE is incorrect");
155 
156 
157 // Block representations.
158 //
159 // This implementation uses three representations for AES blocks. First, the
160 // public API represents blocks as uint8_t[16] in the usual way. Second, most
161 // AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
162 // This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
163 // containing bitsliced blocks a, b, c, d, this would be as follows (vertical
164 // bars divide logical bytes):
165 //
166 //   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
167 //   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
168 //   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
169 //   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
170 //   ...
171 //
172 // Finally, an individual block may be stored as an intermediate form in an
173 // aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
174 // block, so that block[0]'s ith logical byte contains least-significant
175 // |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
176 // |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
177 // "compacting" the block. Note this is no-op with 128-bit words because then
178 // |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
179 // words, one block would be stored in two words:
180 //
181 //   block[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
182 //   block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
183 //
184 // Observe that the distances between corresponding bits in bitsliced and
185 // compact bit orders match. If we line up corresponding words of each block,
186 // the bitsliced and compact representations may be converted by tranposing bits
187 // in corresponding logical bytes. Continuing the 64-bit example:
188 //
189 //   block_a[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
190 //   block_b[0] = b0 b1 b2 b3 |  b8  b9 b10 b11 | b16 b17 b18 b19 ...
191 //   block_c[0] = c0 c1 c2 c3 |  c8  c9 c10 c11 | c16 c17 c18 c19 ...
192 //   block_d[0] = d0 d1 d2 d3 |  d8  d9 d10 d11 | d16 d17 d18 d19 ...
193 //
194 //   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
195 //   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
196 //   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
197 //   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
198 //
199 // Note also that bitwise operations and (logical) byte permutations on an
200 // |aes_word_t| work equally for the bitsliced and compact words.
201 //
202 // We use the compact form in the |AES_KEY| representation to save work
203 // inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
204 // temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
205 // before or after |aes_nohw_transpose|.
206 
207 #define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
208 
209 // An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
210 // specified, it is in bitsliced form.
211 typedef struct {
212   aes_word_t w[8];
213 } AES_NOHW_BATCH;
214 
215 // An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
216 // suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
217 // |AES_KEY|s so it should not be used as a long-term key representation.
218 typedef struct {
219   // keys is an array of batches, one for each round key. Each batch stores
220   // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
221   AES_NOHW_BATCH keys[AES_MAXNR + 1];
222 } AES_NOHW_SCHEDULE;
223 
224 // aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
225 // compact form.
aes_nohw_batch_set(AES_NOHW_BATCH * batch,const aes_word_t in[AES_NOHW_BLOCK_WORDS],size_t i)226 static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
227                                       const aes_word_t in[AES_NOHW_BLOCK_WORDS],
228                                       size_t i) {
229   // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
230   // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
231   // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
232   // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
233   // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
234   // will be correctly placed.)
235   dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
236 #if defined(OPENSSL_SSE2)
237   batch->w[i] = in[0];
238 #elif defined(OPENSSL_64_BIT)
239   batch->w[i] = in[0];
240   batch->w[i + 4] = in[1];
241 #else
242   batch->w[i] = in[0];
243   batch->w[i + 2] = in[1];
244   batch->w[i + 4] = in[2];
245   batch->w[i + 6] = in[3];
246 #endif
247 }
248 
249 // aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
250 // compact form.
aes_nohw_batch_get(const AES_NOHW_BATCH * batch,aes_word_t out[AES_NOHW_BLOCK_WORDS],size_t i)251 static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
252                                       aes_word_t out[AES_NOHW_BLOCK_WORDS],
253                                       size_t i) {
254   dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
255 #if defined(OPENSSL_SSE2)
256   out[0] = batch->w[i];
257 #elif defined(OPENSSL_64_BIT)
258   out[0] = batch->w[i];
259   out[1] = batch->w[i + 4];
260 #else
261   out[0] = batch->w[i];
262   out[1] = batch->w[i + 2];
263   out[2] = batch->w[i + 4];
264   out[3] = batch->w[i + 6];
265 #endif
266 }
267 
268 #if !defined(OPENSSL_SSE2)
269 // aes_nohw_delta_swap returns |a| with bits |a & mask| and
270 // |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
aes_nohw_delta_swap(aes_word_t a,aes_word_t mask,aes_word_t shift)271 static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
272                                              aes_word_t shift) {
273   // See
274   // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
275   aes_word_t b = (a ^ (a >> shift)) & mask;
276   return a ^ b ^ (b << shift);
277 }
278 
279 // In the 32-bit and 64-bit implementations, a block spans multiple words.
280 // |aes_nohw_compact_block| must permute bits across different words. First we
281 // implement |aes_nohw_compact_word| which performs a smaller version of the
282 // transformation which stays within a single word.
283 //
284 // These transformations are generalizations of the output of
285 // http://programming.sirrida.de/calcperm.php on smaller inputs.
286 #if defined(OPENSSL_64_BIT)
aes_nohw_compact_word(uint64_t a)287 static inline uint64_t aes_nohw_compact_word(uint64_t a) {
288   // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
289   // quartets of those chunks:
290   //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
291   //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
292   a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
293   // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
294   //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
295   //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
296   a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
297   // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
298   //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
299   //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
300   a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
301   return a;
302 }
303 
aes_nohw_uncompact_word(uint64_t a)304 static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
305   // Reverse the steps of |aes_nohw_uncompact_word|.
306   a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
307   a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
308   a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
309   return a;
310 }
311 #else   // !OPENSSL_64_BIT
aes_nohw_compact_word(uint32_t a)312 static inline uint32_t aes_nohw_compact_word(uint32_t a) {
313   // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
314   //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
315   //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
316   // Note:  0x00cc = 0b0000_0000_1100_1100
317   //   0x00cc << 6 = 0b0011_0011_0000_0000
318   a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
319   // Now we swap groups of four bits (still numbering by pairs):
320   //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
321   //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
322   // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
323   a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
324   return a;
325 }
326 
aes_nohw_uncompact_word(uint32_t a)327 static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
328   // Reverse the steps of |aes_nohw_uncompact_word|.
329   a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
330   a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
331   return a;
332 }
333 
aes_nohw_word_from_bytes(uint8_t a0,uint8_t a1,uint8_t a2,uint8_t a3)334 static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
335                                                 uint8_t a2, uint8_t a3) {
336   return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
337          ((uint32_t)a3 << 24);
338 }
339 
lo(uint32_t a)340 static inline uint8_t lo(uint32_t a) {
341   return (uint8_t)a;
342 }
343 
344 #endif  // OPENSSL_64_BIT
345 #endif  // !OPENSSL_SSE2
346 
aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],const uint8_t in[16])347 static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
348                                           const uint8_t in[16]) {
349   GFp_memcpy(out, in, 16);
350 #if defined(OPENSSL_SSE2)
351   // No conversions needed.
352 #elif defined(OPENSSL_64_BIT)
353   uint64_t a0 = aes_nohw_compact_word(out[0]);
354   uint64_t a1 = aes_nohw_compact_word(out[1]);
355   out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
356   out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
357 #else
358   uint32_t a0 = aes_nohw_compact_word(out[0]);
359   uint32_t a1 = aes_nohw_compact_word(out[1]);
360   uint32_t a2 = aes_nohw_compact_word(out[2]);
361   uint32_t a3 = aes_nohw_compact_word(out[3]);
362   // Note clang, when building for ARM Thumb2, will sometimes miscompile
363   // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
364   // without optimizations. This bug was introduced in
365   // https://reviews.llvm.org/rL340261 and fixed in
366   // https://reviews.llvm.org/rL351310. The following is written to avoid this.
367   out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
368   out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
369   out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
370   out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
371 #endif
372 }
373 
aes_nohw_uncompact_block(uint8_t out[16],const aes_word_t in[AES_NOHW_BLOCK_WORDS])374 static inline void aes_nohw_uncompact_block(
375     uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
376 #if defined(OPENSSL_SSE2)
377   GFp_memcpy(out, in, 16);  // No conversions needed.
378 #elif defined(OPENSSL_64_BIT)
379   uint64_t a0 = in[0];
380   uint64_t a1 = in[1];
381   uint64_t b0 =
382       aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
383   uint64_t b1 =
384       aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
385   GFp_memcpy(out, &b0, 8);
386   GFp_memcpy(out + 8, &b1, 8);
387 #else
388   uint32_t a0 = in[0];
389   uint32_t a1 = in[1];
390   uint32_t a2 = in[2];
391   uint32_t a3 = in[3];
392   // Note clang, when building for ARM Thumb2, will sometimes miscompile
393   // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
394   // without optimizations. This bug was introduced in
395   // https://reviews.llvm.org/rL340261 and fixed in
396   // https://reviews.llvm.org/rL351310. The following is written to avoid this.
397   uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
398   uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
399   uint32_t b2 =
400       aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
401   uint32_t b3 =
402       aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
403   b0 = aes_nohw_uncompact_word(b0);
404   b1 = aes_nohw_uncompact_word(b1);
405   b2 = aes_nohw_uncompact_word(b2);
406   b3 = aes_nohw_uncompact_word(b3);
407   GFp_memcpy(out, &b0, 4);
408   GFp_memcpy(out + 4, &b1, 4);
409   GFp_memcpy(out + 8, &b2, 4);
410   GFp_memcpy(out + 12, &b3, 4);
411 #endif
412 }
413 
414 // aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
415 // |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
416 // |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
417 // is repeated to the full width of |aes_word_t|.
418 #if defined(OPENSSL_SSE2)
419 // This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
420 // constant shift values.
421 #define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b,              \
422                            /* uint32_t */ mask, /* const */ shift)        \
423   do {                                                                    \
424     __m128i swap =                                                        \
425         _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \
426                       _mm_set_epi32((mask), (mask), (mask), (mask)));     \
427     *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift)));            \
428     *(b) = _mm_xor_si128(*(b), swap);                                     \
429                                                                           \
430   } while (0)
431 #else
aes_nohw_swap_bits(aes_word_t * a,aes_word_t * b,uint32_t mask,aes_word_t shift)432 static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
433                                       uint32_t mask, aes_word_t shift) {
434 #if defined(OPENSSL_64_BIT)
435   aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
436 #else
437   aes_word_t mask_w = mask;
438 #endif
439   // This is a variation on a delta swap.
440   aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
441   *a ^= swap << shift;
442   *b ^= swap;
443 }
444 #endif  // OPENSSL_SSE2
445 
446 // aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
447 // the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
448 // and transposes each square.
aes_nohw_transpose(AES_NOHW_BATCH * batch)449 static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
450   // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
451   aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
452   aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
453   aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
454   aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
455 
456 #if AES_NOHW_BATCH_SIZE >= 4
457   // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
458   aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
459   aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
460   aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
461   aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
462 #endif
463 
464 #if AES_NOHW_BATCH_SIZE >= 8
465   // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
466   aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
467   aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
468   aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
469   aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
470 #endif
471 }
472 
473 // aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
474 // |num_blocks| must be at most |AES_NOHW_BATCH|.
aes_nohw_to_batch(AES_NOHW_BATCH * out,const uint8_t * in,size_t num_blocks)475 static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
476                               size_t num_blocks) {
477   // Don't leave unused blocks uninitialized.
478   GFp_memset(out, 0, sizeof(AES_NOHW_BATCH));
479   debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
480   for (size_t i = 0; i < num_blocks; i++) {
481     aes_word_t block[AES_NOHW_BLOCK_WORDS];
482     aes_nohw_compact_block(block, in + 16 * i);
483     aes_nohw_batch_set(out, block, i);
484   }
485 
486   aes_nohw_transpose(out);
487 }
488 
489 // aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
490 // |num_blocks| must be at most |AES_NOHW_BATCH|.
aes_nohw_from_batch(uint8_t * out,size_t num_blocks,const AES_NOHW_BATCH * batch)491 static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
492                                 const AES_NOHW_BATCH *batch) {
493   AES_NOHW_BATCH copy = *batch;
494   aes_nohw_transpose(&copy);
495 
496   debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
497   for (size_t i = 0; i < num_blocks; i++) {
498     aes_word_t block[AES_NOHW_BLOCK_WORDS];
499     aes_nohw_batch_get(&copy, block, i);
500     aes_nohw_uncompact_block(out + 16 * i, block);
501   }
502 }
503 
504 
505 // AES round steps.
506 
aes_nohw_add_round_key(AES_NOHW_BATCH * batch,const AES_NOHW_BATCH * key)507 static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
508                                    const AES_NOHW_BATCH *key) {
509   for (size_t i = 0; i < 8; i++) {
510     batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
511   }
512 }
513 
aes_nohw_sub_bytes(AES_NOHW_BATCH * batch)514 static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
515   // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
516   aes_word_t x0 = batch->w[7];
517   aes_word_t x1 = batch->w[6];
518   aes_word_t x2 = batch->w[5];
519   aes_word_t x3 = batch->w[4];
520   aes_word_t x4 = batch->w[3];
521   aes_word_t x5 = batch->w[2];
522   aes_word_t x6 = batch->w[1];
523   aes_word_t x7 = batch->w[0];
524 
525   // Figure 2, the top linear transformation.
526   aes_word_t y14 = aes_nohw_xor(x3, x5);
527   aes_word_t y13 = aes_nohw_xor(x0, x6);
528   aes_word_t y9 = aes_nohw_xor(x0, x3);
529   aes_word_t y8 = aes_nohw_xor(x0, x5);
530   aes_word_t t0 = aes_nohw_xor(x1, x2);
531   aes_word_t y1 = aes_nohw_xor(t0, x7);
532   aes_word_t y4 = aes_nohw_xor(y1, x3);
533   aes_word_t y12 = aes_nohw_xor(y13, y14);
534   aes_word_t y2 = aes_nohw_xor(y1, x0);
535   aes_word_t y5 = aes_nohw_xor(y1, x6);
536   aes_word_t y3 = aes_nohw_xor(y5, y8);
537   aes_word_t t1 = aes_nohw_xor(x4, y12);
538   aes_word_t y15 = aes_nohw_xor(t1, x5);
539   aes_word_t y20 = aes_nohw_xor(t1, x1);
540   aes_word_t y6 = aes_nohw_xor(y15, x7);
541   aes_word_t y10 = aes_nohw_xor(y15, t0);
542   aes_word_t y11 = aes_nohw_xor(y20, y9);
543   aes_word_t y7 = aes_nohw_xor(x7, y11);
544   aes_word_t y17 = aes_nohw_xor(y10, y11);
545   aes_word_t y19 = aes_nohw_xor(y10, y8);
546   aes_word_t y16 = aes_nohw_xor(t0, y11);
547   aes_word_t y21 = aes_nohw_xor(y13, y16);
548   aes_word_t y18 = aes_nohw_xor(x0, y16);
549 
550   // Figure 3, the middle non-linear section.
551   aes_word_t t2 = aes_nohw_and(y12, y15);
552   aes_word_t t3 = aes_nohw_and(y3, y6);
553   aes_word_t t4 = aes_nohw_xor(t3, t2);
554   aes_word_t t5 = aes_nohw_and(y4, x7);
555   aes_word_t t6 = aes_nohw_xor(t5, t2);
556   aes_word_t t7 = aes_nohw_and(y13, y16);
557   aes_word_t t8 = aes_nohw_and(y5, y1);
558   aes_word_t t9 = aes_nohw_xor(t8, t7);
559   aes_word_t t10 = aes_nohw_and(y2, y7);
560   aes_word_t t11 = aes_nohw_xor(t10, t7);
561   aes_word_t t12 = aes_nohw_and(y9, y11);
562   aes_word_t t13 = aes_nohw_and(y14, y17);
563   aes_word_t t14 = aes_nohw_xor(t13, t12);
564   aes_word_t t15 = aes_nohw_and(y8, y10);
565   aes_word_t t16 = aes_nohw_xor(t15, t12);
566   aes_word_t t17 = aes_nohw_xor(t4, t14);
567   aes_word_t t18 = aes_nohw_xor(t6, t16);
568   aes_word_t t19 = aes_nohw_xor(t9, t14);
569   aes_word_t t20 = aes_nohw_xor(t11, t16);
570   aes_word_t t21 = aes_nohw_xor(t17, y20);
571   aes_word_t t22 = aes_nohw_xor(t18, y19);
572   aes_word_t t23 = aes_nohw_xor(t19, y21);
573   aes_word_t t24 = aes_nohw_xor(t20, y18);
574   aes_word_t t25 = aes_nohw_xor(t21, t22);
575   aes_word_t t26 = aes_nohw_and(t21, t23);
576   aes_word_t t27 = aes_nohw_xor(t24, t26);
577   aes_word_t t28 = aes_nohw_and(t25, t27);
578   aes_word_t t29 = aes_nohw_xor(t28, t22);
579   aes_word_t t30 = aes_nohw_xor(t23, t24);
580   aes_word_t t31 = aes_nohw_xor(t22, t26);
581   aes_word_t t32 = aes_nohw_and(t31, t30);
582   aes_word_t t33 = aes_nohw_xor(t32, t24);
583   aes_word_t t34 = aes_nohw_xor(t23, t33);
584   aes_word_t t35 = aes_nohw_xor(t27, t33);
585   aes_word_t t36 = aes_nohw_and(t24, t35);
586   aes_word_t t37 = aes_nohw_xor(t36, t34);
587   aes_word_t t38 = aes_nohw_xor(t27, t36);
588   aes_word_t t39 = aes_nohw_and(t29, t38);
589   aes_word_t t40 = aes_nohw_xor(t25, t39);
590   aes_word_t t41 = aes_nohw_xor(t40, t37);
591   aes_word_t t42 = aes_nohw_xor(t29, t33);
592   aes_word_t t43 = aes_nohw_xor(t29, t40);
593   aes_word_t t44 = aes_nohw_xor(t33, t37);
594   aes_word_t t45 = aes_nohw_xor(t42, t41);
595   aes_word_t z0 = aes_nohw_and(t44, y15);
596   aes_word_t z1 = aes_nohw_and(t37, y6);
597   aes_word_t z2 = aes_nohw_and(t33, x7);
598   aes_word_t z3 = aes_nohw_and(t43, y16);
599   aes_word_t z4 = aes_nohw_and(t40, y1);
600   aes_word_t z5 = aes_nohw_and(t29, y7);
601   aes_word_t z6 = aes_nohw_and(t42, y11);
602   aes_word_t z7 = aes_nohw_and(t45, y17);
603   aes_word_t z8 = aes_nohw_and(t41, y10);
604   aes_word_t z9 = aes_nohw_and(t44, y12);
605   aes_word_t z10 = aes_nohw_and(t37, y3);
606   aes_word_t z11 = aes_nohw_and(t33, y4);
607   aes_word_t z12 = aes_nohw_and(t43, y13);
608   aes_word_t z13 = aes_nohw_and(t40, y5);
609   aes_word_t z14 = aes_nohw_and(t29, y2);
610   aes_word_t z15 = aes_nohw_and(t42, y9);
611   aes_word_t z16 = aes_nohw_and(t45, y14);
612   aes_word_t z17 = aes_nohw_and(t41, y8);
613 
614   // Figure 4, bottom linear transformation.
615   aes_word_t t46 = aes_nohw_xor(z15, z16);
616   aes_word_t t47 = aes_nohw_xor(z10, z11);
617   aes_word_t t48 = aes_nohw_xor(z5, z13);
618   aes_word_t t49 = aes_nohw_xor(z9, z10);
619   aes_word_t t50 = aes_nohw_xor(z2, z12);
620   aes_word_t t51 = aes_nohw_xor(z2, z5);
621   aes_word_t t52 = aes_nohw_xor(z7, z8);
622   aes_word_t t53 = aes_nohw_xor(z0, z3);
623   aes_word_t t54 = aes_nohw_xor(z6, z7);
624   aes_word_t t55 = aes_nohw_xor(z16, z17);
625   aes_word_t t56 = aes_nohw_xor(z12, t48);
626   aes_word_t t57 = aes_nohw_xor(t50, t53);
627   aes_word_t t58 = aes_nohw_xor(z4, t46);
628   aes_word_t t59 = aes_nohw_xor(z3, t54);
629   aes_word_t t60 = aes_nohw_xor(t46, t57);
630   aes_word_t t61 = aes_nohw_xor(z14, t57);
631   aes_word_t t62 = aes_nohw_xor(t52, t58);
632   aes_word_t t63 = aes_nohw_xor(t49, t58);
633   aes_word_t t64 = aes_nohw_xor(z4, t59);
634   aes_word_t t65 = aes_nohw_xor(t61, t62);
635   aes_word_t t66 = aes_nohw_xor(z1, t63);
636   aes_word_t s0 = aes_nohw_xor(t59, t63);
637   aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
638   aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
639   aes_word_t t67 = aes_nohw_xor(t64, t65);
640   aes_word_t s3 = aes_nohw_xor(t53, t66);
641   aes_word_t s4 = aes_nohw_xor(t51, t66);
642   aes_word_t s5 = aes_nohw_xor(t47, t65);
643   aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
644   aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
645 
646   batch->w[0] = s7;
647   batch->w[1] = s6;
648   batch->w[2] = s5;
649   batch->w[3] = s4;
650   batch->w[4] = s3;
651   batch->w[5] = s2;
652   batch->w[6] = s1;
653   batch->w[7] = s0;
654 }
655 
656 // aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
657 // to the right by |n|. This is a macro because |aes_nohw_shift_*| require
658 // constant shift counts in the SSE2 implementation.
659 #define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
660   (aes_nohw_or(aes_nohw_shift_right((v), (n)*4),                      \
661                aes_nohw_shift_left((v), 16 - (n)*4)))
662 
aes_nohw_shift_rows(AES_NOHW_BATCH * batch)663 static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
664   for (size_t i = 0; i < 8; i++) {
665     aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
666     aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
667     aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
668     aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
669     row1 = aes_nohw_rotate_cols_right(row1, 1);
670     row2 = aes_nohw_rotate_cols_right(row2, 2);
671     row3 = aes_nohw_rotate_cols_right(row3, 3);
672     batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
673   }
674 }
675 
676 // aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
677 // down by one.
aes_nohw_rotate_rows_down(aes_word_t v)678 static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
679 #if defined(OPENSSL_SSE2)
680   return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24));
681 #elif defined(OPENSSL_64_BIT)
682   return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
683          ((v << 12) & UINT64_C(0xf000f000f000f000));
684 #else
685   return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
686 #endif
687 }
688 
689 // aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
690 // by two.
aes_nohw_rotate_rows_twice(aes_word_t v)691 static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
692 #if defined(OPENSSL_SSE2)
693   return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16));
694 #elif defined(OPENSSL_64_BIT)
695   return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
696          ((v << 8) & UINT64_C(0xff00ff00ff00ff00));
697 #else
698   return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
699 #endif
700 }
701 
aes_nohw_mix_columns(AES_NOHW_BATCH * batch)702 static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
703   // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
704   aes_word_t a0 = batch->w[0];
705   aes_word_t a1 = batch->w[1];
706   aes_word_t a2 = batch->w[2];
707   aes_word_t a3 = batch->w[3];
708   aes_word_t a4 = batch->w[4];
709   aes_word_t a5 = batch->w[5];
710   aes_word_t a6 = batch->w[6];
711   aes_word_t a7 = batch->w[7];
712 
713   aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
714   aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
715   aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
716   aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
717   aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
718   aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
719   aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
720   aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
721   aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
722   aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
723   aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
724   aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
725   aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
726   aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
727   aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
728   aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
729 
730   batch->w[0] =
731       aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
732   batch->w[1] =
733       aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
734                    aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
735   batch->w[2] =
736       aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
737   batch->w[3] =
738       aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
739                    aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
740   batch->w[4] =
741       aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
742                    aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
743   batch->w[5] =
744       aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
745   batch->w[6] =
746       aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
747   batch->w[7] =
748       aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
749 }
750 
aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE * key,size_t num_rounds,AES_NOHW_BATCH * batch)751 static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
752                                    size_t num_rounds, AES_NOHW_BATCH *batch) {
753   aes_nohw_add_round_key(batch, &key->keys[0]);
754   for (size_t i = 1; i < num_rounds; i++) {
755     aes_nohw_sub_bytes(batch);
756     aes_nohw_shift_rows(batch);
757     aes_nohw_mix_columns(batch);
758     aes_nohw_add_round_key(batch, &key->keys[i]);
759   }
760   aes_nohw_sub_bytes(batch);
761   aes_nohw_shift_rows(batch);
762   aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
763 }
764 
765 // Key schedule.
766 
aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE * out,const AES_KEY * key)767 static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
768                                        const AES_KEY *key) {
769   for (unsigned i = 0; i <= key->rounds; i++) {
770     // Copy the round key into each block in the batch.
771     for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
772       aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
773       GFp_memcpy(tmp, key->rd_key + 4 * i, 16);
774       aes_nohw_batch_set(&out->keys[i], tmp, j);
775     }
776     aes_nohw_transpose(&out->keys[i]);
777   }
778 }
779 
780 static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
781                                           0x20, 0x40, 0x80, 0x1b, 0x36};
782 
783 // aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
784 // |rcon|, stored in a |aes_word_t|.
aes_nohw_rcon_slice(uint8_t rcon,size_t i)785 static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
786   rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
787 #if defined(OPENSSL_SSE2)
788   return _mm_set_epi32(0, 0, 0, rcon);
789 #else
790   return ((aes_word_t)rcon);
791 #endif
792 }
793 
aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],const aes_word_t in[AES_NOHW_BLOCK_WORDS])794 static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
795                                const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
796   AES_NOHW_BATCH batch;
797   GFp_memset(&batch, 0, sizeof(batch));
798   aes_nohw_batch_set(&batch, in, 0);
799   aes_nohw_transpose(&batch);
800   aes_nohw_sub_bytes(&batch);
801   aes_nohw_transpose(&batch);
802   aes_nohw_batch_get(&batch, out, 0);
803 }
804 
aes_nohw_setup_key_128(AES_KEY * key,const uint8_t in[16])805 static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
806   key->rounds = 10;
807 
808   aes_word_t block[AES_NOHW_BLOCK_WORDS];
809   aes_nohw_compact_block(block, in);
810   GFp_memcpy(key->rd_key, block, 16);
811 
812   for (size_t i = 1; i <= 10; i++) {
813     aes_word_t sub[AES_NOHW_BLOCK_WORDS];
814     aes_nohw_sub_block(sub, block);
815     uint8_t rcon = aes_nohw_rcon[i - 1];
816     for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
817       // Incorporate |rcon| and the transformed word into the first word.
818       block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
819       block[j] = aes_nohw_xor(
820           block[j],
821           aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
822       // Propagate to the remaining words. Note this is reordered from the usual
823       // formulation to avoid needing masks.
824       aes_word_t v = block[j];
825       block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
826       block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
827       block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
828     }
829     GFp_memcpy(key->rd_key + 4 * i, block, 16);
830   }
831 }
832 
aes_nohw_setup_key_256(AES_KEY * key,const uint8_t in[32])833 static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
834   key->rounds = 14;
835 
836   // Each key schedule iteration produces two round keys.
837   aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
838   aes_nohw_compact_block(block1, in);
839   GFp_memcpy(key->rd_key, block1, 16);
840 
841   aes_nohw_compact_block(block2, in + 16);
842   GFp_memcpy(key->rd_key + 4, block2, 16);
843 
844   for (size_t i = 2; i <= 14; i += 2) {
845     aes_word_t sub[AES_NOHW_BLOCK_WORDS];
846     aes_nohw_sub_block(sub, block2);
847     uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
848     for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
849       // Incorporate |rcon| and the transformed word into the first word.
850       block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
851       block1[j] = aes_nohw_xor(
852           block1[j],
853           aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
854       // Propagate to the remaining words.
855       aes_word_t v = block1[j];
856       block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
857       block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
858       block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
859     }
860     GFp_memcpy(key->rd_key + 4 * i, block1, 16);
861 
862     if (i == 14) {
863       break;
864     }
865 
866     aes_nohw_sub_block(sub, block1);
867     for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
868       // Incorporate the transformed word into the first word.
869       block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
870       // Propagate to the remaining words.
871       aes_word_t v = block2[j];
872       block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
873       block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
874       block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
875     }
876     GFp_memcpy(key->rd_key + 4 * (i + 1), block2, 16);
877   }
878 }
879 
880 
881 // External API.
882 
GFp_aes_nohw_set_encrypt_key(const uint8_t * key,unsigned bits,AES_KEY * aeskey)883 int GFp_aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
884                                  AES_KEY *aeskey) {
885   switch (bits) {
886     case 128:
887       aes_nohw_setup_key_128(aeskey, key);
888       return 0;
889     case 256:
890       aes_nohw_setup_key_256(aeskey, key);
891       return 0;
892   }
893   return 1;
894 }
895 
GFp_aes_nohw_encrypt(const uint8_t * in,uint8_t * out,const AES_KEY * key)896 void GFp_aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
897   AES_NOHW_SCHEDULE sched;
898   aes_nohw_expand_round_keys(&sched, key);
899   AES_NOHW_BATCH batch;
900   aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
901   aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
902   aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
903 }
904 
aes_nohw_xor_block(uint8_t out[16],const uint8_t a[16],const uint8_t b[16])905 static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
906                                       const uint8_t b[16]) {
907   for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
908     aes_word_t x, y;
909     GFp_memcpy(&x, a + i, sizeof(aes_word_t));
910     GFp_memcpy(&y, b + i, sizeof(aes_word_t));
911     x = aes_nohw_xor(x, y);
912     GFp_memcpy(out + i, &x, sizeof(aes_word_t));
913   }
914 }
915 
GFp_aes_nohw_ctr32_encrypt_blocks(const uint8_t * in,uint8_t * out,size_t blocks,const AES_KEY * key,const uint8_t ivec[16])916 void GFp_aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
917                                        size_t blocks, const AES_KEY *key,
918                                        const uint8_t ivec[16]) {
919   if (blocks == 0) {
920     return;
921   }
922 
923   AES_NOHW_SCHEDULE sched;
924   aes_nohw_expand_round_keys(&sched, key);
925 
926   // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
927   alignas(AES_NOHW_WORD_SIZE) union {
928     uint32_t u32[AES_NOHW_BATCH_SIZE * 4];
929     uint8_t u8[AES_NOHW_BATCH_SIZE * 16];
930   } ivs, enc_ivs;
931   for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
932     GFp_memcpy(ivs.u8 + 16 * i, ivec, 16);
933   }
934 
935   uint32_t ctr = CRYPTO_bswap4(ivs.u32[3]);
936   for (;;) {
937     // Update counters.
938     for (uint32_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
939       ivs.u32[4 * i + 3] = CRYPTO_bswap4(ctr + i);
940     }
941 
942     size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
943     AES_NOHW_BATCH batch;
944     aes_nohw_to_batch(&batch, ivs.u8, todo);
945     aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
946     aes_nohw_from_batch(enc_ivs.u8, todo, &batch);
947 
948     for (size_t i = 0; i < todo; i++) {
949       aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs.u8 + 16 * i);
950     }
951 
952     blocks -= todo;
953     if (blocks == 0) {
954       break;
955     }
956 
957     in += 16 * AES_NOHW_BATCH_SIZE;
958     out += 16 * AES_NOHW_BATCH_SIZE;
959     ctr += AES_NOHW_BATCH_SIZE;
960   }
961 }
962