1 //! Fixsliced implementations of AES-128, AES-192 and AES-256 (64-bit)
2 //! adapted from the C implementation.
3 //!
4 //! All implementations are fully bitsliced and do not rely on any
5 //! Look-Up Table (LUT).
6 //!
7 //! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
8 //!
9 //! # Author (original C code)
10 //!
11 //! Alexandre Adomnicai, Nanyang Technological University, Singapore
12 //! <alexandre.adomnicai@ntu.edu.sg>
13 //!
14 //! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
15 
16 #![allow(clippy::unreadable_literal)]
17 
18 use crate::Block;
19 use cipher::{
20     consts::{U16, U24, U32},
21     generic_array::GenericArray,
22 };
23 
24 /// AES block batch size for this implementation
25 pub(crate) const FIXSLICE_BLOCKS: usize = 4;
26 
27 /// AES-128 round keys
28 pub(crate) type FixsliceKeys128 = [u64; 88];
29 
30 /// AES-192 round keys
31 pub(crate) type FixsliceKeys192 = [u64; 104];
32 
33 /// AES-256 round keys
34 pub(crate) type FixsliceKeys256 = [u64; 120];
35 
36 /// 512-bit internal state
37 pub(crate) type State = [u64; 8];
38 
39 /// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys12840 pub(crate) fn aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys128 {
41     let mut rkeys = [0u64; 88];
42 
43     bitslice(&mut rkeys[..8], key, key, key, key);
44 
45     let mut rk_off = 0;
46     for rcon in 0..10 {
47         memshift32(&mut rkeys, rk_off);
48         rk_off += 8;
49 
50         sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
51         sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
52 
53         if rcon < 8 {
54             add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
55         } else {
56             add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
57             add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
58             add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
59             add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
60         }
61 
62         xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
63     }
64 
65     // Adjust to match fixslicing format
66     #[cfg(feature = "compact")]
67     {
68         for i in (8..88).step_by(16) {
69             inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
70         }
71     }
72     #[cfg(not(feature = "compact"))]
73     {
74         for i in (8..72).step_by(32) {
75             inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
76             inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
77             inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
78         }
79         inv_shift_rows_1(&mut rkeys[72..80]);
80     }
81 
82     // Account for NOTs removed from sub_bytes
83     for i in 1..11 {
84         sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
85     }
86 
87     rkeys
88 }
89 
90 /// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys19291 pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys192 {
92     let mut rkeys = [0u64; 104];
93     let mut tmp = [0u64; 8];
94 
95     bitslice(
96         &mut rkeys[..8],
97         &key[..16],
98         &key[..16],
99         &key[..16],
100         &key[..16],
101     );
102     bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]);
103 
104     let mut rcon = 0;
105     let mut rk_off = 8;
106 
107     loop {
108         for i in 0..8 {
109             rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8))
110                 | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
111         }
112 
113         sub_bytes(&mut tmp);
114         sub_bytes_nots(&mut tmp);
115 
116         add_round_constant_bit(&mut tmp, rcon);
117         rcon += 1;
118 
119         for i in 0..8 {
120             let mut ti = rkeys[rk_off + i];
121             ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1));
122             ti ^= 0xf000f000f000f000 & (ti << 4);
123             tmp[i] = ti;
124         }
125         rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
126         rk_off += 8;
127 
128         for i in 0..8 {
129             let ui = tmp[i];
130             let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
131                 | (0xff00ff00ff00ff00 & (ui << 8));
132             ti ^= 0x000f000f000f000f & (ui >> 12);
133             tmp[i] = ti
134                 ^ (0xfff0fff0fff0fff0 & (ti << 4))
135                 ^ (0xff00ff00ff00ff00 & (ti << 8))
136                 ^ (0xf000f000f000f000 & (ti << 12));
137         }
138         rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
139         rk_off += 8;
140 
141         sub_bytes(&mut tmp);
142         sub_bytes_nots(&mut tmp);
143 
144         add_round_constant_bit(&mut tmp, rcon);
145         rcon += 1;
146 
147         for i in 0..8 {
148             let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
149                 | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
150             ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3));
151             rkeys[rk_off + i] = ti
152                 ^ (0xfff0fff0fff0fff0 & (ti << 4))
153                 ^ (0xff00ff00ff00ff00 & (ti << 8))
154                 ^ (0xf000f000f000f000 & (ti << 12));
155         }
156         rk_off += 8;
157 
158         if rcon >= 8 {
159             break;
160         }
161 
162         for i in 0..8 {
163             let ui = rkeys[(rk_off - 8) + i];
164             let mut ti = rkeys[(rk_off - 16) + i];
165             ti ^= 0x0f000f000f000f00 & (ui >> 4);
166             ti ^= 0xf000f000f000f000 & (ti << 4);
167             tmp[i] = ti;
168         }
169     }
170 
171     // Adjust to match fixslicing format
172     #[cfg(feature = "compact")]
173     {
174         for i in (8..104).step_by(16) {
175             inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
176         }
177     }
178     #[cfg(not(feature = "compact"))]
179     {
180         for i in (0..96).step_by(32) {
181             inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
182             inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
183             inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
184         }
185     }
186 
187     // Account for NOTs removed from sub_bytes
188     for i in 1..13 {
189         sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
190     }
191 
192     rkeys
193 }
194 
195 /// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256196 pub(crate) fn aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256 {
197     let mut rkeys = [0u64; 120];
198 
199     bitslice(
200         &mut rkeys[..8],
201         &key[..16],
202         &key[..16],
203         &key[..16],
204         &key[..16],
205     );
206     bitslice(
207         &mut rkeys[8..16],
208         &key[16..],
209         &key[16..],
210         &key[16..],
211         &key[16..],
212     );
213 
214     let mut rk_off = 8;
215 
216     let mut rcon = 0;
217     loop {
218         memshift32(&mut rkeys, rk_off);
219         rk_off += 8;
220 
221         sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
222         sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
223 
224         add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
225         xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
226         rcon += 1;
227 
228         if rcon == 7 {
229             break;
230         }
231 
232         memshift32(&mut rkeys, rk_off);
233         rk_off += 8;
234 
235         sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
236         sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
237 
238         xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
239     }
240 
241     // Adjust to match fixslicing format
242     #[cfg(feature = "compact")]
243     {
244         for i in (8..120).step_by(16) {
245             inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
246         }
247     }
248     #[cfg(not(feature = "compact"))]
249     {
250         for i in (8..104).step_by(32) {
251             inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
252             inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
253             inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
254         }
255         inv_shift_rows_1(&mut rkeys[104..112]);
256     }
257 
258     // Account for NOTs removed from sub_bytes
259     for i in 1..15 {
260         sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
261     }
262 
263     rkeys
264 }
265 
266 /// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
267 ///
268 /// Decrypts four blocks in-place and in parallel.
aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block])269 pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
270     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
271     let mut state = State::default();
272 
273     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
274 
275     add_round_key(&mut state, &rkeys[80..]);
276     inv_sub_bytes(&mut state);
277 
278     #[cfg(not(feature = "compact"))]
279     {
280         inv_shift_rows_2(&mut state);
281     }
282 
283     let mut rk_off = 72;
284     loop {
285         #[cfg(feature = "compact")]
286         {
287             inv_shift_rows_2(&mut state);
288         }
289 
290         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
291         inv_mix_columns_1(&mut state);
292         inv_sub_bytes(&mut state);
293         rk_off -= 8;
294 
295         if rk_off == 0 {
296             break;
297         }
298 
299         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
300         inv_mix_columns_0(&mut state);
301         inv_sub_bytes(&mut state);
302         rk_off -= 8;
303 
304         #[cfg(not(feature = "compact"))]
305         {
306             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
307             inv_mix_columns_3(&mut state);
308             inv_sub_bytes(&mut state);
309             rk_off -= 8;
310 
311             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
312             inv_mix_columns_2(&mut state);
313             inv_sub_bytes(&mut state);
314             rk_off -= 8;
315         }
316     }
317 
318     add_round_key(&mut state, &rkeys[..8]);
319 
320     inv_bitslice(&state, blocks);
321 }
322 
323 /// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
324 ///
325 /// Encrypts four blocks in-place and in parallel.
aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block])326 pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
327     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
328     let mut state = State::default();
329 
330     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
331 
332     add_round_key(&mut state, &rkeys[..8]);
333 
334     let mut rk_off = 8;
335     loop {
336         sub_bytes(&mut state);
337         mix_columns_1(&mut state);
338         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
339         rk_off += 8;
340 
341         #[cfg(feature = "compact")]
342         {
343             shift_rows_2(&mut state);
344         }
345 
346         if rk_off == 80 {
347             break;
348         }
349 
350         #[cfg(not(feature = "compact"))]
351         {
352             sub_bytes(&mut state);
353             mix_columns_2(&mut state);
354             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
355             rk_off += 8;
356 
357             sub_bytes(&mut state);
358             mix_columns_3(&mut state);
359             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
360             rk_off += 8;
361         }
362 
363         sub_bytes(&mut state);
364         mix_columns_0(&mut state);
365         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
366         rk_off += 8;
367     }
368 
369     #[cfg(not(feature = "compact"))]
370     {
371         shift_rows_2(&mut state);
372     }
373 
374     sub_bytes(&mut state);
375     add_round_key(&mut state, &rkeys[80..]);
376 
377     inv_bitslice(&state, blocks);
378 }
379 
380 /// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
381 ///
382 /// Decrypts four blocks in-place and in parallel.
aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block])383 pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
384     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
385     let mut state = State::default();
386 
387     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
388 
389     add_round_key(&mut state, &rkeys[96..]);
390     inv_sub_bytes(&mut state);
391 
392     let mut rk_off = 88;
393     loop {
394         #[cfg(feature = "compact")]
395         {
396             inv_shift_rows_2(&mut state);
397         }
398         #[cfg(not(feature = "compact"))]
399         {
400             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
401             inv_mix_columns_3(&mut state);
402             inv_sub_bytes(&mut state);
403             rk_off -= 8;
404 
405             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
406             inv_mix_columns_2(&mut state);
407             inv_sub_bytes(&mut state);
408             rk_off -= 8;
409         }
410 
411         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
412         inv_mix_columns_1(&mut state);
413         inv_sub_bytes(&mut state);
414         rk_off -= 8;
415 
416         if rk_off == 0 {
417             break;
418         }
419 
420         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
421         inv_mix_columns_0(&mut state);
422         inv_sub_bytes(&mut state);
423         rk_off -= 8;
424     }
425 
426     add_round_key(&mut state, &rkeys[..8]);
427 
428     inv_bitslice(&state, blocks);
429 }
430 
431 /// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
432 ///
433 /// Encrypts four blocks in-place and in parallel.
aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block])434 pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
435     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
436     let mut state = State::default();
437 
438     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
439 
440     add_round_key(&mut state, &rkeys[..8]);
441 
442     let mut rk_off = 8;
443     loop {
444         sub_bytes(&mut state);
445         mix_columns_1(&mut state);
446         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
447         rk_off += 8;
448 
449         #[cfg(feature = "compact")]
450         {
451             shift_rows_2(&mut state);
452         }
453         #[cfg(not(feature = "compact"))]
454         {
455             sub_bytes(&mut state);
456             mix_columns_2(&mut state);
457             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
458             rk_off += 8;
459 
460             sub_bytes(&mut state);
461             mix_columns_3(&mut state);
462             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
463             rk_off += 8;
464         }
465 
466         if rk_off == 96 {
467             break;
468         }
469 
470         sub_bytes(&mut state);
471         mix_columns_0(&mut state);
472         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
473         rk_off += 8;
474     }
475 
476     sub_bytes(&mut state);
477     add_round_key(&mut state, &rkeys[96..]);
478 
479     inv_bitslice(&state, blocks);
480 }
481 
482 /// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
483 ///
484 /// Decrypts four blocks in-place and in parallel.
aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block])485 pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
486     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
487     let mut state = State::default();
488 
489     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
490 
491     add_round_key(&mut state, &rkeys[112..]);
492     inv_sub_bytes(&mut state);
493 
494     #[cfg(not(feature = "compact"))]
495     {
496         inv_shift_rows_2(&mut state);
497     }
498 
499     let mut rk_off = 104;
500     loop {
501         #[cfg(feature = "compact")]
502         {
503             inv_shift_rows_2(&mut state);
504         }
505 
506         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
507         inv_mix_columns_1(&mut state);
508         inv_sub_bytes(&mut state);
509         rk_off -= 8;
510 
511         if rk_off == 0 {
512             break;
513         }
514 
515         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
516         inv_mix_columns_0(&mut state);
517         inv_sub_bytes(&mut state);
518         rk_off -= 8;
519 
520         #[cfg(not(feature = "compact"))]
521         {
522             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
523             inv_mix_columns_3(&mut state);
524             inv_sub_bytes(&mut state);
525             rk_off -= 8;
526 
527             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
528             inv_mix_columns_2(&mut state);
529             inv_sub_bytes(&mut state);
530             rk_off -= 8;
531         }
532     }
533 
534     add_round_key(&mut state, &rkeys[..8]);
535 
536     inv_bitslice(&state, blocks);
537 }
538 
539 /// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
540 ///
541 /// Encrypts four blocks in-place and in parallel.
aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block])542 pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
543     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
544     let mut state = State::default();
545 
546     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
547 
548     add_round_key(&mut state, &rkeys[..8]);
549 
550     let mut rk_off = 8;
551     loop {
552         sub_bytes(&mut state);
553         mix_columns_1(&mut state);
554         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
555         rk_off += 8;
556 
557         #[cfg(feature = "compact")]
558         {
559             shift_rows_2(&mut state);
560         }
561 
562         if rk_off == 112 {
563             break;
564         }
565 
566         #[cfg(not(feature = "compact"))]
567         {
568             sub_bytes(&mut state);
569             mix_columns_2(&mut state);
570             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
571             rk_off += 8;
572 
573             sub_bytes(&mut state);
574             mix_columns_3(&mut state);
575             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
576             rk_off += 8;
577         }
578 
579         sub_bytes(&mut state);
580         mix_columns_0(&mut state);
581         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
582         rk_off += 8;
583     }
584 
585     #[cfg(not(feature = "compact"))]
586     {
587         shift_rows_2(&mut state);
588     }
589 
590     sub_bytes(&mut state);
591     add_round_key(&mut state, &rkeys[112..]);
592 
593     inv_bitslice(&state, blocks);
594 }
595 
596 /// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true
597 /// inverse of 'sub_bytes'.
inv_sub_bytes(state: &mut [u64])598 fn inv_sub_bytes(state: &mut [u64]) {
599     debug_assert_eq!(state.len(), 8);
600 
601     // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
602     // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
603 
604     let u7 = state[0];
605     let u6 = state[1];
606     let u5 = state[2];
607     let u4 = state[3];
608     let u3 = state[4];
609     let u2 = state[5];
610     let u1 = state[6];
611     let u0 = state[7];
612 
613     let t23 = u0 ^ u3;
614     let t8 = u1 ^ t23;
615     let m2 = t23 & t8;
616     let t4 = u4 ^ t8;
617     let t22 = u1 ^ u3;
618     let t2 = u0 ^ u1;
619     let t1 = u3 ^ u4;
620     // t23 -> stack
621     let t9 = u7 ^ t1;
622     // t8 -> stack
623     let m7 = t22 & t9;
624     // t9 -> stack
625     let t24 = u4 ^ u7;
626     // m7 -> stack
627     let t10 = t2 ^ t24;
628     // u4 -> stack
629     let m14 = t2 & t10;
630     let r5 = u6 ^ u7;
631     // m2 -> stack
632     let t3 = t1 ^ r5;
633     // t2 -> stack
634     let t13 = t2 ^ r5;
635     let t19 = t22 ^ r5;
636     // t3 -> stack
637     let t17 = u2 ^ t19;
638     // t4 -> stack
639     let t25 = u2 ^ t1;
640     let r13 = u1 ^ u6;
641     // t25 -> stack
642     let t20 = t24 ^ r13;
643     // t17 -> stack
644     let m9 = t20 & t17;
645     // t20 -> stack
646     let r17 = u2 ^ u5;
647     // t22 -> stack
648     let t6 = t22 ^ r17;
649     // t13 -> stack
650     let m1 = t13 & t6;
651     let y5 = u0 ^ r17;
652     let m4 = t19 & y5;
653     let m5 = m4 ^ m1;
654     let m17 = m5 ^ t24;
655     let r18 = u5 ^ u6;
656     let t27 = t1 ^ r18;
657     let t15 = t10 ^ t27;
658     // t6 -> stack
659     let m11 = t1 & t15;
660     let m15 = m14 ^ m11;
661     let m21 = m17 ^ m15;
662     // t1 -> stack
663     // t4 <- stack
664     let m12 = t4 & t27;
665     let m13 = m12 ^ m11;
666     let t14 = t10 ^ r18;
667     let m3 = t14 ^ m1;
668     // m2 <- stack
669     let m16 = m3 ^ m2;
670     let m20 = m16 ^ m13;
671     // u4 <- stack
672     let r19 = u2 ^ u4;
673     let t16 = r13 ^ r19;
674     // t3 <- stack
675     let t26 = t3 ^ t16;
676     let m6 = t3 & t16;
677     let m8 = t26 ^ m6;
678     // t10 -> stack
679     // m7 <- stack
680     let m18 = m8 ^ m7;
681     let m22 = m18 ^ m13;
682     let m25 = m22 & m20;
683     let m26 = m21 ^ m25;
684     let m10 = m9 ^ m6;
685     let m19 = m10 ^ m15;
686     // t25 <- stack
687     let m23 = m19 ^ t25;
688     let m28 = m23 ^ m25;
689     let m24 = m22 ^ m23;
690     let m30 = m26 & m24;
691     let m39 = m23 ^ m30;
692     let m48 = m39 & y5;
693     let m57 = m39 & t19;
694     // m48 -> stack
695     let m36 = m24 ^ m25;
696     let m31 = m20 & m23;
697     let m27 = m20 ^ m21;
698     let m32 = m27 & m31;
699     let m29 = m28 & m27;
700     let m37 = m21 ^ m29;
701     // m39 -> stack
702     let m42 = m37 ^ m39;
703     let m52 = m42 & t15;
704     // t27 -> stack
705     // t1 <- stack
706     let m61 = m42 & t1;
707     let p0 = m52 ^ m61;
708     let p16 = m57 ^ m61;
709     // m57 -> stack
710     // t20 <- stack
711     let m60 = m37 & t20;
712     // p16 -> stack
713     // t17 <- stack
714     let m51 = m37 & t17;
715     let m33 = m27 ^ m25;
716     let m38 = m32 ^ m33;
717     let m43 = m37 ^ m38;
718     let m49 = m43 & t16;
719     let p6 = m49 ^ m60;
720     let p13 = m49 ^ m51;
721     let m58 = m43 & t3;
722     // t9 <- stack
723     let m50 = m38 & t9;
724     // t22 <- stack
725     let m59 = m38 & t22;
726     // p6 -> stack
727     let p1 = m58 ^ m59;
728     let p7 = p0 ^ p1;
729     let m34 = m21 & m22;
730     let m35 = m24 & m34;
731     let m40 = m35 ^ m36;
732     let m41 = m38 ^ m40;
733     let m45 = m42 ^ m41;
734     // t27 <- stack
735     let m53 = m45 & t27;
736     let p8 = m50 ^ m53;
737     let p23 = p7 ^ p8;
738     // t4 <- stack
739     let m62 = m45 & t4;
740     let p14 = m49 ^ m62;
741     let s6 = p14 ^ p23;
742     // t10 <- stack
743     let m54 = m41 & t10;
744     let p2 = m54 ^ m62;
745     let p22 = p2 ^ p7;
746     let s0 = p13 ^ p22;
747     let p17 = m58 ^ p2;
748     let p15 = m54 ^ m59;
749     // t2 <- stack
750     let m63 = m41 & t2;
751     // m39 <- stack
752     let m44 = m39 ^ m40;
753     // p17 -> stack
754     // t6 <- stack
755     let m46 = m44 & t6;
756     let p5 = m46 ^ m51;
757     // p23 -> stack
758     let p18 = m63 ^ p5;
759     let p24 = p5 ^ p7;
760     // m48 <- stack
761     let p12 = m46 ^ m48;
762     let s3 = p12 ^ p22;
763     // t13 <- stack
764     let m55 = m44 & t13;
765     let p9 = m55 ^ m63;
766     // p16 <- stack
767     let s7 = p9 ^ p16;
768     // t8 <- stack
769     let m47 = m40 & t8;
770     let p3 = m47 ^ m50;
771     let p19 = p2 ^ p3;
772     let s5 = p19 ^ p24;
773     let p11 = p0 ^ p3;
774     let p26 = p9 ^ p11;
775     // t23 <- stack
776     let m56 = m40 & t23;
777     let p4 = m48 ^ m56;
778     // p6 <- stack
779     let p20 = p4 ^ p6;
780     let p29 = p15 ^ p20;
781     let s1 = p26 ^ p29;
782     // m57 <- stack
783     let p10 = m57 ^ p4;
784     let p27 = p10 ^ p18;
785     // p23 <- stack
786     let s4 = p23 ^ p27;
787     let p25 = p6 ^ p10;
788     let p28 = p11 ^ p25;
789     // p17 <- stack
790     let s2 = p17 ^ p28;
791 
792     state[0] = s7;
793     state[1] = s6;
794     state[2] = s5;
795     state[3] = s4;
796     state[4] = s3;
797     state[5] = s2;
798     state[6] = s1;
799     state[7] = s0;
800 }
801 
802 /// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
803 ///
804 /// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
805 ///
806 /// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule.
sub_bytes(state: &mut [u64])807 fn sub_bytes(state: &mut [u64]) {
808     debug_assert_eq!(state.len(), 8);
809 
810     // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
811     // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
812 
813     let u7 = state[0];
814     let u6 = state[1];
815     let u5 = state[2];
816     let u4 = state[3];
817     let u3 = state[4];
818     let u2 = state[5];
819     let u1 = state[6];
820     let u0 = state[7];
821 
822     let y14 = u3 ^ u5;
823     let y13 = u0 ^ u6;
824     let y12 = y13 ^ y14;
825     let t1 = u4 ^ y12;
826     let y15 = t1 ^ u5;
827     let t2 = y12 & y15;
828     let y6 = y15 ^ u7;
829     let y20 = t1 ^ u1;
830     // y12 -> stack
831     let y9 = u0 ^ u3;
832     // y20 -> stack
833     let y11 = y20 ^ y9;
834     // y9 -> stack
835     let t12 = y9 & y11;
836     // y6 -> stack
837     let y7 = u7 ^ y11;
838     let y8 = u0 ^ u5;
839     let t0 = u1 ^ u2;
840     let y10 = y15 ^ t0;
841     // y15 -> stack
842     let y17 = y10 ^ y11;
843     // y14 -> stack
844     let t13 = y14 & y17;
845     let t14 = t13 ^ t12;
846     // y17 -> stack
847     let y19 = y10 ^ y8;
848     // y10 -> stack
849     let t15 = y8 & y10;
850     let t16 = t15 ^ t12;
851     let y16 = t0 ^ y11;
852     // y11 -> stack
853     let y21 = y13 ^ y16;
854     // y13 -> stack
855     let t7 = y13 & y16;
856     // y16 -> stack
857     let y18 = u0 ^ y16;
858     let y1 = t0 ^ u7;
859     let y4 = y1 ^ u3;
860     // u7 -> stack
861     let t5 = y4 & u7;
862     let t6 = t5 ^ t2;
863     let t18 = t6 ^ t16;
864     let t22 = t18 ^ y19;
865     let y2 = y1 ^ u0;
866     let t10 = y2 & y7;
867     let t11 = t10 ^ t7;
868     let t20 = t11 ^ t16;
869     let t24 = t20 ^ y18;
870     let y5 = y1 ^ u6;
871     let t8 = y5 & y1;
872     let t9 = t8 ^ t7;
873     let t19 = t9 ^ t14;
874     let t23 = t19 ^ y21;
875     let y3 = y5 ^ y8;
876     // y6 <- stack
877     let t3 = y3 & y6;
878     let t4 = t3 ^ t2;
879     // y20 <- stack
880     let t17 = t4 ^ y20;
881     let t21 = t17 ^ t14;
882     let t26 = t21 & t23;
883     let t27 = t24 ^ t26;
884     let t31 = t22 ^ t26;
885     let t25 = t21 ^ t22;
886     // y4 -> stack
887     let t28 = t25 & t27;
888     let t29 = t28 ^ t22;
889     let z14 = t29 & y2;
890     let z5 = t29 & y7;
891     let t30 = t23 ^ t24;
892     let t32 = t31 & t30;
893     let t33 = t32 ^ t24;
894     let t35 = t27 ^ t33;
895     let t36 = t24 & t35;
896     let t38 = t27 ^ t36;
897     let t39 = t29 & t38;
898     let t40 = t25 ^ t39;
899     let t43 = t29 ^ t40;
900     // y16 <- stack
901     let z3 = t43 & y16;
902     let tc12 = z3 ^ z5;
903     // tc12 -> stack
904     // y13 <- stack
905     let z12 = t43 & y13;
906     let z13 = t40 & y5;
907     let z4 = t40 & y1;
908     let tc6 = z3 ^ z4;
909     let t34 = t23 ^ t33;
910     let t37 = t36 ^ t34;
911     let t41 = t40 ^ t37;
912     // y10 <- stack
913     let z8 = t41 & y10;
914     let z17 = t41 & y8;
915     let t44 = t33 ^ t37;
916     // y15 <- stack
917     let z0 = t44 & y15;
918     // z17 -> stack
919     // y12 <- stack
920     let z9 = t44 & y12;
921     let z10 = t37 & y3;
922     let z1 = t37 & y6;
923     let tc5 = z1 ^ z0;
924     let tc11 = tc6 ^ tc5;
925     // y4 <- stack
926     let z11 = t33 & y4;
927     let t42 = t29 ^ t33;
928     let t45 = t42 ^ t41;
929     // y17 <- stack
930     let z7 = t45 & y17;
931     let tc8 = z7 ^ tc6;
932     // y14 <- stack
933     let z16 = t45 & y14;
934     // y11 <- stack
935     let z6 = t42 & y11;
936     let tc16 = z6 ^ tc8;
937     // z14 -> stack
938     // y9 <- stack
939     let z15 = t42 & y9;
940     let tc20 = z15 ^ tc16;
941     let tc1 = z15 ^ z16;
942     let tc2 = z10 ^ tc1;
943     let tc21 = tc2 ^ z11;
944     let tc3 = z9 ^ tc2;
945     let s0 = tc3 ^ tc16;
946     let s3 = tc3 ^ tc11;
947     let s1 = s3 ^ tc16;
948     let tc13 = z13 ^ tc1;
949     // u7 <- stack
950     let z2 = t33 & u7;
951     let tc4 = z0 ^ z2;
952     let tc7 = z12 ^ tc4;
953     let tc9 = z8 ^ tc7;
954     let tc10 = tc8 ^ tc9;
955     // z14 <- stack
956     let tc17 = z14 ^ tc10;
957     let s5 = tc21 ^ tc17;
958     let tc26 = tc17 ^ tc20;
959     // z17 <- stack
960     let s2 = tc26 ^ z17;
961     // tc12 <- stack
962     let tc14 = tc4 ^ tc12;
963     let tc18 = tc13 ^ tc14;
964     let s6 = tc10 ^ tc18;
965     let s7 = z12 ^ tc18;
966     let s4 = tc14 ^ s3;
967 
968     state[0] = s7;
969     state[1] = s6;
970     state[2] = s5;
971     state[3] = s4;
972     state[4] = s3;
973     state[5] = s2;
974     state[6] = s1;
975     state[7] = s0;
976 }
977 
978 /// NOT operations that are omitted in S-box
979 #[inline]
sub_bytes_nots(state: &mut [u64])980 fn sub_bytes_nots(state: &mut [u64]) {
981     debug_assert_eq!(state.len(), 8);
982     state[0] ^= 0xffffffffffffffff;
983     state[1] ^= 0xffffffffffffffff;
984     state[5] ^= 0xffffffffffffffff;
985     state[6] ^= 0xffffffffffffffff;
986 }
987 
988 /// Computation of the MixColumns transformation in the fixsliced representation, with different
989 /// rotations used according to the round number mod 4.
990 ///
991 /// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
992 macro_rules! define_mix_columns {
993     (
994         $name:ident,
995         $name_inv:ident,
996         $first_rotate:path,
997         $second_rotate:path
998     ) => {
999         #[rustfmt::skip]
1000         fn $name(state: &mut State) {
1001             let (a0, a1, a2, a3, a4, a5, a6, a7) = (
1002                 state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
1003             );
1004             let (b0, b1, b2, b3, b4, b5, b6, b7) = (
1005                 $first_rotate(a0),
1006                 $first_rotate(a1),
1007                 $first_rotate(a2),
1008                 $first_rotate(a3),
1009                 $first_rotate(a4),
1010                 $first_rotate(a5),
1011                 $first_rotate(a6),
1012                 $first_rotate(a7),
1013             );
1014             let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1015                 a0 ^ b0,
1016                 a1 ^ b1,
1017                 a2 ^ b2,
1018                 a3 ^ b3,
1019                 a4 ^ b4,
1020                 a5 ^ b5,
1021                 a6 ^ b6,
1022                 a7 ^ b7,
1023             );
1024             state[0] = b0      ^ c7 ^ $second_rotate(c0);
1025             state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
1026             state[2] = b2 ^ c1      ^ $second_rotate(c2);
1027             state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
1028             state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
1029             state[5] = b5 ^ c4      ^ $second_rotate(c5);
1030             state[6] = b6 ^ c5      ^ $second_rotate(c6);
1031             state[7] = b7 ^ c6      ^ $second_rotate(c7);
1032         }
1033 
1034         #[rustfmt::skip]
1035         fn $name_inv(state: &mut State) {
1036             let (a0, a1, a2, a3, a4, a5, a6, a7) = (
1037                 state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
1038             );
1039             let (b0, b1, b2, b3, b4, b5, b6, b7) = (
1040                 $first_rotate(a0),
1041                 $first_rotate(a1),
1042                 $first_rotate(a2),
1043                 $first_rotate(a3),
1044                 $first_rotate(a4),
1045                 $first_rotate(a5),
1046                 $first_rotate(a6),
1047                 $first_rotate(a7),
1048             );
1049             let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1050                 a0 ^ b0,
1051                 a1 ^ b1,
1052                 a2 ^ b2,
1053                 a3 ^ b3,
1054                 a4 ^ b4,
1055                 a5 ^ b5,
1056                 a6 ^ b6,
1057                 a7 ^ b7,
1058             );
1059             let (d0, d1, d2, d3, d4, d5, d6, d7) = (
1060                 a0      ^ c7,
1061                 a1 ^ c0 ^ c7,
1062                 a2 ^ c1,
1063                 a3 ^ c2 ^ c7,
1064                 a4 ^ c3 ^ c7,
1065                 a5 ^ c4,
1066                 a6 ^ c5,
1067                 a7 ^ c6,
1068             );
1069             let (e0, e1, e2, e3, e4, e5, e6, e7) = (
1070                 c0      ^ d6,
1071                 c1      ^ d6 ^ d7,
1072                 c2 ^ d0      ^ d7,
1073                 c3 ^ d1 ^ d6,
1074                 c4 ^ d2 ^ d6 ^ d7,
1075                 c5 ^ d3      ^ d7,
1076                 c6 ^ d4,
1077                 c7 ^ d5,
1078             );
1079             state[0] = d0 ^ e0 ^ $second_rotate(e0);
1080             state[1] = d1 ^ e1 ^ $second_rotate(e1);
1081             state[2] = d2 ^ e2 ^ $second_rotate(e2);
1082             state[3] = d3 ^ e3 ^ $second_rotate(e3);
1083             state[4] = d4 ^ e4 ^ $second_rotate(e4);
1084             state[5] = d5 ^ e5 ^ $second_rotate(e5);
1085             state[6] = d6 ^ e6 ^ $second_rotate(e6);
1086             state[7] = d7 ^ e7 ^ $second_rotate(e7);
1087         }
1088     }
1089 }
1090 
1091 define_mix_columns!(
1092     mix_columns_0,
1093     inv_mix_columns_0,
1094     rotate_rows_1,
1095     rotate_rows_2
1096 );
1097 
1098 define_mix_columns!(
1099     mix_columns_1,
1100     inv_mix_columns_1,
1101     rotate_rows_and_columns_1_1,
1102     rotate_rows_and_columns_2_2
1103 );
1104 
1105 #[cfg(not(feature = "compact"))]
1106 define_mix_columns!(
1107     mix_columns_2,
1108     inv_mix_columns_2,
1109     rotate_rows_and_columns_1_2,
1110     rotate_rows_2
1111 );
1112 
1113 #[cfg(not(feature = "compact"))]
1114 define_mix_columns!(
1115     mix_columns_3,
1116     inv_mix_columns_3,
1117     rotate_rows_and_columns_1_3,
1118     rotate_rows_and_columns_2_2
1119 );
1120 
1121 #[inline]
delta_swap_1(a: &mut u64, shift: u32, mask: u64)1122 fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) {
1123     let t = (*a ^ ((*a) >> shift)) & mask;
1124     *a ^= t ^ (t << shift);
1125 }
1126 
1127 #[inline]
delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64)1128 fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) {
1129     let t = (*a ^ ((*b) >> shift)) & mask;
1130     *a ^= t;
1131     *b ^= t << shift;
1132 }
1133 
1134 /// Applies ShiftRows once on an AES state (or key).
1135 #[cfg(any(not(feature = "compact"), feature = "hazmat"))]
1136 #[inline]
shift_rows_1(state: &mut [u64])1137 fn shift_rows_1(state: &mut [u64]) {
1138     debug_assert_eq!(state.len(), 8);
1139     for x in state.iter_mut() {
1140         delta_swap_1(x, 8, 0x00f000ff000f0000);
1141         delta_swap_1(x, 4, 0x0f0f00000f0f0000);
1142     }
1143 }
1144 
1145 /// Applies ShiftRows twice on an AES state (or key).
1146 #[inline]
shift_rows_2(state: &mut [u64])1147 fn shift_rows_2(state: &mut [u64]) {
1148     debug_assert_eq!(state.len(), 8);
1149     for x in state.iter_mut() {
1150         delta_swap_1(x, 8, 0x00ff000000ff0000);
1151     }
1152 }
1153 
1154 /// Applies ShiftRows three times on an AES state (or key).
1155 #[inline]
shift_rows_3(state: &mut [u64])1156 fn shift_rows_3(state: &mut [u64]) {
1157     debug_assert_eq!(state.len(), 8);
1158     for x in state.iter_mut() {
1159         delta_swap_1(x, 8, 0x000f00ff00f00000);
1160         delta_swap_1(x, 4, 0x0f0f00000f0f0000);
1161     }
1162 }
1163 
1164 #[inline(always)]
inv_shift_rows_1(state: &mut [u64])1165 fn inv_shift_rows_1(state: &mut [u64]) {
1166     shift_rows_3(state);
1167 }
1168 
1169 #[inline(always)]
inv_shift_rows_2(state: &mut [u64])1170 fn inv_shift_rows_2(state: &mut [u64]) {
1171     shift_rows_2(state);
1172 }
1173 
1174 #[cfg(not(feature = "compact"))]
1175 #[inline(always)]
inv_shift_rows_3(state: &mut [u64])1176 fn inv_shift_rows_3(state: &mut [u64]) {
1177     shift_rows_1(state);
1178 }
1179 
1180 /// XOR the columns after the S-box during the key schedule round function.
1181 ///
1182 /// The `idx_xor` parameter refers to the index of the previous round key that is
1183 /// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
1184 /// respectively).
1185 ///
1186 /// The `idx_ror` parameter refers to the rotation value, which varies between the
1187 /// different key schedules.
xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32)1188 fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) {
1189     for i in 0..8 {
1190         let off_i = offset + i;
1191         let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror));
1192         rkeys[off_i] = rk
1193             ^ (0xfff0fff0fff0fff0 & (rk << 4))
1194             ^ (0xff00ff00ff00ff00 & (rk << 8))
1195             ^ (0xf000f000f000f000 & (rk << 12));
1196     }
1197 }
1198 
1199 /// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state.
bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8])1200 fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) {
1201     debug_assert_eq!(output.len(), 8);
1202     debug_assert_eq!(input0.len(), 16);
1203     debug_assert_eq!(input1.len(), 16);
1204     debug_assert_eq!(input2.len(), 16);
1205     debug_assert_eq!(input3.len(), 16);
1206 
1207     // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a
1208     // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
1209     // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
1210     //     b1 b0 c1 c0 r1 r0 p2 p1 p0
1211     //
1212     // The desired bitsliced data groups first by bit position, then row, column, block:
1213     //     p2 p1 p0 r1 r0 c1 c0 b1 b0
1214 
1215     #[rustfmt::skip]
1216     fn read_reordered(input: &[u8]) -> u64 {
1217         (u64::from(input[0x0])        ) |
1218         (u64::from(input[0x1]) << 0x10) |
1219         (u64::from(input[0x2]) << 0x20) |
1220         (u64::from(input[0x3]) << 0x30) |
1221         (u64::from(input[0x8]) << 0x08) |
1222         (u64::from(input[0x9]) << 0x18) |
1223         (u64::from(input[0xa]) << 0x28) |
1224         (u64::from(input[0xb]) << 0x38)
1225     }
1226 
1227     // Reorder each block's bytes on input
1228     //     __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __
1229     // Reorder by relabeling (note the order of input)
1230     //     b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __
1231     let mut t0 = read_reordered(&input0[0x00..0x0c]);
1232     let mut t4 = read_reordered(&input0[0x04..0x10]);
1233     let mut t1 = read_reordered(&input1[0x00..0x0c]);
1234     let mut t5 = read_reordered(&input1[0x04..0x10]);
1235     let mut t2 = read_reordered(&input2[0x00..0x0c]);
1236     let mut t6 = read_reordered(&input2[0x04..0x10]);
1237     let mut t3 = read_reordered(&input3[0x00..0x0c]);
1238     let mut t7 = read_reordered(&input3[0x04..0x10]);
1239 
1240     // Bit Index Swap 6 <-> 0:
1241     //     __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0
1242     let m0 = 0x5555555555555555;
1243     delta_swap_2(&mut t1, &mut t0, 1, m0);
1244     delta_swap_2(&mut t3, &mut t2, 1, m0);
1245     delta_swap_2(&mut t5, &mut t4, 1, m0);
1246     delta_swap_2(&mut t7, &mut t6, 1, m0);
1247 
1248     // Bit Index Swap 7 <-> 1:
1249     //     __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __
1250     let m1 = 0x3333333333333333;
1251     delta_swap_2(&mut t2, &mut t0, 2, m1);
1252     delta_swap_2(&mut t3, &mut t1, 2, m1);
1253     delta_swap_2(&mut t6, &mut t4, 2, m1);
1254     delta_swap_2(&mut t7, &mut t5, 2, m1);
1255 
1256     // Bit Index Swap 8 <-> 2:
1257     //     c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __
1258     let m2 = 0x0f0f0f0f0f0f0f0f;
1259     delta_swap_2(&mut t4, &mut t0, 4, m2);
1260     delta_swap_2(&mut t5, &mut t1, 4, m2);
1261     delta_swap_2(&mut t6, &mut t2, 4, m2);
1262     delta_swap_2(&mut t7, &mut t3, 4, m2);
1263 
1264     // Final bitsliced bit index, as desired:
1265     //     p2 p1 p0 r1 r0 c1 c0 b1 b0
1266     output[0] = t0;
1267     output[1] = t1;
1268     output[2] = t2;
1269     output[3] = t3;
1270     output[4] = t4;
1271     output[5] = t5;
1272     output[6] = t6;
1273     output[7] = t7;
1274 }
1275 
1276 /// Un-bitslice a 512-bit internal state into four 128-bit blocks of output.
inv_bitslice(input: &[u64], output: &mut [Block])1277 fn inv_bitslice(input: &[u64], output: &mut [Block]) {
1278     debug_assert_eq!(input.len(), 8);
1279     debug_assert_eq!(output.len(), 4);
1280 
1281     // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at
1282     // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
1283     // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
1284     //     b1 b0 c1 c0 r1 r0 p2 p1 p0
1285     //
1286     // The initially bitsliced data groups first by bit position, then row, column, block:
1287     //     p2 p1 p0 r1 r0 c1 c0 b1 b0
1288 
1289     let mut t0 = input[0];
1290     let mut t1 = input[1];
1291     let mut t2 = input[2];
1292     let mut t3 = input[3];
1293     let mut t4 = input[4];
1294     let mut t5 = input[5];
1295     let mut t6 = input[6];
1296     let mut t7 = input[7];
1297 
1298     // TODO: these bit index swaps are identical to those in 'packing'
1299 
1300     // Bit Index Swap 6 <-> 0:
1301     //     __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0
1302     let m0 = 0x5555555555555555;
1303     delta_swap_2(&mut t1, &mut t0, 1, m0);
1304     delta_swap_2(&mut t3, &mut t2, 1, m0);
1305     delta_swap_2(&mut t5, &mut t4, 1, m0);
1306     delta_swap_2(&mut t7, &mut t6, 1, m0);
1307 
1308     // Bit Index Swap 7 <-> 1:
1309     //     __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __
1310     let m1 = 0x3333333333333333;
1311     delta_swap_2(&mut t2, &mut t0, 2, m1);
1312     delta_swap_2(&mut t3, &mut t1, 2, m1);
1313     delta_swap_2(&mut t6, &mut t4, 2, m1);
1314     delta_swap_2(&mut t7, &mut t5, 2, m1);
1315 
1316     // Bit Index Swap 8 <-> 2:
1317     //     p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __
1318     let m2 = 0x0f0f0f0f0f0f0f0f;
1319     delta_swap_2(&mut t4, &mut t0, 4, m2);
1320     delta_swap_2(&mut t5, &mut t1, 4, m2);
1321     delta_swap_2(&mut t6, &mut t2, 4, m2);
1322     delta_swap_2(&mut t7, &mut t3, 4, m2);
1323 
1324     #[rustfmt::skip]
1325     fn write_reordered(columns: u64, output: &mut [u8]) {
1326         output[0x0] = (columns        ) as u8;
1327         output[0x1] = (columns >> 0x10) as u8;
1328         output[0x2] = (columns >> 0x20) as u8;
1329         output[0x3] = (columns >> 0x30) as u8;
1330         output[0x8] = (columns >> 0x08) as u8;
1331         output[0x9] = (columns >> 0x18) as u8;
1332         output[0xa] = (columns >> 0x28) as u8;
1333         output[0xb] = (columns >> 0x38) as u8;
1334     }
1335 
1336     // Reorder by relabeling (note the order of output)
1337     //     c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __
1338     // Reorder each block's bytes on output
1339     //     __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __
1340     write_reordered(t0, &mut output[0][0x00..0x0c]);
1341     write_reordered(t4, &mut output[0][0x04..0x10]);
1342     write_reordered(t1, &mut output[1][0x00..0x0c]);
1343     write_reordered(t5, &mut output[1][0x04..0x10]);
1344     write_reordered(t2, &mut output[2][0x00..0x0c]);
1345     write_reordered(t6, &mut output[2][0x04..0x10]);
1346     write_reordered(t3, &mut output[3][0x00..0x0c]);
1347     write_reordered(t7, &mut output[3][0x04..0x10]);
1348 
1349     // Final AES bit index, as desired:
1350     //     b1 b0 c1 c0 r1 r0 p2 p1 p0
1351 }
1352 
1353 /// Copy 32-bytes within the provided slice to an 8-byte offset
memshift32(buffer: &mut [u64], src_offset: usize)1354 fn memshift32(buffer: &mut [u64], src_offset: usize) {
1355     debug_assert_eq!(src_offset % 8, 0);
1356 
1357     let dst_offset = src_offset + 8;
1358     debug_assert!(dst_offset + 8 <= buffer.len());
1359 
1360     for i in (0..8).rev() {
1361         buffer[dst_offset + i] = buffer[src_offset + i];
1362     }
1363 }
1364 
1365 /// XOR the round key to the internal state. The round keys are expected to be
1366 /// pre-computed and to be packed in the fixsliced representation.
1367 #[inline]
add_round_key(state: &mut State, rkey: &[u64])1368 fn add_round_key(state: &mut State, rkey: &[u64]) {
1369     debug_assert_eq!(rkey.len(), 8);
1370     for (a, b) in state.iter_mut().zip(rkey) {
1371         *a ^= b;
1372     }
1373 }
1374 
1375 #[inline(always)]
add_round_constant_bit(state: &mut [u64], bit: usize)1376 fn add_round_constant_bit(state: &mut [u64], bit: usize) {
1377     state[bit] ^= 0x00000000f0000000;
1378 }
1379 
1380 #[inline(always)]
ror(x: u64, y: u32) -> u641381 fn ror(x: u64, y: u32) -> u64 {
1382     x.rotate_right(y)
1383 }
1384 
1385 #[inline(always)]
ror_distance(rows: u32, cols: u32) -> u321386 fn ror_distance(rows: u32, cols: u32) -> u32 {
1387     (rows << 4) + (cols << 2)
1388 }
1389 
1390 #[inline(always)]
rotate_rows_1(x: u64) -> u641391 fn rotate_rows_1(x: u64) -> u64 {
1392     ror(x, ror_distance(1, 0))
1393 }
1394 
1395 #[inline(always)]
rotate_rows_2(x: u64) -> u641396 fn rotate_rows_2(x: u64) -> u64 {
1397     ror(x, ror_distance(2, 0))
1398 }
1399 
1400 #[inline(always)]
1401 #[rustfmt::skip]
rotate_rows_and_columns_1_1(x: u64) -> u641402 fn rotate_rows_and_columns_1_1(x: u64) -> u64 {
1403     (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) |
1404     (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000)
1405 }
1406 
1407 #[cfg(not(feature = "compact"))]
1408 #[inline(always)]
1409 #[rustfmt::skip]
rotate_rows_and_columns_1_2(x: u64) -> u641410 fn rotate_rows_and_columns_1_2(x: u64) -> u64 {
1411     (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) |
1412     (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00)
1413 }
1414 
1415 #[cfg(not(feature = "compact"))]
1416 #[inline(always)]
1417 #[rustfmt::skip]
rotate_rows_and_columns_1_3(x: u64) -> u641418 fn rotate_rows_and_columns_1_3(x: u64) -> u64 {
1419     (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) |
1420     (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0)
1421 }
1422 
1423 #[inline(always)]
1424 #[rustfmt::skip]
rotate_rows_and_columns_2_2(x: u64) -> u641425 fn rotate_rows_and_columns_2_2(x: u64) -> u64 {
1426     (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) |
1427     (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00)
1428 }
1429 
1430 /// Low-level "hazmat" AES functions.
1431 ///
1432 /// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
1433 /// implementations in this crate, but instead provides raw access to
1434 /// the AES round function gated under the `hazmat` crate feature.
1435 #[cfg(feature = "hazmat")]
1436 pub(crate) mod hazmat {
1437     use super::{
1438         bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0,
1439         shift_rows_1, sub_bytes, sub_bytes_nots, State,
1440     };
1441     use crate::{Block, ParBlocks};
1442 
1443     /// XOR the `src` block into the `dst` block in-place.
xor_in_place(dst: &mut Block, src: &Block)1444     fn xor_in_place(dst: &mut Block, src: &Block) {
1445         for (a, b) in dst.iter_mut().zip(src.as_slice()) {
1446             *a ^= *b;
1447         }
1448     }
1449 
1450     /// Perform a bitslice operation, loading a single block.
bitslice_block(block: &Block) -> State1451     fn bitslice_block(block: &Block) -> State {
1452         let mut state = State::default();
1453         bitslice(&mut state, block, block, block, block);
1454         state
1455     }
1456 
1457     /// Perform an inverse bitslice operation, extracting a single block.
inv_bitslice_block(block: &mut Block, state: &State)1458     fn inv_bitslice_block(block: &mut Block, state: &State) {
1459         let mut out = [Block::default(); 4];
1460         inv_bitslice(state, &mut out);
1461         block.copy_from_slice(&out[0]);
1462     }
1463 
1464     /// AES cipher (encrypt) round function.
1465     #[inline]
cipher_round(block: &mut Block, round_key: &Block)1466     pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) {
1467         let mut state = bitslice_block(block);
1468         sub_bytes(&mut state);
1469         sub_bytes_nots(&mut state);
1470         shift_rows_1(&mut state);
1471         mix_columns_0(&mut state);
1472         inv_bitslice_block(block, &state);
1473         xor_in_place(block, round_key);
1474     }
1475 
1476     /// AES cipher (encrypt) round function: parallel version.
1477     #[inline]
cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks)1478     pub(crate) fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
1479         for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
1480             let mut state = State::default();
1481             bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
1482             sub_bytes(&mut state);
1483             sub_bytes_nots(&mut state);
1484             shift_rows_1(&mut state);
1485             mix_columns_0(&mut state);
1486             inv_bitslice(&state, chunk);
1487 
1488             for i in 0..4 {
1489                 xor_in_place(&mut chunk[i], &keys[i]);
1490             }
1491         }
1492     }
1493 
1494     /// AES cipher (encrypt) round function.
1495     #[inline]
equiv_inv_cipher_round(block: &mut Block, round_key: &Block)1496     pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
1497         let mut state = State::default();
1498         bitslice(&mut state, &block, &block, &block, &block);
1499         sub_bytes_nots(&mut state);
1500         inv_sub_bytes(&mut state);
1501         inv_shift_rows_1(&mut state);
1502         inv_mix_columns_0(&mut state);
1503         inv_bitslice_block(block, &state);
1504         xor_in_place(block, round_key);
1505     }
1506 
1507     /// AES cipher (encrypt) round function: parallel version.
1508     #[inline]
equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks)1509     pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
1510         for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
1511             let mut state = State::default();
1512             bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
1513             sub_bytes_nots(&mut state);
1514             inv_sub_bytes(&mut state);
1515             inv_shift_rows_1(&mut state);
1516             inv_mix_columns_0(&mut state);
1517             inv_bitslice(&state, chunk);
1518 
1519             for i in 0..4 {
1520                 xor_in_place(&mut chunk[i], &keys[i]);
1521             }
1522         }
1523     }
1524 
1525     /// AES mix columns function.
1526     #[inline]
mix_columns(block: &mut Block)1527     pub(crate) fn mix_columns(block: &mut Block) {
1528         let mut state = bitslice_block(block);
1529         mix_columns_0(&mut state);
1530         inv_bitslice_block(block, &state);
1531     }
1532 
1533     /// AES inverse mix columns function.
1534     #[inline]
inv_mix_columns(block: &mut Block)1535     pub(crate) fn inv_mix_columns(block: &mut Block) {
1536         let mut state = bitslice_block(block);
1537         inv_mix_columns_0(&mut state);
1538         inv_bitslice_block(block, &state);
1539     }
1540 }
1541