1 //! Fixsliced implementations of AES-128, AES-192 and AES-256
2 //! adapted from the C implementation.
3 //!
4 //! All implementations are fully bitsliced and do not rely on any
5 //! Look-Up Table (LUT).
6 //!
7 //! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
8 //!
9 //! # Author (original C code)
10 //!
11 //! Alexandre Adomnicai, Nanyang Technological University, Singapore
12 //! <alexandre.adomnicai@ntu.edu.sg>
13 //!
14 //! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
15 
16 use crate::Block;
17 use cipher::{
18     consts::{U16, U24, U32},
19     generic_array::GenericArray,
20 };
21 
22 /// AES block batch size for this implementation
23 pub(crate) const FIXSLICE_BLOCKS: usize = 4;
24 
25 /// AES-128 round keys
26 pub(crate) type FixsliceKeys128 = [u64; 88];
27 
28 /// AES-192 round keys
29 pub(crate) type FixsliceKeys192 = [u64; 104];
30 
31 /// AES-256 round keys
32 pub(crate) type FixsliceKeys256 = [u64; 120];
33 
34 /// 512-bit internal state
35 type State = [u64; 8];
36 
37 /// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys12838 pub(crate) fn aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys128 {
39     // TODO(tarcieri): use `::default()` after MSRV 1.47+
40     let mut rkeys = [0u64; 88];
41 
42     bitslice(&mut rkeys[..8], key, key, key, key);
43 
44     let mut rk_off = 0;
45     for rcon in 0..10 {
46         memshift32(&mut rkeys, rk_off);
47         rk_off += 8;
48 
49         sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
50         sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
51 
52         if rcon < 8 {
53             add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
54         } else {
55             add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
56             add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
57             add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
58             add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
59         }
60 
61         xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
62     }
63 
64     // Adjust to match fixslicing format
65     #[cfg(feature = "semi_fixslice")]
66     {
67         for i in (8..88).step_by(16) {
68             inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
69         }
70     }
71     #[cfg(not(feature = "semi_fixslice"))]
72     {
73         for i in (8..72).step_by(32) {
74             inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
75             inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
76             inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
77         }
78         inv_shift_rows_1(&mut rkeys[72..80]);
79     }
80 
81     // Account for NOTs removed from sub_bytes
82     for i in 1..11 {
83         sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
84     }
85 
86     rkeys
87 }
88 
89 /// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys19290 pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys192 {
91     // TODO(tarcieri): use `::default()` after MSRV 1.47+
92     let mut rkeys = [0u64; 104];
93     let mut tmp = [0u64; 8];
94 
95     bitslice(
96         &mut rkeys[..8],
97         &key[..16],
98         &key[..16],
99         &key[..16],
100         &key[..16],
101     );
102     bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]);
103 
104     let mut rcon = 0;
105     let mut rk_off = 8;
106 
107     loop {
108         for i in 0..8 {
109             rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8))
110                 | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
111         }
112 
113         sub_bytes(&mut tmp);
114         sub_bytes_nots(&mut tmp);
115 
116         add_round_constant_bit(&mut tmp, rcon);
117         rcon += 1;
118 
119         for i in 0..8 {
120             let mut ti = rkeys[rk_off + i];
121             ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1));
122             ti ^= 0xf000f000f000f000 & (ti << 4);
123             tmp[i] = ti;
124         }
125         rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
126         rk_off += 8;
127 
128         for i in 0..8 {
129             let ui = tmp[i];
130             let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
131                 | (0xff00ff00ff00ff00 & (ui << 8));
132             ti ^= 0x000f000f000f000f & (ui >> 12);
133             tmp[i] = ti
134                 ^ (0xfff0fff0fff0fff0 & (ti << 4))
135                 ^ (0xff00ff00ff00ff00 & (ti << 8))
136                 ^ (0xf000f000f000f000 & (ti << 12));
137         }
138         rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
139         rk_off += 8;
140 
141         sub_bytes(&mut tmp);
142         sub_bytes_nots(&mut tmp);
143 
144         add_round_constant_bit(&mut tmp, rcon);
145         rcon += 1;
146 
147         for i in 0..8 {
148             let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
149                 | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
150             ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3));
151             rkeys[rk_off + i] = ti
152                 ^ (0xfff0fff0fff0fff0 & (ti << 4))
153                 ^ (0xff00ff00ff00ff00 & (ti << 8))
154                 ^ (0xf000f000f000f000 & (ti << 12));
155         }
156         rk_off += 8;
157 
158         if rcon >= 8 {
159             break;
160         }
161 
162         for i in 0..8 {
163             let ui = rkeys[(rk_off - 8) + i];
164             let mut ti = rkeys[(rk_off - 16) + i];
165             ti ^= 0x0f000f000f000f00 & (ui >> 4);
166             ti ^= 0xf000f000f000f000 & (ti << 4);
167             tmp[i] = ti;
168         }
169     }
170 
171     // Adjust to match fixslicing format
172     #[cfg(feature = "semi_fixslice")]
173     {
174         for i in (8..104).step_by(16) {
175             inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
176         }
177     }
178     #[cfg(not(feature = "semi_fixslice"))]
179     {
180         for i in (0..96).step_by(32) {
181             inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
182             inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
183             inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
184         }
185     }
186 
187     // Account for NOTs removed from sub_bytes
188     for i in 1..13 {
189         sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
190     }
191 
192     rkeys
193 }
194 
195 /// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256196 pub(crate) fn aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256 {
197     // TODO(tarcieri): use `::default()` after MSRV 1.47+
198     let mut rkeys = [0u64; 120];
199 
200     bitslice(
201         &mut rkeys[..8],
202         &key[..16],
203         &key[..16],
204         &key[..16],
205         &key[..16],
206     );
207     bitslice(
208         &mut rkeys[8..16],
209         &key[16..],
210         &key[16..],
211         &key[16..],
212         &key[16..],
213     );
214 
215     let mut rk_off = 8;
216 
217     let mut rcon = 0;
218     loop {
219         memshift32(&mut rkeys, rk_off);
220         rk_off += 8;
221 
222         sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
223         sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
224 
225         add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
226         xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
227         rcon += 1;
228 
229         if rcon == 7 {
230             break;
231         }
232 
233         memshift32(&mut rkeys, rk_off);
234         rk_off += 8;
235 
236         sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
237         sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
238 
239         xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
240     }
241 
242     // Adjust to match fixslicing format
243     #[cfg(feature = "semi_fixslice")]
244     {
245         for i in (8..120).step_by(16) {
246             inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
247         }
248     }
249     #[cfg(not(feature = "semi_fixslice"))]
250     {
251         for i in (8..104).step_by(32) {
252             inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
253             inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
254             inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
255         }
256         inv_shift_rows_1(&mut rkeys[104..112]);
257     }
258 
259     // Account for NOTs removed from sub_bytes
260     for i in 1..15 {
261         sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
262     }
263 
264     rkeys
265 }
266 
267 /// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
268 ///
269 /// Decrypts four blocks in-place and in parallel.
aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block])270 pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
271     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
272     let mut state = State::default();
273 
274     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
275 
276     add_round_key(&mut state, &rkeys[80..]);
277     inv_sub_bytes(&mut state);
278 
279     #[cfg(not(feature = "semi_fixslice"))]
280     {
281         inv_shift_rows_2(&mut state);
282     }
283 
284     let mut rk_off = 72;
285     loop {
286         #[cfg(feature = "semi_fixslice")]
287         {
288             inv_shift_rows_2(&mut state);
289         }
290 
291         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
292         inv_mix_columns_1(&mut state);
293         inv_sub_bytes(&mut state);
294         rk_off -= 8;
295 
296         if rk_off == 0 {
297             break;
298         }
299 
300         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
301         inv_mix_columns_0(&mut state);
302         inv_sub_bytes(&mut state);
303         rk_off -= 8;
304 
305         #[cfg(not(feature = "semi_fixslice"))]
306         {
307             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
308             inv_mix_columns_3(&mut state);
309             inv_sub_bytes(&mut state);
310             rk_off -= 8;
311 
312             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
313             inv_mix_columns_2(&mut state);
314             inv_sub_bytes(&mut state);
315             rk_off -= 8;
316         }
317     }
318 
319     add_round_key(&mut state, &rkeys[..8]);
320 
321     inv_bitslice(&mut state, blocks);
322 }
323 
324 /// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
325 ///
326 /// Encrypts four blocks in-place and in parallel.
aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block])327 pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
328     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
329     let mut state = State::default();
330 
331     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
332 
333     add_round_key(&mut state, &rkeys[..8]);
334 
335     let mut rk_off = 8;
336     loop {
337         sub_bytes(&mut state);
338         mix_columns_1(&mut state);
339         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
340         rk_off += 8;
341 
342         #[cfg(feature = "semi_fixslice")]
343         {
344             shift_rows_2(&mut state);
345         }
346 
347         if rk_off == 80 {
348             break;
349         }
350 
351         #[cfg(not(feature = "semi_fixslice"))]
352         {
353             sub_bytes(&mut state);
354             mix_columns_2(&mut state);
355             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
356             rk_off += 8;
357 
358             sub_bytes(&mut state);
359             mix_columns_3(&mut state);
360             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
361             rk_off += 8;
362         }
363 
364         sub_bytes(&mut state);
365         mix_columns_0(&mut state);
366         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
367         rk_off += 8;
368     }
369 
370     #[cfg(not(feature = "semi_fixslice"))]
371     {
372         shift_rows_2(&mut state);
373     }
374 
375     sub_bytes(&mut state);
376     add_round_key(&mut state, &rkeys[80..]);
377 
378     inv_bitslice(&mut state, blocks);
379 }
380 
381 /// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
382 ///
383 /// Decrypts four blocks in-place and in parallel.
aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block])384 pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
385     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
386     let mut state = State::default();
387 
388     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
389 
390     add_round_key(&mut state, &rkeys[96..]);
391     inv_sub_bytes(&mut state);
392 
393     let mut rk_off = 88;
394     loop {
395         #[cfg(feature = "semi_fixslice")]
396         {
397             inv_shift_rows_2(&mut state);
398         }
399         #[cfg(not(feature = "semi_fixslice"))]
400         {
401             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
402             inv_mix_columns_3(&mut state);
403             inv_sub_bytes(&mut state);
404             rk_off -= 8;
405 
406             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
407             inv_mix_columns_2(&mut state);
408             inv_sub_bytes(&mut state);
409             rk_off -= 8;
410         }
411 
412         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
413         inv_mix_columns_1(&mut state);
414         inv_sub_bytes(&mut state);
415         rk_off -= 8;
416 
417         if rk_off == 0 {
418             break;
419         }
420 
421         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
422         inv_mix_columns_0(&mut state);
423         inv_sub_bytes(&mut state);
424         rk_off -= 8;
425     }
426 
427     add_round_key(&mut state, &rkeys[..8]);
428 
429     inv_bitslice(&mut state, blocks);
430 }
431 
432 /// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
433 ///
434 /// Encrypts four blocks in-place and in parallel.
aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block])435 pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
436     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
437     let mut state = State::default();
438 
439     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
440 
441     add_round_key(&mut state, &rkeys[..8]);
442 
443     let mut rk_off = 8;
444     loop {
445         sub_bytes(&mut state);
446         mix_columns_1(&mut state);
447         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
448         rk_off += 8;
449 
450         #[cfg(feature = "semi_fixslice")]
451         {
452             shift_rows_2(&mut state);
453         }
454         #[cfg(not(feature = "semi_fixslice"))]
455         {
456             sub_bytes(&mut state);
457             mix_columns_2(&mut state);
458             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
459             rk_off += 8;
460 
461             sub_bytes(&mut state);
462             mix_columns_3(&mut state);
463             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
464             rk_off += 8;
465         }
466 
467         if rk_off == 96 {
468             break;
469         }
470 
471         sub_bytes(&mut state);
472         mix_columns_0(&mut state);
473         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
474         rk_off += 8;
475     }
476 
477     sub_bytes(&mut state);
478     add_round_key(&mut state, &rkeys[96..]);
479 
480     inv_bitslice(&mut state, blocks);
481 }
482 
483 /// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
484 ///
485 /// Decrypts four blocks in-place and in parallel.
aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block])486 pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
487     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
488     let mut state = State::default();
489 
490     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
491 
492     add_round_key(&mut state, &rkeys[112..]);
493     inv_sub_bytes(&mut state);
494 
495     #[cfg(not(feature = "semi_fixslice"))]
496     {
497         inv_shift_rows_2(&mut state);
498     }
499 
500     let mut rk_off = 104;
501     loop {
502         #[cfg(feature = "semi_fixslice")]
503         {
504             inv_shift_rows_2(&mut state);
505         }
506 
507         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
508         inv_mix_columns_1(&mut state);
509         inv_sub_bytes(&mut state);
510         rk_off -= 8;
511 
512         if rk_off == 0 {
513             break;
514         }
515 
516         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
517         inv_mix_columns_0(&mut state);
518         inv_sub_bytes(&mut state);
519         rk_off -= 8;
520 
521         #[cfg(not(feature = "semi_fixslice"))]
522         {
523             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
524             inv_mix_columns_3(&mut state);
525             inv_sub_bytes(&mut state);
526             rk_off -= 8;
527 
528             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
529             inv_mix_columns_2(&mut state);
530             inv_sub_bytes(&mut state);
531             rk_off -= 8;
532         }
533     }
534 
535     add_round_key(&mut state, &rkeys[..8]);
536 
537     inv_bitslice(&mut state, blocks);
538 }
539 
540 /// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
541 ///
542 /// Encrypts four blocks in-place and in parallel.
aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block])543 pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
544     debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
545     let mut state = State::default();
546 
547     bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
548 
549     add_round_key(&mut state, &rkeys[..8]);
550 
551     let mut rk_off = 8;
552     loop {
553         sub_bytes(&mut state);
554         mix_columns_1(&mut state);
555         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
556         rk_off += 8;
557 
558         #[cfg(feature = "semi_fixslice")]
559         {
560             shift_rows_2(&mut state);
561         }
562 
563         if rk_off == 112 {
564             break;
565         }
566 
567         #[cfg(not(feature = "semi_fixslice"))]
568         {
569             sub_bytes(&mut state);
570             mix_columns_2(&mut state);
571             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
572             rk_off += 8;
573 
574             sub_bytes(&mut state);
575             mix_columns_3(&mut state);
576             add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
577             rk_off += 8;
578         }
579 
580         sub_bytes(&mut state);
581         mix_columns_0(&mut state);
582         add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
583         rk_off += 8;
584     }
585 
586     #[cfg(not(feature = "semi_fixslice"))]
587     {
588         shift_rows_2(&mut state);
589     }
590 
591     sub_bytes(&mut state);
592     add_round_key(&mut state, &rkeys[112..]);
593 
594     inv_bitslice(&mut state, blocks);
595 }
596 
597 /// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true
598 /// inverse of 'sub_bytes'.
inv_sub_bytes(state: &mut [u64])599 fn inv_sub_bytes(state: &mut [u64]) {
600     debug_assert_eq!(state.len(), 8);
601 
602     // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
603     // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
604 
605     let u7 = state[0];
606     let u6 = state[1];
607     let u5 = state[2];
608     let u4 = state[3];
609     let u3 = state[4];
610     let u2 = state[5];
611     let u1 = state[6];
612     let u0 = state[7];
613 
614     let t23 = u0 ^ u3;
615     let t8 = u1 ^ t23;
616     let m2 = t23 & t8;
617     let t4 = u4 ^ t8;
618     let t22 = u1 ^ u3;
619     let t2 = u0 ^ u1;
620     let t1 = u3 ^ u4;
621     // t23 -> stack
622     let t9 = u7 ^ t1;
623     // t8 -> stack
624     let m7 = t22 & t9;
625     // t9 -> stack
626     let t24 = u4 ^ u7;
627     // m7 -> stack
628     let t10 = t2 ^ t24;
629     // u4 -> stack
630     let m14 = t2 & t10;
631     let r5 = u6 ^ u7;
632     // m2 -> stack
633     let t3 = t1 ^ r5;
634     // t2 -> stack
635     let t13 = t2 ^ r5;
636     let t19 = t22 ^ r5;
637     // t3 -> stack
638     let t17 = u2 ^ t19;
639     // t4 -> stack
640     let t25 = u2 ^ t1;
641     let r13 = u1 ^ u6;
642     // t25 -> stack
643     let t20 = t24 ^ r13;
644     // t17 -> stack
645     let m9 = t20 & t17;
646     // t20 -> stack
647     let r17 = u2 ^ u5;
648     // t22 -> stack
649     let t6 = t22 ^ r17;
650     // t13 -> stack
651     let m1 = t13 & t6;
652     let y5 = u0 ^ r17;
653     let m4 = t19 & y5;
654     let m5 = m4 ^ m1;
655     let m17 = m5 ^ t24;
656     let r18 = u5 ^ u6;
657     let t27 = t1 ^ r18;
658     let t15 = t10 ^ t27;
659     // t6 -> stack
660     let m11 = t1 & t15;
661     let m15 = m14 ^ m11;
662     let m21 = m17 ^ m15;
663     // t1 -> stack
664     // t4 <- stack
665     let m12 = t4 & t27;
666     let m13 = m12 ^ m11;
667     let t14 = t10 ^ r18;
668     let m3 = t14 ^ m1;
669     // m2 <- stack
670     let m16 = m3 ^ m2;
671     let m20 = m16 ^ m13;
672     // u4 <- stack
673     let r19 = u2 ^ u4;
674     let t16 = r13 ^ r19;
675     // t3 <- stack
676     let t26 = t3 ^ t16;
677     let m6 = t3 & t16;
678     let m8 = t26 ^ m6;
679     // t10 -> stack
680     // m7 <- stack
681     let m18 = m8 ^ m7;
682     let m22 = m18 ^ m13;
683     let m25 = m22 & m20;
684     let m26 = m21 ^ m25;
685     let m10 = m9 ^ m6;
686     let m19 = m10 ^ m15;
687     // t25 <- stack
688     let m23 = m19 ^ t25;
689     let m28 = m23 ^ m25;
690     let m24 = m22 ^ m23;
691     let m30 = m26 & m24;
692     let m39 = m23 ^ m30;
693     let m48 = m39 & y5;
694     let m57 = m39 & t19;
695     // m48 -> stack
696     let m36 = m24 ^ m25;
697     let m31 = m20 & m23;
698     let m27 = m20 ^ m21;
699     let m32 = m27 & m31;
700     let m29 = m28 & m27;
701     let m37 = m21 ^ m29;
702     // m39 -> stack
703     let m42 = m37 ^ m39;
704     let m52 = m42 & t15;
705     // t27 -> stack
706     // t1 <- stack
707     let m61 = m42 & t1;
708     let p0 = m52 ^ m61;
709     let p16 = m57 ^ m61;
710     // m57 -> stack
711     // t20 <- stack
712     let m60 = m37 & t20;
713     // p16 -> stack
714     // t17 <- stack
715     let m51 = m37 & t17;
716     let m33 = m27 ^ m25;
717     let m38 = m32 ^ m33;
718     let m43 = m37 ^ m38;
719     let m49 = m43 & t16;
720     let p6 = m49 ^ m60;
721     let p13 = m49 ^ m51;
722     let m58 = m43 & t3;
723     // t9 <- stack
724     let m50 = m38 & t9;
725     // t22 <- stack
726     let m59 = m38 & t22;
727     // p6 -> stack
728     let p1 = m58 ^ m59;
729     let p7 = p0 ^ p1;
730     let m34 = m21 & m22;
731     let m35 = m24 & m34;
732     let m40 = m35 ^ m36;
733     let m41 = m38 ^ m40;
734     let m45 = m42 ^ m41;
735     // t27 <- stack
736     let m53 = m45 & t27;
737     let p8 = m50 ^ m53;
738     let p23 = p7 ^ p8;
739     // t4 <- stack
740     let m62 = m45 & t4;
741     let p14 = m49 ^ m62;
742     let s6 = p14 ^ p23;
743     // t10 <- stack
744     let m54 = m41 & t10;
745     let p2 = m54 ^ m62;
746     let p22 = p2 ^ p7;
747     let s0 = p13 ^ p22;
748     let p17 = m58 ^ p2;
749     let p15 = m54 ^ m59;
750     // t2 <- stack
751     let m63 = m41 & t2;
752     // m39 <- stack
753     let m44 = m39 ^ m40;
754     // p17 -> stack
755     // t6 <- stack
756     let m46 = m44 & t6;
757     let p5 = m46 ^ m51;
758     // p23 -> stack
759     let p18 = m63 ^ p5;
760     let p24 = p5 ^ p7;
761     // m48 <- stack
762     let p12 = m46 ^ m48;
763     let s3 = p12 ^ p22;
764     // t13 <- stack
765     let m55 = m44 & t13;
766     let p9 = m55 ^ m63;
767     // p16 <- stack
768     let s7 = p9 ^ p16;
769     // t8 <- stack
770     let m47 = m40 & t8;
771     let p3 = m47 ^ m50;
772     let p19 = p2 ^ p3;
773     let s5 = p19 ^ p24;
774     let p11 = p0 ^ p3;
775     let p26 = p9 ^ p11;
776     // t23 <- stack
777     let m56 = m40 & t23;
778     let p4 = m48 ^ m56;
779     // p6 <- stack
780     let p20 = p4 ^ p6;
781     let p29 = p15 ^ p20;
782     let s1 = p26 ^ p29;
783     // m57 <- stack
784     let p10 = m57 ^ p4;
785     let p27 = p10 ^ p18;
786     // p23 <- stack
787     let s4 = p23 ^ p27;
788     let p25 = p6 ^ p10;
789     let p28 = p11 ^ p25;
790     // p17 <- stack
791     let s2 = p17 ^ p28;
792 
793     state[0] = s7;
794     state[1] = s6;
795     state[2] = s5;
796     state[3] = s4;
797     state[4] = s3;
798     state[5] = s2;
799     state[6] = s1;
800     state[7] = s0;
801 }
802 
803 /// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
804 ///
805 /// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
806 ///
807 /// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule.
sub_bytes(state: &mut [u64])808 fn sub_bytes(state: &mut [u64]) {
809     debug_assert_eq!(state.len(), 8);
810 
811     // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
812     // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
813 
814     let u7 = state[0];
815     let u6 = state[1];
816     let u5 = state[2];
817     let u4 = state[3];
818     let u3 = state[4];
819     let u2 = state[5];
820     let u1 = state[6];
821     let u0 = state[7];
822 
823     let y14 = u3 ^ u5;
824     let y13 = u0 ^ u6;
825     let y12 = y13 ^ y14;
826     let t1 = u4 ^ y12;
827     let y15 = t1 ^ u5;
828     let t2 = y12 & y15;
829     let y6 = y15 ^ u7;
830     let y20 = t1 ^ u1;
831     // y12 -> stack
832     let y9 = u0 ^ u3;
833     // y20 -> stack
834     let y11 = y20 ^ y9;
835     // y9 -> stack
836     let t12 = y9 & y11;
837     // y6 -> stack
838     let y7 = u7 ^ y11;
839     let y8 = u0 ^ u5;
840     let t0 = u1 ^ u2;
841     let y10 = y15 ^ t0;
842     // y15 -> stack
843     let y17 = y10 ^ y11;
844     // y14 -> stack
845     let t13 = y14 & y17;
846     let t14 = t13 ^ t12;
847     // y17 -> stack
848     let y19 = y10 ^ y8;
849     // y10 -> stack
850     let t15 = y8 & y10;
851     let t16 = t15 ^ t12;
852     let y16 = t0 ^ y11;
853     // y11 -> stack
854     let y21 = y13 ^ y16;
855     // y13 -> stack
856     let t7 = y13 & y16;
857     // y16 -> stack
858     let y18 = u0 ^ y16;
859     let y1 = t0 ^ u7;
860     let y4 = y1 ^ u3;
861     // u7 -> stack
862     let t5 = y4 & u7;
863     let t6 = t5 ^ t2;
864     let t18 = t6 ^ t16;
865     let t22 = t18 ^ y19;
866     let y2 = y1 ^ u0;
867     let t10 = y2 & y7;
868     let t11 = t10 ^ t7;
869     let t20 = t11 ^ t16;
870     let t24 = t20 ^ y18;
871     let y5 = y1 ^ u6;
872     let t8 = y5 & y1;
873     let t9 = t8 ^ t7;
874     let t19 = t9 ^ t14;
875     let t23 = t19 ^ y21;
876     let y3 = y5 ^ y8;
877     // y6 <- stack
878     let t3 = y3 & y6;
879     let t4 = t3 ^ t2;
880     // y20 <- stack
881     let t17 = t4 ^ y20;
882     let t21 = t17 ^ t14;
883     let t26 = t21 & t23;
884     let t27 = t24 ^ t26;
885     let t31 = t22 ^ t26;
886     let t25 = t21 ^ t22;
887     // y4 -> stack
888     let t28 = t25 & t27;
889     let t29 = t28 ^ t22;
890     let z14 = t29 & y2;
891     let z5 = t29 & y7;
892     let t30 = t23 ^ t24;
893     let t32 = t31 & t30;
894     let t33 = t32 ^ t24;
895     let t35 = t27 ^ t33;
896     let t36 = t24 & t35;
897     let t38 = t27 ^ t36;
898     let t39 = t29 & t38;
899     let t40 = t25 ^ t39;
900     let t43 = t29 ^ t40;
901     // y16 <- stack
902     let z3 = t43 & y16;
903     let tc12 = z3 ^ z5;
904     // tc12 -> stack
905     // y13 <- stack
906     let z12 = t43 & y13;
907     let z13 = t40 & y5;
908     let z4 = t40 & y1;
909     let tc6 = z3 ^ z4;
910     let t34 = t23 ^ t33;
911     let t37 = t36 ^ t34;
912     let t41 = t40 ^ t37;
913     // y10 <- stack
914     let z8 = t41 & y10;
915     let z17 = t41 & y8;
916     let t44 = t33 ^ t37;
917     // y15 <- stack
918     let z0 = t44 & y15;
919     // z17 -> stack
920     // y12 <- stack
921     let z9 = t44 & y12;
922     let z10 = t37 & y3;
923     let z1 = t37 & y6;
924     let tc5 = z1 ^ z0;
925     let tc11 = tc6 ^ tc5;
926     // y4 <- stack
927     let z11 = t33 & y4;
928     let t42 = t29 ^ t33;
929     let t45 = t42 ^ t41;
930     // y17 <- stack
931     let z7 = t45 & y17;
932     let tc8 = z7 ^ tc6;
933     // y14 <- stack
934     let z16 = t45 & y14;
935     // y11 <- stack
936     let z6 = t42 & y11;
937     let tc16 = z6 ^ tc8;
938     // z14 -> stack
939     // y9 <- stack
940     let z15 = t42 & y9;
941     let tc20 = z15 ^ tc16;
942     let tc1 = z15 ^ z16;
943     let tc2 = z10 ^ tc1;
944     let tc21 = tc2 ^ z11;
945     let tc3 = z9 ^ tc2;
946     let s0 = tc3 ^ tc16;
947     let s3 = tc3 ^ tc11;
948     let s1 = s3 ^ tc16;
949     let tc13 = z13 ^ tc1;
950     // u7 <- stack
951     let z2 = t33 & u7;
952     let tc4 = z0 ^ z2;
953     let tc7 = z12 ^ tc4;
954     let tc9 = z8 ^ tc7;
955     let tc10 = tc8 ^ tc9;
956     // z14 <- stack
957     let tc17 = z14 ^ tc10;
958     let s5 = tc21 ^ tc17;
959     let tc26 = tc17 ^ tc20;
960     // z17 <- stack
961     let s2 = tc26 ^ z17;
962     // tc12 <- stack
963     let tc14 = tc4 ^ tc12;
964     let tc18 = tc13 ^ tc14;
965     let s6 = tc10 ^ tc18;
966     let s7 = z12 ^ tc18;
967     let s4 = tc14 ^ s3;
968 
969     state[0] = s7;
970     state[1] = s6;
971     state[2] = s5;
972     state[3] = s4;
973     state[4] = s3;
974     state[5] = s2;
975     state[6] = s1;
976     state[7] = s0;
977 }
978 
979 /// NOT operations that are omitted in S-box
980 #[inline]
sub_bytes_nots(state: &mut [u64])981 fn sub_bytes_nots(state: &mut [u64]) {
982     debug_assert_eq!(state.len(), 8);
983     state[0] ^= 0xffffffffffffffff;
984     state[1] ^= 0xffffffffffffffff;
985     state[5] ^= 0xffffffffffffffff;
986     state[6] ^= 0xffffffffffffffff;
987 }
988 
989 /// Computation of the MixColumns transformation in the fixsliced representation, with different
990 /// rotations used according to the round number mod 4.
991 ///
992 /// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
993 macro_rules! define_mix_columns {
994     (
995         $name:ident,
996         $name_inv:ident,
997         $first_rotate:path,
998         $second_rotate:path
999     ) => {
1000         #[rustfmt::skip]
1001         fn $name(state: &mut State) {
1002             let (a0, a1, a2, a3, a4, a5, a6, a7) = (
1003                 state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
1004             );
1005             let (b0, b1, b2, b3, b4, b5, b6, b7) = (
1006                 $first_rotate(a0),
1007                 $first_rotate(a1),
1008                 $first_rotate(a2),
1009                 $first_rotate(a3),
1010                 $first_rotate(a4),
1011                 $first_rotate(a5),
1012                 $first_rotate(a6),
1013                 $first_rotate(a7),
1014             );
1015             let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1016                 a0 ^ b0,
1017                 a1 ^ b1,
1018                 a2 ^ b2,
1019                 a3 ^ b3,
1020                 a4 ^ b4,
1021                 a5 ^ b5,
1022                 a6 ^ b6,
1023                 a7 ^ b7,
1024             );
1025             state[0] = b0      ^ c7 ^ $second_rotate(c0);
1026             state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
1027             state[2] = b2 ^ c1      ^ $second_rotate(c2);
1028             state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
1029             state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
1030             state[5] = b5 ^ c4      ^ $second_rotate(c5);
1031             state[6] = b6 ^ c5      ^ $second_rotate(c6);
1032             state[7] = b7 ^ c6      ^ $second_rotate(c7);
1033         }
1034 
1035         #[rustfmt::skip]
1036         fn $name_inv(state: &mut State) {
1037             let (a0, a1, a2, a3, a4, a5, a6, a7) = (
1038                 state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
1039             );
1040             let (b0, b1, b2, b3, b4, b5, b6, b7) = (
1041                 $first_rotate(a0),
1042                 $first_rotate(a1),
1043                 $first_rotate(a2),
1044                 $first_rotate(a3),
1045                 $first_rotate(a4),
1046                 $first_rotate(a5),
1047                 $first_rotate(a6),
1048                 $first_rotate(a7),
1049             );
1050             let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1051                 a0 ^ b0,
1052                 a1 ^ b1,
1053                 a2 ^ b2,
1054                 a3 ^ b3,
1055                 a4 ^ b4,
1056                 a5 ^ b5,
1057                 a6 ^ b6,
1058                 a7 ^ b7,
1059             );
1060             let (d0, d1, d2, d3, d4, d5, d6, d7) = (
1061                 a0      ^ c7,
1062                 a1 ^ c0 ^ c7,
1063                 a2 ^ c1,
1064                 a3 ^ c2 ^ c7,
1065                 a4 ^ c3 ^ c7,
1066                 a5 ^ c4,
1067                 a6 ^ c5,
1068                 a7 ^ c6,
1069             );
1070             let (e0, e1, e2, e3, e4, e5, e6, e7) = (
1071                 c0      ^ d6,
1072                 c1      ^ d6 ^ d7,
1073                 c2 ^ d0      ^ d7,
1074                 c3 ^ d1 ^ d6,
1075                 c4 ^ d2 ^ d6 ^ d7,
1076                 c5 ^ d3      ^ d7,
1077                 c6 ^ d4,
1078                 c7 ^ d5,
1079             );
1080             state[0] = d0 ^ e0 ^ $second_rotate(e0);
1081             state[1] = d1 ^ e1 ^ $second_rotate(e1);
1082             state[2] = d2 ^ e2 ^ $second_rotate(e2);
1083             state[3] = d3 ^ e3 ^ $second_rotate(e3);
1084             state[4] = d4 ^ e4 ^ $second_rotate(e4);
1085             state[5] = d5 ^ e5 ^ $second_rotate(e5);
1086             state[6] = d6 ^ e6 ^ $second_rotate(e6);
1087             state[7] = d7 ^ e7 ^ $second_rotate(e7);
1088         }
1089     }
1090 }
1091 
1092 define_mix_columns!(
1093     mix_columns_0,
1094     inv_mix_columns_0,
1095     rotate_rows_1,
1096     rotate_rows_2
1097 );
1098 
1099 define_mix_columns!(
1100     mix_columns_1,
1101     inv_mix_columns_1,
1102     rotate_rows_and_columns_1_1,
1103     rotate_rows_and_columns_2_2
1104 );
1105 
1106 #[cfg(not(feature = "semi_fixslice"))]
1107 define_mix_columns!(
1108     mix_columns_2,
1109     inv_mix_columns_2,
1110     rotate_rows_and_columns_1_2,
1111     rotate_rows_2
1112 );
1113 
1114 #[cfg(not(feature = "semi_fixslice"))]
1115 define_mix_columns!(
1116     mix_columns_3,
1117     inv_mix_columns_3,
1118     rotate_rows_and_columns_1_3,
1119     rotate_rows_and_columns_2_2
1120 );
1121 
1122 #[inline]
delta_swap_1(a: &mut u64, shift: u32, mask: u64)1123 fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) {
1124     let t = (*a ^ ((*a) >> shift)) & mask;
1125     *a ^= t ^ (t << shift);
1126 }
1127 
1128 #[inline]
delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64)1129 fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) {
1130     let t = (*a ^ ((*b) >> shift)) & mask;
1131     *a ^= t;
1132     *b ^= t << shift;
1133 }
1134 
1135 /// Applies ShiftRows once on an AES state (or key).
1136 #[cfg(not(feature = "semi_fixslice"))]
1137 #[inline]
shift_rows_1(state: &mut [u64])1138 fn shift_rows_1(state: &mut [u64]) {
1139     debug_assert_eq!(state.len(), 8);
1140     for x in state.iter_mut() {
1141         delta_swap_1(x, 8, 0x00f000ff000f0000);
1142         delta_swap_1(x, 4, 0x0f0f00000f0f0000);
1143     }
1144 }
1145 
1146 /// Applies ShiftRows twice on an AES state (or key).
1147 #[inline]
shift_rows_2(state: &mut [u64])1148 fn shift_rows_2(state: &mut [u64]) {
1149     debug_assert_eq!(state.len(), 8);
1150     for x in state.iter_mut() {
1151         delta_swap_1(x, 8, 0x00ff000000ff0000);
1152     }
1153 }
1154 
1155 /// Applies ShiftRows three times on an AES state (or key).
1156 #[inline]
shift_rows_3(state: &mut [u64])1157 fn shift_rows_3(state: &mut [u64]) {
1158     debug_assert_eq!(state.len(), 8);
1159     for x in state.iter_mut() {
1160         delta_swap_1(x, 8, 0x000f00ff00f00000);
1161         delta_swap_1(x, 4, 0x0f0f00000f0f0000);
1162     }
1163 }
1164 
1165 #[inline(always)]
inv_shift_rows_1(state: &mut [u64])1166 fn inv_shift_rows_1(state: &mut [u64]) {
1167     shift_rows_3(state);
1168 }
1169 
1170 #[inline(always)]
inv_shift_rows_2(state: &mut [u64])1171 fn inv_shift_rows_2(state: &mut [u64]) {
1172     shift_rows_2(state);
1173 }
1174 
1175 #[cfg(not(feature = "semi_fixslice"))]
1176 #[inline(always)]
inv_shift_rows_3(state: &mut [u64])1177 fn inv_shift_rows_3(state: &mut [u64]) {
1178     shift_rows_1(state);
1179 }
1180 
1181 /// XOR the columns after the S-box during the key schedule round function.
1182 ///
1183 /// The `idx_xor` parameter refers to the index of the previous round key that is
1184 /// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
1185 /// respectively).
1186 ///
1187 /// The `idx_ror` parameter refers to the rotation value, which varies between the
1188 /// different key schedules.
xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32)1189 fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) {
1190     for i in 0..8 {
1191         let off_i = offset + i;
1192         let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror));
1193         rkeys[off_i] = rk
1194             ^ (0xfff0fff0fff0fff0 & (rk << 4))
1195             ^ (0xff00ff00ff00ff00 & (rk << 8))
1196             ^ (0xf000f000f000f000 & (rk << 12));
1197     }
1198 }
1199 
1200 /// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state.
bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8])1201 fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) {
1202     debug_assert_eq!(output.len(), 8);
1203     debug_assert_eq!(input0.len(), 16);
1204     debug_assert_eq!(input1.len(), 16);
1205     debug_assert_eq!(input2.len(), 16);
1206     debug_assert_eq!(input3.len(), 16);
1207 
1208     // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a
1209     // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
1210     // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
1211     //     b1 b0 c1 c0 r1 r0 p2 p1 p0
1212     //
1213     // The desired bitsliced data groups first by bit position, then row, column, block:
1214     //     p2 p1 p0 r1 r0 c1 c0 b1 b0
1215 
1216     #[rustfmt::skip]
1217     fn read_reordered(input: &[u8]) -> u64 {
1218         (u64::from(input[0x0])        ) |
1219         (u64::from(input[0x1]) << 0x10) |
1220         (u64::from(input[0x2]) << 0x20) |
1221         (u64::from(input[0x3]) << 0x30) |
1222         (u64::from(input[0x8]) << 0x08) |
1223         (u64::from(input[0x9]) << 0x18) |
1224         (u64::from(input[0xa]) << 0x28) |
1225         (u64::from(input[0xb]) << 0x38)
1226     }
1227 
1228     // Reorder each block's bytes on input
1229     //     __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __
1230     // Reorder by relabeling (note the order of input)
1231     //     b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __
1232     let mut t0 = read_reordered(&input0[0x00..0x0c]);
1233     let mut t4 = read_reordered(&input0[0x04..0x10]);
1234     let mut t1 = read_reordered(&input1[0x00..0x0c]);
1235     let mut t5 = read_reordered(&input1[0x04..0x10]);
1236     let mut t2 = read_reordered(&input2[0x00..0x0c]);
1237     let mut t6 = read_reordered(&input2[0x04..0x10]);
1238     let mut t3 = read_reordered(&input3[0x00..0x0c]);
1239     let mut t7 = read_reordered(&input3[0x04..0x10]);
1240 
1241     // Bit Index Swap 6 <-> 0:
1242     //     __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0
1243     let m0 = 0x5555555555555555;
1244     delta_swap_2(&mut t1, &mut t0, 1, m0);
1245     delta_swap_2(&mut t3, &mut t2, 1, m0);
1246     delta_swap_2(&mut t5, &mut t4, 1, m0);
1247     delta_swap_2(&mut t7, &mut t6, 1, m0);
1248 
1249     // Bit Index Swap 7 <-> 1:
1250     //     __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __
1251     let m1 = 0x3333333333333333;
1252     delta_swap_2(&mut t2, &mut t0, 2, m1);
1253     delta_swap_2(&mut t3, &mut t1, 2, m1);
1254     delta_swap_2(&mut t6, &mut t4, 2, m1);
1255     delta_swap_2(&mut t7, &mut t5, 2, m1);
1256 
1257     // Bit Index Swap 8 <-> 2:
1258     //     c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __
1259     let m2 = 0x0f0f0f0f0f0f0f0f;
1260     delta_swap_2(&mut t4, &mut t0, 4, m2);
1261     delta_swap_2(&mut t5, &mut t1, 4, m2);
1262     delta_swap_2(&mut t6, &mut t2, 4, m2);
1263     delta_swap_2(&mut t7, &mut t3, 4, m2);
1264 
1265     // Final bitsliced bit index, as desired:
1266     //     p2 p1 p0 r1 r0 c1 c0 b1 b0
1267     output[0] = t0;
1268     output[1] = t1;
1269     output[2] = t2;
1270     output[3] = t3;
1271     output[4] = t4;
1272     output[5] = t5;
1273     output[6] = t6;
1274     output[7] = t7;
1275 }
1276 
1277 /// Un-bitslice a 512-bit internal state into four 128-bit blocks of output.
inv_bitslice(input: &mut [u64], output: &mut [Block])1278 fn inv_bitslice(input: &mut [u64], output: &mut [Block]) {
1279     debug_assert_eq!(input.len(), 8);
1280     debug_assert_eq!(output.len(), 4);
1281 
1282     // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at
1283     // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
1284     // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
1285     //     b1 b0 c1 c0 r1 r0 p2 p1 p0
1286     //
1287     // The initially bitsliced data groups first by bit position, then row, column, block:
1288     //     p2 p1 p0 r1 r0 c1 c0 b1 b0
1289 
1290     let mut t0 = input[0];
1291     let mut t1 = input[1];
1292     let mut t2 = input[2];
1293     let mut t3 = input[3];
1294     let mut t4 = input[4];
1295     let mut t5 = input[5];
1296     let mut t6 = input[6];
1297     let mut t7 = input[7];
1298 
1299     // TODO: these bit index swaps are identical to those in 'packing'
1300 
1301     // Bit Index Swap 6 <-> 0:
1302     //     __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0
1303     let m0 = 0x5555555555555555;
1304     delta_swap_2(&mut t1, &mut t0, 1, m0);
1305     delta_swap_2(&mut t3, &mut t2, 1, m0);
1306     delta_swap_2(&mut t5, &mut t4, 1, m0);
1307     delta_swap_2(&mut t7, &mut t6, 1, m0);
1308 
1309     // Bit Index Swap 7 <-> 1:
1310     //     __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __
1311     let m1 = 0x3333333333333333;
1312     delta_swap_2(&mut t2, &mut t0, 2, m1);
1313     delta_swap_2(&mut t3, &mut t1, 2, m1);
1314     delta_swap_2(&mut t6, &mut t4, 2, m1);
1315     delta_swap_2(&mut t7, &mut t5, 2, m1);
1316 
1317     // Bit Index Swap 8 <-> 2:
1318     //     p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __
1319     let m2 = 0x0f0f0f0f0f0f0f0f;
1320     delta_swap_2(&mut t4, &mut t0, 4, m2);
1321     delta_swap_2(&mut t5, &mut t1, 4, m2);
1322     delta_swap_2(&mut t6, &mut t2, 4, m2);
1323     delta_swap_2(&mut t7, &mut t3, 4, m2);
1324 
1325     #[rustfmt::skip]
1326     fn write_reordered(columns: u64, output: &mut [u8]) {
1327         output[0x0] = (columns        ) as u8;
1328         output[0x1] = (columns >> 0x10) as u8;
1329         output[0x2] = (columns >> 0x20) as u8;
1330         output[0x3] = (columns >> 0x30) as u8;
1331         output[0x8] = (columns >> 0x08) as u8;
1332         output[0x9] = (columns >> 0x18) as u8;
1333         output[0xa] = (columns >> 0x28) as u8;
1334         output[0xb] = (columns >> 0x38) as u8;
1335     }
1336 
1337     // Reorder by relabeling (note the order of output)
1338     //     c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __
1339     // Reorder each block's bytes on output
1340     //     __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __
1341     write_reordered(t0, &mut output[0][0x00..0x0c]);
1342     write_reordered(t4, &mut output[0][0x04..0x10]);
1343     write_reordered(t1, &mut output[1][0x00..0x0c]);
1344     write_reordered(t5, &mut output[1][0x04..0x10]);
1345     write_reordered(t2, &mut output[2][0x00..0x0c]);
1346     write_reordered(t6, &mut output[2][0x04..0x10]);
1347     write_reordered(t3, &mut output[3][0x00..0x0c]);
1348     write_reordered(t7, &mut output[3][0x04..0x10]);
1349 
1350     // Final AES bit index, as desired:
1351     //     b1 b0 c1 c0 r1 r0 p2 p1 p0
1352 }
1353 
1354 /// Copy 32-bytes within the provided slice to an 8-byte offset
memshift32(buffer: &mut [u64], src_offset: usize)1355 fn memshift32(buffer: &mut [u64], src_offset: usize) {
1356     debug_assert_eq!(src_offset % 8, 0);
1357 
1358     let dst_offset = src_offset + 8;
1359     debug_assert!(dst_offset + 8 <= buffer.len());
1360 
1361     for i in (0..8).rev() {
1362         buffer[dst_offset + i] = buffer[src_offset + i];
1363     }
1364 }
1365 
1366 /// XOR the round key to the internal state. The round keys are expected to be
1367 /// pre-computed and to be packed in the fixsliced representation.
1368 #[inline]
add_round_key(state: &mut State, rkey: &[u64])1369 fn add_round_key(state: &mut State, rkey: &[u64]) {
1370     debug_assert_eq!(rkey.len(), 8);
1371     for (a, b) in state.iter_mut().zip(rkey) {
1372         *a ^= b;
1373     }
1374 }
1375 
1376 #[inline(always)]
add_round_constant_bit(state: &mut [u64], bit: usize)1377 fn add_round_constant_bit(state: &mut [u64], bit: usize) {
1378     state[bit] ^= 0x00000000f0000000;
1379 }
1380 
1381 #[inline(always)]
ror(x: u64, y: u32) -> u641382 fn ror(x: u64, y: u32) -> u64 {
1383     x.rotate_right(y)
1384 }
1385 
1386 #[inline(always)]
ror_distance(rows: u32, cols: u32) -> u321387 fn ror_distance(rows: u32, cols: u32) -> u32 {
1388     (rows << 4) + (cols << 2)
1389 }
1390 
1391 #[inline(always)]
rotate_rows_1(x: u64) -> u641392 fn rotate_rows_1(x: u64) -> u64 {
1393     ror(x, ror_distance(1, 0))
1394 }
1395 
1396 #[inline(always)]
rotate_rows_2(x: u64) -> u641397 fn rotate_rows_2(x: u64) -> u64 {
1398     ror(x, ror_distance(2, 0))
1399 }
1400 
1401 #[inline(always)]
1402 #[rustfmt::skip]
rotate_rows_and_columns_1_1(x: u64) -> u641403 fn rotate_rows_and_columns_1_1(x: u64) -> u64 {
1404     (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) |
1405     (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000)
1406 }
1407 
1408 #[cfg(not(feature = "semi_fixslice"))]
1409 #[inline(always)]
1410 #[rustfmt::skip]
rotate_rows_and_columns_1_2(x: u64) -> u641411 fn rotate_rows_and_columns_1_2(x: u64) -> u64 {
1412     (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) |
1413     (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00)
1414 }
1415 
1416 #[cfg(not(feature = "semi_fixslice"))]
1417 #[inline(always)]
1418 #[rustfmt::skip]
rotate_rows_and_columns_1_3(x: u64) -> u641419 fn rotate_rows_and_columns_1_3(x: u64) -> u64 {
1420     (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) |
1421     (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0)
1422 }
1423 
1424 #[inline(always)]
1425 #[rustfmt::skip]
rotate_rows_and_columns_2_2(x: u64) -> u641426 fn rotate_rows_and_columns_2_2(x: u64) -> u64 {
1427     (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) |
1428     (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00)
1429 }
1430