1 //! Fixsliced implementations of AES-128, AES-192 and AES-256
2 //! adapted from the C implementation.
3 //!
4 //! All implementations are fully bitsliced and do not rely on any
5 //! Look-Up Table (LUT).
6 //!
7 //! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
8 //!
9 //! # Author (original C code)
10 //!
11 //! Alexandre Adomnicai, Nanyang Technological University, Singapore
12 //! <alexandre.adomnicai@ntu.edu.sg>
13 //!
14 //! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
15
16 use crate::Block;
17 use cipher::{
18 consts::{U16, U24, U32},
19 generic_array::GenericArray,
20 };
21
22 /// AES block batch size for this implementation
23 pub(crate) const FIXSLICE_BLOCKS: usize = 4;
24
25 /// AES-128 round keys
26 pub(crate) type FixsliceKeys128 = [u64; 88];
27
28 /// AES-192 round keys
29 pub(crate) type FixsliceKeys192 = [u64; 104];
30
31 /// AES-256 round keys
32 pub(crate) type FixsliceKeys256 = [u64; 120];
33
34 /// 512-bit internal state
35 type State = [u64; 8];
36
37 /// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys12838 pub(crate) fn aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys128 {
39 // TODO(tarcieri): use `::default()` after MSRV 1.47+
40 let mut rkeys = [0u64; 88];
41
42 bitslice(&mut rkeys[..8], key, key, key, key);
43
44 let mut rk_off = 0;
45 for rcon in 0..10 {
46 memshift32(&mut rkeys, rk_off);
47 rk_off += 8;
48
49 sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
50 sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
51
52 if rcon < 8 {
53 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
54 } else {
55 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
56 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
57 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
58 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
59 }
60
61 xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
62 }
63
64 // Adjust to match fixslicing format
65 #[cfg(feature = "semi_fixslice")]
66 {
67 for i in (8..88).step_by(16) {
68 inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
69 }
70 }
71 #[cfg(not(feature = "semi_fixslice"))]
72 {
73 for i in (8..72).step_by(32) {
74 inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
75 inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
76 inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
77 }
78 inv_shift_rows_1(&mut rkeys[72..80]);
79 }
80
81 // Account for NOTs removed from sub_bytes
82 for i in 1..11 {
83 sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
84 }
85
86 rkeys
87 }
88
89 /// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys19290 pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys192 {
91 // TODO(tarcieri): use `::default()` after MSRV 1.47+
92 let mut rkeys = [0u64; 104];
93 let mut tmp = [0u64; 8];
94
95 bitslice(
96 &mut rkeys[..8],
97 &key[..16],
98 &key[..16],
99 &key[..16],
100 &key[..16],
101 );
102 bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]);
103
104 let mut rcon = 0;
105 let mut rk_off = 8;
106
107 loop {
108 for i in 0..8 {
109 rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8))
110 | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
111 }
112
113 sub_bytes(&mut tmp);
114 sub_bytes_nots(&mut tmp);
115
116 add_round_constant_bit(&mut tmp, rcon);
117 rcon += 1;
118
119 for i in 0..8 {
120 let mut ti = rkeys[rk_off + i];
121 ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1));
122 ti ^= 0xf000f000f000f000 & (ti << 4);
123 tmp[i] = ti;
124 }
125 rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
126 rk_off += 8;
127
128 for i in 0..8 {
129 let ui = tmp[i];
130 let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
131 | (0xff00ff00ff00ff00 & (ui << 8));
132 ti ^= 0x000f000f000f000f & (ui >> 12);
133 tmp[i] = ti
134 ^ (0xfff0fff0fff0fff0 & (ti << 4))
135 ^ (0xff00ff00ff00ff00 & (ti << 8))
136 ^ (0xf000f000f000f000 & (ti << 12));
137 }
138 rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
139 rk_off += 8;
140
141 sub_bytes(&mut tmp);
142 sub_bytes_nots(&mut tmp);
143
144 add_round_constant_bit(&mut tmp, rcon);
145 rcon += 1;
146
147 for i in 0..8 {
148 let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
149 | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
150 ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3));
151 rkeys[rk_off + i] = ti
152 ^ (0xfff0fff0fff0fff0 & (ti << 4))
153 ^ (0xff00ff00ff00ff00 & (ti << 8))
154 ^ (0xf000f000f000f000 & (ti << 12));
155 }
156 rk_off += 8;
157
158 if rcon >= 8 {
159 break;
160 }
161
162 for i in 0..8 {
163 let ui = rkeys[(rk_off - 8) + i];
164 let mut ti = rkeys[(rk_off - 16) + i];
165 ti ^= 0x0f000f000f000f00 & (ui >> 4);
166 ti ^= 0xf000f000f000f000 & (ti << 4);
167 tmp[i] = ti;
168 }
169 }
170
171 // Adjust to match fixslicing format
172 #[cfg(feature = "semi_fixslice")]
173 {
174 for i in (8..104).step_by(16) {
175 inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
176 }
177 }
178 #[cfg(not(feature = "semi_fixslice"))]
179 {
180 for i in (0..96).step_by(32) {
181 inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
182 inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
183 inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
184 }
185 }
186
187 // Account for NOTs removed from sub_bytes
188 for i in 1..13 {
189 sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
190 }
191
192 rkeys
193 }
194
195 /// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256196 pub(crate) fn aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256 {
197 // TODO(tarcieri): use `::default()` after MSRV 1.47+
198 let mut rkeys = [0u64; 120];
199
200 bitslice(
201 &mut rkeys[..8],
202 &key[..16],
203 &key[..16],
204 &key[..16],
205 &key[..16],
206 );
207 bitslice(
208 &mut rkeys[8..16],
209 &key[16..],
210 &key[16..],
211 &key[16..],
212 &key[16..],
213 );
214
215 let mut rk_off = 8;
216
217 let mut rcon = 0;
218 loop {
219 memshift32(&mut rkeys, rk_off);
220 rk_off += 8;
221
222 sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
223 sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
224
225 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
226 xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
227 rcon += 1;
228
229 if rcon == 7 {
230 break;
231 }
232
233 memshift32(&mut rkeys, rk_off);
234 rk_off += 8;
235
236 sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
237 sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
238
239 xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
240 }
241
242 // Adjust to match fixslicing format
243 #[cfg(feature = "semi_fixslice")]
244 {
245 for i in (8..120).step_by(16) {
246 inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
247 }
248 }
249 #[cfg(not(feature = "semi_fixslice"))]
250 {
251 for i in (8..104).step_by(32) {
252 inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
253 inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
254 inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
255 }
256 inv_shift_rows_1(&mut rkeys[104..112]);
257 }
258
259 // Account for NOTs removed from sub_bytes
260 for i in 1..15 {
261 sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
262 }
263
264 rkeys
265 }
266
267 /// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
268 ///
269 /// Decrypts four blocks in-place and in parallel.
aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block])270 pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
271 debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
272 let mut state = State::default();
273
274 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
275
276 add_round_key(&mut state, &rkeys[80..]);
277 inv_sub_bytes(&mut state);
278
279 #[cfg(not(feature = "semi_fixslice"))]
280 {
281 inv_shift_rows_2(&mut state);
282 }
283
284 let mut rk_off = 72;
285 loop {
286 #[cfg(feature = "semi_fixslice")]
287 {
288 inv_shift_rows_2(&mut state);
289 }
290
291 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
292 inv_mix_columns_1(&mut state);
293 inv_sub_bytes(&mut state);
294 rk_off -= 8;
295
296 if rk_off == 0 {
297 break;
298 }
299
300 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
301 inv_mix_columns_0(&mut state);
302 inv_sub_bytes(&mut state);
303 rk_off -= 8;
304
305 #[cfg(not(feature = "semi_fixslice"))]
306 {
307 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
308 inv_mix_columns_3(&mut state);
309 inv_sub_bytes(&mut state);
310 rk_off -= 8;
311
312 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
313 inv_mix_columns_2(&mut state);
314 inv_sub_bytes(&mut state);
315 rk_off -= 8;
316 }
317 }
318
319 add_round_key(&mut state, &rkeys[..8]);
320
321 inv_bitslice(&mut state, blocks);
322 }
323
324 /// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
325 ///
326 /// Encrypts four blocks in-place and in parallel.
aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block])327 pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
328 debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
329 let mut state = State::default();
330
331 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
332
333 add_round_key(&mut state, &rkeys[..8]);
334
335 let mut rk_off = 8;
336 loop {
337 sub_bytes(&mut state);
338 mix_columns_1(&mut state);
339 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
340 rk_off += 8;
341
342 #[cfg(feature = "semi_fixslice")]
343 {
344 shift_rows_2(&mut state);
345 }
346
347 if rk_off == 80 {
348 break;
349 }
350
351 #[cfg(not(feature = "semi_fixslice"))]
352 {
353 sub_bytes(&mut state);
354 mix_columns_2(&mut state);
355 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
356 rk_off += 8;
357
358 sub_bytes(&mut state);
359 mix_columns_3(&mut state);
360 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
361 rk_off += 8;
362 }
363
364 sub_bytes(&mut state);
365 mix_columns_0(&mut state);
366 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
367 rk_off += 8;
368 }
369
370 #[cfg(not(feature = "semi_fixslice"))]
371 {
372 shift_rows_2(&mut state);
373 }
374
375 sub_bytes(&mut state);
376 add_round_key(&mut state, &rkeys[80..]);
377
378 inv_bitslice(&mut state, blocks);
379 }
380
381 /// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
382 ///
383 /// Decrypts four blocks in-place and in parallel.
aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block])384 pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
385 debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
386 let mut state = State::default();
387
388 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
389
390 add_round_key(&mut state, &rkeys[96..]);
391 inv_sub_bytes(&mut state);
392
393 let mut rk_off = 88;
394 loop {
395 #[cfg(feature = "semi_fixslice")]
396 {
397 inv_shift_rows_2(&mut state);
398 }
399 #[cfg(not(feature = "semi_fixslice"))]
400 {
401 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
402 inv_mix_columns_3(&mut state);
403 inv_sub_bytes(&mut state);
404 rk_off -= 8;
405
406 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
407 inv_mix_columns_2(&mut state);
408 inv_sub_bytes(&mut state);
409 rk_off -= 8;
410 }
411
412 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
413 inv_mix_columns_1(&mut state);
414 inv_sub_bytes(&mut state);
415 rk_off -= 8;
416
417 if rk_off == 0 {
418 break;
419 }
420
421 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
422 inv_mix_columns_0(&mut state);
423 inv_sub_bytes(&mut state);
424 rk_off -= 8;
425 }
426
427 add_round_key(&mut state, &rkeys[..8]);
428
429 inv_bitslice(&mut state, blocks);
430 }
431
432 /// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
433 ///
434 /// Encrypts four blocks in-place and in parallel.
aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block])435 pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
436 debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
437 let mut state = State::default();
438
439 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
440
441 add_round_key(&mut state, &rkeys[..8]);
442
443 let mut rk_off = 8;
444 loop {
445 sub_bytes(&mut state);
446 mix_columns_1(&mut state);
447 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
448 rk_off += 8;
449
450 #[cfg(feature = "semi_fixslice")]
451 {
452 shift_rows_2(&mut state);
453 }
454 #[cfg(not(feature = "semi_fixslice"))]
455 {
456 sub_bytes(&mut state);
457 mix_columns_2(&mut state);
458 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
459 rk_off += 8;
460
461 sub_bytes(&mut state);
462 mix_columns_3(&mut state);
463 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
464 rk_off += 8;
465 }
466
467 if rk_off == 96 {
468 break;
469 }
470
471 sub_bytes(&mut state);
472 mix_columns_0(&mut state);
473 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
474 rk_off += 8;
475 }
476
477 sub_bytes(&mut state);
478 add_round_key(&mut state, &rkeys[96..]);
479
480 inv_bitslice(&mut state, blocks);
481 }
482
483 /// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
484 ///
485 /// Decrypts four blocks in-place and in parallel.
aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block])486 pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
487 debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
488 let mut state = State::default();
489
490 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
491
492 add_round_key(&mut state, &rkeys[112..]);
493 inv_sub_bytes(&mut state);
494
495 #[cfg(not(feature = "semi_fixslice"))]
496 {
497 inv_shift_rows_2(&mut state);
498 }
499
500 let mut rk_off = 104;
501 loop {
502 #[cfg(feature = "semi_fixslice")]
503 {
504 inv_shift_rows_2(&mut state);
505 }
506
507 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
508 inv_mix_columns_1(&mut state);
509 inv_sub_bytes(&mut state);
510 rk_off -= 8;
511
512 if rk_off == 0 {
513 break;
514 }
515
516 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
517 inv_mix_columns_0(&mut state);
518 inv_sub_bytes(&mut state);
519 rk_off -= 8;
520
521 #[cfg(not(feature = "semi_fixslice"))]
522 {
523 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
524 inv_mix_columns_3(&mut state);
525 inv_sub_bytes(&mut state);
526 rk_off -= 8;
527
528 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
529 inv_mix_columns_2(&mut state);
530 inv_sub_bytes(&mut state);
531 rk_off -= 8;
532 }
533 }
534
535 add_round_key(&mut state, &rkeys[..8]);
536
537 inv_bitslice(&mut state, blocks);
538 }
539
540 /// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
541 ///
542 /// Encrypts four blocks in-place and in parallel.
aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block])543 pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
544 debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
545 let mut state = State::default();
546
547 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
548
549 add_round_key(&mut state, &rkeys[..8]);
550
551 let mut rk_off = 8;
552 loop {
553 sub_bytes(&mut state);
554 mix_columns_1(&mut state);
555 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
556 rk_off += 8;
557
558 #[cfg(feature = "semi_fixslice")]
559 {
560 shift_rows_2(&mut state);
561 }
562
563 if rk_off == 112 {
564 break;
565 }
566
567 #[cfg(not(feature = "semi_fixslice"))]
568 {
569 sub_bytes(&mut state);
570 mix_columns_2(&mut state);
571 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
572 rk_off += 8;
573
574 sub_bytes(&mut state);
575 mix_columns_3(&mut state);
576 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
577 rk_off += 8;
578 }
579
580 sub_bytes(&mut state);
581 mix_columns_0(&mut state);
582 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
583 rk_off += 8;
584 }
585
586 #[cfg(not(feature = "semi_fixslice"))]
587 {
588 shift_rows_2(&mut state);
589 }
590
591 sub_bytes(&mut state);
592 add_round_key(&mut state, &rkeys[112..]);
593
594 inv_bitslice(&mut state, blocks);
595 }
596
597 /// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true
598 /// inverse of 'sub_bytes'.
inv_sub_bytes(state: &mut [u64])599 fn inv_sub_bytes(state: &mut [u64]) {
600 debug_assert_eq!(state.len(), 8);
601
602 // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
603 // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
604
605 let u7 = state[0];
606 let u6 = state[1];
607 let u5 = state[2];
608 let u4 = state[3];
609 let u3 = state[4];
610 let u2 = state[5];
611 let u1 = state[6];
612 let u0 = state[7];
613
614 let t23 = u0 ^ u3;
615 let t8 = u1 ^ t23;
616 let m2 = t23 & t8;
617 let t4 = u4 ^ t8;
618 let t22 = u1 ^ u3;
619 let t2 = u0 ^ u1;
620 let t1 = u3 ^ u4;
621 // t23 -> stack
622 let t9 = u7 ^ t1;
623 // t8 -> stack
624 let m7 = t22 & t9;
625 // t9 -> stack
626 let t24 = u4 ^ u7;
627 // m7 -> stack
628 let t10 = t2 ^ t24;
629 // u4 -> stack
630 let m14 = t2 & t10;
631 let r5 = u6 ^ u7;
632 // m2 -> stack
633 let t3 = t1 ^ r5;
634 // t2 -> stack
635 let t13 = t2 ^ r5;
636 let t19 = t22 ^ r5;
637 // t3 -> stack
638 let t17 = u2 ^ t19;
639 // t4 -> stack
640 let t25 = u2 ^ t1;
641 let r13 = u1 ^ u6;
642 // t25 -> stack
643 let t20 = t24 ^ r13;
644 // t17 -> stack
645 let m9 = t20 & t17;
646 // t20 -> stack
647 let r17 = u2 ^ u5;
648 // t22 -> stack
649 let t6 = t22 ^ r17;
650 // t13 -> stack
651 let m1 = t13 & t6;
652 let y5 = u0 ^ r17;
653 let m4 = t19 & y5;
654 let m5 = m4 ^ m1;
655 let m17 = m5 ^ t24;
656 let r18 = u5 ^ u6;
657 let t27 = t1 ^ r18;
658 let t15 = t10 ^ t27;
659 // t6 -> stack
660 let m11 = t1 & t15;
661 let m15 = m14 ^ m11;
662 let m21 = m17 ^ m15;
663 // t1 -> stack
664 // t4 <- stack
665 let m12 = t4 & t27;
666 let m13 = m12 ^ m11;
667 let t14 = t10 ^ r18;
668 let m3 = t14 ^ m1;
669 // m2 <- stack
670 let m16 = m3 ^ m2;
671 let m20 = m16 ^ m13;
672 // u4 <- stack
673 let r19 = u2 ^ u4;
674 let t16 = r13 ^ r19;
675 // t3 <- stack
676 let t26 = t3 ^ t16;
677 let m6 = t3 & t16;
678 let m8 = t26 ^ m6;
679 // t10 -> stack
680 // m7 <- stack
681 let m18 = m8 ^ m7;
682 let m22 = m18 ^ m13;
683 let m25 = m22 & m20;
684 let m26 = m21 ^ m25;
685 let m10 = m9 ^ m6;
686 let m19 = m10 ^ m15;
687 // t25 <- stack
688 let m23 = m19 ^ t25;
689 let m28 = m23 ^ m25;
690 let m24 = m22 ^ m23;
691 let m30 = m26 & m24;
692 let m39 = m23 ^ m30;
693 let m48 = m39 & y5;
694 let m57 = m39 & t19;
695 // m48 -> stack
696 let m36 = m24 ^ m25;
697 let m31 = m20 & m23;
698 let m27 = m20 ^ m21;
699 let m32 = m27 & m31;
700 let m29 = m28 & m27;
701 let m37 = m21 ^ m29;
702 // m39 -> stack
703 let m42 = m37 ^ m39;
704 let m52 = m42 & t15;
705 // t27 -> stack
706 // t1 <- stack
707 let m61 = m42 & t1;
708 let p0 = m52 ^ m61;
709 let p16 = m57 ^ m61;
710 // m57 -> stack
711 // t20 <- stack
712 let m60 = m37 & t20;
713 // p16 -> stack
714 // t17 <- stack
715 let m51 = m37 & t17;
716 let m33 = m27 ^ m25;
717 let m38 = m32 ^ m33;
718 let m43 = m37 ^ m38;
719 let m49 = m43 & t16;
720 let p6 = m49 ^ m60;
721 let p13 = m49 ^ m51;
722 let m58 = m43 & t3;
723 // t9 <- stack
724 let m50 = m38 & t9;
725 // t22 <- stack
726 let m59 = m38 & t22;
727 // p6 -> stack
728 let p1 = m58 ^ m59;
729 let p7 = p0 ^ p1;
730 let m34 = m21 & m22;
731 let m35 = m24 & m34;
732 let m40 = m35 ^ m36;
733 let m41 = m38 ^ m40;
734 let m45 = m42 ^ m41;
735 // t27 <- stack
736 let m53 = m45 & t27;
737 let p8 = m50 ^ m53;
738 let p23 = p7 ^ p8;
739 // t4 <- stack
740 let m62 = m45 & t4;
741 let p14 = m49 ^ m62;
742 let s6 = p14 ^ p23;
743 // t10 <- stack
744 let m54 = m41 & t10;
745 let p2 = m54 ^ m62;
746 let p22 = p2 ^ p7;
747 let s0 = p13 ^ p22;
748 let p17 = m58 ^ p2;
749 let p15 = m54 ^ m59;
750 // t2 <- stack
751 let m63 = m41 & t2;
752 // m39 <- stack
753 let m44 = m39 ^ m40;
754 // p17 -> stack
755 // t6 <- stack
756 let m46 = m44 & t6;
757 let p5 = m46 ^ m51;
758 // p23 -> stack
759 let p18 = m63 ^ p5;
760 let p24 = p5 ^ p7;
761 // m48 <- stack
762 let p12 = m46 ^ m48;
763 let s3 = p12 ^ p22;
764 // t13 <- stack
765 let m55 = m44 & t13;
766 let p9 = m55 ^ m63;
767 // p16 <- stack
768 let s7 = p9 ^ p16;
769 // t8 <- stack
770 let m47 = m40 & t8;
771 let p3 = m47 ^ m50;
772 let p19 = p2 ^ p3;
773 let s5 = p19 ^ p24;
774 let p11 = p0 ^ p3;
775 let p26 = p9 ^ p11;
776 // t23 <- stack
777 let m56 = m40 & t23;
778 let p4 = m48 ^ m56;
779 // p6 <- stack
780 let p20 = p4 ^ p6;
781 let p29 = p15 ^ p20;
782 let s1 = p26 ^ p29;
783 // m57 <- stack
784 let p10 = m57 ^ p4;
785 let p27 = p10 ^ p18;
786 // p23 <- stack
787 let s4 = p23 ^ p27;
788 let p25 = p6 ^ p10;
789 let p28 = p11 ^ p25;
790 // p17 <- stack
791 let s2 = p17 ^ p28;
792
793 state[0] = s7;
794 state[1] = s6;
795 state[2] = s5;
796 state[3] = s4;
797 state[4] = s3;
798 state[5] = s2;
799 state[6] = s1;
800 state[7] = s0;
801 }
802
803 /// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
804 ///
805 /// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
806 ///
807 /// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule.
sub_bytes(state: &mut [u64])808 fn sub_bytes(state: &mut [u64]) {
809 debug_assert_eq!(state.len(), 8);
810
811 // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
812 // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
813
814 let u7 = state[0];
815 let u6 = state[1];
816 let u5 = state[2];
817 let u4 = state[3];
818 let u3 = state[4];
819 let u2 = state[5];
820 let u1 = state[6];
821 let u0 = state[7];
822
823 let y14 = u3 ^ u5;
824 let y13 = u0 ^ u6;
825 let y12 = y13 ^ y14;
826 let t1 = u4 ^ y12;
827 let y15 = t1 ^ u5;
828 let t2 = y12 & y15;
829 let y6 = y15 ^ u7;
830 let y20 = t1 ^ u1;
831 // y12 -> stack
832 let y9 = u0 ^ u3;
833 // y20 -> stack
834 let y11 = y20 ^ y9;
835 // y9 -> stack
836 let t12 = y9 & y11;
837 // y6 -> stack
838 let y7 = u7 ^ y11;
839 let y8 = u0 ^ u5;
840 let t0 = u1 ^ u2;
841 let y10 = y15 ^ t0;
842 // y15 -> stack
843 let y17 = y10 ^ y11;
844 // y14 -> stack
845 let t13 = y14 & y17;
846 let t14 = t13 ^ t12;
847 // y17 -> stack
848 let y19 = y10 ^ y8;
849 // y10 -> stack
850 let t15 = y8 & y10;
851 let t16 = t15 ^ t12;
852 let y16 = t0 ^ y11;
853 // y11 -> stack
854 let y21 = y13 ^ y16;
855 // y13 -> stack
856 let t7 = y13 & y16;
857 // y16 -> stack
858 let y18 = u0 ^ y16;
859 let y1 = t0 ^ u7;
860 let y4 = y1 ^ u3;
861 // u7 -> stack
862 let t5 = y4 & u7;
863 let t6 = t5 ^ t2;
864 let t18 = t6 ^ t16;
865 let t22 = t18 ^ y19;
866 let y2 = y1 ^ u0;
867 let t10 = y2 & y7;
868 let t11 = t10 ^ t7;
869 let t20 = t11 ^ t16;
870 let t24 = t20 ^ y18;
871 let y5 = y1 ^ u6;
872 let t8 = y5 & y1;
873 let t9 = t8 ^ t7;
874 let t19 = t9 ^ t14;
875 let t23 = t19 ^ y21;
876 let y3 = y5 ^ y8;
877 // y6 <- stack
878 let t3 = y3 & y6;
879 let t4 = t3 ^ t2;
880 // y20 <- stack
881 let t17 = t4 ^ y20;
882 let t21 = t17 ^ t14;
883 let t26 = t21 & t23;
884 let t27 = t24 ^ t26;
885 let t31 = t22 ^ t26;
886 let t25 = t21 ^ t22;
887 // y4 -> stack
888 let t28 = t25 & t27;
889 let t29 = t28 ^ t22;
890 let z14 = t29 & y2;
891 let z5 = t29 & y7;
892 let t30 = t23 ^ t24;
893 let t32 = t31 & t30;
894 let t33 = t32 ^ t24;
895 let t35 = t27 ^ t33;
896 let t36 = t24 & t35;
897 let t38 = t27 ^ t36;
898 let t39 = t29 & t38;
899 let t40 = t25 ^ t39;
900 let t43 = t29 ^ t40;
901 // y16 <- stack
902 let z3 = t43 & y16;
903 let tc12 = z3 ^ z5;
904 // tc12 -> stack
905 // y13 <- stack
906 let z12 = t43 & y13;
907 let z13 = t40 & y5;
908 let z4 = t40 & y1;
909 let tc6 = z3 ^ z4;
910 let t34 = t23 ^ t33;
911 let t37 = t36 ^ t34;
912 let t41 = t40 ^ t37;
913 // y10 <- stack
914 let z8 = t41 & y10;
915 let z17 = t41 & y8;
916 let t44 = t33 ^ t37;
917 // y15 <- stack
918 let z0 = t44 & y15;
919 // z17 -> stack
920 // y12 <- stack
921 let z9 = t44 & y12;
922 let z10 = t37 & y3;
923 let z1 = t37 & y6;
924 let tc5 = z1 ^ z0;
925 let tc11 = tc6 ^ tc5;
926 // y4 <- stack
927 let z11 = t33 & y4;
928 let t42 = t29 ^ t33;
929 let t45 = t42 ^ t41;
930 // y17 <- stack
931 let z7 = t45 & y17;
932 let tc8 = z7 ^ tc6;
933 // y14 <- stack
934 let z16 = t45 & y14;
935 // y11 <- stack
936 let z6 = t42 & y11;
937 let tc16 = z6 ^ tc8;
938 // z14 -> stack
939 // y9 <- stack
940 let z15 = t42 & y9;
941 let tc20 = z15 ^ tc16;
942 let tc1 = z15 ^ z16;
943 let tc2 = z10 ^ tc1;
944 let tc21 = tc2 ^ z11;
945 let tc3 = z9 ^ tc2;
946 let s0 = tc3 ^ tc16;
947 let s3 = tc3 ^ tc11;
948 let s1 = s3 ^ tc16;
949 let tc13 = z13 ^ tc1;
950 // u7 <- stack
951 let z2 = t33 & u7;
952 let tc4 = z0 ^ z2;
953 let tc7 = z12 ^ tc4;
954 let tc9 = z8 ^ tc7;
955 let tc10 = tc8 ^ tc9;
956 // z14 <- stack
957 let tc17 = z14 ^ tc10;
958 let s5 = tc21 ^ tc17;
959 let tc26 = tc17 ^ tc20;
960 // z17 <- stack
961 let s2 = tc26 ^ z17;
962 // tc12 <- stack
963 let tc14 = tc4 ^ tc12;
964 let tc18 = tc13 ^ tc14;
965 let s6 = tc10 ^ tc18;
966 let s7 = z12 ^ tc18;
967 let s4 = tc14 ^ s3;
968
969 state[0] = s7;
970 state[1] = s6;
971 state[2] = s5;
972 state[3] = s4;
973 state[4] = s3;
974 state[5] = s2;
975 state[6] = s1;
976 state[7] = s0;
977 }
978
979 /// NOT operations that are omitted in S-box
980 #[inline]
sub_bytes_nots(state: &mut [u64])981 fn sub_bytes_nots(state: &mut [u64]) {
982 debug_assert_eq!(state.len(), 8);
983 state[0] ^= 0xffffffffffffffff;
984 state[1] ^= 0xffffffffffffffff;
985 state[5] ^= 0xffffffffffffffff;
986 state[6] ^= 0xffffffffffffffff;
987 }
988
989 /// Computation of the MixColumns transformation in the fixsliced representation, with different
990 /// rotations used according to the round number mod 4.
991 ///
992 /// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
993 macro_rules! define_mix_columns {
994 (
995 $name:ident,
996 $name_inv:ident,
997 $first_rotate:path,
998 $second_rotate:path
999 ) => {
1000 #[rustfmt::skip]
1001 fn $name(state: &mut State) {
1002 let (a0, a1, a2, a3, a4, a5, a6, a7) = (
1003 state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
1004 );
1005 let (b0, b1, b2, b3, b4, b5, b6, b7) = (
1006 $first_rotate(a0),
1007 $first_rotate(a1),
1008 $first_rotate(a2),
1009 $first_rotate(a3),
1010 $first_rotate(a4),
1011 $first_rotate(a5),
1012 $first_rotate(a6),
1013 $first_rotate(a7),
1014 );
1015 let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1016 a0 ^ b0,
1017 a1 ^ b1,
1018 a2 ^ b2,
1019 a3 ^ b3,
1020 a4 ^ b4,
1021 a5 ^ b5,
1022 a6 ^ b6,
1023 a7 ^ b7,
1024 );
1025 state[0] = b0 ^ c7 ^ $second_rotate(c0);
1026 state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
1027 state[2] = b2 ^ c1 ^ $second_rotate(c2);
1028 state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
1029 state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
1030 state[5] = b5 ^ c4 ^ $second_rotate(c5);
1031 state[6] = b6 ^ c5 ^ $second_rotate(c6);
1032 state[7] = b7 ^ c6 ^ $second_rotate(c7);
1033 }
1034
1035 #[rustfmt::skip]
1036 fn $name_inv(state: &mut State) {
1037 let (a0, a1, a2, a3, a4, a5, a6, a7) = (
1038 state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
1039 );
1040 let (b0, b1, b2, b3, b4, b5, b6, b7) = (
1041 $first_rotate(a0),
1042 $first_rotate(a1),
1043 $first_rotate(a2),
1044 $first_rotate(a3),
1045 $first_rotate(a4),
1046 $first_rotate(a5),
1047 $first_rotate(a6),
1048 $first_rotate(a7),
1049 );
1050 let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1051 a0 ^ b0,
1052 a1 ^ b1,
1053 a2 ^ b2,
1054 a3 ^ b3,
1055 a4 ^ b4,
1056 a5 ^ b5,
1057 a6 ^ b6,
1058 a7 ^ b7,
1059 );
1060 let (d0, d1, d2, d3, d4, d5, d6, d7) = (
1061 a0 ^ c7,
1062 a1 ^ c0 ^ c7,
1063 a2 ^ c1,
1064 a3 ^ c2 ^ c7,
1065 a4 ^ c3 ^ c7,
1066 a5 ^ c4,
1067 a6 ^ c5,
1068 a7 ^ c6,
1069 );
1070 let (e0, e1, e2, e3, e4, e5, e6, e7) = (
1071 c0 ^ d6,
1072 c1 ^ d6 ^ d7,
1073 c2 ^ d0 ^ d7,
1074 c3 ^ d1 ^ d6,
1075 c4 ^ d2 ^ d6 ^ d7,
1076 c5 ^ d3 ^ d7,
1077 c6 ^ d4,
1078 c7 ^ d5,
1079 );
1080 state[0] = d0 ^ e0 ^ $second_rotate(e0);
1081 state[1] = d1 ^ e1 ^ $second_rotate(e1);
1082 state[2] = d2 ^ e2 ^ $second_rotate(e2);
1083 state[3] = d3 ^ e3 ^ $second_rotate(e3);
1084 state[4] = d4 ^ e4 ^ $second_rotate(e4);
1085 state[5] = d5 ^ e5 ^ $second_rotate(e5);
1086 state[6] = d6 ^ e6 ^ $second_rotate(e6);
1087 state[7] = d7 ^ e7 ^ $second_rotate(e7);
1088 }
1089 }
1090 }
1091
1092 define_mix_columns!(
1093 mix_columns_0,
1094 inv_mix_columns_0,
1095 rotate_rows_1,
1096 rotate_rows_2
1097 );
1098
1099 define_mix_columns!(
1100 mix_columns_1,
1101 inv_mix_columns_1,
1102 rotate_rows_and_columns_1_1,
1103 rotate_rows_and_columns_2_2
1104 );
1105
1106 #[cfg(not(feature = "semi_fixslice"))]
1107 define_mix_columns!(
1108 mix_columns_2,
1109 inv_mix_columns_2,
1110 rotate_rows_and_columns_1_2,
1111 rotate_rows_2
1112 );
1113
1114 #[cfg(not(feature = "semi_fixslice"))]
1115 define_mix_columns!(
1116 mix_columns_3,
1117 inv_mix_columns_3,
1118 rotate_rows_and_columns_1_3,
1119 rotate_rows_and_columns_2_2
1120 );
1121
1122 #[inline]
delta_swap_1(a: &mut u64, shift: u32, mask: u64)1123 fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) {
1124 let t = (*a ^ ((*a) >> shift)) & mask;
1125 *a ^= t ^ (t << shift);
1126 }
1127
1128 #[inline]
delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64)1129 fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) {
1130 let t = (*a ^ ((*b) >> shift)) & mask;
1131 *a ^= t;
1132 *b ^= t << shift;
1133 }
1134
1135 /// Applies ShiftRows once on an AES state (or key).
1136 #[cfg(not(feature = "semi_fixslice"))]
1137 #[inline]
shift_rows_1(state: &mut [u64])1138 fn shift_rows_1(state: &mut [u64]) {
1139 debug_assert_eq!(state.len(), 8);
1140 for x in state.iter_mut() {
1141 delta_swap_1(x, 8, 0x00f000ff000f0000);
1142 delta_swap_1(x, 4, 0x0f0f00000f0f0000);
1143 }
1144 }
1145
1146 /// Applies ShiftRows twice on an AES state (or key).
1147 #[inline]
shift_rows_2(state: &mut [u64])1148 fn shift_rows_2(state: &mut [u64]) {
1149 debug_assert_eq!(state.len(), 8);
1150 for x in state.iter_mut() {
1151 delta_swap_1(x, 8, 0x00ff000000ff0000);
1152 }
1153 }
1154
1155 /// Applies ShiftRows three times on an AES state (or key).
1156 #[inline]
shift_rows_3(state: &mut [u64])1157 fn shift_rows_3(state: &mut [u64]) {
1158 debug_assert_eq!(state.len(), 8);
1159 for x in state.iter_mut() {
1160 delta_swap_1(x, 8, 0x000f00ff00f00000);
1161 delta_swap_1(x, 4, 0x0f0f00000f0f0000);
1162 }
1163 }
1164
1165 #[inline(always)]
inv_shift_rows_1(state: &mut [u64])1166 fn inv_shift_rows_1(state: &mut [u64]) {
1167 shift_rows_3(state);
1168 }
1169
1170 #[inline(always)]
inv_shift_rows_2(state: &mut [u64])1171 fn inv_shift_rows_2(state: &mut [u64]) {
1172 shift_rows_2(state);
1173 }
1174
1175 #[cfg(not(feature = "semi_fixslice"))]
1176 #[inline(always)]
inv_shift_rows_3(state: &mut [u64])1177 fn inv_shift_rows_3(state: &mut [u64]) {
1178 shift_rows_1(state);
1179 }
1180
1181 /// XOR the columns after the S-box during the key schedule round function.
1182 ///
1183 /// The `idx_xor` parameter refers to the index of the previous round key that is
1184 /// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
1185 /// respectively).
1186 ///
1187 /// The `idx_ror` parameter refers to the rotation value, which varies between the
1188 /// different key schedules.
xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32)1189 fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) {
1190 for i in 0..8 {
1191 let off_i = offset + i;
1192 let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror));
1193 rkeys[off_i] = rk
1194 ^ (0xfff0fff0fff0fff0 & (rk << 4))
1195 ^ (0xff00ff00ff00ff00 & (rk << 8))
1196 ^ (0xf000f000f000f000 & (rk << 12));
1197 }
1198 }
1199
1200 /// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state.
bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8])1201 fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) {
1202 debug_assert_eq!(output.len(), 8);
1203 debug_assert_eq!(input0.len(), 16);
1204 debug_assert_eq!(input1.len(), 16);
1205 debug_assert_eq!(input2.len(), 16);
1206 debug_assert_eq!(input3.len(), 16);
1207
1208 // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a
1209 // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
1210 // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
1211 // b1 b0 c1 c0 r1 r0 p2 p1 p0
1212 //
1213 // The desired bitsliced data groups first by bit position, then row, column, block:
1214 // p2 p1 p0 r1 r0 c1 c0 b1 b0
1215
1216 #[rustfmt::skip]
1217 fn read_reordered(input: &[u8]) -> u64 {
1218 (u64::from(input[0x0]) ) |
1219 (u64::from(input[0x1]) << 0x10) |
1220 (u64::from(input[0x2]) << 0x20) |
1221 (u64::from(input[0x3]) << 0x30) |
1222 (u64::from(input[0x8]) << 0x08) |
1223 (u64::from(input[0x9]) << 0x18) |
1224 (u64::from(input[0xa]) << 0x28) |
1225 (u64::from(input[0xb]) << 0x38)
1226 }
1227
1228 // Reorder each block's bytes on input
1229 // __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __
1230 // Reorder by relabeling (note the order of input)
1231 // b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __
1232 let mut t0 = read_reordered(&input0[0x00..0x0c]);
1233 let mut t4 = read_reordered(&input0[0x04..0x10]);
1234 let mut t1 = read_reordered(&input1[0x00..0x0c]);
1235 let mut t5 = read_reordered(&input1[0x04..0x10]);
1236 let mut t2 = read_reordered(&input2[0x00..0x0c]);
1237 let mut t6 = read_reordered(&input2[0x04..0x10]);
1238 let mut t3 = read_reordered(&input3[0x00..0x0c]);
1239 let mut t7 = read_reordered(&input3[0x04..0x10]);
1240
1241 // Bit Index Swap 6 <-> 0:
1242 // __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0
1243 let m0 = 0x5555555555555555;
1244 delta_swap_2(&mut t1, &mut t0, 1, m0);
1245 delta_swap_2(&mut t3, &mut t2, 1, m0);
1246 delta_swap_2(&mut t5, &mut t4, 1, m0);
1247 delta_swap_2(&mut t7, &mut t6, 1, m0);
1248
1249 // Bit Index Swap 7 <-> 1:
1250 // __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __
1251 let m1 = 0x3333333333333333;
1252 delta_swap_2(&mut t2, &mut t0, 2, m1);
1253 delta_swap_2(&mut t3, &mut t1, 2, m1);
1254 delta_swap_2(&mut t6, &mut t4, 2, m1);
1255 delta_swap_2(&mut t7, &mut t5, 2, m1);
1256
1257 // Bit Index Swap 8 <-> 2:
1258 // c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __
1259 let m2 = 0x0f0f0f0f0f0f0f0f;
1260 delta_swap_2(&mut t4, &mut t0, 4, m2);
1261 delta_swap_2(&mut t5, &mut t1, 4, m2);
1262 delta_swap_2(&mut t6, &mut t2, 4, m2);
1263 delta_swap_2(&mut t7, &mut t3, 4, m2);
1264
1265 // Final bitsliced bit index, as desired:
1266 // p2 p1 p0 r1 r0 c1 c0 b1 b0
1267 output[0] = t0;
1268 output[1] = t1;
1269 output[2] = t2;
1270 output[3] = t3;
1271 output[4] = t4;
1272 output[5] = t5;
1273 output[6] = t6;
1274 output[7] = t7;
1275 }
1276
1277 /// Un-bitslice a 512-bit internal state into four 128-bit blocks of output.
inv_bitslice(input: &mut [u64], output: &mut [Block])1278 fn inv_bitslice(input: &mut [u64], output: &mut [Block]) {
1279 debug_assert_eq!(input.len(), 8);
1280 debug_assert_eq!(output.len(), 4);
1281
1282 // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at
1283 // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
1284 // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
1285 // b1 b0 c1 c0 r1 r0 p2 p1 p0
1286 //
1287 // The initially bitsliced data groups first by bit position, then row, column, block:
1288 // p2 p1 p0 r1 r0 c1 c0 b1 b0
1289
1290 let mut t0 = input[0];
1291 let mut t1 = input[1];
1292 let mut t2 = input[2];
1293 let mut t3 = input[3];
1294 let mut t4 = input[4];
1295 let mut t5 = input[5];
1296 let mut t6 = input[6];
1297 let mut t7 = input[7];
1298
1299 // TODO: these bit index swaps are identical to those in 'packing'
1300
1301 // Bit Index Swap 6 <-> 0:
1302 // __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0
1303 let m0 = 0x5555555555555555;
1304 delta_swap_2(&mut t1, &mut t0, 1, m0);
1305 delta_swap_2(&mut t3, &mut t2, 1, m0);
1306 delta_swap_2(&mut t5, &mut t4, 1, m0);
1307 delta_swap_2(&mut t7, &mut t6, 1, m0);
1308
1309 // Bit Index Swap 7 <-> 1:
1310 // __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __
1311 let m1 = 0x3333333333333333;
1312 delta_swap_2(&mut t2, &mut t0, 2, m1);
1313 delta_swap_2(&mut t3, &mut t1, 2, m1);
1314 delta_swap_2(&mut t6, &mut t4, 2, m1);
1315 delta_swap_2(&mut t7, &mut t5, 2, m1);
1316
1317 // Bit Index Swap 8 <-> 2:
1318 // p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __
1319 let m2 = 0x0f0f0f0f0f0f0f0f;
1320 delta_swap_2(&mut t4, &mut t0, 4, m2);
1321 delta_swap_2(&mut t5, &mut t1, 4, m2);
1322 delta_swap_2(&mut t6, &mut t2, 4, m2);
1323 delta_swap_2(&mut t7, &mut t3, 4, m2);
1324
1325 #[rustfmt::skip]
1326 fn write_reordered(columns: u64, output: &mut [u8]) {
1327 output[0x0] = (columns ) as u8;
1328 output[0x1] = (columns >> 0x10) as u8;
1329 output[0x2] = (columns >> 0x20) as u8;
1330 output[0x3] = (columns >> 0x30) as u8;
1331 output[0x8] = (columns >> 0x08) as u8;
1332 output[0x9] = (columns >> 0x18) as u8;
1333 output[0xa] = (columns >> 0x28) as u8;
1334 output[0xb] = (columns >> 0x38) as u8;
1335 }
1336
1337 // Reorder by relabeling (note the order of output)
1338 // c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __
1339 // Reorder each block's bytes on output
1340 // __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __
1341 write_reordered(t0, &mut output[0][0x00..0x0c]);
1342 write_reordered(t4, &mut output[0][0x04..0x10]);
1343 write_reordered(t1, &mut output[1][0x00..0x0c]);
1344 write_reordered(t5, &mut output[1][0x04..0x10]);
1345 write_reordered(t2, &mut output[2][0x00..0x0c]);
1346 write_reordered(t6, &mut output[2][0x04..0x10]);
1347 write_reordered(t3, &mut output[3][0x00..0x0c]);
1348 write_reordered(t7, &mut output[3][0x04..0x10]);
1349
1350 // Final AES bit index, as desired:
1351 // b1 b0 c1 c0 r1 r0 p2 p1 p0
1352 }
1353
1354 /// Copy 32-bytes within the provided slice to an 8-byte offset
memshift32(buffer: &mut [u64], src_offset: usize)1355 fn memshift32(buffer: &mut [u64], src_offset: usize) {
1356 debug_assert_eq!(src_offset % 8, 0);
1357
1358 let dst_offset = src_offset + 8;
1359 debug_assert!(dst_offset + 8 <= buffer.len());
1360
1361 for i in (0..8).rev() {
1362 buffer[dst_offset + i] = buffer[src_offset + i];
1363 }
1364 }
1365
1366 /// XOR the round key to the internal state. The round keys are expected to be
1367 /// pre-computed and to be packed in the fixsliced representation.
1368 #[inline]
add_round_key(state: &mut State, rkey: &[u64])1369 fn add_round_key(state: &mut State, rkey: &[u64]) {
1370 debug_assert_eq!(rkey.len(), 8);
1371 for (a, b) in state.iter_mut().zip(rkey) {
1372 *a ^= b;
1373 }
1374 }
1375
1376 #[inline(always)]
add_round_constant_bit(state: &mut [u64], bit: usize)1377 fn add_round_constant_bit(state: &mut [u64], bit: usize) {
1378 state[bit] ^= 0x00000000f0000000;
1379 }
1380
1381 #[inline(always)]
ror(x: u64, y: u32) -> u641382 fn ror(x: u64, y: u32) -> u64 {
1383 x.rotate_right(y)
1384 }
1385
1386 #[inline(always)]
ror_distance(rows: u32, cols: u32) -> u321387 fn ror_distance(rows: u32, cols: u32) -> u32 {
1388 (rows << 4) + (cols << 2)
1389 }
1390
1391 #[inline(always)]
rotate_rows_1(x: u64) -> u641392 fn rotate_rows_1(x: u64) -> u64 {
1393 ror(x, ror_distance(1, 0))
1394 }
1395
1396 #[inline(always)]
rotate_rows_2(x: u64) -> u641397 fn rotate_rows_2(x: u64) -> u64 {
1398 ror(x, ror_distance(2, 0))
1399 }
1400
1401 #[inline(always)]
1402 #[rustfmt::skip]
rotate_rows_and_columns_1_1(x: u64) -> u641403 fn rotate_rows_and_columns_1_1(x: u64) -> u64 {
1404 (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) |
1405 (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000)
1406 }
1407
1408 #[cfg(not(feature = "semi_fixslice"))]
1409 #[inline(always)]
1410 #[rustfmt::skip]
rotate_rows_and_columns_1_2(x: u64) -> u641411 fn rotate_rows_and_columns_1_2(x: u64) -> u64 {
1412 (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) |
1413 (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00)
1414 }
1415
1416 #[cfg(not(feature = "semi_fixslice"))]
1417 #[inline(always)]
1418 #[rustfmt::skip]
rotate_rows_and_columns_1_3(x: u64) -> u641419 fn rotate_rows_and_columns_1_3(x: u64) -> u64 {
1420 (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) |
1421 (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0)
1422 }
1423
1424 #[inline(always)]
1425 #[rustfmt::skip]
rotate_rows_and_columns_2_2(x: u64) -> u641426 fn rotate_rows_and_columns_2_2(x: u64) -> u64 {
1427 (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) |
1428 (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00)
1429 }
1430