1 //! SHA-256 `x86`/`x86_64` backend
2 
3 #![allow(clippy::many_single_char_names)]
4 
5 #[cfg(target_arch = "x86")]
6 use core::arch::x86::*;
7 #[cfg(target_arch = "x86_64")]
8 use core::arch::x86_64::*;
9 
schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i10 unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
11     let t1 = _mm_sha256msg1_epu32(v0, v1);
12     let t2 = _mm_alignr_epi8(v3, v2, 4);
13     let t3 = _mm_add_epi32(t1, t2);
14     _mm_sha256msg2_epu32(t3, v3)
15 }
16 
17 macro_rules! rounds4 {
18     ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
19         let k = crate::consts::K32X4[$i];
20         let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
21         let t1 = _mm_add_epi32($rest, kv);
22         $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1);
23         let t2 = _mm_shuffle_epi32(t1, 0x0E);
24         $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2);
25     }};
26 }
27 
28 macro_rules! schedule_rounds4 {
29     (
30         $abef:ident, $cdgh:ident,
31         $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
32         $i: expr
33     ) => {{
34         $w4 = schedule($w0, $w1, $w2, $w3);
35         rounds4!($abef, $cdgh, $w4, $i);
36     }};
37 }
38 
39 // we use unaligned loads with `__m128i` pointers
40 #[allow(clippy::cast_ptr_alignment)]
41 #[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]])42 unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
43     #[allow(non_snake_case)]
44     let MASK: __m128i = _mm_set_epi64x(
45         0x0C0D_0E0F_0809_0A0Bu64 as i64,
46         0x0405_0607_0001_0203u64 as i64,
47     );
48 
49     let state_ptr = state.as_ptr() as *const __m128i;
50     let dcba = _mm_loadu_si128(state_ptr.add(0));
51     let efgh = _mm_loadu_si128(state_ptr.add(1));
52 
53     let cdab = _mm_shuffle_epi32(dcba, 0xB1);
54     let efgh = _mm_shuffle_epi32(efgh, 0x1B);
55     let mut abef = _mm_alignr_epi8(cdab, efgh, 8);
56     let mut cdgh = _mm_blend_epi16(efgh, cdab, 0xF0);
57 
58     for block in blocks {
59         let abef_save = abef;
60         let cdgh_save = cdgh;
61 
62         let data_ptr = block.as_ptr() as *const __m128i;
63         let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(0)), MASK);
64         let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(1)), MASK);
65         let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(2)), MASK);
66         let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(3)), MASK);
67         let mut w4;
68 
69         rounds4!(abef, cdgh, w0, 0);
70         rounds4!(abef, cdgh, w1, 1);
71         rounds4!(abef, cdgh, w2, 2);
72         rounds4!(abef, cdgh, w3, 3);
73         schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
74         schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
75         schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
76         schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
77         schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
78         schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
79         schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
80         schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
81         schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
82         schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
83         schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
84         schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
85 
86         abef = _mm_add_epi32(abef, abef_save);
87         cdgh = _mm_add_epi32(cdgh, cdgh_save);
88     }
89 
90     let feba = _mm_shuffle_epi32(abef, 0x1B);
91     let dchg = _mm_shuffle_epi32(cdgh, 0xB1);
92     let dcba = _mm_blend_epi16(feba, dchg, 0xF0);
93     let hgef = _mm_alignr_epi8(dchg, feba, 8);
94 
95     let state_ptr_mut = state.as_mut_ptr() as *mut __m128i;
96     _mm_storeu_si128(state_ptr_mut.add(0), dcba);
97     _mm_storeu_si128(state_ptr_mut.add(1), hgef);
98 }
99 
100 cpufeatures::new!(shani_cpuid, "sha", "sse2", "ssse3", "sse4.1");
101 
compress(state: &mut [u32; 8], blocks: &[[u8; 64]])102 pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
103     // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
104     // after stabilization
105     if shani_cpuid::get() {
106         unsafe {
107             digest_blocks(state, blocks);
108         }
109     } else {
110         super::soft::compress(state, blocks);
111     }
112 }
113