1 #[cfg(target_arch = "x86")]
2 use core::arch::x86::*;
3 #[cfg(target_arch = "x86_64")]
4 use core::arch::x86_64::*;
5 
6 use crate::{
7     counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
8     OUT_LEN,
9 };
10 use arrayref::{array_mut_ref, array_ref, mut_array_refs};
11 
12 pub const DEGREE: usize = 4;
13 
14 #[inline(always)]
loadu(src: *const u8) -> __m128i15 unsafe fn loadu(src: *const u8) -> __m128i {
16     // This is an unaligned load, so the pointer cast is allowed.
17     _mm_loadu_si128(src as *const __m128i)
18 }
19 
20 #[inline(always)]
storeu(src: __m128i, dest: *mut u8)21 unsafe fn storeu(src: __m128i, dest: *mut u8) {
22     // This is an unaligned store, so the pointer cast is allowed.
23     _mm_storeu_si128(dest as *mut __m128i, src)
24 }
25 
26 #[inline(always)]
add(a: __m128i, b: __m128i) -> __m128i27 unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
28     _mm_add_epi32(a, b)
29 }
30 
31 #[inline(always)]
xor(a: __m128i, b: __m128i) -> __m128i32 unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
33     _mm_xor_si128(a, b)
34 }
35 
36 #[inline(always)]
set1(x: u32) -> __m128i37 unsafe fn set1(x: u32) -> __m128i {
38     _mm_set1_epi32(x as i32)
39 }
40 
41 #[inline(always)]
set4(a: u32, b: u32, c: u32, d: u32) -> __m128i42 unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
43     _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
44 }
45 
46 // These rotations are the "simple/shifts version". For the
47 // "complicated/shuffles version", see
48 // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
49 // For a discussion of the tradeoffs, see
50 // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
51 // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
52 // on recent x86 chips.
53 
54 #[inline(always)]
rot16(a: __m128i) -> __m128i55 unsafe fn rot16(a: __m128i) -> __m128i {
56     _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
57 }
58 
59 #[inline(always)]
rot12(a: __m128i) -> __m128i60 unsafe fn rot12(a: __m128i) -> __m128i {
61     _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
62 }
63 
64 #[inline(always)]
rot8(a: __m128i) -> __m128i65 unsafe fn rot8(a: __m128i) -> __m128i {
66     _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
67 }
68 
69 #[inline(always)]
rot7(a: __m128i) -> __m128i70 unsafe fn rot7(a: __m128i) -> __m128i {
71     _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
72 }
73 
74 #[inline(always)]
g1( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, )75 unsafe fn g1(
76     row0: &mut __m128i,
77     row1: &mut __m128i,
78     row2: &mut __m128i,
79     row3: &mut __m128i,
80     m: __m128i,
81 ) {
82     *row0 = add(add(*row0, m), *row1);
83     *row3 = xor(*row3, *row0);
84     *row3 = rot16(*row3);
85     *row2 = add(*row2, *row3);
86     *row1 = xor(*row1, *row2);
87     *row1 = rot12(*row1);
88 }
89 
90 #[inline(always)]
g2( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, )91 unsafe fn g2(
92     row0: &mut __m128i,
93     row1: &mut __m128i,
94     row2: &mut __m128i,
95     row3: &mut __m128i,
96     m: __m128i,
97 ) {
98     *row0 = add(add(*row0, m), *row1);
99     *row3 = xor(*row3, *row0);
100     *row3 = rot8(*row3);
101     *row2 = add(*row2, *row3);
102     *row1 = xor(*row1, *row2);
103     *row1 = rot7(*row1);
104 }
105 
106 // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
107 macro_rules! _MM_SHUFFLE {
108     ($z:expr, $y:expr, $x:expr, $w:expr) => {
109         ($z << 6) | ($y << 4) | ($x << 2) | $w
110     };
111 }
112 
113 macro_rules! shuffle2 {
114     ($a:expr, $b:expr, $c:expr) => {
115         _mm_castps_si128(_mm_shuffle_ps(
116             _mm_castsi128_ps($a),
117             _mm_castsi128_ps($b),
118             $c,
119         ))
120     };
121 }
122 
123 // Note the optimization here of leaving row1 as the unrotated row, rather than
124 // row0. All the message loads below are adjusted to compensate for this. See
125 // discussion at https://github.com/sneves/blake2-avx2/pull/4
126 #[inline(always)]
diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i)127 unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
128     *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
129     *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
130     *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
131 }
132 
133 #[inline(always)]
undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i)134 unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
135     *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
136     *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
137     *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
138 }
139 
140 #[inline(always)]
compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [__m128i; 4]141 unsafe fn compress_pre(
142     cv: &CVWords,
143     block: &[u8; BLOCK_LEN],
144     block_len: u8,
145     counter: u64,
146     flags: u8,
147 ) -> [__m128i; 4] {
148     let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
149     let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
150     let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
151     let row3 = &mut set4(
152         counter_low(counter),
153         counter_high(counter),
154         block_len as u32,
155         flags as u32,
156     );
157 
158     let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
159     let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
160     let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
161     let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
162 
163     let mut t0;
164     let mut t1;
165     let mut t2;
166     let mut t3;
167     let mut tt;
168 
169     // Round 1. The first round permutes the message words from the original
170     // input order, into the groups that get mixed in parallel.
171     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); //  6  4  2  0
172     g1(row0, row1, row2, row3, t0);
173     t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); //  7  5  3  1
174     g2(row0, row1, row2, row3, t1);
175     diagonalize(row0, row2, row3);
176     t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10  8
177     t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10  8 14
178     g1(row0, row1, row2, row3, t2);
179     t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11  9
180     t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11  9 15
181     g2(row0, row1, row2, row3, t3);
182     undiagonalize(row0, row2, row3);
183     m0 = t0;
184     m1 = t1;
185     m2 = t2;
186     m3 = t3;
187 
188     // Round 2. This round and all following rounds apply a fixed permutation
189     // to the message words from the round before.
190     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
191     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
192     g1(row0, row1, row2, row3, t0);
193     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
194     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
195     t1 = _mm_blend_epi16(tt, t1, 0xCC);
196     g2(row0, row1, row2, row3, t1);
197     diagonalize(row0, row2, row3);
198     t2 = _mm_unpacklo_epi64(m3, m1);
199     tt = _mm_blend_epi16(t2, m2, 0xC0);
200     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
201     g1(row0, row1, row2, row3, t2);
202     t3 = _mm_unpackhi_epi32(m1, m3);
203     tt = _mm_unpacklo_epi32(m2, t3);
204     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
205     g2(row0, row1, row2, row3, t3);
206     undiagonalize(row0, row2, row3);
207     m0 = t0;
208     m1 = t1;
209     m2 = t2;
210     m3 = t3;
211 
212     // Round 3
213     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
214     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
215     g1(row0, row1, row2, row3, t0);
216     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
217     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
218     t1 = _mm_blend_epi16(tt, t1, 0xCC);
219     g2(row0, row1, row2, row3, t1);
220     diagonalize(row0, row2, row3);
221     t2 = _mm_unpacklo_epi64(m3, m1);
222     tt = _mm_blend_epi16(t2, m2, 0xC0);
223     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
224     g1(row0, row1, row2, row3, t2);
225     t3 = _mm_unpackhi_epi32(m1, m3);
226     tt = _mm_unpacklo_epi32(m2, t3);
227     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
228     g2(row0, row1, row2, row3, t3);
229     undiagonalize(row0, row2, row3);
230     m0 = t0;
231     m1 = t1;
232     m2 = t2;
233     m3 = t3;
234 
235     // Round 4
236     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
237     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
238     g1(row0, row1, row2, row3, t0);
239     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
240     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
241     t1 = _mm_blend_epi16(tt, t1, 0xCC);
242     g2(row0, row1, row2, row3, t1);
243     diagonalize(row0, row2, row3);
244     t2 = _mm_unpacklo_epi64(m3, m1);
245     tt = _mm_blend_epi16(t2, m2, 0xC0);
246     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
247     g1(row0, row1, row2, row3, t2);
248     t3 = _mm_unpackhi_epi32(m1, m3);
249     tt = _mm_unpacklo_epi32(m2, t3);
250     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
251     g2(row0, row1, row2, row3, t3);
252     undiagonalize(row0, row2, row3);
253     m0 = t0;
254     m1 = t1;
255     m2 = t2;
256     m3 = t3;
257 
258     // Round 5
259     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
260     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
261     g1(row0, row1, row2, row3, t0);
262     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
263     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
264     t1 = _mm_blend_epi16(tt, t1, 0xCC);
265     g2(row0, row1, row2, row3, t1);
266     diagonalize(row0, row2, row3);
267     t2 = _mm_unpacklo_epi64(m3, m1);
268     tt = _mm_blend_epi16(t2, m2, 0xC0);
269     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
270     g1(row0, row1, row2, row3, t2);
271     t3 = _mm_unpackhi_epi32(m1, m3);
272     tt = _mm_unpacklo_epi32(m2, t3);
273     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
274     g2(row0, row1, row2, row3, t3);
275     undiagonalize(row0, row2, row3);
276     m0 = t0;
277     m1 = t1;
278     m2 = t2;
279     m3 = t3;
280 
281     // Round 6
282     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
283     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
284     g1(row0, row1, row2, row3, t0);
285     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
286     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
287     t1 = _mm_blend_epi16(tt, t1, 0xCC);
288     g2(row0, row1, row2, row3, t1);
289     diagonalize(row0, row2, row3);
290     t2 = _mm_unpacklo_epi64(m3, m1);
291     tt = _mm_blend_epi16(t2, m2, 0xC0);
292     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
293     g1(row0, row1, row2, row3, t2);
294     t3 = _mm_unpackhi_epi32(m1, m3);
295     tt = _mm_unpacklo_epi32(m2, t3);
296     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
297     g2(row0, row1, row2, row3, t3);
298     undiagonalize(row0, row2, row3);
299     m0 = t0;
300     m1 = t1;
301     m2 = t2;
302     m3 = t3;
303 
304     // Round 7
305     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
306     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
307     g1(row0, row1, row2, row3, t0);
308     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
309     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
310     t1 = _mm_blend_epi16(tt, t1, 0xCC);
311     g2(row0, row1, row2, row3, t1);
312     diagonalize(row0, row2, row3);
313     t2 = _mm_unpacklo_epi64(m3, m1);
314     tt = _mm_blend_epi16(t2, m2, 0xC0);
315     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
316     g1(row0, row1, row2, row3, t2);
317     t3 = _mm_unpackhi_epi32(m1, m3);
318     tt = _mm_unpacklo_epi32(m2, t3);
319     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
320     g2(row0, row1, row2, row3, t3);
321     undiagonalize(row0, row2, row3);
322 
323     [*row0, *row1, *row2, *row3]
324 }
325 
326 #[target_feature(enable = "sse4.1")]
compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, )327 pub unsafe fn compress_in_place(
328     cv: &mut CVWords,
329     block: &[u8; BLOCK_LEN],
330     block_len: u8,
331     counter: u64,
332     flags: u8,
333 ) {
334     let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
335     storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
336     storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
337 }
338 
339 #[target_feature(enable = "sse4.1")]
compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64]340 pub unsafe fn compress_xof(
341     cv: &CVWords,
342     block: &[u8; BLOCK_LEN],
343     block_len: u8,
344     counter: u64,
345     flags: u8,
346 ) -> [u8; 64] {
347     let [mut row0, mut row1, mut row2, mut row3] =
348         compress_pre(cv, block, block_len, counter, flags);
349     row0 = xor(row0, row2);
350     row1 = xor(row1, row3);
351     row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
352     row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
353     core::mem::transmute([row0, row1, row2, row3])
354 }
355 
356 #[inline(always)]
round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize)357 unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
358     v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
359     v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
360     v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
361     v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
362     v[0] = add(v[0], v[4]);
363     v[1] = add(v[1], v[5]);
364     v[2] = add(v[2], v[6]);
365     v[3] = add(v[3], v[7]);
366     v[12] = xor(v[12], v[0]);
367     v[13] = xor(v[13], v[1]);
368     v[14] = xor(v[14], v[2]);
369     v[15] = xor(v[15], v[3]);
370     v[12] = rot16(v[12]);
371     v[13] = rot16(v[13]);
372     v[14] = rot16(v[14]);
373     v[15] = rot16(v[15]);
374     v[8] = add(v[8], v[12]);
375     v[9] = add(v[9], v[13]);
376     v[10] = add(v[10], v[14]);
377     v[11] = add(v[11], v[15]);
378     v[4] = xor(v[4], v[8]);
379     v[5] = xor(v[5], v[9]);
380     v[6] = xor(v[6], v[10]);
381     v[7] = xor(v[7], v[11]);
382     v[4] = rot12(v[4]);
383     v[5] = rot12(v[5]);
384     v[6] = rot12(v[6]);
385     v[7] = rot12(v[7]);
386     v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
387     v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
388     v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
389     v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
390     v[0] = add(v[0], v[4]);
391     v[1] = add(v[1], v[5]);
392     v[2] = add(v[2], v[6]);
393     v[3] = add(v[3], v[7]);
394     v[12] = xor(v[12], v[0]);
395     v[13] = xor(v[13], v[1]);
396     v[14] = xor(v[14], v[2]);
397     v[15] = xor(v[15], v[3]);
398     v[12] = rot8(v[12]);
399     v[13] = rot8(v[13]);
400     v[14] = rot8(v[14]);
401     v[15] = rot8(v[15]);
402     v[8] = add(v[8], v[12]);
403     v[9] = add(v[9], v[13]);
404     v[10] = add(v[10], v[14]);
405     v[11] = add(v[11], v[15]);
406     v[4] = xor(v[4], v[8]);
407     v[5] = xor(v[5], v[9]);
408     v[6] = xor(v[6], v[10]);
409     v[7] = xor(v[7], v[11]);
410     v[4] = rot7(v[4]);
411     v[5] = rot7(v[5]);
412     v[6] = rot7(v[6]);
413     v[7] = rot7(v[7]);
414 
415     v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
416     v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
417     v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
418     v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
419     v[0] = add(v[0], v[5]);
420     v[1] = add(v[1], v[6]);
421     v[2] = add(v[2], v[7]);
422     v[3] = add(v[3], v[4]);
423     v[15] = xor(v[15], v[0]);
424     v[12] = xor(v[12], v[1]);
425     v[13] = xor(v[13], v[2]);
426     v[14] = xor(v[14], v[3]);
427     v[15] = rot16(v[15]);
428     v[12] = rot16(v[12]);
429     v[13] = rot16(v[13]);
430     v[14] = rot16(v[14]);
431     v[10] = add(v[10], v[15]);
432     v[11] = add(v[11], v[12]);
433     v[8] = add(v[8], v[13]);
434     v[9] = add(v[9], v[14]);
435     v[5] = xor(v[5], v[10]);
436     v[6] = xor(v[6], v[11]);
437     v[7] = xor(v[7], v[8]);
438     v[4] = xor(v[4], v[9]);
439     v[5] = rot12(v[5]);
440     v[6] = rot12(v[6]);
441     v[7] = rot12(v[7]);
442     v[4] = rot12(v[4]);
443     v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
444     v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
445     v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
446     v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
447     v[0] = add(v[0], v[5]);
448     v[1] = add(v[1], v[6]);
449     v[2] = add(v[2], v[7]);
450     v[3] = add(v[3], v[4]);
451     v[15] = xor(v[15], v[0]);
452     v[12] = xor(v[12], v[1]);
453     v[13] = xor(v[13], v[2]);
454     v[14] = xor(v[14], v[3]);
455     v[15] = rot8(v[15]);
456     v[12] = rot8(v[12]);
457     v[13] = rot8(v[13]);
458     v[14] = rot8(v[14]);
459     v[10] = add(v[10], v[15]);
460     v[11] = add(v[11], v[12]);
461     v[8] = add(v[8], v[13]);
462     v[9] = add(v[9], v[14]);
463     v[5] = xor(v[5], v[10]);
464     v[6] = xor(v[6], v[11]);
465     v[7] = xor(v[7], v[8]);
466     v[4] = xor(v[4], v[9]);
467     v[5] = rot7(v[5]);
468     v[6] = rot7(v[6]);
469     v[7] = rot7(v[7]);
470     v[4] = rot7(v[4]);
471 }
472 
473 #[inline(always)]
transpose_vecs(vecs: &mut [__m128i; DEGREE])474 unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
475     // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
476     // 22/33. Note that this doesn't split the vector into two lanes, as the
477     // AVX2 counterparts do.
478     let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
479     let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
480     let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
481     let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
482 
483     // Interleave 64-bit lanes.
484     let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
485     let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
486     let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
487     let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
488 
489     vecs[0] = abcd_0;
490     vecs[1] = abcd_1;
491     vecs[2] = abcd_2;
492     vecs[3] = abcd_3;
493 }
494 
495 #[inline(always)]
transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16]496 unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
497     let mut vecs = [
498         loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
499         loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
500         loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
501         loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
502         loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
503         loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
504         loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
505         loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
506         loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
507         loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
508         loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
509         loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
510         loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
511         loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
512         loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
513         loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
514     ];
515     for i in 0..DEGREE {
516         _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0);
517     }
518     let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
519     transpose_vecs(squares.0);
520     transpose_vecs(squares.1);
521     transpose_vecs(squares.2);
522     transpose_vecs(squares.3);
523     vecs
524 }
525 
526 #[inline(always)]
load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i)527 unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
528     let mask = if increment_counter.yes() { !0 } else { 0 };
529     (
530         set4(
531             counter_low(counter + (mask & 0)),
532             counter_low(counter + (mask & 1)),
533             counter_low(counter + (mask & 2)),
534             counter_low(counter + (mask & 3)),
535         ),
536         set4(
537             counter_high(counter + (mask & 0)),
538             counter_high(counter + (mask & 1)),
539             counter_high(counter + (mask & 2)),
540             counter_high(counter + (mask & 3)),
541         ),
542     )
543 }
544 
545 #[target_feature(enable = "sse4.1")]
hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], )546 pub unsafe fn hash4(
547     inputs: &[*const u8; DEGREE],
548     blocks: usize,
549     key: &CVWords,
550     counter: u64,
551     increment_counter: IncrementCounter,
552     flags: u8,
553     flags_start: u8,
554     flags_end: u8,
555     out: &mut [u8; DEGREE * OUT_LEN],
556 ) {
557     let mut h_vecs = [
558         set1(key[0]),
559         set1(key[1]),
560         set1(key[2]),
561         set1(key[3]),
562         set1(key[4]),
563         set1(key[5]),
564         set1(key[6]),
565         set1(key[7]),
566     ];
567     let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
568     let mut block_flags = flags | flags_start;
569 
570     for block in 0..blocks {
571         if block + 1 == blocks {
572             block_flags |= flags_end;
573         }
574         let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
575         let block_flags_vec = set1(block_flags as u32);
576         let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
577 
578         // The transposed compression function. Note that inlining this
579         // manually here improves compile times by a lot, compared to factoring
580         // it out into its own function and making it #[inline(always)]. Just
581         // guessing, it might have something to do with loop unrolling.
582         let mut v = [
583             h_vecs[0],
584             h_vecs[1],
585             h_vecs[2],
586             h_vecs[3],
587             h_vecs[4],
588             h_vecs[5],
589             h_vecs[6],
590             h_vecs[7],
591             set1(IV[0]),
592             set1(IV[1]),
593             set1(IV[2]),
594             set1(IV[3]),
595             counter_low_vec,
596             counter_high_vec,
597             block_len_vec,
598             block_flags_vec,
599         ];
600         round(&mut v, &msg_vecs, 0);
601         round(&mut v, &msg_vecs, 1);
602         round(&mut v, &msg_vecs, 2);
603         round(&mut v, &msg_vecs, 3);
604         round(&mut v, &msg_vecs, 4);
605         round(&mut v, &msg_vecs, 5);
606         round(&mut v, &msg_vecs, 6);
607         h_vecs[0] = xor(v[0], v[8]);
608         h_vecs[1] = xor(v[1], v[9]);
609         h_vecs[2] = xor(v[2], v[10]);
610         h_vecs[3] = xor(v[3], v[11]);
611         h_vecs[4] = xor(v[4], v[12]);
612         h_vecs[5] = xor(v[5], v[13]);
613         h_vecs[6] = xor(v[6], v[14]);
614         h_vecs[7] = xor(v[7], v[15]);
615 
616         block_flags = flags;
617     }
618 
619     let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
620     transpose_vecs(squares.0);
621     transpose_vecs(squares.1);
622     // The first four vecs now contain the first half of each output, and the
623     // second four vecs contain the second half of each output.
624     storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
625     storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
626     storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
627     storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
628     storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
629     storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
630     storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
631     storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
632 }
633 
634 #[target_feature(enable = "sse4.1")]
hash1<const N: usize>( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, )635 unsafe fn hash1<const N: usize>(
636     input: &[u8; N],
637     key: &CVWords,
638     counter: u64,
639     flags: u8,
640     flags_start: u8,
641     flags_end: u8,
642     out: &mut CVBytes,
643 ) {
644     debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks");
645     let mut cv = *key;
646     let mut block_flags = flags | flags_start;
647     let mut slice = &input[..];
648     while slice.len() >= BLOCK_LEN {
649         if slice.len() == BLOCK_LEN {
650             block_flags |= flags_end;
651         }
652         compress_in_place(
653             &mut cv,
654             array_ref!(slice, 0, BLOCK_LEN),
655             BLOCK_LEN as u8,
656             counter,
657             block_flags,
658         );
659         block_flags = flags;
660         slice = &slice[BLOCK_LEN..];
661     }
662     *out = core::mem::transmute(cv); // x86 is little-endian
663 }
664 
665 #[target_feature(enable = "sse4.1")]
hash_many<const N: usize>( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], )666 pub unsafe fn hash_many<const N: usize>(
667     mut inputs: &[&[u8; N]],
668     key: &CVWords,
669     mut counter: u64,
670     increment_counter: IncrementCounter,
671     flags: u8,
672     flags_start: u8,
673     flags_end: u8,
674     mut out: &mut [u8],
675 ) {
676     debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
677     while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
678         // Safe because the layout of arrays is guaranteed, and because the
679         // `blocks` count is determined statically from the argument type.
680         let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
681         let blocks = N / BLOCK_LEN;
682         hash4(
683             input_ptrs,
684             blocks,
685             key,
686             counter,
687             increment_counter,
688             flags,
689             flags_start,
690             flags_end,
691             array_mut_ref!(out, 0, DEGREE * OUT_LEN),
692         );
693         if increment_counter.yes() {
694             counter += DEGREE as u64;
695         }
696         inputs = &inputs[DEGREE..];
697         out = &mut out[DEGREE * OUT_LEN..];
698     }
699     for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
700         hash1(
701             input,
702             key,
703             counter,
704             flags,
705             flags_start,
706             flags_end,
707             array_mut_ref!(output, 0, OUT_LEN),
708         );
709         if increment_counter.yes() {
710             counter += 1;
711         }
712     }
713 }
714 
715 #[cfg(test)]
716 mod test {
717     use super::*;
718 
719     #[test]
test_transpose()720     fn test_transpose() {
721         if !crate::platform::sse41_detected() {
722             return;
723         }
724 
725         #[target_feature(enable = "sse4.1")]
726         unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
727             transpose_vecs(vecs);
728         }
729 
730         let mut matrix = [[0 as u32; DEGREE]; DEGREE];
731         for i in 0..DEGREE {
732             for j in 0..DEGREE {
733                 matrix[i][j] = (i * DEGREE + j) as u32;
734             }
735         }
736 
737         unsafe {
738             let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
739             transpose_wrapper(&mut vecs);
740             matrix = core::mem::transmute(vecs);
741         }
742 
743         for i in 0..DEGREE {
744             for j in 0..DEGREE {
745                 // Reversed indexes from above.
746                 assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
747             }
748         }
749     }
750 
751     #[test]
test_compress()752     fn test_compress() {
753         if !crate::platform::sse41_detected() {
754             return;
755         }
756         crate::test::test_compress_fn(compress_in_place, compress_xof);
757     }
758 
759     #[test]
test_hash_many()760     fn test_hash_many() {
761         if !crate::platform::sse41_detected() {
762             return;
763         }
764         crate::test::test_hash_many_fn(hash_many, hash_many);
765     }
766 }
767