1 #[cfg(target_arch = "x86")]
2 use core::arch::x86::*;
3 #[cfg(target_arch = "x86_64")]
4 use core::arch::x86_64::*;
5 
6 use crate::{
7     counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
8     OUT_LEN,
9 };
10 use arrayref::{array_mut_ref, array_ref, mut_array_refs};
11 
12 pub const DEGREE: usize = 4;
13 
14 #[inline(always)]
loadu(src: *const u8) -> __m128i15 unsafe fn loadu(src: *const u8) -> __m128i {
16     // This is an unaligned load, so the pointer cast is allowed.
17     _mm_loadu_si128(src as *const __m128i)
18 }
19 
20 #[inline(always)]
storeu(src: __m128i, dest: *mut u8)21 unsafe fn storeu(src: __m128i, dest: *mut u8) {
22     // This is an unaligned store, so the pointer cast is allowed.
23     _mm_storeu_si128(dest as *mut __m128i, src)
24 }
25 
26 #[inline(always)]
add(a: __m128i, b: __m128i) -> __m128i27 unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
28     _mm_add_epi32(a, b)
29 }
30 
31 #[inline(always)]
xor(a: __m128i, b: __m128i) -> __m128i32 unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
33     _mm_xor_si128(a, b)
34 }
35 
36 #[inline(always)]
set1(x: u32) -> __m128i37 unsafe fn set1(x: u32) -> __m128i {
38     _mm_set1_epi32(x as i32)
39 }
40 
41 #[inline(always)]
set4(a: u32, b: u32, c: u32, d: u32) -> __m128i42 unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
43     _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
44 }
45 
46 // These rotations are the "simple/shifts version". For the
47 // "complicated/shuffles version", see
48 // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
49 // For a discussion of the tradeoffs, see
50 // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
51 // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
52 // on recent x86 chips.
53 
54 #[inline(always)]
rot16(a: __m128i) -> __m128i55 unsafe fn rot16(a: __m128i) -> __m128i {
56     _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
57 }
58 
59 #[inline(always)]
rot12(a: __m128i) -> __m128i60 unsafe fn rot12(a: __m128i) -> __m128i {
61     _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
62 }
63 
64 #[inline(always)]
rot8(a: __m128i) -> __m128i65 unsafe fn rot8(a: __m128i) -> __m128i {
66     _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
67 }
68 
69 #[inline(always)]
rot7(a: __m128i) -> __m128i70 unsafe fn rot7(a: __m128i) -> __m128i {
71     _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
72 }
73 
74 #[inline(always)]
g1( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, )75 unsafe fn g1(
76     row0: &mut __m128i,
77     row1: &mut __m128i,
78     row2: &mut __m128i,
79     row3: &mut __m128i,
80     m: __m128i,
81 ) {
82     *row0 = add(add(*row0, m), *row1);
83     *row3 = xor(*row3, *row0);
84     *row3 = rot16(*row3);
85     *row2 = add(*row2, *row3);
86     *row1 = xor(*row1, *row2);
87     *row1 = rot12(*row1);
88 }
89 
90 #[inline(always)]
g2( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, )91 unsafe fn g2(
92     row0: &mut __m128i,
93     row1: &mut __m128i,
94     row2: &mut __m128i,
95     row3: &mut __m128i,
96     m: __m128i,
97 ) {
98     *row0 = add(add(*row0, m), *row1);
99     *row3 = xor(*row3, *row0);
100     *row3 = rot8(*row3);
101     *row2 = add(*row2, *row3);
102     *row1 = xor(*row1, *row2);
103     *row1 = rot7(*row1);
104 }
105 
106 // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
107 macro_rules! _MM_SHUFFLE {
108     ($z:expr, $y:expr, $x:expr, $w:expr) => {
109         ($z << 6) | ($y << 4) | ($x << 2) | $w
110     };
111 }
112 
113 macro_rules! shuffle2 {
114     ($a:expr, $b:expr, $c:expr) => {
115         _mm_castps_si128(_mm_shuffle_ps(
116             _mm_castsi128_ps($a),
117             _mm_castsi128_ps($b),
118             $c,
119         ))
120     };
121 }
122 
123 // Note the optimization here of leaving row1 as the unrotated row, rather than
124 // row0. All the message loads below are adjusted to compensate for this. See
125 // discussion at https://github.com/sneves/blake2-avx2/pull/4
126 #[inline(always)]
diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i)127 unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
128     *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
129     *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
130     *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
131 }
132 
133 #[inline(always)]
undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i)134 unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
135     *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
136     *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
137     *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
138 }
139 
140 #[inline(always)]
blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i141 unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
142     let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
143     let mut mask = _mm_set1_epi16(imm8 as i16);
144     mask = _mm_and_si128(mask, bits);
145     mask = _mm_cmpeq_epi16(mask, bits);
146     _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
147 }
148 
149 #[inline(always)]
compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [__m128i; 4]150 unsafe fn compress_pre(
151     cv: &CVWords,
152     block: &[u8; BLOCK_LEN],
153     block_len: u8,
154     counter: u64,
155     flags: u8,
156 ) -> [__m128i; 4] {
157     let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
158     let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
159     let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
160     let row3 = &mut set4(
161         counter_low(counter),
162         counter_high(counter),
163         block_len as u32,
164         flags as u32,
165     );
166 
167     let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
168     let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
169     let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
170     let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
171 
172     let mut t0;
173     let mut t1;
174     let mut t2;
175     let mut t3;
176     let mut tt;
177 
178     // Round 1. The first round permutes the message words from the original
179     // input order, into the groups that get mixed in parallel.
180     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); //  6  4  2  0
181     g1(row0, row1, row2, row3, t0);
182     t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); //  7  5  3  1
183     g2(row0, row1, row2, row3, t1);
184     diagonalize(row0, row2, row3);
185     t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10  8
186     t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10  8 14
187     g1(row0, row1, row2, row3, t2);
188     t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11  9
189     t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11  9 15
190     g2(row0, row1, row2, row3, t3);
191     undiagonalize(row0, row2, row3);
192     m0 = t0;
193     m1 = t1;
194     m2 = t2;
195     m3 = t3;
196 
197     // Round 2. This round and all following rounds apply a fixed permutation
198     // to the message words from the round before.
199     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
200     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
201     g1(row0, row1, row2, row3, t0);
202     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
203     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
204     t1 = blend_epi16(tt, t1, 0xCC);
205     g2(row0, row1, row2, row3, t1);
206     diagonalize(row0, row2, row3);
207     t2 = _mm_unpacklo_epi64(m3, m1);
208     tt = blend_epi16(t2, m2, 0xC0);
209     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
210     g1(row0, row1, row2, row3, t2);
211     t3 = _mm_unpackhi_epi32(m1, m3);
212     tt = _mm_unpacklo_epi32(m2, t3);
213     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
214     g2(row0, row1, row2, row3, t3);
215     undiagonalize(row0, row2, row3);
216     m0 = t0;
217     m1 = t1;
218     m2 = t2;
219     m3 = t3;
220 
221     // Round 3
222     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
223     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
224     g1(row0, row1, row2, row3, t0);
225     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
226     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
227     t1 = blend_epi16(tt, t1, 0xCC);
228     g2(row0, row1, row2, row3, t1);
229     diagonalize(row0, row2, row3);
230     t2 = _mm_unpacklo_epi64(m3, m1);
231     tt = blend_epi16(t2, m2, 0xC0);
232     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
233     g1(row0, row1, row2, row3, t2);
234     t3 = _mm_unpackhi_epi32(m1, m3);
235     tt = _mm_unpacklo_epi32(m2, t3);
236     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
237     g2(row0, row1, row2, row3, t3);
238     undiagonalize(row0, row2, row3);
239     m0 = t0;
240     m1 = t1;
241     m2 = t2;
242     m3 = t3;
243 
244     // Round 4
245     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
246     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
247     g1(row0, row1, row2, row3, t0);
248     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
249     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
250     t1 = blend_epi16(tt, t1, 0xCC);
251     g2(row0, row1, row2, row3, t1);
252     diagonalize(row0, row2, row3);
253     t2 = _mm_unpacklo_epi64(m3, m1);
254     tt = blend_epi16(t2, m2, 0xC0);
255     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
256     g1(row0, row1, row2, row3, t2);
257     t3 = _mm_unpackhi_epi32(m1, m3);
258     tt = _mm_unpacklo_epi32(m2, t3);
259     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
260     g2(row0, row1, row2, row3, t3);
261     undiagonalize(row0, row2, row3);
262     m0 = t0;
263     m1 = t1;
264     m2 = t2;
265     m3 = t3;
266 
267     // Round 5
268     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
269     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
270     g1(row0, row1, row2, row3, t0);
271     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
272     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
273     t1 = blend_epi16(tt, t1, 0xCC);
274     g2(row0, row1, row2, row3, t1);
275     diagonalize(row0, row2, row3);
276     t2 = _mm_unpacklo_epi64(m3, m1);
277     tt = blend_epi16(t2, m2, 0xC0);
278     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
279     g1(row0, row1, row2, row3, t2);
280     t3 = _mm_unpackhi_epi32(m1, m3);
281     tt = _mm_unpacklo_epi32(m2, t3);
282     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
283     g2(row0, row1, row2, row3, t3);
284     undiagonalize(row0, row2, row3);
285     m0 = t0;
286     m1 = t1;
287     m2 = t2;
288     m3 = t3;
289 
290     // Round 6
291     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
292     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
293     g1(row0, row1, row2, row3, t0);
294     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
295     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
296     t1 = blend_epi16(tt, t1, 0xCC);
297     g2(row0, row1, row2, row3, t1);
298     diagonalize(row0, row2, row3);
299     t2 = _mm_unpacklo_epi64(m3, m1);
300     tt = blend_epi16(t2, m2, 0xC0);
301     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
302     g1(row0, row1, row2, row3, t2);
303     t3 = _mm_unpackhi_epi32(m1, m3);
304     tt = _mm_unpacklo_epi32(m2, t3);
305     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
306     g2(row0, row1, row2, row3, t3);
307     undiagonalize(row0, row2, row3);
308     m0 = t0;
309     m1 = t1;
310     m2 = t2;
311     m3 = t3;
312 
313     // Round 7
314     t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
315     t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
316     g1(row0, row1, row2, row3, t0);
317     t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
318     tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
319     t1 = blend_epi16(tt, t1, 0xCC);
320     g2(row0, row1, row2, row3, t1);
321     diagonalize(row0, row2, row3);
322     t2 = _mm_unpacklo_epi64(m3, m1);
323     tt = blend_epi16(t2, m2, 0xC0);
324     t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
325     g1(row0, row1, row2, row3, t2);
326     t3 = _mm_unpackhi_epi32(m1, m3);
327     tt = _mm_unpacklo_epi32(m2, t3);
328     t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
329     g2(row0, row1, row2, row3, t3);
330     undiagonalize(row0, row2, row3);
331 
332     [*row0, *row1, *row2, *row3]
333 }
334 
335 #[target_feature(enable = "sse2")]
compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, )336 pub unsafe fn compress_in_place(
337     cv: &mut CVWords,
338     block: &[u8; BLOCK_LEN],
339     block_len: u8,
340     counter: u64,
341     flags: u8,
342 ) {
343     let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
344     storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
345     storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
346 }
347 
348 #[target_feature(enable = "sse2")]
compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64]349 pub unsafe fn compress_xof(
350     cv: &CVWords,
351     block: &[u8; BLOCK_LEN],
352     block_len: u8,
353     counter: u64,
354     flags: u8,
355 ) -> [u8; 64] {
356     let [mut row0, mut row1, mut row2, mut row3] =
357         compress_pre(cv, block, block_len, counter, flags);
358     row0 = xor(row0, row2);
359     row1 = xor(row1, row3);
360     row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
361     row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
362     core::mem::transmute([row0, row1, row2, row3])
363 }
364 
365 #[inline(always)]
round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize)366 unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
367     v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
368     v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
369     v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
370     v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
371     v[0] = add(v[0], v[4]);
372     v[1] = add(v[1], v[5]);
373     v[2] = add(v[2], v[6]);
374     v[3] = add(v[3], v[7]);
375     v[12] = xor(v[12], v[0]);
376     v[13] = xor(v[13], v[1]);
377     v[14] = xor(v[14], v[2]);
378     v[15] = xor(v[15], v[3]);
379     v[12] = rot16(v[12]);
380     v[13] = rot16(v[13]);
381     v[14] = rot16(v[14]);
382     v[15] = rot16(v[15]);
383     v[8] = add(v[8], v[12]);
384     v[9] = add(v[9], v[13]);
385     v[10] = add(v[10], v[14]);
386     v[11] = add(v[11], v[15]);
387     v[4] = xor(v[4], v[8]);
388     v[5] = xor(v[5], v[9]);
389     v[6] = xor(v[6], v[10]);
390     v[7] = xor(v[7], v[11]);
391     v[4] = rot12(v[4]);
392     v[5] = rot12(v[5]);
393     v[6] = rot12(v[6]);
394     v[7] = rot12(v[7]);
395     v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
396     v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
397     v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
398     v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
399     v[0] = add(v[0], v[4]);
400     v[1] = add(v[1], v[5]);
401     v[2] = add(v[2], v[6]);
402     v[3] = add(v[3], v[7]);
403     v[12] = xor(v[12], v[0]);
404     v[13] = xor(v[13], v[1]);
405     v[14] = xor(v[14], v[2]);
406     v[15] = xor(v[15], v[3]);
407     v[12] = rot8(v[12]);
408     v[13] = rot8(v[13]);
409     v[14] = rot8(v[14]);
410     v[15] = rot8(v[15]);
411     v[8] = add(v[8], v[12]);
412     v[9] = add(v[9], v[13]);
413     v[10] = add(v[10], v[14]);
414     v[11] = add(v[11], v[15]);
415     v[4] = xor(v[4], v[8]);
416     v[5] = xor(v[5], v[9]);
417     v[6] = xor(v[6], v[10]);
418     v[7] = xor(v[7], v[11]);
419     v[4] = rot7(v[4]);
420     v[5] = rot7(v[5]);
421     v[6] = rot7(v[6]);
422     v[7] = rot7(v[7]);
423 
424     v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
425     v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
426     v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
427     v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
428     v[0] = add(v[0], v[5]);
429     v[1] = add(v[1], v[6]);
430     v[2] = add(v[2], v[7]);
431     v[3] = add(v[3], v[4]);
432     v[15] = xor(v[15], v[0]);
433     v[12] = xor(v[12], v[1]);
434     v[13] = xor(v[13], v[2]);
435     v[14] = xor(v[14], v[3]);
436     v[15] = rot16(v[15]);
437     v[12] = rot16(v[12]);
438     v[13] = rot16(v[13]);
439     v[14] = rot16(v[14]);
440     v[10] = add(v[10], v[15]);
441     v[11] = add(v[11], v[12]);
442     v[8] = add(v[8], v[13]);
443     v[9] = add(v[9], v[14]);
444     v[5] = xor(v[5], v[10]);
445     v[6] = xor(v[6], v[11]);
446     v[7] = xor(v[7], v[8]);
447     v[4] = xor(v[4], v[9]);
448     v[5] = rot12(v[5]);
449     v[6] = rot12(v[6]);
450     v[7] = rot12(v[7]);
451     v[4] = rot12(v[4]);
452     v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
453     v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
454     v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
455     v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
456     v[0] = add(v[0], v[5]);
457     v[1] = add(v[1], v[6]);
458     v[2] = add(v[2], v[7]);
459     v[3] = add(v[3], v[4]);
460     v[15] = xor(v[15], v[0]);
461     v[12] = xor(v[12], v[1]);
462     v[13] = xor(v[13], v[2]);
463     v[14] = xor(v[14], v[3]);
464     v[15] = rot8(v[15]);
465     v[12] = rot8(v[12]);
466     v[13] = rot8(v[13]);
467     v[14] = rot8(v[14]);
468     v[10] = add(v[10], v[15]);
469     v[11] = add(v[11], v[12]);
470     v[8] = add(v[8], v[13]);
471     v[9] = add(v[9], v[14]);
472     v[5] = xor(v[5], v[10]);
473     v[6] = xor(v[6], v[11]);
474     v[7] = xor(v[7], v[8]);
475     v[4] = xor(v[4], v[9]);
476     v[5] = rot7(v[5]);
477     v[6] = rot7(v[6]);
478     v[7] = rot7(v[7]);
479     v[4] = rot7(v[4]);
480 }
481 
482 #[inline(always)]
transpose_vecs(vecs: &mut [__m128i; DEGREE])483 unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
484     // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
485     // 22/33. Note that this doesn't split the vector into two lanes, as the
486     // AVX2 counterparts do.
487     let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
488     let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
489     let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
490     let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
491 
492     // Interleave 64-bit lanes.
493     let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
494     let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
495     let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
496     let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
497 
498     vecs[0] = abcd_0;
499     vecs[1] = abcd_1;
500     vecs[2] = abcd_2;
501     vecs[3] = abcd_3;
502 }
503 
504 #[inline(always)]
transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16]505 unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
506     let mut vecs = [
507         loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
508         loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
509         loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
510         loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
511         loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
512         loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
513         loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
514         loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
515         loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
516         loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
517         loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
518         loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
519         loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
520         loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
521         loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
522         loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
523     ];
524     for i in 0..DEGREE {
525         _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0);
526     }
527     let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
528     transpose_vecs(squares.0);
529     transpose_vecs(squares.1);
530     transpose_vecs(squares.2);
531     transpose_vecs(squares.3);
532     vecs
533 }
534 
535 #[inline(always)]
load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i)536 unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
537     let mask = if increment_counter.yes() { !0 } else { 0 };
538     (
539         set4(
540             counter_low(counter + (mask & 0)),
541             counter_low(counter + (mask & 1)),
542             counter_low(counter + (mask & 2)),
543             counter_low(counter + (mask & 3)),
544         ),
545         set4(
546             counter_high(counter + (mask & 0)),
547             counter_high(counter + (mask & 1)),
548             counter_high(counter + (mask & 2)),
549             counter_high(counter + (mask & 3)),
550         ),
551     )
552 }
553 
554 #[target_feature(enable = "sse2")]
hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], )555 pub unsafe fn hash4(
556     inputs: &[*const u8; DEGREE],
557     blocks: usize,
558     key: &CVWords,
559     counter: u64,
560     increment_counter: IncrementCounter,
561     flags: u8,
562     flags_start: u8,
563     flags_end: u8,
564     out: &mut [u8; DEGREE * OUT_LEN],
565 ) {
566     let mut h_vecs = [
567         set1(key[0]),
568         set1(key[1]),
569         set1(key[2]),
570         set1(key[3]),
571         set1(key[4]),
572         set1(key[5]),
573         set1(key[6]),
574         set1(key[7]),
575     ];
576     let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
577     let mut block_flags = flags | flags_start;
578 
579     for block in 0..blocks {
580         if block + 1 == blocks {
581             block_flags |= flags_end;
582         }
583         let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
584         let block_flags_vec = set1(block_flags as u32);
585         let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
586 
587         // The transposed compression function. Note that inlining this
588         // manually here improves compile times by a lot, compared to factoring
589         // it out into its own function and making it #[inline(always)]. Just
590         // guessing, it might have something to do with loop unrolling.
591         let mut v = [
592             h_vecs[0],
593             h_vecs[1],
594             h_vecs[2],
595             h_vecs[3],
596             h_vecs[4],
597             h_vecs[5],
598             h_vecs[6],
599             h_vecs[7],
600             set1(IV[0]),
601             set1(IV[1]),
602             set1(IV[2]),
603             set1(IV[3]),
604             counter_low_vec,
605             counter_high_vec,
606             block_len_vec,
607             block_flags_vec,
608         ];
609         round(&mut v, &msg_vecs, 0);
610         round(&mut v, &msg_vecs, 1);
611         round(&mut v, &msg_vecs, 2);
612         round(&mut v, &msg_vecs, 3);
613         round(&mut v, &msg_vecs, 4);
614         round(&mut v, &msg_vecs, 5);
615         round(&mut v, &msg_vecs, 6);
616         h_vecs[0] = xor(v[0], v[8]);
617         h_vecs[1] = xor(v[1], v[9]);
618         h_vecs[2] = xor(v[2], v[10]);
619         h_vecs[3] = xor(v[3], v[11]);
620         h_vecs[4] = xor(v[4], v[12]);
621         h_vecs[5] = xor(v[5], v[13]);
622         h_vecs[6] = xor(v[6], v[14]);
623         h_vecs[7] = xor(v[7], v[15]);
624 
625         block_flags = flags;
626     }
627 
628     let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
629     transpose_vecs(squares.0);
630     transpose_vecs(squares.1);
631     // The first four vecs now contain the first half of each output, and the
632     // second four vecs contain the second half of each output.
633     storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
634     storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
635     storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
636     storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
637     storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
638     storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
639     storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
640     storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
641 }
642 
643 #[target_feature(enable = "sse2")]
hash1<const N: usize>( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, )644 unsafe fn hash1<const N: usize>(
645     input: &[u8; N],
646     key: &CVWords,
647     counter: u64,
648     flags: u8,
649     flags_start: u8,
650     flags_end: u8,
651     out: &mut CVBytes,
652 ) {
653     debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks");
654     let mut cv = *key;
655     let mut block_flags = flags | flags_start;
656     let mut slice = &input[..];
657     while slice.len() >= BLOCK_LEN {
658         if slice.len() == BLOCK_LEN {
659             block_flags |= flags_end;
660         }
661         compress_in_place(
662             &mut cv,
663             array_ref!(slice, 0, BLOCK_LEN),
664             BLOCK_LEN as u8,
665             counter,
666             block_flags,
667         );
668         block_flags = flags;
669         slice = &slice[BLOCK_LEN..];
670     }
671     *out = core::mem::transmute(cv); // x86 is little-endian
672 }
673 
674 #[target_feature(enable = "sse2")]
hash_many<const N: usize>( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], )675 pub unsafe fn hash_many<const N: usize>(
676     mut inputs: &[&[u8; N]],
677     key: &CVWords,
678     mut counter: u64,
679     increment_counter: IncrementCounter,
680     flags: u8,
681     flags_start: u8,
682     flags_end: u8,
683     mut out: &mut [u8],
684 ) {
685     debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
686     while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
687         // Safe because the layout of arrays is guaranteed, and because the
688         // `blocks` count is determined statically from the argument type.
689         let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
690         let blocks = N / BLOCK_LEN;
691         hash4(
692             input_ptrs,
693             blocks,
694             key,
695             counter,
696             increment_counter,
697             flags,
698             flags_start,
699             flags_end,
700             array_mut_ref!(out, 0, DEGREE * OUT_LEN),
701         );
702         if increment_counter.yes() {
703             counter += DEGREE as u64;
704         }
705         inputs = &inputs[DEGREE..];
706         out = &mut out[DEGREE * OUT_LEN..];
707     }
708     for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
709         hash1(
710             input,
711             key,
712             counter,
713             flags,
714             flags_start,
715             flags_end,
716             array_mut_ref!(output, 0, OUT_LEN),
717         );
718         if increment_counter.yes() {
719             counter += 1;
720         }
721     }
722 }
723 
724 #[cfg(test)]
725 mod test {
726     use super::*;
727 
728     #[test]
test_transpose()729     fn test_transpose() {
730         if !crate::platform::sse2_detected() {
731             return;
732         }
733 
734         #[target_feature(enable = "sse2")]
735         unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
736             transpose_vecs(vecs);
737         }
738 
739         let mut matrix = [[0 as u32; DEGREE]; DEGREE];
740         for i in 0..DEGREE {
741             for j in 0..DEGREE {
742                 matrix[i][j] = (i * DEGREE + j) as u32;
743             }
744         }
745 
746         unsafe {
747             let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
748             transpose_wrapper(&mut vecs);
749             matrix = core::mem::transmute(vecs);
750         }
751 
752         for i in 0..DEGREE {
753             for j in 0..DEGREE {
754                 // Reversed indexes from above.
755                 assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
756             }
757         }
758     }
759 
760     #[test]
test_compress()761     fn test_compress() {
762         if !crate::platform::sse2_detected() {
763             return;
764         }
765         crate::test::test_compress_fn(compress_in_place, compress_xof);
766     }
767 
768     #[test]
test_hash_many()769     fn test_hash_many() {
770         if !crate::platform::sse2_detected() {
771             return;
772         }
773         crate::test::test_hash_many_fn(hash_many, hash_many);
774     }
775 }
776