1 #[cfg(target_arch = "x86")]
2 use core::arch::x86::*;
3 #[cfg(target_arch = "x86_64")]
4 use core::arch::x86_64::*;
5
6 use crate::{
7 counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
8 OUT_LEN,
9 };
10 use arrayref::{array_mut_ref, array_ref, mut_array_refs};
11
12 pub const DEGREE: usize = 4;
13
14 #[inline(always)]
loadu(src: *const u8) -> __m128i15 unsafe fn loadu(src: *const u8) -> __m128i {
16 // This is an unaligned load, so the pointer cast is allowed.
17 _mm_loadu_si128(src as *const __m128i)
18 }
19
20 #[inline(always)]
storeu(src: __m128i, dest: *mut u8)21 unsafe fn storeu(src: __m128i, dest: *mut u8) {
22 // This is an unaligned store, so the pointer cast is allowed.
23 _mm_storeu_si128(dest as *mut __m128i, src)
24 }
25
26 #[inline(always)]
add(a: __m128i, b: __m128i) -> __m128i27 unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
28 _mm_add_epi32(a, b)
29 }
30
31 #[inline(always)]
xor(a: __m128i, b: __m128i) -> __m128i32 unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
33 _mm_xor_si128(a, b)
34 }
35
36 #[inline(always)]
set1(x: u32) -> __m128i37 unsafe fn set1(x: u32) -> __m128i {
38 _mm_set1_epi32(x as i32)
39 }
40
41 #[inline(always)]
set4(a: u32, b: u32, c: u32, d: u32) -> __m128i42 unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
43 _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
44 }
45
46 // These rotations are the "simple/shifts version". For the
47 // "complicated/shuffles version", see
48 // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
49 // For a discussion of the tradeoffs, see
50 // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
51 // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
52 // on recent x86 chips.
53
54 #[inline(always)]
rot16(a: __m128i) -> __m128i55 unsafe fn rot16(a: __m128i) -> __m128i {
56 _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
57 }
58
59 #[inline(always)]
rot12(a: __m128i) -> __m128i60 unsafe fn rot12(a: __m128i) -> __m128i {
61 _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
62 }
63
64 #[inline(always)]
rot8(a: __m128i) -> __m128i65 unsafe fn rot8(a: __m128i) -> __m128i {
66 _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
67 }
68
69 #[inline(always)]
rot7(a: __m128i) -> __m128i70 unsafe fn rot7(a: __m128i) -> __m128i {
71 _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
72 }
73
74 #[inline(always)]
g1( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, )75 unsafe fn g1(
76 row0: &mut __m128i,
77 row1: &mut __m128i,
78 row2: &mut __m128i,
79 row3: &mut __m128i,
80 m: __m128i,
81 ) {
82 *row0 = add(add(*row0, m), *row1);
83 *row3 = xor(*row3, *row0);
84 *row3 = rot16(*row3);
85 *row2 = add(*row2, *row3);
86 *row1 = xor(*row1, *row2);
87 *row1 = rot12(*row1);
88 }
89
90 #[inline(always)]
g2( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, )91 unsafe fn g2(
92 row0: &mut __m128i,
93 row1: &mut __m128i,
94 row2: &mut __m128i,
95 row3: &mut __m128i,
96 m: __m128i,
97 ) {
98 *row0 = add(add(*row0, m), *row1);
99 *row3 = xor(*row3, *row0);
100 *row3 = rot8(*row3);
101 *row2 = add(*row2, *row3);
102 *row1 = xor(*row1, *row2);
103 *row1 = rot7(*row1);
104 }
105
106 // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
107 macro_rules! _MM_SHUFFLE {
108 ($z:expr, $y:expr, $x:expr, $w:expr) => {
109 ($z << 6) | ($y << 4) | ($x << 2) | $w
110 };
111 }
112
113 macro_rules! shuffle2 {
114 ($a:expr, $b:expr, $c:expr) => {
115 _mm_castps_si128(_mm_shuffle_ps(
116 _mm_castsi128_ps($a),
117 _mm_castsi128_ps($b),
118 $c,
119 ))
120 };
121 }
122
123 // Note the optimization here of leaving row1 as the unrotated row, rather than
124 // row0. All the message loads below are adjusted to compensate for this. See
125 // discussion at https://github.com/sneves/blake2-avx2/pull/4
126 #[inline(always)]
diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i)127 unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
128 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
129 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
130 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
131 }
132
133 #[inline(always)]
undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i)134 unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
135 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
136 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
137 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
138 }
139
140 #[inline(always)]
compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [__m128i; 4]141 unsafe fn compress_pre(
142 cv: &CVWords,
143 block: &[u8; BLOCK_LEN],
144 block_len: u8,
145 counter: u64,
146 flags: u8,
147 ) -> [__m128i; 4] {
148 let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
149 let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
150 let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
151 let row3 = &mut set4(
152 counter_low(counter),
153 counter_high(counter),
154 block_len as u32,
155 flags as u32,
156 );
157
158 let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
159 let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
160 let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
161 let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
162
163 let mut t0;
164 let mut t1;
165 let mut t2;
166 let mut t3;
167 let mut tt;
168
169 // Round 1. The first round permutes the message words from the original
170 // input order, into the groups that get mixed in parallel.
171 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0
172 g1(row0, row1, row2, row3, t0);
173 t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1
174 g2(row0, row1, row2, row3, t1);
175 diagonalize(row0, row2, row3);
176 t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8
177 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14
178 g1(row0, row1, row2, row3, t2);
179 t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9
180 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15
181 g2(row0, row1, row2, row3, t3);
182 undiagonalize(row0, row2, row3);
183 m0 = t0;
184 m1 = t1;
185 m2 = t2;
186 m3 = t3;
187
188 // Round 2. This round and all following rounds apply a fixed permutation
189 // to the message words from the round before.
190 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
191 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
192 g1(row0, row1, row2, row3, t0);
193 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
194 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
195 t1 = _mm_blend_epi16(tt, t1, 0xCC);
196 g2(row0, row1, row2, row3, t1);
197 diagonalize(row0, row2, row3);
198 t2 = _mm_unpacklo_epi64(m3, m1);
199 tt = _mm_blend_epi16(t2, m2, 0xC0);
200 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
201 g1(row0, row1, row2, row3, t2);
202 t3 = _mm_unpackhi_epi32(m1, m3);
203 tt = _mm_unpacklo_epi32(m2, t3);
204 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
205 g2(row0, row1, row2, row3, t3);
206 undiagonalize(row0, row2, row3);
207 m0 = t0;
208 m1 = t1;
209 m2 = t2;
210 m3 = t3;
211
212 // Round 3
213 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
214 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
215 g1(row0, row1, row2, row3, t0);
216 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
217 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
218 t1 = _mm_blend_epi16(tt, t1, 0xCC);
219 g2(row0, row1, row2, row3, t1);
220 diagonalize(row0, row2, row3);
221 t2 = _mm_unpacklo_epi64(m3, m1);
222 tt = _mm_blend_epi16(t2, m2, 0xC0);
223 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
224 g1(row0, row1, row2, row3, t2);
225 t3 = _mm_unpackhi_epi32(m1, m3);
226 tt = _mm_unpacklo_epi32(m2, t3);
227 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
228 g2(row0, row1, row2, row3, t3);
229 undiagonalize(row0, row2, row3);
230 m0 = t0;
231 m1 = t1;
232 m2 = t2;
233 m3 = t3;
234
235 // Round 4
236 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
237 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
238 g1(row0, row1, row2, row3, t0);
239 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
240 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
241 t1 = _mm_blend_epi16(tt, t1, 0xCC);
242 g2(row0, row1, row2, row3, t1);
243 diagonalize(row0, row2, row3);
244 t2 = _mm_unpacklo_epi64(m3, m1);
245 tt = _mm_blend_epi16(t2, m2, 0xC0);
246 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
247 g1(row0, row1, row2, row3, t2);
248 t3 = _mm_unpackhi_epi32(m1, m3);
249 tt = _mm_unpacklo_epi32(m2, t3);
250 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
251 g2(row0, row1, row2, row3, t3);
252 undiagonalize(row0, row2, row3);
253 m0 = t0;
254 m1 = t1;
255 m2 = t2;
256 m3 = t3;
257
258 // Round 5
259 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
260 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
261 g1(row0, row1, row2, row3, t0);
262 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
263 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
264 t1 = _mm_blend_epi16(tt, t1, 0xCC);
265 g2(row0, row1, row2, row3, t1);
266 diagonalize(row0, row2, row3);
267 t2 = _mm_unpacklo_epi64(m3, m1);
268 tt = _mm_blend_epi16(t2, m2, 0xC0);
269 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
270 g1(row0, row1, row2, row3, t2);
271 t3 = _mm_unpackhi_epi32(m1, m3);
272 tt = _mm_unpacklo_epi32(m2, t3);
273 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
274 g2(row0, row1, row2, row3, t3);
275 undiagonalize(row0, row2, row3);
276 m0 = t0;
277 m1 = t1;
278 m2 = t2;
279 m3 = t3;
280
281 // Round 6
282 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
283 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
284 g1(row0, row1, row2, row3, t0);
285 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
286 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
287 t1 = _mm_blend_epi16(tt, t1, 0xCC);
288 g2(row0, row1, row2, row3, t1);
289 diagonalize(row0, row2, row3);
290 t2 = _mm_unpacklo_epi64(m3, m1);
291 tt = _mm_blend_epi16(t2, m2, 0xC0);
292 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
293 g1(row0, row1, row2, row3, t2);
294 t3 = _mm_unpackhi_epi32(m1, m3);
295 tt = _mm_unpacklo_epi32(m2, t3);
296 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
297 g2(row0, row1, row2, row3, t3);
298 undiagonalize(row0, row2, row3);
299 m0 = t0;
300 m1 = t1;
301 m2 = t2;
302 m3 = t3;
303
304 // Round 7
305 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
306 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
307 g1(row0, row1, row2, row3, t0);
308 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
309 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
310 t1 = _mm_blend_epi16(tt, t1, 0xCC);
311 g2(row0, row1, row2, row3, t1);
312 diagonalize(row0, row2, row3);
313 t2 = _mm_unpacklo_epi64(m3, m1);
314 tt = _mm_blend_epi16(t2, m2, 0xC0);
315 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
316 g1(row0, row1, row2, row3, t2);
317 t3 = _mm_unpackhi_epi32(m1, m3);
318 tt = _mm_unpacklo_epi32(m2, t3);
319 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
320 g2(row0, row1, row2, row3, t3);
321 undiagonalize(row0, row2, row3);
322
323 [*row0, *row1, *row2, *row3]
324 }
325
326 #[target_feature(enable = "sse4.1")]
compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, )327 pub unsafe fn compress_in_place(
328 cv: &mut CVWords,
329 block: &[u8; BLOCK_LEN],
330 block_len: u8,
331 counter: u64,
332 flags: u8,
333 ) {
334 let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
335 storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
336 storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
337 }
338
339 #[target_feature(enable = "sse4.1")]
compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64]340 pub unsafe fn compress_xof(
341 cv: &CVWords,
342 block: &[u8; BLOCK_LEN],
343 block_len: u8,
344 counter: u64,
345 flags: u8,
346 ) -> [u8; 64] {
347 let [mut row0, mut row1, mut row2, mut row3] =
348 compress_pre(cv, block, block_len, counter, flags);
349 row0 = xor(row0, row2);
350 row1 = xor(row1, row3);
351 row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
352 row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
353 core::mem::transmute([row0, row1, row2, row3])
354 }
355
356 #[inline(always)]
round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize)357 unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
358 v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
359 v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
360 v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
361 v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
362 v[0] = add(v[0], v[4]);
363 v[1] = add(v[1], v[5]);
364 v[2] = add(v[2], v[6]);
365 v[3] = add(v[3], v[7]);
366 v[12] = xor(v[12], v[0]);
367 v[13] = xor(v[13], v[1]);
368 v[14] = xor(v[14], v[2]);
369 v[15] = xor(v[15], v[3]);
370 v[12] = rot16(v[12]);
371 v[13] = rot16(v[13]);
372 v[14] = rot16(v[14]);
373 v[15] = rot16(v[15]);
374 v[8] = add(v[8], v[12]);
375 v[9] = add(v[9], v[13]);
376 v[10] = add(v[10], v[14]);
377 v[11] = add(v[11], v[15]);
378 v[4] = xor(v[4], v[8]);
379 v[5] = xor(v[5], v[9]);
380 v[6] = xor(v[6], v[10]);
381 v[7] = xor(v[7], v[11]);
382 v[4] = rot12(v[4]);
383 v[5] = rot12(v[5]);
384 v[6] = rot12(v[6]);
385 v[7] = rot12(v[7]);
386 v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
387 v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
388 v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
389 v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
390 v[0] = add(v[0], v[4]);
391 v[1] = add(v[1], v[5]);
392 v[2] = add(v[2], v[6]);
393 v[3] = add(v[3], v[7]);
394 v[12] = xor(v[12], v[0]);
395 v[13] = xor(v[13], v[1]);
396 v[14] = xor(v[14], v[2]);
397 v[15] = xor(v[15], v[3]);
398 v[12] = rot8(v[12]);
399 v[13] = rot8(v[13]);
400 v[14] = rot8(v[14]);
401 v[15] = rot8(v[15]);
402 v[8] = add(v[8], v[12]);
403 v[9] = add(v[9], v[13]);
404 v[10] = add(v[10], v[14]);
405 v[11] = add(v[11], v[15]);
406 v[4] = xor(v[4], v[8]);
407 v[5] = xor(v[5], v[9]);
408 v[6] = xor(v[6], v[10]);
409 v[7] = xor(v[7], v[11]);
410 v[4] = rot7(v[4]);
411 v[5] = rot7(v[5]);
412 v[6] = rot7(v[6]);
413 v[7] = rot7(v[7]);
414
415 v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
416 v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
417 v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
418 v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
419 v[0] = add(v[0], v[5]);
420 v[1] = add(v[1], v[6]);
421 v[2] = add(v[2], v[7]);
422 v[3] = add(v[3], v[4]);
423 v[15] = xor(v[15], v[0]);
424 v[12] = xor(v[12], v[1]);
425 v[13] = xor(v[13], v[2]);
426 v[14] = xor(v[14], v[3]);
427 v[15] = rot16(v[15]);
428 v[12] = rot16(v[12]);
429 v[13] = rot16(v[13]);
430 v[14] = rot16(v[14]);
431 v[10] = add(v[10], v[15]);
432 v[11] = add(v[11], v[12]);
433 v[8] = add(v[8], v[13]);
434 v[9] = add(v[9], v[14]);
435 v[5] = xor(v[5], v[10]);
436 v[6] = xor(v[6], v[11]);
437 v[7] = xor(v[7], v[8]);
438 v[4] = xor(v[4], v[9]);
439 v[5] = rot12(v[5]);
440 v[6] = rot12(v[6]);
441 v[7] = rot12(v[7]);
442 v[4] = rot12(v[4]);
443 v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
444 v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
445 v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
446 v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
447 v[0] = add(v[0], v[5]);
448 v[1] = add(v[1], v[6]);
449 v[2] = add(v[2], v[7]);
450 v[3] = add(v[3], v[4]);
451 v[15] = xor(v[15], v[0]);
452 v[12] = xor(v[12], v[1]);
453 v[13] = xor(v[13], v[2]);
454 v[14] = xor(v[14], v[3]);
455 v[15] = rot8(v[15]);
456 v[12] = rot8(v[12]);
457 v[13] = rot8(v[13]);
458 v[14] = rot8(v[14]);
459 v[10] = add(v[10], v[15]);
460 v[11] = add(v[11], v[12]);
461 v[8] = add(v[8], v[13]);
462 v[9] = add(v[9], v[14]);
463 v[5] = xor(v[5], v[10]);
464 v[6] = xor(v[6], v[11]);
465 v[7] = xor(v[7], v[8]);
466 v[4] = xor(v[4], v[9]);
467 v[5] = rot7(v[5]);
468 v[6] = rot7(v[6]);
469 v[7] = rot7(v[7]);
470 v[4] = rot7(v[4]);
471 }
472
473 #[inline(always)]
transpose_vecs(vecs: &mut [__m128i; DEGREE])474 unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
475 // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
476 // 22/33. Note that this doesn't split the vector into two lanes, as the
477 // AVX2 counterparts do.
478 let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
479 let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
480 let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
481 let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
482
483 // Interleave 64-bit lanes.
484 let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
485 let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
486 let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
487 let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
488
489 vecs[0] = abcd_0;
490 vecs[1] = abcd_1;
491 vecs[2] = abcd_2;
492 vecs[3] = abcd_3;
493 }
494
495 #[inline(always)]
transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16]496 unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
497 let mut vecs = [
498 loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
499 loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
500 loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
501 loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
502 loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
503 loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
504 loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
505 loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
506 loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
507 loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
508 loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
509 loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
510 loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
511 loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
512 loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
513 loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
514 ];
515 for i in 0..DEGREE {
516 _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0);
517 }
518 let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
519 transpose_vecs(squares.0);
520 transpose_vecs(squares.1);
521 transpose_vecs(squares.2);
522 transpose_vecs(squares.3);
523 vecs
524 }
525
526 #[inline(always)]
load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i)527 unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
528 let mask = if increment_counter.yes() { !0 } else { 0 };
529 (
530 set4(
531 counter_low(counter + (mask & 0)),
532 counter_low(counter + (mask & 1)),
533 counter_low(counter + (mask & 2)),
534 counter_low(counter + (mask & 3)),
535 ),
536 set4(
537 counter_high(counter + (mask & 0)),
538 counter_high(counter + (mask & 1)),
539 counter_high(counter + (mask & 2)),
540 counter_high(counter + (mask & 3)),
541 ),
542 )
543 }
544
545 #[target_feature(enable = "sse4.1")]
hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], )546 pub unsafe fn hash4(
547 inputs: &[*const u8; DEGREE],
548 blocks: usize,
549 key: &CVWords,
550 counter: u64,
551 increment_counter: IncrementCounter,
552 flags: u8,
553 flags_start: u8,
554 flags_end: u8,
555 out: &mut [u8; DEGREE * OUT_LEN],
556 ) {
557 let mut h_vecs = [
558 set1(key[0]),
559 set1(key[1]),
560 set1(key[2]),
561 set1(key[3]),
562 set1(key[4]),
563 set1(key[5]),
564 set1(key[6]),
565 set1(key[7]),
566 ];
567 let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
568 let mut block_flags = flags | flags_start;
569
570 for block in 0..blocks {
571 if block + 1 == blocks {
572 block_flags |= flags_end;
573 }
574 let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
575 let block_flags_vec = set1(block_flags as u32);
576 let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
577
578 // The transposed compression function. Note that inlining this
579 // manually here improves compile times by a lot, compared to factoring
580 // it out into its own function and making it #[inline(always)]. Just
581 // guessing, it might have something to do with loop unrolling.
582 let mut v = [
583 h_vecs[0],
584 h_vecs[1],
585 h_vecs[2],
586 h_vecs[3],
587 h_vecs[4],
588 h_vecs[5],
589 h_vecs[6],
590 h_vecs[7],
591 set1(IV[0]),
592 set1(IV[1]),
593 set1(IV[2]),
594 set1(IV[3]),
595 counter_low_vec,
596 counter_high_vec,
597 block_len_vec,
598 block_flags_vec,
599 ];
600 round(&mut v, &msg_vecs, 0);
601 round(&mut v, &msg_vecs, 1);
602 round(&mut v, &msg_vecs, 2);
603 round(&mut v, &msg_vecs, 3);
604 round(&mut v, &msg_vecs, 4);
605 round(&mut v, &msg_vecs, 5);
606 round(&mut v, &msg_vecs, 6);
607 h_vecs[0] = xor(v[0], v[8]);
608 h_vecs[1] = xor(v[1], v[9]);
609 h_vecs[2] = xor(v[2], v[10]);
610 h_vecs[3] = xor(v[3], v[11]);
611 h_vecs[4] = xor(v[4], v[12]);
612 h_vecs[5] = xor(v[5], v[13]);
613 h_vecs[6] = xor(v[6], v[14]);
614 h_vecs[7] = xor(v[7], v[15]);
615
616 block_flags = flags;
617 }
618
619 let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
620 transpose_vecs(squares.0);
621 transpose_vecs(squares.1);
622 // The first four vecs now contain the first half of each output, and the
623 // second four vecs contain the second half of each output.
624 storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
625 storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
626 storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
627 storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
628 storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
629 storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
630 storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
631 storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
632 }
633
634 #[target_feature(enable = "sse4.1")]
hash1<const N: usize>( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, )635 unsafe fn hash1<const N: usize>(
636 input: &[u8; N],
637 key: &CVWords,
638 counter: u64,
639 flags: u8,
640 flags_start: u8,
641 flags_end: u8,
642 out: &mut CVBytes,
643 ) {
644 debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks");
645 let mut cv = *key;
646 let mut block_flags = flags | flags_start;
647 let mut slice = &input[..];
648 while slice.len() >= BLOCK_LEN {
649 if slice.len() == BLOCK_LEN {
650 block_flags |= flags_end;
651 }
652 compress_in_place(
653 &mut cv,
654 array_ref!(slice, 0, BLOCK_LEN),
655 BLOCK_LEN as u8,
656 counter,
657 block_flags,
658 );
659 block_flags = flags;
660 slice = &slice[BLOCK_LEN..];
661 }
662 *out = core::mem::transmute(cv); // x86 is little-endian
663 }
664
665 #[target_feature(enable = "sse4.1")]
hash_many<const N: usize>( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], )666 pub unsafe fn hash_many<const N: usize>(
667 mut inputs: &[&[u8; N]],
668 key: &CVWords,
669 mut counter: u64,
670 increment_counter: IncrementCounter,
671 flags: u8,
672 flags_start: u8,
673 flags_end: u8,
674 mut out: &mut [u8],
675 ) {
676 debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
677 while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
678 // Safe because the layout of arrays is guaranteed, and because the
679 // `blocks` count is determined statically from the argument type.
680 let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
681 let blocks = N / BLOCK_LEN;
682 hash4(
683 input_ptrs,
684 blocks,
685 key,
686 counter,
687 increment_counter,
688 flags,
689 flags_start,
690 flags_end,
691 array_mut_ref!(out, 0, DEGREE * OUT_LEN),
692 );
693 if increment_counter.yes() {
694 counter += DEGREE as u64;
695 }
696 inputs = &inputs[DEGREE..];
697 out = &mut out[DEGREE * OUT_LEN..];
698 }
699 for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
700 hash1(
701 input,
702 key,
703 counter,
704 flags,
705 flags_start,
706 flags_end,
707 array_mut_ref!(output, 0, OUT_LEN),
708 );
709 if increment_counter.yes() {
710 counter += 1;
711 }
712 }
713 }
714
715 #[cfg(test)]
716 mod test {
717 use super::*;
718
719 #[test]
test_transpose()720 fn test_transpose() {
721 if !crate::platform::sse41_detected() {
722 return;
723 }
724
725 #[target_feature(enable = "sse4.1")]
726 unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
727 transpose_vecs(vecs);
728 }
729
730 let mut matrix = [[0 as u32; DEGREE]; DEGREE];
731 for i in 0..DEGREE {
732 for j in 0..DEGREE {
733 matrix[i][j] = (i * DEGREE + j) as u32;
734 }
735 }
736
737 unsafe {
738 let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
739 transpose_wrapper(&mut vecs);
740 matrix = core::mem::transmute(vecs);
741 }
742
743 for i in 0..DEGREE {
744 for j in 0..DEGREE {
745 // Reversed indexes from above.
746 assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
747 }
748 }
749 }
750
751 #[test]
test_compress()752 fn test_compress() {
753 if !crate::platform::sse41_detected() {
754 return;
755 }
756 crate::test::test_compress_fn(compress_in_place, compress_xof);
757 }
758
759 #[test]
test_hash_many()760 fn test_hash_many() {
761 if !crate::platform::sse41_detected() {
762 return;
763 }
764 crate::test::test_hash_many_fn(hash_many, hash_many);
765 }
766 }
767