1 #[cfg(target_arch = "x86")]
2 use core::arch::x86::*;
3 #[cfg(target_arch = "x86_64")]
4 use core::arch::x86_64::*;
5
6 use crate::{
7 counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
8 OUT_LEN,
9 };
10 use arrayref::{array_mut_ref, array_ref, mut_array_refs};
11
12 pub const DEGREE: usize = 4;
13
14 #[inline(always)]
loadu(src: *const u8) -> __m128i15 unsafe fn loadu(src: *const u8) -> __m128i {
16 // This is an unaligned load, so the pointer cast is allowed.
17 _mm_loadu_si128(src as *const __m128i)
18 }
19
20 #[inline(always)]
storeu(src: __m128i, dest: *mut u8)21 unsafe fn storeu(src: __m128i, dest: *mut u8) {
22 // This is an unaligned store, so the pointer cast is allowed.
23 _mm_storeu_si128(dest as *mut __m128i, src)
24 }
25
26 #[inline(always)]
add(a: __m128i, b: __m128i) -> __m128i27 unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
28 _mm_add_epi32(a, b)
29 }
30
31 #[inline(always)]
xor(a: __m128i, b: __m128i) -> __m128i32 unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
33 _mm_xor_si128(a, b)
34 }
35
36 #[inline(always)]
set1(x: u32) -> __m128i37 unsafe fn set1(x: u32) -> __m128i {
38 _mm_set1_epi32(x as i32)
39 }
40
41 #[inline(always)]
set4(a: u32, b: u32, c: u32, d: u32) -> __m128i42 unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
43 _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
44 }
45
46 // These rotations are the "simple/shifts version". For the
47 // "complicated/shuffles version", see
48 // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
49 // For a discussion of the tradeoffs, see
50 // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
51 // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
52 // on recent x86 chips.
53
54 #[inline(always)]
rot16(a: __m128i) -> __m128i55 unsafe fn rot16(a: __m128i) -> __m128i {
56 _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
57 }
58
59 #[inline(always)]
rot12(a: __m128i) -> __m128i60 unsafe fn rot12(a: __m128i) -> __m128i {
61 _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
62 }
63
64 #[inline(always)]
rot8(a: __m128i) -> __m128i65 unsafe fn rot8(a: __m128i) -> __m128i {
66 _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
67 }
68
69 #[inline(always)]
rot7(a: __m128i) -> __m128i70 unsafe fn rot7(a: __m128i) -> __m128i {
71 _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
72 }
73
74 #[inline(always)]
g1( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, )75 unsafe fn g1(
76 row0: &mut __m128i,
77 row1: &mut __m128i,
78 row2: &mut __m128i,
79 row3: &mut __m128i,
80 m: __m128i,
81 ) {
82 *row0 = add(add(*row0, m), *row1);
83 *row3 = xor(*row3, *row0);
84 *row3 = rot16(*row3);
85 *row2 = add(*row2, *row3);
86 *row1 = xor(*row1, *row2);
87 *row1 = rot12(*row1);
88 }
89
90 #[inline(always)]
g2( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, )91 unsafe fn g2(
92 row0: &mut __m128i,
93 row1: &mut __m128i,
94 row2: &mut __m128i,
95 row3: &mut __m128i,
96 m: __m128i,
97 ) {
98 *row0 = add(add(*row0, m), *row1);
99 *row3 = xor(*row3, *row0);
100 *row3 = rot8(*row3);
101 *row2 = add(*row2, *row3);
102 *row1 = xor(*row1, *row2);
103 *row1 = rot7(*row1);
104 }
105
106 // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
107 macro_rules! _MM_SHUFFLE {
108 ($z:expr, $y:expr, $x:expr, $w:expr) => {
109 ($z << 6) | ($y << 4) | ($x << 2) | $w
110 };
111 }
112
113 macro_rules! shuffle2 {
114 ($a:expr, $b:expr, $c:expr) => {
115 _mm_castps_si128(_mm_shuffle_ps(
116 _mm_castsi128_ps($a),
117 _mm_castsi128_ps($b),
118 $c,
119 ))
120 };
121 }
122
123 // Note the optimization here of leaving row1 as the unrotated row, rather than
124 // row0. All the message loads below are adjusted to compensate for this. See
125 // discussion at https://github.com/sneves/blake2-avx2/pull/4
126 #[inline(always)]
diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i)127 unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
128 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
129 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
130 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
131 }
132
133 #[inline(always)]
undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i)134 unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
135 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
136 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
137 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
138 }
139
140 #[inline(always)]
blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i141 unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
142 let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
143 let mut mask = _mm_set1_epi16(imm8 as i16);
144 mask = _mm_and_si128(mask, bits);
145 mask = _mm_cmpeq_epi16(mask, bits);
146 _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
147 }
148
149 #[inline(always)]
compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [__m128i; 4]150 unsafe fn compress_pre(
151 cv: &CVWords,
152 block: &[u8; BLOCK_LEN],
153 block_len: u8,
154 counter: u64,
155 flags: u8,
156 ) -> [__m128i; 4] {
157 let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
158 let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
159 let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
160 let row3 = &mut set4(
161 counter_low(counter),
162 counter_high(counter),
163 block_len as u32,
164 flags as u32,
165 );
166
167 let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
168 let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
169 let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
170 let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
171
172 let mut t0;
173 let mut t1;
174 let mut t2;
175 let mut t3;
176 let mut tt;
177
178 // Round 1. The first round permutes the message words from the original
179 // input order, into the groups that get mixed in parallel.
180 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0
181 g1(row0, row1, row2, row3, t0);
182 t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1
183 g2(row0, row1, row2, row3, t1);
184 diagonalize(row0, row2, row3);
185 t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8
186 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14
187 g1(row0, row1, row2, row3, t2);
188 t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9
189 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15
190 g2(row0, row1, row2, row3, t3);
191 undiagonalize(row0, row2, row3);
192 m0 = t0;
193 m1 = t1;
194 m2 = t2;
195 m3 = t3;
196
197 // Round 2. This round and all following rounds apply a fixed permutation
198 // to the message words from the round before.
199 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
200 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
201 g1(row0, row1, row2, row3, t0);
202 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
203 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
204 t1 = blend_epi16(tt, t1, 0xCC);
205 g2(row0, row1, row2, row3, t1);
206 diagonalize(row0, row2, row3);
207 t2 = _mm_unpacklo_epi64(m3, m1);
208 tt = blend_epi16(t2, m2, 0xC0);
209 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
210 g1(row0, row1, row2, row3, t2);
211 t3 = _mm_unpackhi_epi32(m1, m3);
212 tt = _mm_unpacklo_epi32(m2, t3);
213 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
214 g2(row0, row1, row2, row3, t3);
215 undiagonalize(row0, row2, row3);
216 m0 = t0;
217 m1 = t1;
218 m2 = t2;
219 m3 = t3;
220
221 // Round 3
222 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
223 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
224 g1(row0, row1, row2, row3, t0);
225 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
226 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
227 t1 = blend_epi16(tt, t1, 0xCC);
228 g2(row0, row1, row2, row3, t1);
229 diagonalize(row0, row2, row3);
230 t2 = _mm_unpacklo_epi64(m3, m1);
231 tt = blend_epi16(t2, m2, 0xC0);
232 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
233 g1(row0, row1, row2, row3, t2);
234 t3 = _mm_unpackhi_epi32(m1, m3);
235 tt = _mm_unpacklo_epi32(m2, t3);
236 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
237 g2(row0, row1, row2, row3, t3);
238 undiagonalize(row0, row2, row3);
239 m0 = t0;
240 m1 = t1;
241 m2 = t2;
242 m3 = t3;
243
244 // Round 4
245 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
246 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
247 g1(row0, row1, row2, row3, t0);
248 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
249 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
250 t1 = blend_epi16(tt, t1, 0xCC);
251 g2(row0, row1, row2, row3, t1);
252 diagonalize(row0, row2, row3);
253 t2 = _mm_unpacklo_epi64(m3, m1);
254 tt = blend_epi16(t2, m2, 0xC0);
255 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
256 g1(row0, row1, row2, row3, t2);
257 t3 = _mm_unpackhi_epi32(m1, m3);
258 tt = _mm_unpacklo_epi32(m2, t3);
259 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
260 g2(row0, row1, row2, row3, t3);
261 undiagonalize(row0, row2, row3);
262 m0 = t0;
263 m1 = t1;
264 m2 = t2;
265 m3 = t3;
266
267 // Round 5
268 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
269 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
270 g1(row0, row1, row2, row3, t0);
271 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
272 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
273 t1 = blend_epi16(tt, t1, 0xCC);
274 g2(row0, row1, row2, row3, t1);
275 diagonalize(row0, row2, row3);
276 t2 = _mm_unpacklo_epi64(m3, m1);
277 tt = blend_epi16(t2, m2, 0xC0);
278 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
279 g1(row0, row1, row2, row3, t2);
280 t3 = _mm_unpackhi_epi32(m1, m3);
281 tt = _mm_unpacklo_epi32(m2, t3);
282 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
283 g2(row0, row1, row2, row3, t3);
284 undiagonalize(row0, row2, row3);
285 m0 = t0;
286 m1 = t1;
287 m2 = t2;
288 m3 = t3;
289
290 // Round 6
291 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
292 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
293 g1(row0, row1, row2, row3, t0);
294 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
295 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
296 t1 = blend_epi16(tt, t1, 0xCC);
297 g2(row0, row1, row2, row3, t1);
298 diagonalize(row0, row2, row3);
299 t2 = _mm_unpacklo_epi64(m3, m1);
300 tt = blend_epi16(t2, m2, 0xC0);
301 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
302 g1(row0, row1, row2, row3, t2);
303 t3 = _mm_unpackhi_epi32(m1, m3);
304 tt = _mm_unpacklo_epi32(m2, t3);
305 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
306 g2(row0, row1, row2, row3, t3);
307 undiagonalize(row0, row2, row3);
308 m0 = t0;
309 m1 = t1;
310 m2 = t2;
311 m3 = t3;
312
313 // Round 7
314 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
315 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
316 g1(row0, row1, row2, row3, t0);
317 t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
318 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
319 t1 = blend_epi16(tt, t1, 0xCC);
320 g2(row0, row1, row2, row3, t1);
321 diagonalize(row0, row2, row3);
322 t2 = _mm_unpacklo_epi64(m3, m1);
323 tt = blend_epi16(t2, m2, 0xC0);
324 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
325 g1(row0, row1, row2, row3, t2);
326 t3 = _mm_unpackhi_epi32(m1, m3);
327 tt = _mm_unpacklo_epi32(m2, t3);
328 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
329 g2(row0, row1, row2, row3, t3);
330 undiagonalize(row0, row2, row3);
331
332 [*row0, *row1, *row2, *row3]
333 }
334
335 #[target_feature(enable = "sse2")]
compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, )336 pub unsafe fn compress_in_place(
337 cv: &mut CVWords,
338 block: &[u8; BLOCK_LEN],
339 block_len: u8,
340 counter: u64,
341 flags: u8,
342 ) {
343 let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
344 storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
345 storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
346 }
347
348 #[target_feature(enable = "sse2")]
compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64]349 pub unsafe fn compress_xof(
350 cv: &CVWords,
351 block: &[u8; BLOCK_LEN],
352 block_len: u8,
353 counter: u64,
354 flags: u8,
355 ) -> [u8; 64] {
356 let [mut row0, mut row1, mut row2, mut row3] =
357 compress_pre(cv, block, block_len, counter, flags);
358 row0 = xor(row0, row2);
359 row1 = xor(row1, row3);
360 row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
361 row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
362 core::mem::transmute([row0, row1, row2, row3])
363 }
364
365 #[inline(always)]
round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize)366 unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
367 v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
368 v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
369 v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
370 v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
371 v[0] = add(v[0], v[4]);
372 v[1] = add(v[1], v[5]);
373 v[2] = add(v[2], v[6]);
374 v[3] = add(v[3], v[7]);
375 v[12] = xor(v[12], v[0]);
376 v[13] = xor(v[13], v[1]);
377 v[14] = xor(v[14], v[2]);
378 v[15] = xor(v[15], v[3]);
379 v[12] = rot16(v[12]);
380 v[13] = rot16(v[13]);
381 v[14] = rot16(v[14]);
382 v[15] = rot16(v[15]);
383 v[8] = add(v[8], v[12]);
384 v[9] = add(v[9], v[13]);
385 v[10] = add(v[10], v[14]);
386 v[11] = add(v[11], v[15]);
387 v[4] = xor(v[4], v[8]);
388 v[5] = xor(v[5], v[9]);
389 v[6] = xor(v[6], v[10]);
390 v[7] = xor(v[7], v[11]);
391 v[4] = rot12(v[4]);
392 v[5] = rot12(v[5]);
393 v[6] = rot12(v[6]);
394 v[7] = rot12(v[7]);
395 v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
396 v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
397 v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
398 v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
399 v[0] = add(v[0], v[4]);
400 v[1] = add(v[1], v[5]);
401 v[2] = add(v[2], v[6]);
402 v[3] = add(v[3], v[7]);
403 v[12] = xor(v[12], v[0]);
404 v[13] = xor(v[13], v[1]);
405 v[14] = xor(v[14], v[2]);
406 v[15] = xor(v[15], v[3]);
407 v[12] = rot8(v[12]);
408 v[13] = rot8(v[13]);
409 v[14] = rot8(v[14]);
410 v[15] = rot8(v[15]);
411 v[8] = add(v[8], v[12]);
412 v[9] = add(v[9], v[13]);
413 v[10] = add(v[10], v[14]);
414 v[11] = add(v[11], v[15]);
415 v[4] = xor(v[4], v[8]);
416 v[5] = xor(v[5], v[9]);
417 v[6] = xor(v[6], v[10]);
418 v[7] = xor(v[7], v[11]);
419 v[4] = rot7(v[4]);
420 v[5] = rot7(v[5]);
421 v[6] = rot7(v[6]);
422 v[7] = rot7(v[7]);
423
424 v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
425 v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
426 v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
427 v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
428 v[0] = add(v[0], v[5]);
429 v[1] = add(v[1], v[6]);
430 v[2] = add(v[2], v[7]);
431 v[3] = add(v[3], v[4]);
432 v[15] = xor(v[15], v[0]);
433 v[12] = xor(v[12], v[1]);
434 v[13] = xor(v[13], v[2]);
435 v[14] = xor(v[14], v[3]);
436 v[15] = rot16(v[15]);
437 v[12] = rot16(v[12]);
438 v[13] = rot16(v[13]);
439 v[14] = rot16(v[14]);
440 v[10] = add(v[10], v[15]);
441 v[11] = add(v[11], v[12]);
442 v[8] = add(v[8], v[13]);
443 v[9] = add(v[9], v[14]);
444 v[5] = xor(v[5], v[10]);
445 v[6] = xor(v[6], v[11]);
446 v[7] = xor(v[7], v[8]);
447 v[4] = xor(v[4], v[9]);
448 v[5] = rot12(v[5]);
449 v[6] = rot12(v[6]);
450 v[7] = rot12(v[7]);
451 v[4] = rot12(v[4]);
452 v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
453 v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
454 v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
455 v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
456 v[0] = add(v[0], v[5]);
457 v[1] = add(v[1], v[6]);
458 v[2] = add(v[2], v[7]);
459 v[3] = add(v[3], v[4]);
460 v[15] = xor(v[15], v[0]);
461 v[12] = xor(v[12], v[1]);
462 v[13] = xor(v[13], v[2]);
463 v[14] = xor(v[14], v[3]);
464 v[15] = rot8(v[15]);
465 v[12] = rot8(v[12]);
466 v[13] = rot8(v[13]);
467 v[14] = rot8(v[14]);
468 v[10] = add(v[10], v[15]);
469 v[11] = add(v[11], v[12]);
470 v[8] = add(v[8], v[13]);
471 v[9] = add(v[9], v[14]);
472 v[5] = xor(v[5], v[10]);
473 v[6] = xor(v[6], v[11]);
474 v[7] = xor(v[7], v[8]);
475 v[4] = xor(v[4], v[9]);
476 v[5] = rot7(v[5]);
477 v[6] = rot7(v[6]);
478 v[7] = rot7(v[7]);
479 v[4] = rot7(v[4]);
480 }
481
482 #[inline(always)]
transpose_vecs(vecs: &mut [__m128i; DEGREE])483 unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
484 // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
485 // 22/33. Note that this doesn't split the vector into two lanes, as the
486 // AVX2 counterparts do.
487 let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
488 let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
489 let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
490 let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
491
492 // Interleave 64-bit lanes.
493 let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
494 let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
495 let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
496 let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
497
498 vecs[0] = abcd_0;
499 vecs[1] = abcd_1;
500 vecs[2] = abcd_2;
501 vecs[3] = abcd_3;
502 }
503
504 #[inline(always)]
transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16]505 unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
506 let mut vecs = [
507 loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
508 loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
509 loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
510 loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
511 loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
512 loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
513 loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
514 loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
515 loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
516 loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
517 loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
518 loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
519 loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
520 loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
521 loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
522 loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
523 ];
524 for i in 0..DEGREE {
525 _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0);
526 }
527 let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
528 transpose_vecs(squares.0);
529 transpose_vecs(squares.1);
530 transpose_vecs(squares.2);
531 transpose_vecs(squares.3);
532 vecs
533 }
534
535 #[inline(always)]
load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i)536 unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
537 let mask = if increment_counter.yes() { !0 } else { 0 };
538 (
539 set4(
540 counter_low(counter + (mask & 0)),
541 counter_low(counter + (mask & 1)),
542 counter_low(counter + (mask & 2)),
543 counter_low(counter + (mask & 3)),
544 ),
545 set4(
546 counter_high(counter + (mask & 0)),
547 counter_high(counter + (mask & 1)),
548 counter_high(counter + (mask & 2)),
549 counter_high(counter + (mask & 3)),
550 ),
551 )
552 }
553
554 #[target_feature(enable = "sse2")]
hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], )555 pub unsafe fn hash4(
556 inputs: &[*const u8; DEGREE],
557 blocks: usize,
558 key: &CVWords,
559 counter: u64,
560 increment_counter: IncrementCounter,
561 flags: u8,
562 flags_start: u8,
563 flags_end: u8,
564 out: &mut [u8; DEGREE * OUT_LEN],
565 ) {
566 let mut h_vecs = [
567 set1(key[0]),
568 set1(key[1]),
569 set1(key[2]),
570 set1(key[3]),
571 set1(key[4]),
572 set1(key[5]),
573 set1(key[6]),
574 set1(key[7]),
575 ];
576 let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
577 let mut block_flags = flags | flags_start;
578
579 for block in 0..blocks {
580 if block + 1 == blocks {
581 block_flags |= flags_end;
582 }
583 let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
584 let block_flags_vec = set1(block_flags as u32);
585 let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
586
587 // The transposed compression function. Note that inlining this
588 // manually here improves compile times by a lot, compared to factoring
589 // it out into its own function and making it #[inline(always)]. Just
590 // guessing, it might have something to do with loop unrolling.
591 let mut v = [
592 h_vecs[0],
593 h_vecs[1],
594 h_vecs[2],
595 h_vecs[3],
596 h_vecs[4],
597 h_vecs[5],
598 h_vecs[6],
599 h_vecs[7],
600 set1(IV[0]),
601 set1(IV[1]),
602 set1(IV[2]),
603 set1(IV[3]),
604 counter_low_vec,
605 counter_high_vec,
606 block_len_vec,
607 block_flags_vec,
608 ];
609 round(&mut v, &msg_vecs, 0);
610 round(&mut v, &msg_vecs, 1);
611 round(&mut v, &msg_vecs, 2);
612 round(&mut v, &msg_vecs, 3);
613 round(&mut v, &msg_vecs, 4);
614 round(&mut v, &msg_vecs, 5);
615 round(&mut v, &msg_vecs, 6);
616 h_vecs[0] = xor(v[0], v[8]);
617 h_vecs[1] = xor(v[1], v[9]);
618 h_vecs[2] = xor(v[2], v[10]);
619 h_vecs[3] = xor(v[3], v[11]);
620 h_vecs[4] = xor(v[4], v[12]);
621 h_vecs[5] = xor(v[5], v[13]);
622 h_vecs[6] = xor(v[6], v[14]);
623 h_vecs[7] = xor(v[7], v[15]);
624
625 block_flags = flags;
626 }
627
628 let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
629 transpose_vecs(squares.0);
630 transpose_vecs(squares.1);
631 // The first four vecs now contain the first half of each output, and the
632 // second four vecs contain the second half of each output.
633 storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
634 storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
635 storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
636 storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
637 storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
638 storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
639 storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
640 storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
641 }
642
643 #[target_feature(enable = "sse2")]
hash1<const N: usize>( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, )644 unsafe fn hash1<const N: usize>(
645 input: &[u8; N],
646 key: &CVWords,
647 counter: u64,
648 flags: u8,
649 flags_start: u8,
650 flags_end: u8,
651 out: &mut CVBytes,
652 ) {
653 debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks");
654 let mut cv = *key;
655 let mut block_flags = flags | flags_start;
656 let mut slice = &input[..];
657 while slice.len() >= BLOCK_LEN {
658 if slice.len() == BLOCK_LEN {
659 block_flags |= flags_end;
660 }
661 compress_in_place(
662 &mut cv,
663 array_ref!(slice, 0, BLOCK_LEN),
664 BLOCK_LEN as u8,
665 counter,
666 block_flags,
667 );
668 block_flags = flags;
669 slice = &slice[BLOCK_LEN..];
670 }
671 *out = core::mem::transmute(cv); // x86 is little-endian
672 }
673
674 #[target_feature(enable = "sse2")]
hash_many<const N: usize>( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], )675 pub unsafe fn hash_many<const N: usize>(
676 mut inputs: &[&[u8; N]],
677 key: &CVWords,
678 mut counter: u64,
679 increment_counter: IncrementCounter,
680 flags: u8,
681 flags_start: u8,
682 flags_end: u8,
683 mut out: &mut [u8],
684 ) {
685 debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
686 while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
687 // Safe because the layout of arrays is guaranteed, and because the
688 // `blocks` count is determined statically from the argument type.
689 let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
690 let blocks = N / BLOCK_LEN;
691 hash4(
692 input_ptrs,
693 blocks,
694 key,
695 counter,
696 increment_counter,
697 flags,
698 flags_start,
699 flags_end,
700 array_mut_ref!(out, 0, DEGREE * OUT_LEN),
701 );
702 if increment_counter.yes() {
703 counter += DEGREE as u64;
704 }
705 inputs = &inputs[DEGREE..];
706 out = &mut out[DEGREE * OUT_LEN..];
707 }
708 for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
709 hash1(
710 input,
711 key,
712 counter,
713 flags,
714 flags_start,
715 flags_end,
716 array_mut_ref!(output, 0, OUT_LEN),
717 );
718 if increment_counter.yes() {
719 counter += 1;
720 }
721 }
722 }
723
724 #[cfg(test)]
725 mod test {
726 use super::*;
727
728 #[test]
test_transpose()729 fn test_transpose() {
730 if !crate::platform::sse2_detected() {
731 return;
732 }
733
734 #[target_feature(enable = "sse2")]
735 unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
736 transpose_vecs(vecs);
737 }
738
739 let mut matrix = [[0 as u32; DEGREE]; DEGREE];
740 for i in 0..DEGREE {
741 for j in 0..DEGREE {
742 matrix[i][j] = (i * DEGREE + j) as u32;
743 }
744 }
745
746 unsafe {
747 let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
748 transpose_wrapper(&mut vecs);
749 matrix = core::mem::transmute(vecs);
750 }
751
752 for i in 0..DEGREE {
753 for j in 0..DEGREE {
754 // Reversed indexes from above.
755 assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
756 }
757 }
758 }
759
760 #[test]
test_compress()761 fn test_compress() {
762 if !crate::platform::sse2_detected() {
763 return;
764 }
765 crate::test::test_compress_fn(compress_in_place, compress_xof);
766 }
767
768 #[test]
test_hash_many()769 fn test_hash_many() {
770 if !crate::platform::sse2_detected() {
771 return;
772 }
773 crate::test::test_hash_many_fn(hash_many, hash_many);
774 }
775 }
776