1 //! Interfaces for hashing multiple inputs at once, using SIMD more
2 //! efficiently.
3 //!
4 //! The throughput of these interfaces is comparable to BLAKE2bp, about twice
5 //! the throughput of regular BLAKE2b when AVX2 is available.
6 //!
7 //! These interfaces can accept any number of inputs, and the implementation
8 //! does its best to parallelize them. In general, the more inputs you can pass
9 //! in at once the better. If you need to batch your inputs in smaller groups,
10 //! see the [`degree`](fn.degree.html) function for a good batch size.
11 //!
12 //! The implementation keeps working in parallel even when inputs are of
13 //! different lengths, by managing a working set of jobs whose input isn't yet
14 //! exhausted. However, if one or two inputs are much longer than the others,
15 //! and they're encountered only at the end, there might not be any remaining
16 //! work to parallelize them with. In this case, sorting the inputs
17 //! longest-first can improve parallelism.
18 //!
19 //! # Example
20 //!
21 //! ```
22 //! use blake2b_simd::{blake2b, State, many::update_many};
23 //!
24 //! let mut states = [
25 //!     State::new(),
26 //!     State::new(),
27 //!     State::new(),
28 //!     State::new(),
29 //! ];
30 //!
31 //! let inputs = [
32 //!     &b"foo"[..],
33 //!     &b"bar"[..],
34 //!     &b"baz"[..],
35 //!     &b"bing"[..],
36 //! ];
37 //!
38 //! update_many(states.iter_mut().zip(inputs.iter()));
39 //!
40 //! for (state, input) in states.iter_mut().zip(inputs.iter()) {
41 //!     assert_eq!(blake2b(input), state.finalize());
42 //! }
43 //! ```
44 
45 use crate::guts::{self, Finalize, Implementation, Job, LastNode, Stride};
46 use crate::state_words_to_bytes;
47 use crate::Count;
48 use crate::Hash;
49 use crate::Params;
50 use crate::State;
51 use crate::Word;
52 use crate::BLOCKBYTES;
53 use arrayref::array_mut_ref;
54 use arrayvec::ArrayVec;
55 use core::fmt;
56 
57 /// The largest possible value of [`degree`](fn.degree.html) on the target
58 /// platform.
59 ///
60 /// Note that this constant reflects the parallelism degree supported by this
61 /// crate, so it will change over time as support is added or removed. For
62 /// example, when Rust stabilizes AVX-512 support and this crate adds an
63 /// AVX-512 implementation, this constant will double on x86 targets. If that
64 /// implementation is an optional feature (e.g. because it's nightly-only), the
65 /// value of this constant will depend on that optional feature also.
66 pub const MAX_DEGREE: usize = guts::MAX_DEGREE;
67 
68 /// The parallelism degree of the implementation, detected at runtime. If you
69 /// hash your inputs in small batches, making the batch size a multiple of
70 /// `degree` will generally give good performance.
71 ///
72 /// For example, an x86 processor that supports AVX2 can compute four BLAKE2b
73 /// hashes in parallel, so `degree` returns 4 on that machine. If you call
74 /// [`hash_many`] with only three inputs, that's not enough to use the AVX2
75 /// implementation, and your average throughput will be lower. Likewise if you
76 /// call it with five inputs of equal length, the first four will be hashed in
77 /// parallel with AVX2, but the last one will have to be hashed by itself, and
78 /// again your average throughput will be lower.
79 ///
80 /// As noted in the module level docs, performance is more complicated if your
81 /// inputs are of different lengths. When parallelizing long and short inputs
82 /// together, the longer ones will have bytes left over, and the implementation
83 /// will try to parallelize those leftover bytes with subsequent inputs. The
84 /// more inputs available in that case, the more the implementation will be
85 /// able to parallelize.
86 ///
87 /// If you need a constant batch size, for example to collect inputs in an
88 /// array, see [`MAX_DEGREE`].
89 ///
90 /// [`hash_many`]: fn.hash_many.html
91 /// [`MAX_DEGREE`]: constant.MAX_DEGREE.html
degree() -> usize92 pub fn degree() -> usize {
93     guts::Implementation::detect().degree()
94 }
95 
96 type JobsVec<'a, 'b> = ArrayVec<[Job<'a, 'b>; guts::MAX_DEGREE]>;
97 
98 #[inline(always)]
fill_jobs_vec<'a, 'b>( jobs_iter: &mut impl Iterator<Item = Job<'a, 'b>>, vec: &mut JobsVec<'a, 'b>, target_len: usize, )99 fn fill_jobs_vec<'a, 'b>(
100     jobs_iter: &mut impl Iterator<Item = Job<'a, 'b>>,
101     vec: &mut JobsVec<'a, 'b>,
102     target_len: usize,
103 ) {
104     while vec.len() < target_len {
105         if let Some(job) = jobs_iter.next() {
106             vec.push(job);
107         } else {
108             break;
109         }
110     }
111 }
112 
113 #[inline(always)]
evict_finished<'a, 'b>(vec: &mut JobsVec<'a, 'b>, num_jobs: usize)114 fn evict_finished<'a, 'b>(vec: &mut JobsVec<'a, 'b>, num_jobs: usize) {
115     // Iterate backwards so that removal doesn't cause an out-of-bounds panic.
116     for i in (0..num_jobs).rev() {
117         // Note that is_empty() is only valid because we know all these jobs
118         // have been run at least once. Otherwise we could confuse the empty
119         // input for a finished job, which would be incorrect.
120         //
121         // Avoid a panic branch here in release mode.
122         debug_assert!(vec.len() > i);
123         if vec.len() > i && vec[i].input.is_empty() {
124             // Note that calling pop_at() repeatedly has some overhead, because
125             // later elements need to be shifted up. However, the JobsVec is
126             // small, and this approach guarantees that jobs are encountered in
127             // order.
128             vec.pop_at(i);
129         }
130     }
131 }
132 
compress_many<'a, 'b, I>( jobs: I, imp: Implementation, finalize: Finalize, stride: Stride, ) where I: IntoIterator<Item = Job<'a, 'b>>,133 pub(crate) fn compress_many<'a, 'b, I>(
134     jobs: I,
135     imp: Implementation,
136     finalize: Finalize,
137     stride: Stride,
138 ) where
139     I: IntoIterator<Item = Job<'a, 'b>>,
140 {
141     // Fuse is important for correctness, since each of these blocks tries to
142     // advance the iterator, even if a previous block emptied it.
143     let mut jobs_iter = jobs.into_iter().fuse();
144     let mut jobs_vec = JobsVec::new();
145 
146     if imp.degree() >= 4 {
147         loop {
148             fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 4);
149             if jobs_vec.len() < 4 {
150                 break;
151             }
152             let jobs_array = array_mut_ref!(jobs_vec, 0, 4);
153             imp.compress4_loop(jobs_array, finalize, stride);
154             evict_finished(&mut jobs_vec, 4);
155         }
156     }
157 
158     if imp.degree() >= 2 {
159         loop {
160             fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 2);
161             if jobs_vec.len() < 2 {
162                 break;
163             }
164             let jobs_array = array_mut_ref!(jobs_vec, 0, 2);
165             imp.compress2_loop(jobs_array, finalize, stride);
166             evict_finished(&mut jobs_vec, 2);
167         }
168     }
169 
170     for job in jobs_vec.into_iter().chain(jobs_iter) {
171         let Job {
172             input,
173             words,
174             count,
175             last_node,
176         } = job;
177         imp.compress1_loop(input, words, count, last_node, finalize, stride);
178     }
179 }
180 
181 /// Update any number of `State` objects at once.
182 ///
183 /// # Example
184 ///
185 /// ```
186 /// use blake2b_simd::{blake2b, State, many::update_many};
187 ///
188 /// let mut states = [
189 ///     State::new(),
190 ///     State::new(),
191 ///     State::new(),
192 ///     State::new(),
193 /// ];
194 ///
195 /// let inputs = [
196 ///     &b"foo"[..],
197 ///     &b"bar"[..],
198 ///     &b"baz"[..],
199 ///     &b"bing"[..],
200 /// ];
201 ///
202 /// update_many(states.iter_mut().zip(inputs.iter()));
203 ///
204 /// for (state, input) in states.iter_mut().zip(inputs.iter()) {
205 ///     assert_eq!(blake2b(input), state.finalize());
206 /// }
207 /// ```
update_many<'a, 'b, I, T>(pairs: I) where I: IntoIterator<Item = (&'a mut State, &'b T)>, T: 'b + AsRef<[u8]> + ?Sized,208 pub fn update_many<'a, 'b, I, T>(pairs: I)
209 where
210     I: IntoIterator<Item = (&'a mut State, &'b T)>,
211     T: 'b + AsRef<[u8]> + ?Sized,
212 {
213     // Get the guts::Implementation from the first state, if any.
214     let mut peekable_pairs = pairs.into_iter().peekable();
215     let implementation = if let Some((state, _)) = peekable_pairs.peek() {
216         state.implementation
217     } else {
218         // No work items, just short circuit.
219         return;
220     };
221 
222     // Adapt the pairs iterator into a Jobs iterator, but skip over the Jobs
223     // where there's not actually any work to do (e.g. because there's not much
224     // input and it's all just going in the State buffer).
225     let jobs = peekable_pairs.flat_map(|(state, input_t)| {
226         let mut input = input_t.as_ref();
227         // For each pair, if the State has some input in its buffer, try to
228         // finish that buffer. If there wasn't enough input to do that --
229         // or if the input was empty to begin with -- skip this pair.
230         state.compress_buffer_if_possible(&mut input);
231         if input.is_empty() {
232             return None;
233         }
234         // Now we know the buffer is empty and there's more input. Make sure we
235         // buffer the final block, because update() doesn't finalize.
236         let mut last_block_start = input.len() - 1;
237         last_block_start -= last_block_start % BLOCKBYTES;
238         let (blocks, last_block) = input.split_at(last_block_start);
239         state.buf[..last_block.len()].copy_from_slice(last_block);
240         state.buflen = last_block.len() as u8;
241         // Finally, if the full blocks slice is non-empty, prepare that job for
242         // compression, and bump the State count.
243         if blocks.is_empty() {
244             None
245         } else {
246             let count = state.count;
247             state.count = state.count.wrapping_add(blocks.len() as Count);
248             Some(Job {
249                 input: blocks,
250                 words: &mut state.words,
251                 count,
252                 last_node: state.last_node,
253             })
254         }
255     });
256 
257     // Run all the Jobs in the iterator.
258     compress_many(jobs, implementation, Finalize::No, Stride::Serial);
259 }
260 
261 /// A job for the [`hash_many`] function. After calling [`hash_many`] on a
262 /// collection of `HashManyJob` objects, you can call [`to_hash`] on each job
263 /// to get the result.
264 ///
265 /// [`hash_many`]: fn.hash_many.html
266 /// [`to_hash`]: struct.HashManyJob.html#method.to_hash
267 #[derive(Clone)]
268 pub struct HashManyJob<'a> {
269     words: [Word; 8],
270     count: Count,
271     last_node: LastNode,
272     hash_length: u8,
273     input: &'a [u8],
274     finished: bool,
275     implementation: guts::Implementation,
276 }
277 
278 impl<'a> HashManyJob<'a> {
279     /// Construct a new `HashManyJob` from a set of hashing parameters and an
280     /// input.
281     #[inline]
new(params: &Params, input: &'a [u8]) -> Self282     pub fn new(params: &Params, input: &'a [u8]) -> Self {
283         let mut words = params.to_words();
284         let mut count = 0;
285         let mut finished = false;
286         // If we have key bytes, compress them into the state words. If there's
287         // no additional input, this compression needs to finalize and set
288         // finished=true.
289         if params.key_length > 0 {
290             let mut finalization = Finalize::No;
291             if input.is_empty() {
292                 finalization = Finalize::Yes;
293                 finished = true;
294             }
295             params.implementation.compress1_loop(
296                 &params.key_block,
297                 &mut words,
298                 0,
299                 params.last_node,
300                 finalization,
301                 Stride::Serial,
302             );
303             count = BLOCKBYTES as Count;
304         }
305         Self {
306             words,
307             count,
308             last_node: params.last_node,
309             hash_length: params.hash_length,
310             input,
311             finished,
312             implementation: params.implementation,
313         }
314     }
315 
316     /// Get the hash from a finished job. If you call this before calling
317     /// [`hash_many`], it will panic in debug mode.
318     ///
319     /// [`hash_many`]: fn.hash_many.html
320     #[inline]
to_hash(&self) -> Hash321     pub fn to_hash(&self) -> Hash {
322         debug_assert!(self.finished, "job hasn't been run yet");
323         Hash {
324             bytes: state_words_to_bytes(&self.words),
325             len: self.hash_length,
326         }
327     }
328 }
329 
330 impl<'a> fmt::Debug for HashManyJob<'a> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result331     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
332         // NB: Don't print the words. Leaking them would allow length extension.
333         write!(
334             f,
335             "HashManyJob {{ count: {}, hash_length: {}, last_node: {}, input_len: {} }}",
336             self.count,
337             self.hash_length,
338             self.last_node.yes(),
339             self.input.len(),
340         )
341     }
342 }
343 
344 /// Hash any number of complete inputs all at once.
345 ///
346 /// This is slightly more efficient than using `update_many` with `State`
347 /// objects, because it doesn't need to do any buffering.
348 ///
349 /// Running `hash_many` on the same `HashManyJob` object more than once has no
350 /// effect.
351 ///
352 /// # Example
353 ///
354 /// ```
355 /// use blake2b_simd::{blake2b, Params, many::{HashManyJob, hash_many}};
356 ///
357 /// let inputs = [
358 ///     &b"foo"[..],
359 ///     &b"bar"[..],
360 ///     &b"baz"[..],
361 ///     &b"bing"[..],
362 /// ];
363 ///
364 /// let mut params = Params::new();
365 /// params.hash_length(16);
366 ///
367 /// let mut jobs = [
368 ///     HashManyJob::new(&params, inputs[0]),
369 ///     HashManyJob::new(&params, inputs[1]),
370 ///     HashManyJob::new(&params, inputs[2]),
371 ///     HashManyJob::new(&params, inputs[3]),
372 /// ];
373 ///
374 /// hash_many(jobs.iter_mut());
375 ///
376 /// for (input, job) in inputs.iter().zip(jobs.iter()) {
377 ///     let expected = params.hash(input);
378 ///     assert_eq!(expected, job.to_hash());
379 /// }
380 /// ```
hash_many<'a, 'b, I>(hash_many_jobs: I) where 'b: 'a, I: IntoIterator<Item = &'a mut HashManyJob<'b>>,381 pub fn hash_many<'a, 'b, I>(hash_many_jobs: I)
382 where
383     'b: 'a,
384     I: IntoIterator<Item = &'a mut HashManyJob<'b>>,
385 {
386     // Get the guts::Implementation from the first job, if any.
387     let mut peekable_jobs = hash_many_jobs.into_iter().peekable();
388     let implementation = if let Some(job) = peekable_jobs.peek() {
389         job.implementation
390     } else {
391         // No work items, just short circuit.
392         return;
393     };
394 
395     // In the jobs iterator, skip HashManyJobs that have already been run. This
396     // is less because we actually expect callers to call hash_many twice
397     // (though they're allowed to if they want), and more because
398     // HashManyJob::new might need to finalize if there are key bytes but no
399     // input. Tying the job lifetime to the Params reference is an alternative,
400     // but I've found it too constraining in practice. We could also put key
401     // bytes in every HashManyJob, but that would add unnecessary storage and
402     // zeroing for all callers.
403     let unfinished_jobs = peekable_jobs.into_iter().filter(|j| !j.finished);
404     let jobs = unfinished_jobs.map(|j| {
405         j.finished = true;
406         Job {
407             input: j.input,
408             words: &mut j.words,
409             count: j.count,
410             last_node: j.last_node,
411         }
412     });
413     compress_many(jobs, implementation, Finalize::Yes, Stride::Serial);
414 }
415 
416 #[cfg(test)]
417 mod test {
418     use super::*;
419     use crate::guts;
420     use crate::paint_test_input;
421     use crate::BLOCKBYTES;
422     use arrayvec::ArrayVec;
423 
424     #[test]
test_degree()425     fn test_degree() {
426         assert!(degree() <= MAX_DEGREE);
427 
428         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
429         #[cfg(feature = "std")]
430         {
431             if is_x86_feature_detected!("avx2") {
432                 assert!(degree() >= 4);
433             }
434             if is_x86_feature_detected!("sse4.1") {
435                 assert!(degree() >= 2);
436             }
437         }
438     }
439 
440     #[test]
test_hash_many()441     fn test_hash_many() {
442         // Use a length of inputs that will exercise all of the power-of-two loops.
443         const LEN: usize = 2 * guts::MAX_DEGREE - 1;
444 
445         // Rerun LEN inputs LEN different times, with the empty input starting in a
446         // different spot each time.
447         let mut input = [0; LEN * BLOCKBYTES];
448         paint_test_input(&mut input);
449         for start_offset in 0..LEN {
450             let mut inputs: [&[u8]; LEN] = [&[]; LEN];
451             for i in 0..LEN {
452                 let chunks = (i + start_offset) % LEN;
453                 inputs[i] = &input[..chunks * BLOCKBYTES];
454             }
455 
456             let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new();
457             for i in 0..LEN {
458                 let mut p = Params::new();
459                 p.node_offset(i as u64);
460                 if i % 2 == 1 {
461                     p.last_node(true);
462                     p.key(b"foo");
463                 }
464                 params.push(p);
465             }
466 
467             let mut jobs: ArrayVec<[HashManyJob; LEN]> = ArrayVec::new();
468             for i in 0..LEN {
469                 jobs.push(HashManyJob::new(&params[i], inputs[i]));
470             }
471 
472             hash_many(&mut jobs);
473 
474             // Check the outputs.
475             for i in 0..LEN {
476                 let expected = params[i].hash(inputs[i]);
477                 assert_eq!(expected, jobs[i].to_hash());
478             }
479         }
480     }
481 
482     #[test]
test_update_many()483     fn test_update_many() {
484         // Use a length of inputs that will exercise all of the power-of-two loops.
485         const LEN: usize = 2 * guts::MAX_DEGREE - 1;
486 
487         // Rerun LEN inputs LEN different times, with the empty input starting in a
488         // different spot each time.
489         let mut input = [0; LEN * BLOCKBYTES];
490         paint_test_input(&mut input);
491         for start_offset in 0..LEN {
492             let mut inputs: [&[u8]; LEN] = [&[]; LEN];
493             for i in 0..LEN {
494                 let chunks = (i + start_offset) % LEN;
495                 inputs[i] = &input[..chunks * BLOCKBYTES];
496             }
497 
498             let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new();
499             for i in 0..LEN {
500                 let mut p = Params::new();
501                 p.node_offset(i as u64);
502                 if i % 2 == 1 {
503                     p.last_node(true);
504                     p.key(b"foo");
505                 }
506                 params.push(p);
507             }
508 
509             let mut states: ArrayVec<[State; LEN]> = ArrayVec::new();
510             for i in 0..LEN {
511                 states.push(params[i].to_state());
512             }
513 
514             // Run each input twice through, to exercise buffering.
515             update_many(states.iter_mut().zip(inputs.iter()));
516             update_many(states.iter_mut().zip(inputs.iter()));
517 
518             // Check the outputs.
519             for i in 0..LEN {
520                 let mut reference_state = params[i].to_state();
521                 // Again, run the input twice.
522                 reference_state.update(inputs[i]);
523                 reference_state.update(inputs[i]);
524                 assert_eq!(reference_state.finalize(), states[i].finalize());
525                 assert_eq!(2 * inputs[i].len() as Count, states[i].count());
526             }
527         }
528     }
529 }
530