1 //! BLAKE2bp, a variant of BLAKE2b that uses SIMD more efficiently.
2 //!
3 //! The AVX2 implementation of BLAKE2bp is about twice as fast that of BLAKE2b.
4 //! However, note that it's a different hash function, and it gives a different
5 //! hash from BLAKE2b for the same input.
6 //!
7 //! # Example
8 //!
9 //! ```
10 //! use blake2b_simd::blake2bp;
11 //!
12 //! let hash = blake2bp::Params::new()
13 //!     .hash_length(16)
14 //!     .key(b"The Magic Words are Squeamish Ossifrage")
15 //!     .to_state()
16 //!     .update(b"foo")
17 //!     .update(b"bar")
18 //!     .update(b"baz")
19 //!     .finalize();
20 //! assert_eq!("e69c7d2c42a5ac14948772231c68c552", &hash.to_hex());
21 //! ```
22 
23 use crate::guts::{Finalize, Implementation, Job, LastNode, Stride};
24 use crate::many;
25 use crate::Count;
26 use crate::Hash;
27 use crate::Word;
28 use crate::BLOCKBYTES;
29 use crate::KEYBYTES;
30 use crate::OUTBYTES;
31 use core::cmp;
32 use core::fmt;
33 use core::mem::size_of;
34 
35 #[cfg(feature = "std")]
36 use std;
37 
38 pub(crate) const DEGREE: usize = 4;
39 
40 /// Compute the BLAKE2bp hash of a slice of bytes all at once, using default
41 /// parameters.
42 ///
43 /// # Example
44 ///
45 /// ```
46 /// # use blake2b_simd::blake2bp::blake2bp;
47 /// let expected = "8ca9ccee7946afcb686fe7556628b5ba1bf9a691da37ca58cd049354d99f3704\
48 ///                 2c007427e5f219b9ab5063707ec6823872dee413ee014b4d02f2ebb6abb5f643";
49 /// let hash = blake2bp(b"foo");
50 /// assert_eq!(expected, &hash.to_hex());
51 /// ```
blake2bp(input: &[u8]) -> Hash52 pub fn blake2bp(input: &[u8]) -> Hash {
53     Params::new().hash(input)
54 }
55 
56 /// A parameter builder for BLAKE2bp, just like the [`Params`](../struct.Params.html) type for
57 /// BLAKE2b.
58 ///
59 /// This builder only supports configuring the hash length and a secret key. This matches the
60 /// options provided by the [reference
61 /// implementation](https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2.h#L162-L165).
62 ///
63 /// # Example
64 ///
65 /// ```
66 /// use blake2b_simd::blake2bp;
67 /// let mut state = blake2bp::Params::new().hash_length(32).to_state();
68 /// ```
69 #[derive(Clone)]
70 pub struct Params {
71     hash_length: u8,
72     key_length: u8,
73     key: [u8; KEYBYTES],
74     implementation: Implementation,
75 }
76 
77 impl Params {
78     /// Equivalent to `Params::default()`.
new() -> Self79     pub fn new() -> Self {
80         Self {
81             hash_length: OUTBYTES as u8,
82             key_length: 0,
83             key: [0; KEYBYTES],
84             implementation: Implementation::detect(),
85         }
86     }
87 
to_words(&self) -> ([[Word; 8]; DEGREE], [Word; 8])88     fn to_words(&self) -> ([[Word; 8]; DEGREE], [Word; 8]) {
89         let mut base_params = crate::Params::new();
90         base_params
91             .hash_length(self.hash_length as usize)
92             .key(&self.key[..self.key_length as usize])
93             .fanout(DEGREE as u8)
94             .max_depth(2)
95             .max_leaf_length(0)
96             // Note that inner_hash_length is always OUTBYTES, regardless of the hash_length
97             // parameter. This isn't documented in the spec, but it matches the behavior of the
98             // reference implementation: https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2bp-ref.c#L55
99             .inner_hash_length(OUTBYTES);
100         let leaf_words = |worker_index| {
101             base_params
102                 .clone()
103                 .node_offset(worker_index)
104                 .node_depth(0)
105                 // Note that setting the last_node flag here has no effect,
106                 // because it isn't included in the state words.
107                 .to_words()
108         };
109         let leaf_words = [leaf_words(0), leaf_words(1), leaf_words(2), leaf_words(3)];
110         let root_words = base_params
111             .clone()
112             .node_offset(0)
113             .node_depth(1)
114             // Note that setting the last_node flag here has no effect, because
115             // it isn't included in the state words. Also note that because
116             // we're only preserving its state words, the root node won't hash
117             // any key bytes.
118             .to_words();
119         (leaf_words, root_words)
120     }
121 
122     /// Hash an input all at once with these parameters.
hash(&self, input: &[u8]) -> Hash123     pub fn hash(&self, input: &[u8]) -> Hash {
124         // If there's a key, just fall back to using the State.
125         if self.key_length > 0 {
126             return self.to_state().update(input).finalize();
127         }
128         let (mut leaf_words, mut root_words) = self.to_words();
129         // Hash each leaf in parallel.
130         let jobs = leaf_words.iter_mut().enumerate().map(|(i, words)| {
131             let input_start = cmp::min(input.len(), i * BLOCKBYTES);
132             Job {
133                 input: &input[input_start..],
134                 words,
135                 count: 0,
136                 last_node: if i == DEGREE - 1 {
137                     LastNode::Yes
138                 } else {
139                     LastNode::No
140                 },
141             }
142         });
143         many::compress_many(jobs, self.implementation, Finalize::Yes, Stride::Parallel);
144         // Hash each leaf into the root.
145         finalize_root_words(
146             &leaf_words,
147             &mut root_words,
148             self.hash_length,
149             self.implementation,
150         )
151     }
152 
153     /// Construct a BLAKE2bp `State` object based on these parameters.
to_state(&self) -> State154     pub fn to_state(&self) -> State {
155         State::with_params(self)
156     }
157 
158     /// Set the length of the final hash, from 1 to `OUTBYTES` (64). Apart from controlling the
159     /// length of the final `Hash`, this is also associated data, and changing it will result in a
160     /// totally different hash.
hash_length(&mut self, length: usize) -> &mut Self161     pub fn hash_length(&mut self, length: usize) -> &mut Self {
162         assert!(
163             1 <= length && length <= OUTBYTES,
164             "Bad hash length: {}",
165             length
166         );
167         self.hash_length = length as u8;
168         self
169     }
170 
171     /// Use a secret key, so that BLAKE2bp acts as a MAC. The maximum key length is `KEYBYTES`
172     /// (64). An empty key is equivalent to having no key at all.
key(&mut self, key: &[u8]) -> &mut Self173     pub fn key(&mut self, key: &[u8]) -> &mut Self {
174         assert!(key.len() <= KEYBYTES, "Bad key length: {}", key.len());
175         self.key_length = key.len() as u8;
176         self.key = [0; KEYBYTES];
177         self.key[..key.len()].copy_from_slice(key);
178         self
179     }
180 }
181 
182 impl Default for Params {
default() -> Self183     fn default() -> Self {
184         Self::new()
185     }
186 }
187 
188 impl fmt::Debug for Params {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result189     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
190         write!(
191             f,
192             "Params {{ hash_length: {}, key_length: {} }}",
193             self.hash_length,
194             // NB: Don't print the key itself. Debug shouldn't leak secrets.
195             self.key_length,
196         )
197     }
198 }
199 
200 /// An incremental hasher for BLAKE2bp, just like the [`State`](../struct.State.html) type for
201 /// BLAKE2b.
202 ///
203 /// # Example
204 ///
205 /// ```
206 /// use blake2b_simd::blake2bp;
207 ///
208 /// let mut state = blake2bp::State::new();
209 /// state.update(b"foo");
210 /// state.update(b"bar");
211 /// let hash = state.finalize();
212 ///
213 /// let expected = "e654427b6ef02949471712263e59071abbb6aa94855674c1daeed6cfaf127c33\
214 ///                 dfa3205f7f7f71e4f0673d25fa82a368488911f446bccd323af3ab03f53e56e5";
215 /// assert_eq!(expected, &hash.to_hex());
216 /// ```
217 #[derive(Clone)]
218 pub struct State {
219     leaf_words: [[Word; 8]; DEGREE],
220     root_words: [Word; 8],
221     // Note that this buffer is twice as large as what compress4 needs. That guarantees that we
222     // have enough input when we compress to know we don't need to finalize any of the leaves.
223     buf: [u8; 2 * DEGREE * BLOCKBYTES],
224     buf_len: u16,
225     // Note that this is the *per-leaf* count.
226     count: Count,
227     hash_length: u8,
228     implementation: Implementation,
229     is_keyed: bool,
230 }
231 
232 impl State {
233     /// Equivalent to `State::default()` or `Params::default().to_state()`.
new() -> Self234     pub fn new() -> Self {
235         Self::with_params(&Params::default())
236     }
237 
with_params(params: &Params) -> Self238     fn with_params(params: &Params) -> Self {
239         let (leaf_words, root_words) = params.to_words();
240 
241         // If a key is set, initalize the buffer to contain the key bytes. Note
242         // that only the leaves hash key bytes. The root doesn't, even though
243         // the key length it still set in its parameters. Again this isn't
244         // documented in the spec, but it matches the behavior of the reference
245         // implementation:
246         // https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2bp-ref.c#L128
247         // This particular behavior (though not the inner hash length behavior
248         // above) is also corroborated by the official test vectors; see
249         // tests/vector_tests.rs.
250         let mut buf = [0; 2 * DEGREE * BLOCKBYTES];
251         let mut buf_len = 0;
252         if params.key_length > 0 {
253             for i in 0..DEGREE {
254                 let keybytes = &params.key[..params.key_length as usize];
255                 buf[i * BLOCKBYTES..][..keybytes.len()].copy_from_slice(keybytes);
256                 buf_len = BLOCKBYTES * DEGREE;
257             }
258         }
259 
260         Self {
261             leaf_words,
262             root_words,
263             buf,
264             buf_len: buf_len as u16,
265             count: 0, // count gets updated in self.compress()
266             hash_length: params.hash_length,
267             implementation: params.implementation,
268             is_keyed: params.key_length > 0,
269         }
270     }
271 
fill_buf(&mut self, input: &mut &[u8])272     fn fill_buf(&mut self, input: &mut &[u8]) {
273         let take = cmp::min(self.buf.len() - self.buf_len as usize, input.len());
274         self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]);
275         self.buf_len += take as u16;
276         *input = &input[take..];
277     }
278 
compress_to_leaves( leaves: &mut [[Word; 8]; DEGREE], input: &[u8], count: &mut Count, implementation: Implementation, )279     fn compress_to_leaves(
280         leaves: &mut [[Word; 8]; DEGREE],
281         input: &[u8],
282         count: &mut Count,
283         implementation: Implementation,
284     ) {
285         // Input is assumed to be an even number of blocks for each leaf. Since
286         // we're not finilizing, debug asserts will fire otherwise.
287         let jobs = leaves.iter_mut().enumerate().map(|(i, words)| {
288             Job {
289                 input: &input[i * BLOCKBYTES..],
290                 words,
291                 count: *count,
292                 last_node: LastNode::No, // irrelevant when not finalizing
293             }
294         });
295         many::compress_many(jobs, implementation, Finalize::No, Stride::Parallel);
296         // Note that count is the bytes input *per-leaf*.
297         *count = count.wrapping_add((input.len() / DEGREE) as Count);
298     }
299 
300     /// Add input to the hash. You can call `update` any number of times.
update(&mut self, mut input: &[u8]) -> &mut Self301     pub fn update(&mut self, mut input: &[u8]) -> &mut Self {
302         // If we have a partial buffer, try to complete it. If we complete it and there's more
303         // input waiting, we need to compress to make more room. However, because we need to be
304         // sure that *none* of the leaves would need to be finalized as part of this round of
305         // compression, we need to buffer more than we would for BLAKE2b.
306         if self.buf_len > 0 {
307             self.fill_buf(&mut input);
308             // The buffer is large enough for two compressions. If we've filled
309             // the buffer and there's still more input coming, then we have to
310             // do at least one compression. If there's enough input still
311             // coming that all the leaves are guaranteed to get more, do both
312             // compressions in the buffer. Otherwise, do just one and shift the
313             // back half of the buffer to the front.
314             if !input.is_empty() {
315                 if input.len() > (DEGREE - 1) * BLOCKBYTES {
316                     // Enough input coming to do both compressions.
317                     Self::compress_to_leaves(
318                         &mut self.leaf_words,
319                         &self.buf,
320                         &mut self.count,
321                         self.implementation,
322                     );
323                     self.buf_len = 0;
324                 } else {
325                     // Only enough input coming for one compression.
326                     Self::compress_to_leaves(
327                         &mut self.leaf_words,
328                         &self.buf[..DEGREE * BLOCKBYTES],
329                         &mut self.count,
330                         self.implementation,
331                     );
332                     self.buf_len = (DEGREE * BLOCKBYTES) as u16;
333                     let (buf_front, buf_back) = self.buf.split_at_mut(DEGREE * BLOCKBYTES);
334                     buf_front.copy_from_slice(buf_back);
335                 }
336             }
337         }
338 
339         // Now we directly compress as much input as possible, without copying
340         // it into the buffer. We need to make sure we buffer at least one byte
341         // for each of the leaves, so that we know we don't need to finalize
342         // them.
343         let needed_tail = (DEGREE - 1) * BLOCKBYTES + 1;
344         let mut bulk_bytes = input.len().saturating_sub(needed_tail);
345         bulk_bytes -= bulk_bytes % (DEGREE * BLOCKBYTES);
346         if bulk_bytes > 0 {
347             Self::compress_to_leaves(
348                 &mut self.leaf_words,
349                 &input[..bulk_bytes],
350                 &mut self.count,
351                 self.implementation,
352             );
353             input = &input[bulk_bytes..];
354         }
355 
356         // Buffer any remaining input, to be either compressed or finalized in
357         // a subsequent call.
358         self.fill_buf(&mut input);
359         debug_assert_eq!(0, input.len());
360         self
361     }
362 
363     /// Finalize the state and return a `Hash`. This method is idempotent, and calling it multiple
364     /// times will give the same result. It's also possible to `update` with more input in between.
finalize(&self) -> Hash365     pub fn finalize(&self) -> Hash {
366         // Hash whatever's remaining in the buffer and finalize the leaves.
367         let buf_len = self.buf_len as usize;
368         let mut leaves_copy = self.leaf_words;
369         let jobs = leaves_copy
370             .iter_mut()
371             .enumerate()
372             .map(|(leaf_index, leaf_words)| {
373                 let input = &self.buf[cmp::min(leaf_index * BLOCKBYTES, buf_len)..buf_len];
374                 Job {
375                     input,
376                     words: leaf_words,
377                     count: self.count,
378                     last_node: if leaf_index == DEGREE - 1 {
379                         LastNode::Yes
380                     } else {
381                         LastNode::No
382                     },
383                 }
384             });
385         many::compress_many(jobs, self.implementation, Finalize::Yes, Stride::Parallel);
386 
387         // Concatenate each leaf into the root and hash that.
388         let mut root_words_copy = self.root_words;
389         finalize_root_words(
390             &leaves_copy,
391             &mut root_words_copy,
392             self.hash_length,
393             self.implementation,
394         )
395     }
396 
397     /// Return the total number of bytes input so far.
398     ///
399     /// Note that `count` doesn't include the bytes of the key block, if any.
400     /// It's exactly the total number of input bytes fed to `update`.
count(&self) -> Count401     pub fn count(&self) -> Count {
402         // Remember that self.count is *per-leaf*.
403         let mut ret = self
404             .count
405             .wrapping_mul(DEGREE as Count)
406             .wrapping_add(self.buf_len as Count);
407         if self.is_keyed {
408             ret -= (DEGREE * BLOCKBYTES) as Count;
409         }
410         ret
411     }
412 }
413 
414 #[cfg(feature = "std")]
415 impl std::io::Write for State {
write(&mut self, buf: &[u8]) -> std::io::Result<usize>416     fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
417         self.update(buf);
418         Ok(buf.len())
419     }
420 
flush(&mut self) -> std::io::Result<()>421     fn flush(&mut self) -> std::io::Result<()> {
422         Ok(())
423     }
424 }
425 
426 impl fmt::Debug for State {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result427     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
428         write!(
429             f,
430             "State {{ count: {}, hash_length: {} }}",
431             self.count(),
432             self.hash_length,
433         )
434     }
435 }
436 
437 impl Default for State {
default() -> Self438     fn default() -> Self {
439         Self::with_params(&Params::default())
440     }
441 }
442 
443 // Compress each of the four finalized hashes into the root words as input,
444 // using two compressions. Note that even if a future version of this
445 // implementation supports the hash_length parameter and sets it as associated
446 // data for all nodes, this step must still use the untruncated output of each
447 // leaf. Note also that, as mentioned above, the root node doesn't hash any key
448 // bytes.
finalize_root_words( leaf_words: &[[Word; 8]; DEGREE], root_words: &mut [Word; 8], hash_length: u8, imp: Implementation, ) -> Hash449 fn finalize_root_words(
450     leaf_words: &[[Word; 8]; DEGREE],
451     root_words: &mut [Word; 8],
452     hash_length: u8,
453     imp: Implementation,
454 ) -> Hash {
455     debug_assert_eq!(OUTBYTES, 8 * size_of::<Word>());
456     let mut block = [0; DEGREE * OUTBYTES];
457     for (word, chunk) in leaf_words
458         .iter()
459         .flat_map(|words| words.iter())
460         .zip(block.chunks_exact_mut(size_of::<Word>()))
461     {
462         chunk.copy_from_slice(&word.to_le_bytes());
463     }
464     imp.compress1_loop(
465         &block,
466         root_words,
467         0,
468         LastNode::Yes,
469         Finalize::Yes,
470         Stride::Serial,
471     );
472     Hash {
473         bytes: crate::state_words_to_bytes(&root_words),
474         len: hash_length,
475     }
476 }
477 
force_portable(params: &mut Params)478 pub(crate) fn force_portable(params: &mut Params) {
479     params.implementation = Implementation::portable();
480 }
481 
482 #[cfg(test)]
483 pub(crate) mod test {
484     use super::*;
485     use crate::paint_test_input;
486 
487     // This is a simple reference implementation without the complicated buffering or parameter
488     // support of the real implementation. We need this because the official test vectors don't
489     // include any inputs large enough to exercise all the branches in the buffering logic.
blake2bp_reference(input: &[u8]) -> Hash490     fn blake2bp_reference(input: &[u8]) -> Hash {
491         let mut leaves = arrayvec::ArrayVec::<[_; DEGREE]>::new();
492         for leaf_index in 0..DEGREE {
493             leaves.push(
494                 crate::Params::new()
495                     .fanout(DEGREE as u8)
496                     .max_depth(2)
497                     .node_offset(leaf_index as u64)
498                     .inner_hash_length(OUTBYTES)
499                     .to_state(),
500             );
501         }
502         leaves[DEGREE - 1].set_last_node(true);
503         for (i, chunk) in input.chunks(BLOCKBYTES).enumerate() {
504             leaves[i % DEGREE].update(chunk);
505         }
506         let mut root = crate::Params::new()
507             .fanout(DEGREE as u8)
508             .max_depth(2)
509             .node_depth(1)
510             .inner_hash_length(OUTBYTES)
511             .last_node(true)
512             .to_state();
513         for leaf in &mut leaves {
514             root.update(leaf.finalize().as_bytes());
515         }
516         root.finalize()
517     }
518 
519     #[test]
test_against_reference()520     fn test_against_reference() {
521         let mut buf = [0; 21 * BLOCKBYTES];
522         paint_test_input(&mut buf);
523         // - 8 blocks is just enought to fill the double buffer.
524         // - 9 blocks triggers the "perform one compression on the double buffer" case.
525         // - 11 blocks is the largest input where only one compression may be performed, on the
526         //   first half of the buffer, because there's not enough input to avoid needing to
527         //   finalize the second half.
528         // - 12 blocks triggers the "perform both compressions in the double buffer" case.
529         // - 15 blocks is the largest input where, after compressing 8 blocks from the buffer,
530         //   there's not enough input to hash directly from memory.
531         // - 16 blocks triggers "after emptying the buffer, hash directly from memory".
532         for num_blocks in 0..=20 {
533             for &extra in &[0, 1, BLOCKBYTES - 1] {
534                 for &portable in &[false, true] {
535                     // eprintln!("\ncase -----");
536                     // dbg!(num_blocks);
537                     // dbg!(extra);
538                     // dbg!(portable);
539 
540                     // First hash the input all at once, as a sanity check.
541                     let mut params = Params::new();
542                     if portable {
543                         force_portable(&mut params);
544                     }
545                     let input = &buf[..num_blocks * BLOCKBYTES + extra];
546                     let expected = blake2bp_reference(&input);
547                     let mut state = params.to_state();
548                     let found = state.update(input).finalize();
549                     assert_eq!(expected, found);
550 
551                     // Then, do it again, but buffer 1 byte of input first. That causes the buffering
552                     // branch to trigger.
553                     let mut state = params.to_state();
554                     let maybe_one = cmp::min(1, input.len());
555                     state.update(&input[..maybe_one]);
556                     assert_eq!(maybe_one as Count, state.count());
557                     // Do a throwaway finalize here to check for idempotency.
558                     state.finalize();
559                     state.update(&input[maybe_one..]);
560                     assert_eq!(input.len() as Count, state.count());
561                     let found = state.finalize();
562                     assert_eq!(expected, found);
563 
564                     // Finally, do it again with the all-at-once interface.
565                     assert_eq!(expected, blake2bp(input));
566                 }
567             }
568         }
569     }
570 }
571