1 // -*- mode: rust; coding: utf-8; -*-
2 //
3 // This file is part of curve25519-dalek.
4 // Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
5 // See LICENSE for licensing information.
6 //
7 // Authors:
8 // - Isis Agora Lovecruft <isis@patternsinthevoid.net>
9 // - Henry de Valence <hdevalence@hdevalence.ca>
10 
11 //! An implementation of 4-way vectorized 32bit field arithmetic using
12 //! AVX2.
13 //!
14 //! The `FieldElement2625x4` struct provides a vector of four field
15 //! elements, implemented using AVX2 operations.  Its API is designed
16 //! to abstract away the platform-dependent details, so that point
17 //! arithmetic can be implemented only in terms of a vector of field
18 //! elements.
19 //!
20 //! At this level, the API is optimized for speed and not safety.  The
21 //! `FieldElement2625x4` does not always perform reductions.  The pre-
22 //! and post-conditions on the bounds of the coefficients are
23 //! documented for each method, but it is the caller's responsibility
24 //! to ensure that there are no overflows.
25 
26 #![allow(non_snake_case)]
27 
28 const A_LANES: u8 = 0b0000_0101;
29 const B_LANES: u8 = 0b0000_1010;
30 const C_LANES: u8 = 0b0101_0000;
31 const D_LANES: u8 = 0b1010_0000;
32 
33 #[allow(unused)]
34 const A_LANES64: u8 = 0b00_00_00_11;
35 #[allow(unused)]
36 const B_LANES64: u8 = 0b00_00_11_00;
37 #[allow(unused)]
38 const C_LANES64: u8 = 0b00_11_00_00;
39 #[allow(unused)]
40 const D_LANES64: u8 = 0b11_00_00_00;
41 
42 use core::ops::{Add, Mul, Neg};
43 use packed_simd::{i32x8, u32x8, u64x4, IntoBits};
44 
45 use backend::vector::avx2::constants::{P_TIMES_16_HI, P_TIMES_16_LO, P_TIMES_2_HI, P_TIMES_2_LO};
46 use backend::serial::u64::field::FieldElement51;
47 
48 /// Unpack 32-bit lanes into 64-bit lanes:
49 /// ```ascii,no_run
50 /// (a0, b0, a1, b1, c0, d0, c1, d1)
51 /// ```
52 /// into
53 /// ```ascii,no_run
54 /// (a0, 0, b0, 0, c0, 0, d0, 0)
55 /// (a1, 0, b1, 0, c1, 0, d1, 0)
56 /// ```
57 #[inline(always)]
unpack_pair(src: u32x8) -> (u32x8, u32x8)58 fn unpack_pair(src: u32x8) -> (u32x8, u32x8) {
59     let a: u32x8;
60     let b: u32x8;
61     let zero = i32x8::new(0, 0, 0, 0, 0, 0, 0, 0);
62     unsafe {
63         use core::arch::x86_64::_mm256_unpackhi_epi32;
64         use core::arch::x86_64::_mm256_unpacklo_epi32;
65         a = _mm256_unpacklo_epi32(src.into_bits(), zero.into_bits()).into_bits();
66         b = _mm256_unpackhi_epi32(src.into_bits(), zero.into_bits()).into_bits();
67     }
68     (a, b)
69 }
70 
71 /// Repack 64-bit lanes into 32-bit lanes:
72 /// ```ascii,no_run
73 /// (a0, 0, b0, 0, c0, 0, d0, 0)
74 /// (a1, 0, b1, 0, c1, 0, d1, 0)
75 /// ```
76 /// into
77 /// ```ascii,no_run
78 /// (a0, b0, a1, b1, c0, d0, c1, d1)
79 /// ```
80 #[inline(always)]
repack_pair(x: u32x8, y: u32x8) -> u32x881 fn repack_pair(x: u32x8, y: u32x8) -> u32x8 {
82     unsafe {
83         use core::arch::x86_64::_mm256_blend_epi32;
84         use core::arch::x86_64::_mm256_shuffle_epi32;
85 
86         // Input: x = (a0, 0, b0, 0, c0, 0, d0, 0)
87         // Input: y = (a1, 0, b1, 0, c1, 0, d1, 0)
88 
89         let x_shuffled = _mm256_shuffle_epi32(x.into_bits(), 0b11_01_10_00);
90         let y_shuffled = _mm256_shuffle_epi32(y.into_bits(), 0b10_00_11_01);
91 
92         // x' = (a0, b0,  0,  0, c0, d0,  0,  0)
93         // y' = ( 0,  0, a1, b1,  0,  0, c1, d1)
94 
95         return _mm256_blend_epi32(x_shuffled, y_shuffled, 0b11001100).into_bits();
96     }
97 }
98 
99 /// The `Lanes` enum represents a subset of the lanes `A,B,C,D` of a
100 /// `FieldElement2625x4`.
101 ///
102 /// It's used to specify blend operations without
103 /// having to know details about the data layout of the
104 /// `FieldElement2625x4`.
105 #[derive(Copy, Clone, Debug)]
106 pub enum Lanes {
107     C,
108     D,
109     AB,
110     AC,
111     CD,
112     AD,
113     BC,
114     ABCD,
115 }
116 
117 /// The `Shuffle` enum represents a shuffle of a `FieldElement2625x4`.
118 ///
119 /// The enum variants are named by what they do to a vector \\(
120 /// (A,B,C,D) \\); for instance, `Shuffle::BADC` turns \\( (A, B, C,
121 /// D) \\) into \\( (B, A, D, C) \\).
122 #[derive(Copy, Clone, Debug)]
123 pub enum Shuffle {
124     AAAA,
125     BBBB,
126     CACA,
127     DBBD,
128     ADDA,
129     CBCB,
130     ABAB,
131     BADC,
132     BACD,
133     ABDC,
134 }
135 
136 /// A vector of four field elements.
137 ///
138 /// Each operation on a `FieldElement2625x4` has documented effects on
139 /// the bounds of the coefficients.  This API is designed for speed
140 /// and not safety; it is the caller's responsibility to ensure that
141 /// the post-conditions of one operation are compatible with the
142 /// pre-conditions of the next.
143 #[derive(Clone, Copy, Debug)]
144 pub struct FieldElement2625x4(pub(crate) [u32x8; 5]);
145 
146 use subtle::Choice;
147 use subtle::ConditionallySelectable;
148 
149 impl ConditionallySelectable for FieldElement2625x4 {
conditional_select( a: &FieldElement2625x4, b: &FieldElement2625x4, choice: Choice, ) -> FieldElement2625x4150     fn conditional_select(
151         a: &FieldElement2625x4,
152         b: &FieldElement2625x4,
153         choice: Choice,
154     ) -> FieldElement2625x4 {
155         let mask = (-(choice.unwrap_u8() as i32)) as u32;
156         let mask_vec = u32x8::splat(mask);
157         FieldElement2625x4([
158             a.0[0] ^ (mask_vec & (a.0[0] ^ b.0[0])),
159             a.0[1] ^ (mask_vec & (a.0[1] ^ b.0[1])),
160             a.0[2] ^ (mask_vec & (a.0[2] ^ b.0[2])),
161             a.0[3] ^ (mask_vec & (a.0[3] ^ b.0[3])),
162             a.0[4] ^ (mask_vec & (a.0[4] ^ b.0[4])),
163         ])
164     }
165 
conditional_assign( &mut self, other: &FieldElement2625x4, choice: Choice, )166     fn conditional_assign(
167         &mut self,
168         other: &FieldElement2625x4,
169         choice: Choice,
170     ) {
171         let mask = (-(choice.unwrap_u8() as i32)) as u32;
172         let mask_vec = u32x8::splat(mask);
173         self.0[0] ^= mask_vec & (self.0[0] ^ other.0[0]);
174         self.0[1] ^= mask_vec & (self.0[1] ^ other.0[1]);
175         self.0[2] ^= mask_vec & (self.0[2] ^ other.0[2]);
176         self.0[3] ^= mask_vec & (self.0[3] ^ other.0[3]);
177         self.0[4] ^= mask_vec & (self.0[4] ^ other.0[4]);
178     }
179 }
180 
181 impl FieldElement2625x4 {
182     /// Split this vector into an array of four (serial) field
183     /// elements.
split(&self) -> [FieldElement51; 4]184     pub fn split(&self) -> [FieldElement51; 4] {
185         let mut out = [FieldElement51::zero(); 4];
186         for i in 0..5 {
187             let a_2i   = self.0[i].extract(0) as u64; //
188             let b_2i   = self.0[i].extract(1) as u64; //
189             let a_2i_1 = self.0[i].extract(2) as u64; // `.
190             let b_2i_1 = self.0[i].extract(3) as u64; //  | pre-swapped to avoid
191             let c_2i   = self.0[i].extract(4) as u64; //  | a cross lane shuffle
192             let d_2i   = self.0[i].extract(5) as u64; // .'
193             let c_2i_1 = self.0[i].extract(6) as u64; //
194             let d_2i_1 = self.0[i].extract(7) as u64; //
195 
196             out[0].0[i] = a_2i + (a_2i_1 << 26);
197             out[1].0[i] = b_2i + (b_2i_1 << 26);
198             out[2].0[i] = c_2i + (c_2i_1 << 26);
199             out[3].0[i] = d_2i + (d_2i_1 << 26);
200         }
201 
202         out
203     }
204 
205     /// Rearrange the elements of this vector according to `control`.
206     ///
207     /// The `control` parameter should be a compile-time constant, so
208     /// that when this function is inlined, LLVM is able to lower the
209     /// shuffle using an immediate.
210     #[inline]
shuffle(&self, control: Shuffle) -> FieldElement2625x4211     pub fn shuffle(&self, control: Shuffle) -> FieldElement2625x4 {
212         #[inline(always)]
213         fn shuffle_lanes(x: u32x8, control: Shuffle) -> u32x8 {
214             unsafe {
215                 use core::arch::x86_64::_mm256_permutevar8x32_epi32;
216 
217                 let c: u32x8 = match control {
218                     Shuffle::AAAA => u32x8::new(0, 0, 2, 2, 0, 0, 2, 2),
219                     Shuffle::BBBB => u32x8::new(1, 1, 3, 3, 1, 1, 3, 3),
220                     Shuffle::CACA => u32x8::new(4, 0, 6, 2, 4, 0, 6, 2),
221                     Shuffle::DBBD => u32x8::new(5, 1, 7, 3, 1, 5, 3, 7),
222                     Shuffle::ADDA => u32x8::new(0, 5, 2, 7, 5, 0, 7, 2),
223                     Shuffle::CBCB => u32x8::new(4, 1, 6, 3, 4, 1, 6, 3),
224                     Shuffle::ABAB => u32x8::new(0, 1, 2, 3, 0, 1, 2, 3),
225                     Shuffle::BADC => u32x8::new(1, 0, 3, 2, 5, 4, 7, 6),
226                     Shuffle::BACD => u32x8::new(1, 0, 3, 2, 4, 5, 6, 7),
227                     Shuffle::ABDC => u32x8::new(0, 1, 2, 3, 5, 4, 7, 6),
228                 };
229                 // Note that this gets turned into a generic LLVM
230                 // shuffle-by-constants, which can be lowered to a simpler
231                 // instruction than a generic permute.
232                 _mm256_permutevar8x32_epi32(x.into_bits(), c.into_bits()).into_bits()
233             }
234         }
235 
236         FieldElement2625x4([
237             shuffle_lanes(self.0[0], control),
238             shuffle_lanes(self.0[1], control),
239             shuffle_lanes(self.0[2], control),
240             shuffle_lanes(self.0[3], control),
241             shuffle_lanes(self.0[4], control),
242         ])
243     }
244 
245     /// Blend `self` with `other`, taking lanes specified in `control` from `other`.
246     ///
247     /// The `control` parameter should be a compile-time constant, so
248     /// that this function can be inlined and LLVM can lower it to a
249     /// blend instruction using an immediate.
250     #[inline]
blend(&self, other: FieldElement2625x4, control: Lanes) -> FieldElement2625x4251     pub fn blend(&self, other: FieldElement2625x4, control: Lanes) -> FieldElement2625x4 {
252         #[inline(always)]
253         fn blend_lanes(x: u32x8, y: u32x8, control: Lanes) -> u32x8 {
254             unsafe {
255                 use core::arch::x86_64::_mm256_blend_epi32;
256 
257                 // This would be much cleaner if we could factor out the match
258                 // statement on the control. Unfortunately, rustc forgets
259                 // constant-info very quickly, so we can't even write
260                 // ```
261                 // match control {
262                 //     Lanes::C => {
263                 //         let imm = C_LANES as i32;
264                 //         _mm256_blend_epi32(..., imm)
265                 // ```
266                 // let alone
267                 // ```
268                 // let imm = match control {
269                 //     Lanes::C => C_LANES as i32,
270                 // }
271                 // _mm256_blend_epi32(..., imm)
272                 // ```
273                 // even though both of these would be constant-folded by LLVM
274                 // at a lower level (as happens in the shuffle implementation,
275                 // which does not require a shuffle immediate but *is* lowered
276                 // to immediate shuffles anyways).
277                 match control {
278                     Lanes::C => {
279                         _mm256_blend_epi32(x.into_bits(), y.into_bits(), C_LANES as i32).into_bits()
280                     }
281                     Lanes::D => {
282                         _mm256_blend_epi32(x.into_bits(), y.into_bits(), D_LANES as i32).into_bits()
283                     }
284                     Lanes::AD => {
285                         _mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | D_LANES) as i32)
286                             .into_bits()
287                     }
288                     Lanes::AB => {
289                         _mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | B_LANES) as i32)
290                             .into_bits()
291                     }
292                     Lanes::AC => {
293                         _mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | C_LANES) as i32)
294                             .into_bits()
295                     }
296                     Lanes::CD => {
297                         _mm256_blend_epi32(x.into_bits(), y.into_bits(), (C_LANES | D_LANES) as i32)
298                             .into_bits()
299                     }
300                     Lanes::BC => {
301                         _mm256_blend_epi32(x.into_bits(), y.into_bits(), (B_LANES | C_LANES) as i32)
302                             .into_bits()
303                     }
304                     Lanes::ABCD => _mm256_blend_epi32(
305                         x.into_bits(),
306                         y.into_bits(),
307                         (A_LANES | B_LANES | C_LANES | D_LANES) as i32,
308                     ).into_bits(),
309                 }
310             }
311         }
312 
313         FieldElement2625x4([
314             blend_lanes(self.0[0], other.0[0], control),
315             blend_lanes(self.0[1], other.0[1], control),
316             blend_lanes(self.0[2], other.0[2], control),
317             blend_lanes(self.0[3], other.0[3], control),
318             blend_lanes(self.0[4], other.0[4], control),
319         ])
320     }
321 
322     /// Construct a vector of zeros.
zero() -> FieldElement2625x4323     pub fn zero() -> FieldElement2625x4 {
324         FieldElement2625x4([u32x8::splat(0); 5])
325     }
326 
327     /// Convenience wrapper around `new(x,x,x,x)`.
splat(x: &FieldElement51) -> FieldElement2625x4328     pub fn splat(x: &FieldElement51) -> FieldElement2625x4 {
329         FieldElement2625x4::new(x, x, x, x)
330     }
331 
332     /// Create a `FieldElement2625x4` from four `FieldElement51`s.
333     ///
334     /// # Postconditions
335     ///
336     /// The resulting `FieldElement2625x4` is bounded with \\( b < 0.0002 \\).
new( x0: &FieldElement51, x1: &FieldElement51, x2: &FieldElement51, x3: &FieldElement51, ) -> FieldElement2625x4337     pub fn new(
338         x0: &FieldElement51,
339         x1: &FieldElement51,
340         x2: &FieldElement51,
341         x3: &FieldElement51,
342     ) -> FieldElement2625x4 {
343         let mut buf = [u32x8::splat(0); 5];
344         let low_26_bits = (1 << 26) - 1;
345         for i in 0..5 {
346             let a_2i   = (x0.0[i] & low_26_bits) as u32;
347             let a_2i_1 = (x0.0[i] >> 26) as u32;
348             let b_2i   = (x1.0[i] & low_26_bits) as u32;
349             let b_2i_1 = (x1.0[i] >> 26) as u32;
350             let c_2i   = (x2.0[i] & low_26_bits) as u32;
351             let c_2i_1 = (x2.0[i] >> 26) as u32;
352             let d_2i   = (x3.0[i] & low_26_bits) as u32;
353             let d_2i_1 = (x3.0[i] >> 26) as u32;
354 
355             buf[i] = u32x8::new(a_2i, b_2i, a_2i_1, b_2i_1, c_2i, d_2i, c_2i_1, d_2i_1);
356         }
357 
358         // We don't know that the original `FieldElement51`s were
359         // fully reduced, so the odd limbs may exceed 2^25.
360         // Reduce them to be sure.
361         FieldElement2625x4(buf).reduce()
362     }
363 
364     /// Given \\((A,B,C,D)\\), compute \\((-A,-B,-C,-D)\\), without
365     /// performing a reduction.
366     ///
367     /// # Preconditions
368     ///
369     /// The coefficients of `self` must be bounded with \\( b < 0.999 \\).
370     ///
371     /// # Postconditions
372     ///
373     /// The coefficients of the result are bounded with \\( b < 1 \\).
374     #[inline]
negate_lazy(&self) -> FieldElement2625x4375     pub fn negate_lazy(&self) -> FieldElement2625x4 {
376         // The limbs of self are bounded with b < 0.999, while the
377         // smallest limb of 2*p is 67108845 > 2^{26+0.9999}, so
378         // underflows are not possible.
379         FieldElement2625x4([
380             P_TIMES_2_LO - self.0[0],
381             P_TIMES_2_HI - self.0[1],
382             P_TIMES_2_HI - self.0[2],
383             P_TIMES_2_HI - self.0[3],
384             P_TIMES_2_HI - self.0[4],
385         ])
386     }
387 
388     /// Given `self = (A,B,C,D)`, compute `(B - A, B + A, D - C, D + C)`.
389     ///
390     /// # Preconditions
391     ///
392     /// The coefficients of `self` must be bounded with \\( b < 0.01 \\).
393     ///
394     /// # Postconditions
395     ///
396     /// The coefficients of the result are bounded with \\( b < 1.6 \\).
397     #[inline]
diff_sum(&self) -> FieldElement2625x4398     pub fn diff_sum(&self) -> FieldElement2625x4 {
399         // tmp1 = (B, A, D, C)
400         let tmp1 = self.shuffle(Shuffle::BADC);
401         // tmp2 = (-A, B, -C, D)
402         let tmp2 = self.blend(self.negate_lazy(), Lanes::AC);
403         // (B - A, B + A, D - C, D + C) bounded with b < 1.6
404         tmp1 + tmp2
405     }
406 
407     /// Reduce this vector of field elements \\(\mathrm{mod} p\\).
408     ///
409     /// # Postconditions
410     ///
411     /// The coefficients of the result are bounded with \\( b < 0.0002 \\).
412     #[inline]
reduce(&self) -> FieldElement2625x4413     pub fn reduce(&self) -> FieldElement2625x4 {
414         let shifts = i32x8::new(26, 26, 25, 25, 26, 26, 25, 25);
415         let masks = u32x8::new(
416             (1 << 26) - 1,
417             (1 << 26) - 1,
418             (1 << 25) - 1,
419             (1 << 25) - 1,
420             (1 << 26) - 1,
421             (1 << 26) - 1,
422             (1 << 25) - 1,
423             (1 << 25) - 1,
424         );
425 
426         // Let c(x) denote the carryout of the coefficient x.
427         //
428         // Given    (   x0,    y0,    x1,    y1,    z0,    w0,    z1,    w1),
429         // compute  (c(x1), c(y1), c(x0), c(y0), c(z1), c(w1), c(z0), c(w0)).
430         //
431         // The carryouts are bounded by 2^(32 - 25) = 2^7.
432         let rotated_carryout = |v: u32x8| -> u32x8 {
433             unsafe {
434                 use core::arch::x86_64::_mm256_srlv_epi32;
435                 use core::arch::x86_64::_mm256_shuffle_epi32;
436 
437                 let c = _mm256_srlv_epi32(v.into_bits(), shifts.into_bits());
438                 _mm256_shuffle_epi32(c, 0b01_00_11_10).into_bits()
439             }
440         };
441 
442         // Combine (lo, lo, lo, lo, lo, lo, lo, lo)
443         //    with (hi, hi, hi, hi, hi, hi, hi, hi)
444         //      to (lo, lo, hi, hi, lo, lo, hi, hi)
445         //
446         // This allows combining carryouts, e.g.,
447         //
448         // lo  (c(x1), c(y1), c(x0), c(y0), c(z1), c(w1), c(z0), c(w0))
449         // hi  (c(x3), c(y3), c(x2), c(y2), c(z3), c(w3), c(z2), c(w2))
450         // ->  (c(x1), c(y1), c(x2), c(y2), c(z1), c(w1), c(z2), c(w2))
451         //
452         // which is exactly the vector of carryins for
453         //
454         //     (   x2,    y2,    x3,    y3,    z2,    w2,    z3,    w3).
455         //
456         let combine = |v_lo: u32x8, v_hi: u32x8| -> u32x8 {
457             unsafe {
458                 use core::arch::x86_64::_mm256_blend_epi32;
459                 _mm256_blend_epi32(v_lo.into_bits(), v_hi.into_bits(), 0b11_00_11_00).into_bits()
460             }
461         };
462 
463         let mut v = self.0;
464 
465         let c10 = rotated_carryout(v[0]);
466         v[0] = (v[0] & masks) + combine(u32x8::splat(0), c10);
467 
468         let c32 = rotated_carryout(v[1]);
469         v[1] = (v[1] & masks) + combine(c10, c32);
470 
471         let c54 = rotated_carryout(v[2]);
472         v[2] = (v[2] & masks) + combine(c32, c54);
473 
474         let c76 = rotated_carryout(v[3]);
475         v[3] = (v[3] & masks) + combine(c54, c76);
476 
477         let c98 = rotated_carryout(v[4]);
478         v[4] = (v[4] & masks) + combine(c76, c98);
479 
480         let c9_19: u32x8 = unsafe {
481             use core::arch::x86_64::_mm256_mul_epu32;
482             use core::arch::x86_64::_mm256_shuffle_epi32;
483 
484             // Need to rearrange c98, since vpmuludq uses the low
485             // 32-bits of each 64-bit lane to compute the product:
486             //
487             // c98       = (c(x9), c(y9), c(x8), c(y8), c(z9), c(w9), c(z8), c(w8));
488             // c9_spread = (c(x9), c(x8), c(y9), c(y8), c(z9), c(z8), c(w9), c(w8)).
489             let c9_spread = _mm256_shuffle_epi32(c98.into_bits(), 0b11_01_10_00);
490 
491             // Since the carryouts are bounded by 2^7, their products with 19
492             // are bounded by 2^11.25.  This means that
493             //
494             // c9_19_spread = (19*c(x9), 0, 19*c(y9), 0, 19*c(z9), 0, 19*c(w9), 0).
495             let c9_19_spread = _mm256_mul_epu32(c9_spread, u64x4::splat(19).into_bits());
496 
497             // Unshuffle:
498             // c9_19 = (19*c(x9), 19*c(y9), 0, 0, 19*c(z9), 19*c(w9), 0, 0).
499             _mm256_shuffle_epi32(c9_19_spread, 0b11_01_10_00).into_bits()
500         };
501 
502         // Add the final carryin.
503         v[0] = v[0] + c9_19;
504 
505         // Each output coefficient has exactly one carryin, which is
506         // bounded by 2^11.25, so they are bounded as
507         //
508         // c_even < 2^26 + 2^11.25 < 26.00006 < 2^{26+b}
509         // c_odd  < 2^25 + 2^11.25 < 25.0001  < 2^{25+b}
510         //
511         // where b = 0.0002.
512         FieldElement2625x4(v)
513     }
514 
515     /// Given an array of wide coefficients, reduce them to a `FieldElement2625x4`.
516     ///
517     /// # Postconditions
518     ///
519     /// The coefficients of the result are bounded with \\( b < 0.007 \\).
520     #[inline]
reduce64(mut z: [u64x4; 10]) -> FieldElement2625x4521     fn reduce64(mut z: [u64x4; 10]) -> FieldElement2625x4 {
522         // These aren't const because splat isn't a const fn
523         let LOW_25_BITS: u64x4 = u64x4::splat((1 << 25) - 1);
524         let LOW_26_BITS: u64x4 = u64x4::splat((1 << 26) - 1);
525 
526         // Carry the value from limb i = 0..8 to limb i+1
527         let carry = |z: &mut [u64x4; 10], i: usize| {
528             debug_assert!(i < 9);
529             if i % 2 == 0 {
530                 // Even limbs have 26 bits
531                 z[i + 1] = z[i + 1] + (z[i] >> 26);
532                 z[i] = z[i] & LOW_26_BITS;
533             } else {
534                 // Odd limbs have 25 bits
535                 z[i + 1] = z[i + 1] + (z[i] >> 25);
536                 z[i] = z[i] & LOW_25_BITS;
537             }
538         };
539 
540         // Perform two halves of the carry chain in parallel.
541         carry(&mut z, 0); carry(&mut z, 4);
542         carry(&mut z, 1); carry(&mut z, 5);
543         carry(&mut z, 2); carry(&mut z, 6);
544         carry(&mut z, 3); carry(&mut z, 7);
545         // Since z[3] < 2^64, c < 2^(64-25) = 2^39,
546         // so    z[4] < 2^26 + 2^39 < 2^39.0002
547         carry(&mut z, 4); carry(&mut z, 8);
548         // Now z[4] < 2^26
549         // and z[5] < 2^25 + 2^13.0002 < 2^25.0004 (good enough)
550 
551         // Last carry has a multiplication by 19.  In the serial case we
552         // do a 64-bit multiplication by 19, but here we want to do a
553         // 32-bit multiplication.  However, if we only know z[9] < 2^64,
554         // the carry is bounded as c < 2^(64-25) = 2^39, which is too
555         // big.  To ensure c < 2^32, we would need z[9] < 2^57.
556         // Instead, we split the carry in two, with c = c_0 + c_1*2^26.
557 
558         let c = z[9] >> 25;
559         z[9] = z[9] & LOW_25_BITS;
560         let mut c0: u64x4 = c & LOW_26_BITS; // c0 < 2^26;
561         let mut c1: u64x4 = c >> 26;         // c1 < 2^(39-26) = 2^13;
562 
563         unsafe {
564             use core::arch::x86_64::_mm256_mul_epu32;
565             let x19 = u64x4::splat(19);
566             c0 = _mm256_mul_epu32(c0.into_bits(), x19.into_bits()).into_bits(); // c0 < 2^30.25
567             c1 = _mm256_mul_epu32(c1.into_bits(), x19.into_bits()).into_bits(); // c1 < 2^17.25
568         }
569 
570         z[0] = z[0] + c0; // z0 < 2^26 + 2^30.25 < 2^30.33
571         z[1] = z[1] + c1; // z1 < 2^25 + 2^17.25 < 2^25.0067
572         carry(&mut z, 0); // z0 < 2^26, z1 < 2^25.0067 + 2^4.33 = 2^25.007
573 
574         // The output coefficients are bounded with
575         //
576         // b = 0.007  for z[1]
577         // b = 0.0004 for z[5]
578         // b = 0      for other z[i].
579         //
580         // So the packed result is bounded with b = 0.007.
581         FieldElement2625x4([
582             repack_pair(z[0].into_bits(), z[1].into_bits()),
583             repack_pair(z[2].into_bits(), z[3].into_bits()),
584             repack_pair(z[4].into_bits(), z[5].into_bits()),
585             repack_pair(z[6].into_bits(), z[7].into_bits()),
586             repack_pair(z[8].into_bits(), z[9].into_bits()),
587         ])
588     }
589 
590     /// Square this field element, and negate the result's \\(D\\) value.
591     ///
592     /// # Preconditions
593     ///
594     /// The coefficients of `self` must be bounded with \\( b < 1.5 \\).
595     ///
596     /// # Postconditions
597     ///
598     /// The coefficients of the result are bounded with \\( b < 0.007 \\).
square_and_negate_D(&self) -> FieldElement2625x4599     pub fn square_and_negate_D(&self) -> FieldElement2625x4 {
600         #[inline(always)]
601         fn m(x: u32x8, y: u32x8) -> u64x4 {
602             use core::arch::x86_64::_mm256_mul_epu32;
603             unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
604         }
605 
606         #[inline(always)]
607         fn m_lo(x: u32x8, y: u32x8) -> u32x8 {
608             use core::arch::x86_64::_mm256_mul_epu32;
609             unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
610         }
611 
612         let v19 = u32x8::new(19, 0, 19, 0, 19, 0, 19, 0);
613 
614         let (x0, x1) = unpack_pair(self.0[0]);
615         let (x2, x3) = unpack_pair(self.0[1]);
616         let (x4, x5) = unpack_pair(self.0[2]);
617         let (x6, x7) = unpack_pair(self.0[3]);
618         let (x8, x9) = unpack_pair(self.0[4]);
619 
620         let x0_2   = x0 << 1;
621         let x1_2   = x1 << 1;
622         let x2_2   = x2 << 1;
623         let x3_2   = x3 << 1;
624         let x4_2   = x4 << 1;
625         let x5_2   = x5 << 1;
626         let x6_2   = x6 << 1;
627         let x7_2   = x7 << 1;
628 
629         let x5_19  = m_lo(v19, x5);
630         let x6_19  = m_lo(v19, x6);
631         let x7_19  = m_lo(v19, x7);
632         let x8_19  = m_lo(v19, x8);
633         let x9_19  = m_lo(v19, x9);
634 
635         let mut z0 = m(x0,  x0) + m(x2_2,x8_19) + m(x4_2,x6_19) + ((m(x1_2,x9_19) +  m(x3_2,x7_19) +    m(x5,x5_19)) << 1);
636         let mut z1 = m(x0_2,x1) + m(x3_2,x8_19) + m(x5_2,x6_19) +                  ((m(x2,x9_19)   +    m(x4,x7_19)) << 1);
637         let mut z2 = m(x0_2,x2) + m(x1_2,x1)    + m(x4_2,x8_19) + m(x6,x6_19)    + ((m(x3_2,x9_19) +  m(x5_2,x7_19)) << 1);
638         let mut z3 = m(x0_2,x3) + m(x1_2,x2)    + m(x5_2,x8_19) +                  ((m(x4,x9_19)   +    m(x6,x7_19)) << 1);
639         let mut z4 = m(x0_2,x4) + m(x1_2,x3_2)  + m(x2,  x2)    + m(x6_2,x8_19)  + ((m(x5_2,x9_19) +    m(x7,x7_19)) << 1);
640         let mut z5 = m(x0_2,x5) + m(x1_2,x4)    + m(x2_2,x3)    + m(x7_2,x8_19)                    +  ((m(x6,x9_19)) << 1);
641         let mut z6 = m(x0_2,x6) + m(x1_2,x5_2)  + m(x2_2,x4)    + m(x3_2,x3) + m(x8,x8_19)        + ((m(x7_2,x9_19)) << 1);
642         let mut z7 = m(x0_2,x7) + m(x1_2,x6)    + m(x2_2,x5)    + m(x3_2,x4)                      +   ((m(x8,x9_19)) << 1);
643         let mut z8 = m(x0_2,x8) + m(x1_2,x7_2)  + m(x2_2,x6)    + m(x3_2,x5_2) + m(x4,x4)         +   ((m(x9,x9_19)) << 1);
644         let mut z9 = m(x0_2,x9) + m(x1_2,x8)    + m(x2_2,x7)    + m(x3_2,x6) + m(x4_2,x5);
645 
646         // The biggest z_i is bounded as z_i < 249*2^(51 + 2*b);
647         // if b < 1.5 we get z_i < 4485585228861014016.
648         //
649         // The limbs of the multiples of p are bounded above by
650         //
651         // 0x3fffffff << 37 = 9223371899415822336 < 2^63
652         //
653         // and below by
654         //
655         // 0x1fffffff << 37 = 4611685880988434432
656         //                  > 4485585228861014016
657         //
658         // So these multiples of p are big enough to avoid underflow
659         // in subtraction, and small enough to fit within u64
660         // with room for a carry.
661 
662         let low__p37 = u64x4::splat(0x3ffffed << 37);
663         let even_p37 = u64x4::splat(0x3ffffff << 37);
664         let odd__p37 = u64x4::splat(0x1ffffff << 37);
665 
666         let negate_D = |x: u64x4, p: u64x4| -> u64x4 {
667             unsafe {
668                 use core::arch::x86_64::_mm256_blend_epi32;
669                 _mm256_blend_epi32(x.into_bits(), (p - x).into_bits(), D_LANES64 as i32).into_bits()
670             }
671         };
672 
673         z0 = negate_D(z0, low__p37);
674         z1 = negate_D(z1, odd__p37);
675         z2 = negate_D(z2, even_p37);
676         z3 = negate_D(z3, odd__p37);
677         z4 = negate_D(z4, even_p37);
678         z5 = negate_D(z5, odd__p37);
679         z6 = negate_D(z6, even_p37);
680         z7 = negate_D(z7, odd__p37);
681         z8 = negate_D(z8, even_p37);
682         z9 = negate_D(z9, odd__p37);
683 
684         FieldElement2625x4::reduce64([z0, z1, z2, z3, z4, z5, z6, z7, z8, z9])
685     }
686 }
687 
688 impl Neg for FieldElement2625x4 {
689     type Output = FieldElement2625x4;
690 
691     /// Negate this field element, performing a reduction.
692     ///
693     /// If the coefficients are known to be small, use `negate_lazy`
694     /// to avoid performing a reduction.
695     ///
696     /// # Preconditions
697     ///
698     /// The coefficients of `self` must be bounded with \\( b < 4.0 \\).
699     ///
700     /// # Postconditions
701     ///
702     /// The coefficients of the result are bounded with \\( b < 0.0002 \\).
703     #[inline]
neg(self) -> FieldElement2625x4704     fn neg(self) -> FieldElement2625x4 {
705         FieldElement2625x4([
706             P_TIMES_16_LO - self.0[0],
707             P_TIMES_16_HI - self.0[1],
708             P_TIMES_16_HI - self.0[2],
709             P_TIMES_16_HI - self.0[3],
710             P_TIMES_16_HI - self.0[4],
711         ]).reduce()
712     }
713 }
714 
715 impl Add<FieldElement2625x4> for FieldElement2625x4 {
716     type Output = FieldElement2625x4;
717     /// Add two `FieldElement2625x4`s, without performing a reduction.
718     #[inline]
add(self, rhs: FieldElement2625x4) -> FieldElement2625x4719     fn add(self, rhs: FieldElement2625x4) -> FieldElement2625x4 {
720         FieldElement2625x4([
721             self.0[0] + rhs.0[0],
722             self.0[1] + rhs.0[1],
723             self.0[2] + rhs.0[2],
724             self.0[3] + rhs.0[3],
725             self.0[4] + rhs.0[4],
726         ])
727     }
728 }
729 
730 impl Mul<(u32, u32, u32, u32)> for FieldElement2625x4 {
731     type Output = FieldElement2625x4;
732     /// Perform a multiplication by a vector of small constants.
733     ///
734     /// # Postconditions
735     ///
736     /// The coefficients of the result are bounded with \\( b < 0.007 \\).
737     #[inline]
mul(self, scalars: (u32, u32, u32, u32)) -> FieldElement2625x4738     fn mul(self, scalars: (u32, u32, u32, u32)) -> FieldElement2625x4 {
739         unsafe {
740             use core::arch::x86_64::_mm256_mul_epu32;
741 
742             let consts = u32x8::new(scalars.0, 0, scalars.1, 0, scalars.2, 0, scalars.3, 0);
743 
744             let (b0, b1) = unpack_pair(self.0[0]);
745             let (b2, b3) = unpack_pair(self.0[1]);
746             let (b4, b5) = unpack_pair(self.0[2]);
747             let (b6, b7) = unpack_pair(self.0[3]);
748             let (b8, b9) = unpack_pair(self.0[4]);
749 
750             FieldElement2625x4::reduce64([
751                 _mm256_mul_epu32(b0.into_bits(), consts.into_bits()).into_bits(),
752                 _mm256_mul_epu32(b1.into_bits(), consts.into_bits()).into_bits(),
753                 _mm256_mul_epu32(b2.into_bits(), consts.into_bits()).into_bits(),
754                 _mm256_mul_epu32(b3.into_bits(), consts.into_bits()).into_bits(),
755                 _mm256_mul_epu32(b4.into_bits(), consts.into_bits()).into_bits(),
756                 _mm256_mul_epu32(b5.into_bits(), consts.into_bits()).into_bits(),
757                 _mm256_mul_epu32(b6.into_bits(), consts.into_bits()).into_bits(),
758                 _mm256_mul_epu32(b7.into_bits(), consts.into_bits()).into_bits(),
759                 _mm256_mul_epu32(b8.into_bits(), consts.into_bits()).into_bits(),
760                 _mm256_mul_epu32(b9.into_bits(), consts.into_bits()).into_bits(),
761             ])
762         }
763     }
764 }
765 
766 impl<'a, 'b> Mul<&'b FieldElement2625x4> for &'a FieldElement2625x4 {
767     type Output = FieldElement2625x4;
768     /// Multiply `self` by `rhs`.
769     ///
770     /// # Preconditions
771     ///
772     /// The coefficients of `self` must be bounded with \\( b < 2.5 \\).
773     ///
774     /// The coefficients of `rhs` must be bounded with \\( b < 1.75 \\).
775     ///
776     /// # Postconditions
777     ///
778     /// The coefficients of the result are bounded with \\( b < 0.007 \\).
779     ///
mul(self, rhs: &'b FieldElement2625x4) -> FieldElement2625x4780     fn mul(self, rhs: &'b FieldElement2625x4) -> FieldElement2625x4 {
781         #[inline(always)]
782         fn m(x: u32x8, y: u32x8) -> u64x4 {
783             use core::arch::x86_64::_mm256_mul_epu32;
784             unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
785         }
786 
787         #[inline(always)]
788         fn m_lo(x: u32x8, y: u32x8) -> u32x8 {
789             use core::arch::x86_64::_mm256_mul_epu32;
790             unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
791         }
792 
793         let (x0, x1) = unpack_pair(self.0[0]);
794         let (x2, x3) = unpack_pair(self.0[1]);
795         let (x4, x5) = unpack_pair(self.0[2]);
796         let (x6, x7) = unpack_pair(self.0[3]);
797         let (x8, x9) = unpack_pair(self.0[4]);
798 
799         let (y0, y1) = unpack_pair(rhs.0[0]);
800         let (y2, y3) = unpack_pair(rhs.0[1]);
801         let (y4, y5) = unpack_pair(rhs.0[2]);
802         let (y6, y7) = unpack_pair(rhs.0[3]);
803         let (y8, y9) = unpack_pair(rhs.0[4]);
804 
805         let v19 = u32x8::new(19, 0, 19, 0, 19, 0, 19, 0);
806 
807         let y1_19 = m_lo(v19, y1); // This fits in a u32
808         let y2_19 = m_lo(v19, y2); // iff 26 + b + lg(19) < 32
809         let y3_19 = m_lo(v19, y3); // if  b < 32 - 26 - 4.248 = 1.752
810         let y4_19 = m_lo(v19, y4);
811         let y5_19 = m_lo(v19, y5);
812         let y6_19 = m_lo(v19, y6);
813         let y7_19 = m_lo(v19, y7);
814         let y8_19 = m_lo(v19, y8);
815         let y9_19 = m_lo(v19, y9);
816 
817         let x1_2 = x1 + x1; // This fits in a u32 iff 25 + b + 1 < 32
818         let x3_2 = x3 + x3; //                    iff b < 6
819         let x5_2 = x5 + x5;
820         let x7_2 = x7 + x7;
821         let x9_2 = x9 + x9;
822 
823         let z0 = m(x0,y0) + m(x1_2,y9_19) + m(x2,y8_19) + m(x3_2,y7_19) + m(x4,y6_19) + m(x5_2,y5_19) + m(x6,y4_19) + m(x7_2,y3_19) + m(x8,y2_19) + m(x9_2,y1_19);
824         let z1 = m(x0,y1) +   m(x1,y0)    + m(x2,y9_19) +   m(x3,y8_19) + m(x4,y7_19) +   m(x5,y6_19) + m(x6,y5_19) +   m(x7,y4_19) + m(x8,y3_19) + m(x9,y2_19);
825         let z2 = m(x0,y2) + m(x1_2,y1)    + m(x2,y0)    + m(x3_2,y9_19) + m(x4,y8_19) + m(x5_2,y7_19) + m(x6,y6_19) + m(x7_2,y5_19) + m(x8,y4_19) + m(x9_2,y3_19);
826         let z3 = m(x0,y3) +   m(x1,y2)    + m(x2,y1)    +   m(x3,y0)    + m(x4,y9_19) +   m(x5,y8_19) + m(x6,y7_19) +   m(x7,y6_19) + m(x8,y5_19) + m(x9,y4_19);
827         let z4 = m(x0,y4) + m(x1_2,y3)    + m(x2,y2)    + m(x3_2,y1)    + m(x4,y0)    + m(x5_2,y9_19) + m(x6,y8_19) + m(x7_2,y7_19) + m(x8,y6_19) + m(x9_2,y5_19);
828         let z5 = m(x0,y5) +   m(x1,y4)    + m(x2,y3)    +   m(x3,y2)    + m(x4,y1)    +   m(x5,y0)    + m(x6,y9_19) +   m(x7,y8_19) + m(x8,y7_19) + m(x9,y6_19);
829         let z6 = m(x0,y6) + m(x1_2,y5)    + m(x2,y4)    + m(x3_2,y3)    + m(x4,y2)    + m(x5_2,y1)    + m(x6,y0)    + m(x7_2,y9_19) + m(x8,y8_19) + m(x9_2,y7_19);
830         let z7 = m(x0,y7) +   m(x1,y6)    + m(x2,y5)    +   m(x3,y4)    + m(x4,y3)    +   m(x5,y2)    + m(x6,y1)    +   m(x7,y0)    + m(x8,y9_19) + m(x9,y8_19);
831         let z8 = m(x0,y8) + m(x1_2,y7)    + m(x2,y6)    + m(x3_2,y5)    + m(x4,y4)    + m(x5_2,y3)    + m(x6,y2)    + m(x7_2,y1)    + m(x8,y0)    + m(x9_2,y9_19);
832         let z9 = m(x0,y9) +   m(x1,y8)    + m(x2,y7)    +   m(x3,y6)    + m(x4,y5)    +   m(x5,y4)    + m(x6,y3)    +   m(x7,y2)    + m(x8,y1)    + m(x9,y0);
833 
834         // The bounds on z[i] are the same as in the serial 32-bit code
835         // and the comment below is copied from there:
836 
837         // How big is the contribution to z[i+j] from x[i], y[j]?
838         //
839         // Using the bounds above, we get:
840         //
841         // i even, j even:   x[i]*y[j] <   2^(26+b)*2^(26+b) = 2*2^(51+2*b)
842         // i  odd, j even:   x[i]*y[j] <   2^(25+b)*2^(26+b) = 1*2^(51+2*b)
843         // i even, j  odd:   x[i]*y[j] <   2^(26+b)*2^(25+b) = 1*2^(51+2*b)
844         // i  odd, j  odd: 2*x[i]*y[j] < 2*2^(25+b)*2^(25+b) = 1*2^(51+2*b)
845         //
846         // We perform inline reduction mod p by replacing 2^255 by 19
847         // (since 2^255 - 19 = 0 mod p).  This adds a factor of 19, so
848         // we get the bounds (z0 is the biggest one, but calculated for
849         // posterity here in case finer estimation is needed later):
850         //
851         //  z0 < ( 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 249*2^(51 + 2*b)
852         //  z1 < ( 1 +  1   + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 154*2^(51 + 2*b)
853         //  z2 < ( 2 +  1   +  2   + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 195*2^(51 + 2*b)
854         //  z3 < ( 1 +  1   +  1   +  1   + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 118*2^(51 + 2*b)
855         //  z4 < ( 2 +  1   +  2   +  1   +  2   + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 141*2^(51 + 2*b)
856         //  z5 < ( 1 +  1   +  1   +  1   +  1   +  1   + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) =  82*2^(51 + 2*b)
857         //  z6 < ( 2 +  1   +  2   +  1   +  2   +  1   +  2   + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) =  87*2^(51 + 2*b)
858         //  z7 < ( 1 +  1   +  1   +  1   +  1   +  1   +  1   +  1   + 1*19 + 1*19 )*2^(51 + 2b) =  46*2^(51 + 2*b)
859         //  z8 < ( 2 +  1   +  2   +  1   +  2   +  1   +  2   +  1   +  2   + 1*19 )*2^(51 + 2b) =  33*2^(51 + 2*b)
860         //  z9 < ( 1 +  1   +  1   +  1   +  1   +  1   +  1   +  1   +  1   +  1   )*2^(51 + 2b) =  10*2^(51 + 2*b)
861         //
862         // So z[0] fits into a u64 if 51 + 2*b + lg(249) < 64
863         //                         if b < 2.5.
864 
865         // In fact this bound is slightly sloppy, since it treats both
866         // inputs x and y as being bounded by the same parameter b,
867         // while they are in fact bounded by b_x and b_y, and we
868         // already require that b_y < 1.75 in order to fit the
869         // multiplications by 19 into a u32.  The tighter bound on b_y
870         // means we could get a tighter bound on the outputs, or a
871         // looser bound on b_x.
872         FieldElement2625x4::reduce64([z0, z1, z2, z3, z4, z5, z6, z7, z8, z9])
873     }
874 }
875 
876 #[cfg(test)]
877 mod test {
878     use super::*;
879 
880     #[test]
scale_by_curve_constants()881     fn scale_by_curve_constants() {
882         let mut x = FieldElement2625x4::splat(&FieldElement51::one());
883 
884         x = x * (121666, 121666, 2*121666, 2*121665);
885 
886         let xs = x.split();
887         assert_eq!(xs[0], FieldElement51([121666, 0, 0, 0, 0]));
888         assert_eq!(xs[1], FieldElement51([121666, 0, 0, 0, 0]));
889         assert_eq!(xs[2], FieldElement51([2 * 121666, 0, 0, 0, 0]));
890         assert_eq!(xs[3], FieldElement51([2 * 121665, 0, 0, 0, 0]));
891     }
892 
893     #[test]
diff_sum_vs_serial()894     fn diff_sum_vs_serial() {
895         let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
896         let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
897         let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
898         let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
899 
900         let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3).diff_sum();
901 
902         let result = vec.split();
903 
904         assert_eq!(result[0], &x1 - &x0);
905         assert_eq!(result[1], &x1 + &x0);
906         assert_eq!(result[2], &x3 - &x2);
907         assert_eq!(result[3], &x3 + &x2);
908     }
909 
910     #[test]
square_vs_serial()911     fn square_vs_serial() {
912         let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
913         let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
914         let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
915         let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
916 
917         let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
918 
919         let result = vec.square_and_negate_D().split();
920 
921         assert_eq!(result[0], &x0 * &x0);
922         assert_eq!(result[1], &x1 * &x1);
923         assert_eq!(result[2], &x2 * &x2);
924         assert_eq!(result[3], -&(&x3 * &x3));
925     }
926 
927     #[test]
multiply_vs_serial()928     fn multiply_vs_serial() {
929         let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
930         let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
931         let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
932         let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
933 
934         let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
935         let vecprime = vec.clone();
936 
937         let result = (&vec * &vecprime).split();
938 
939         assert_eq!(result[0], &x0 * &x0);
940         assert_eq!(result[1], &x1 * &x1);
941         assert_eq!(result[2], &x2 * &x2);
942         assert_eq!(result[3], &x3 * &x3);
943     }
944 
945     #[test]
test_unpack_repack_pair()946     fn test_unpack_repack_pair() {
947         let x0 = FieldElement51([10000 + (10001 << 26), 0, 0, 0, 0]);
948         let x1 = FieldElement51([10100 + (10101 << 26), 0, 0, 0, 0]);
949         let x2 = FieldElement51([10200 + (10201 << 26), 0, 0, 0, 0]);
950         let x3 = FieldElement51([10300 + (10301 << 26), 0, 0, 0, 0]);
951 
952         let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
953 
954         let src = vec.0[0];
955 
956         let (a, b) = unpack_pair(src);
957 
958         let expected_a = u32x8::new(10000, 0, 10100, 0, 10200, 0, 10300, 0);
959         let expected_b = u32x8::new(10001, 0, 10101, 0, 10201, 0, 10301, 0);
960 
961         assert_eq!(a, expected_a);
962         assert_eq!(b, expected_b);
963 
964         let expected_src = repack_pair(a, b);
965 
966         assert_eq!(src, expected_src);
967     }
968 
969     #[test]
new_split_roundtrips()970     fn new_split_roundtrips() {
971         let x0 = FieldElement51::from_bytes(&[0x10; 32]);
972         let x1 = FieldElement51::from_bytes(&[0x11; 32]);
973         let x2 = FieldElement51::from_bytes(&[0x12; 32]);
974         let x3 = FieldElement51::from_bytes(&[0x13; 32]);
975 
976         let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
977 
978         let splits = vec.split();
979 
980         assert_eq!(x0, splits[0]);
981         assert_eq!(x1, splits[1]);
982         assert_eq!(x2, splits[2]);
983         assert_eq!(x3, splits[3]);
984     }
985 }
986