1 // -*- mode: rust; -*-
2 //
3 // This file is part of curve25519-dalek.
4 // Copyright (c) 2016-2021 isis lovecruft
5 // Copyright (c) 2016-2019 Henry de Valence
6 // See LICENSE for licensing information.
7 //
8 // Authors:
9 // - isis agora lovecruft <isis@patternsinthevoid.net>
10 // - Henry de Valence <hdevalence@hdevalence.ca>
11
12 //! An implementation of 4-way vectorized 32bit field arithmetic using
13 //! AVX2.
14 //!
15 //! The `FieldElement2625x4` struct provides a vector of four field
16 //! elements, implemented using AVX2 operations. Its API is designed
17 //! to abstract away the platform-dependent details, so that point
18 //! arithmetic can be implemented only in terms of a vector of field
19 //! elements.
20 //!
21 //! At this level, the API is optimized for speed and not safety. The
22 //! `FieldElement2625x4` does not always perform reductions. The pre-
23 //! and post-conditions on the bounds of the coefficients are
24 //! documented for each method, but it is the caller's responsibility
25 //! to ensure that there are no overflows.
26
27 #![allow(non_snake_case)]
28
29 const A_LANES: u8 = 0b0000_0101;
30 const B_LANES: u8 = 0b0000_1010;
31 const C_LANES: u8 = 0b0101_0000;
32 const D_LANES: u8 = 0b1010_0000;
33
34 #[allow(unused)]
35 const A_LANES64: u8 = 0b00_00_00_11;
36 #[allow(unused)]
37 const B_LANES64: u8 = 0b00_00_11_00;
38 #[allow(unused)]
39 const C_LANES64: u8 = 0b00_11_00_00;
40 #[allow(unused)]
41 const D_LANES64: u8 = 0b11_00_00_00;
42
43 use core::ops::{Add, Mul, Neg};
44 use packed_simd::{i32x8, u32x8, u64x4, IntoBits};
45
46 use backend::vector::avx2::constants::{P_TIMES_16_HI, P_TIMES_16_LO, P_TIMES_2_HI, P_TIMES_2_LO};
47 use backend::serial::u64::field::FieldElement51;
48
49 /// Unpack 32-bit lanes into 64-bit lanes:
50 /// ```ascii,no_run
51 /// (a0, b0, a1, b1, c0, d0, c1, d1)
52 /// ```
53 /// into
54 /// ```ascii,no_run
55 /// (a0, 0, b0, 0, c0, 0, d0, 0)
56 /// (a1, 0, b1, 0, c1, 0, d1, 0)
57 /// ```
58 #[inline(always)]
unpack_pair(src: u32x8) -> (u32x8, u32x8)59 fn unpack_pair(src: u32x8) -> (u32x8, u32x8) {
60 let a: u32x8;
61 let b: u32x8;
62 let zero = i32x8::new(0, 0, 0, 0, 0, 0, 0, 0);
63 unsafe {
64 use core::arch::x86_64::_mm256_unpackhi_epi32;
65 use core::arch::x86_64::_mm256_unpacklo_epi32;
66 a = _mm256_unpacklo_epi32(src.into_bits(), zero.into_bits()).into_bits();
67 b = _mm256_unpackhi_epi32(src.into_bits(), zero.into_bits()).into_bits();
68 }
69 (a, b)
70 }
71
72 /// Repack 64-bit lanes into 32-bit lanes:
73 /// ```ascii,no_run
74 /// (a0, 0, b0, 0, c0, 0, d0, 0)
75 /// (a1, 0, b1, 0, c1, 0, d1, 0)
76 /// ```
77 /// into
78 /// ```ascii,no_run
79 /// (a0, b0, a1, b1, c0, d0, c1, d1)
80 /// ```
81 #[inline(always)]
repack_pair(x: u32x8, y: u32x8) -> u32x882 fn repack_pair(x: u32x8, y: u32x8) -> u32x8 {
83 unsafe {
84 use core::arch::x86_64::_mm256_blend_epi32;
85 use core::arch::x86_64::_mm256_shuffle_epi32;
86
87 // Input: x = (a0, 0, b0, 0, c0, 0, d0, 0)
88 // Input: y = (a1, 0, b1, 0, c1, 0, d1, 0)
89
90 let x_shuffled = _mm256_shuffle_epi32(x.into_bits(), 0b11_01_10_00);
91 let y_shuffled = _mm256_shuffle_epi32(y.into_bits(), 0b10_00_11_01);
92
93 // x' = (a0, b0, 0, 0, c0, d0, 0, 0)
94 // y' = ( 0, 0, a1, b1, 0, 0, c1, d1)
95
96 return _mm256_blend_epi32(x_shuffled, y_shuffled, 0b11001100).into_bits();
97 }
98 }
99
100 /// The `Lanes` enum represents a subset of the lanes `A,B,C,D` of a
101 /// `FieldElement2625x4`.
102 ///
103 /// It's used to specify blend operations without
104 /// having to know details about the data layout of the
105 /// `FieldElement2625x4`.
106 #[derive(Copy, Clone, Debug)]
107 pub enum Lanes {
108 C,
109 D,
110 AB,
111 AC,
112 CD,
113 AD,
114 BC,
115 ABCD,
116 }
117
118 /// The `Shuffle` enum represents a shuffle of a `FieldElement2625x4`.
119 ///
120 /// The enum variants are named by what they do to a vector \\(
121 /// (A,B,C,D) \\); for instance, `Shuffle::BADC` turns \\( (A, B, C,
122 /// D) \\) into \\( (B, A, D, C) \\).
123 #[derive(Copy, Clone, Debug)]
124 pub enum Shuffle {
125 AAAA,
126 BBBB,
127 CACA,
128 DBBD,
129 ADDA,
130 CBCB,
131 ABAB,
132 BADC,
133 BACD,
134 ABDC,
135 }
136
137 /// A vector of four field elements.
138 ///
139 /// Each operation on a `FieldElement2625x4` has documented effects on
140 /// the bounds of the coefficients. This API is designed for speed
141 /// and not safety; it is the caller's responsibility to ensure that
142 /// the post-conditions of one operation are compatible with the
143 /// pre-conditions of the next.
144 #[derive(Clone, Copy, Debug)]
145 pub struct FieldElement2625x4(pub(crate) [u32x8; 5]);
146
147 use subtle::Choice;
148 use subtle::ConditionallySelectable;
149
150 impl ConditionallySelectable for FieldElement2625x4 {
conditional_select( a: &FieldElement2625x4, b: &FieldElement2625x4, choice: Choice, ) -> FieldElement2625x4151 fn conditional_select(
152 a: &FieldElement2625x4,
153 b: &FieldElement2625x4,
154 choice: Choice,
155 ) -> FieldElement2625x4 {
156 let mask = (-(choice.unwrap_u8() as i32)) as u32;
157 let mask_vec = u32x8::splat(mask);
158 FieldElement2625x4([
159 a.0[0] ^ (mask_vec & (a.0[0] ^ b.0[0])),
160 a.0[1] ^ (mask_vec & (a.0[1] ^ b.0[1])),
161 a.0[2] ^ (mask_vec & (a.0[2] ^ b.0[2])),
162 a.0[3] ^ (mask_vec & (a.0[3] ^ b.0[3])),
163 a.0[4] ^ (mask_vec & (a.0[4] ^ b.0[4])),
164 ])
165 }
166
conditional_assign( &mut self, other: &FieldElement2625x4, choice: Choice, )167 fn conditional_assign(
168 &mut self,
169 other: &FieldElement2625x4,
170 choice: Choice,
171 ) {
172 let mask = (-(choice.unwrap_u8() as i32)) as u32;
173 let mask_vec = u32x8::splat(mask);
174 self.0[0] ^= mask_vec & (self.0[0] ^ other.0[0]);
175 self.0[1] ^= mask_vec & (self.0[1] ^ other.0[1]);
176 self.0[2] ^= mask_vec & (self.0[2] ^ other.0[2]);
177 self.0[3] ^= mask_vec & (self.0[3] ^ other.0[3]);
178 self.0[4] ^= mask_vec & (self.0[4] ^ other.0[4]);
179 }
180 }
181
182 impl FieldElement2625x4 {
183 /// Split this vector into an array of four (serial) field
184 /// elements.
split(&self) -> [FieldElement51; 4]185 pub fn split(&self) -> [FieldElement51; 4] {
186 let mut out = [FieldElement51::zero(); 4];
187 for i in 0..5 {
188 let a_2i = self.0[i].extract(0) as u64; //
189 let b_2i = self.0[i].extract(1) as u64; //
190 let a_2i_1 = self.0[i].extract(2) as u64; // `.
191 let b_2i_1 = self.0[i].extract(3) as u64; // | pre-swapped to avoid
192 let c_2i = self.0[i].extract(4) as u64; // | a cross lane shuffle
193 let d_2i = self.0[i].extract(5) as u64; // .'
194 let c_2i_1 = self.0[i].extract(6) as u64; //
195 let d_2i_1 = self.0[i].extract(7) as u64; //
196
197 out[0].0[i] = a_2i + (a_2i_1 << 26);
198 out[1].0[i] = b_2i + (b_2i_1 << 26);
199 out[2].0[i] = c_2i + (c_2i_1 << 26);
200 out[3].0[i] = d_2i + (d_2i_1 << 26);
201 }
202
203 out
204 }
205
206 /// Rearrange the elements of this vector according to `control`.
207 ///
208 /// The `control` parameter should be a compile-time constant, so
209 /// that when this function is inlined, LLVM is able to lower the
210 /// shuffle using an immediate.
211 #[inline]
shuffle(&self, control: Shuffle) -> FieldElement2625x4212 pub fn shuffle(&self, control: Shuffle) -> FieldElement2625x4 {
213 #[inline(always)]
214 fn shuffle_lanes(x: u32x8, control: Shuffle) -> u32x8 {
215 unsafe {
216 use core::arch::x86_64::_mm256_permutevar8x32_epi32;
217
218 let c: u32x8 = match control {
219 Shuffle::AAAA => u32x8::new(0, 0, 2, 2, 0, 0, 2, 2),
220 Shuffle::BBBB => u32x8::new(1, 1, 3, 3, 1, 1, 3, 3),
221 Shuffle::CACA => u32x8::new(4, 0, 6, 2, 4, 0, 6, 2),
222 Shuffle::DBBD => u32x8::new(5, 1, 7, 3, 1, 5, 3, 7),
223 Shuffle::ADDA => u32x8::new(0, 5, 2, 7, 5, 0, 7, 2),
224 Shuffle::CBCB => u32x8::new(4, 1, 6, 3, 4, 1, 6, 3),
225 Shuffle::ABAB => u32x8::new(0, 1, 2, 3, 0, 1, 2, 3),
226 Shuffle::BADC => u32x8::new(1, 0, 3, 2, 5, 4, 7, 6),
227 Shuffle::BACD => u32x8::new(1, 0, 3, 2, 4, 5, 6, 7),
228 Shuffle::ABDC => u32x8::new(0, 1, 2, 3, 5, 4, 7, 6),
229 };
230 // Note that this gets turned into a generic LLVM
231 // shuffle-by-constants, which can be lowered to a simpler
232 // instruction than a generic permute.
233 _mm256_permutevar8x32_epi32(x.into_bits(), c.into_bits()).into_bits()
234 }
235 }
236
237 FieldElement2625x4([
238 shuffle_lanes(self.0[0], control),
239 shuffle_lanes(self.0[1], control),
240 shuffle_lanes(self.0[2], control),
241 shuffle_lanes(self.0[3], control),
242 shuffle_lanes(self.0[4], control),
243 ])
244 }
245
246 /// Blend `self` with `other`, taking lanes specified in `control` from `other`.
247 ///
248 /// The `control` parameter should be a compile-time constant, so
249 /// that this function can be inlined and LLVM can lower it to a
250 /// blend instruction using an immediate.
251 #[inline]
blend(&self, other: FieldElement2625x4, control: Lanes) -> FieldElement2625x4252 pub fn blend(&self, other: FieldElement2625x4, control: Lanes) -> FieldElement2625x4 {
253 #[inline(always)]
254 fn blend_lanes(x: u32x8, y: u32x8, control: Lanes) -> u32x8 {
255 unsafe {
256 use core::arch::x86_64::_mm256_blend_epi32;
257
258 // This would be much cleaner if we could factor out the match
259 // statement on the control. Unfortunately, rustc forgets
260 // constant-info very quickly, so we can't even write
261 // ```
262 // match control {
263 // Lanes::C => {
264 // let imm = C_LANES as i32;
265 // _mm256_blend_epi32(..., imm)
266 // ```
267 // let alone
268 // ```
269 // let imm = match control {
270 // Lanes::C => C_LANES as i32,
271 // }
272 // _mm256_blend_epi32(..., imm)
273 // ```
274 // even though both of these would be constant-folded by LLVM
275 // at a lower level (as happens in the shuffle implementation,
276 // which does not require a shuffle immediate but *is* lowered
277 // to immediate shuffles anyways).
278 match control {
279 Lanes::C => {
280 _mm256_blend_epi32(x.into_bits(), y.into_bits(), C_LANES as i32).into_bits()
281 }
282 Lanes::D => {
283 _mm256_blend_epi32(x.into_bits(), y.into_bits(), D_LANES as i32).into_bits()
284 }
285 Lanes::AD => {
286 _mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | D_LANES) as i32)
287 .into_bits()
288 }
289 Lanes::AB => {
290 _mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | B_LANES) as i32)
291 .into_bits()
292 }
293 Lanes::AC => {
294 _mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | C_LANES) as i32)
295 .into_bits()
296 }
297 Lanes::CD => {
298 _mm256_blend_epi32(x.into_bits(), y.into_bits(), (C_LANES | D_LANES) as i32)
299 .into_bits()
300 }
301 Lanes::BC => {
302 _mm256_blend_epi32(x.into_bits(), y.into_bits(), (B_LANES | C_LANES) as i32)
303 .into_bits()
304 }
305 Lanes::ABCD => _mm256_blend_epi32(
306 x.into_bits(),
307 y.into_bits(),
308 (A_LANES | B_LANES | C_LANES | D_LANES) as i32,
309 ).into_bits(),
310 }
311 }
312 }
313
314 FieldElement2625x4([
315 blend_lanes(self.0[0], other.0[0], control),
316 blend_lanes(self.0[1], other.0[1], control),
317 blend_lanes(self.0[2], other.0[2], control),
318 blend_lanes(self.0[3], other.0[3], control),
319 blend_lanes(self.0[4], other.0[4], control),
320 ])
321 }
322
323 /// Construct a vector of zeros.
zero() -> FieldElement2625x4324 pub fn zero() -> FieldElement2625x4 {
325 FieldElement2625x4([u32x8::splat(0); 5])
326 }
327
328 /// Convenience wrapper around `new(x,x,x,x)`.
splat(x: &FieldElement51) -> FieldElement2625x4329 pub fn splat(x: &FieldElement51) -> FieldElement2625x4 {
330 FieldElement2625x4::new(x, x, x, x)
331 }
332
333 /// Create a `FieldElement2625x4` from four `FieldElement51`s.
334 ///
335 /// # Postconditions
336 ///
337 /// The resulting `FieldElement2625x4` is bounded with \\( b < 0.0002 \\).
new( x0: &FieldElement51, x1: &FieldElement51, x2: &FieldElement51, x3: &FieldElement51, ) -> FieldElement2625x4338 pub fn new(
339 x0: &FieldElement51,
340 x1: &FieldElement51,
341 x2: &FieldElement51,
342 x3: &FieldElement51,
343 ) -> FieldElement2625x4 {
344 let mut buf = [u32x8::splat(0); 5];
345 let low_26_bits = (1 << 26) - 1;
346 for i in 0..5 {
347 let a_2i = (x0.0[i] & low_26_bits) as u32;
348 let a_2i_1 = (x0.0[i] >> 26) as u32;
349 let b_2i = (x1.0[i] & low_26_bits) as u32;
350 let b_2i_1 = (x1.0[i] >> 26) as u32;
351 let c_2i = (x2.0[i] & low_26_bits) as u32;
352 let c_2i_1 = (x2.0[i] >> 26) as u32;
353 let d_2i = (x3.0[i] & low_26_bits) as u32;
354 let d_2i_1 = (x3.0[i] >> 26) as u32;
355
356 buf[i] = u32x8::new(a_2i, b_2i, a_2i_1, b_2i_1, c_2i, d_2i, c_2i_1, d_2i_1);
357 }
358
359 // We don't know that the original `FieldElement51`s were
360 // fully reduced, so the odd limbs may exceed 2^25.
361 // Reduce them to be sure.
362 FieldElement2625x4(buf).reduce()
363 }
364
365 /// Given \\((A,B,C,D)\\), compute \\((-A,-B,-C,-D)\\), without
366 /// performing a reduction.
367 ///
368 /// # Preconditions
369 ///
370 /// The coefficients of `self` must be bounded with \\( b < 0.999 \\).
371 ///
372 /// # Postconditions
373 ///
374 /// The coefficients of the result are bounded with \\( b < 1 \\).
375 #[inline]
negate_lazy(&self) -> FieldElement2625x4376 pub fn negate_lazy(&self) -> FieldElement2625x4 {
377 // The limbs of self are bounded with b < 0.999, while the
378 // smallest limb of 2*p is 67108845 > 2^{26+0.9999}, so
379 // underflows are not possible.
380 FieldElement2625x4([
381 P_TIMES_2_LO - self.0[0],
382 P_TIMES_2_HI - self.0[1],
383 P_TIMES_2_HI - self.0[2],
384 P_TIMES_2_HI - self.0[3],
385 P_TIMES_2_HI - self.0[4],
386 ])
387 }
388
389 /// Given `self = (A,B,C,D)`, compute `(B - A, B + A, D - C, D + C)`.
390 ///
391 /// # Preconditions
392 ///
393 /// The coefficients of `self` must be bounded with \\( b < 0.01 \\).
394 ///
395 /// # Postconditions
396 ///
397 /// The coefficients of the result are bounded with \\( b < 1.6 \\).
398 #[inline]
diff_sum(&self) -> FieldElement2625x4399 pub fn diff_sum(&self) -> FieldElement2625x4 {
400 // tmp1 = (B, A, D, C)
401 let tmp1 = self.shuffle(Shuffle::BADC);
402 // tmp2 = (-A, B, -C, D)
403 let tmp2 = self.blend(self.negate_lazy(), Lanes::AC);
404 // (B - A, B + A, D - C, D + C) bounded with b < 1.6
405 tmp1 + tmp2
406 }
407
408 /// Reduce this vector of field elements \\(\mathrm{mod} p\\).
409 ///
410 /// # Postconditions
411 ///
412 /// The coefficients of the result are bounded with \\( b < 0.0002 \\).
413 #[inline]
reduce(&self) -> FieldElement2625x4414 pub fn reduce(&self) -> FieldElement2625x4 {
415 let shifts = i32x8::new(26, 26, 25, 25, 26, 26, 25, 25);
416 let masks = u32x8::new(
417 (1 << 26) - 1,
418 (1 << 26) - 1,
419 (1 << 25) - 1,
420 (1 << 25) - 1,
421 (1 << 26) - 1,
422 (1 << 26) - 1,
423 (1 << 25) - 1,
424 (1 << 25) - 1,
425 );
426
427 // Let c(x) denote the carryout of the coefficient x.
428 //
429 // Given ( x0, y0, x1, y1, z0, w0, z1, w1),
430 // compute (c(x1), c(y1), c(x0), c(y0), c(z1), c(w1), c(z0), c(w0)).
431 //
432 // The carryouts are bounded by 2^(32 - 25) = 2^7.
433 let rotated_carryout = |v: u32x8| -> u32x8 {
434 unsafe {
435 use core::arch::x86_64::_mm256_srlv_epi32;
436 use core::arch::x86_64::_mm256_shuffle_epi32;
437
438 let c = _mm256_srlv_epi32(v.into_bits(), shifts.into_bits());
439 _mm256_shuffle_epi32(c, 0b01_00_11_10).into_bits()
440 }
441 };
442
443 // Combine (lo, lo, lo, lo, lo, lo, lo, lo)
444 // with (hi, hi, hi, hi, hi, hi, hi, hi)
445 // to (lo, lo, hi, hi, lo, lo, hi, hi)
446 //
447 // This allows combining carryouts, e.g.,
448 //
449 // lo (c(x1), c(y1), c(x0), c(y0), c(z1), c(w1), c(z0), c(w0))
450 // hi (c(x3), c(y3), c(x2), c(y2), c(z3), c(w3), c(z2), c(w2))
451 // -> (c(x1), c(y1), c(x2), c(y2), c(z1), c(w1), c(z2), c(w2))
452 //
453 // which is exactly the vector of carryins for
454 //
455 // ( x2, y2, x3, y3, z2, w2, z3, w3).
456 //
457 let combine = |v_lo: u32x8, v_hi: u32x8| -> u32x8 {
458 unsafe {
459 use core::arch::x86_64::_mm256_blend_epi32;
460 _mm256_blend_epi32(v_lo.into_bits(), v_hi.into_bits(), 0b11_00_11_00).into_bits()
461 }
462 };
463
464 let mut v = self.0;
465
466 let c10 = rotated_carryout(v[0]);
467 v[0] = (v[0] & masks) + combine(u32x8::splat(0), c10);
468
469 let c32 = rotated_carryout(v[1]);
470 v[1] = (v[1] & masks) + combine(c10, c32);
471
472 let c54 = rotated_carryout(v[2]);
473 v[2] = (v[2] & masks) + combine(c32, c54);
474
475 let c76 = rotated_carryout(v[3]);
476 v[3] = (v[3] & masks) + combine(c54, c76);
477
478 let c98 = rotated_carryout(v[4]);
479 v[4] = (v[4] & masks) + combine(c76, c98);
480
481 let c9_19: u32x8 = unsafe {
482 use core::arch::x86_64::_mm256_mul_epu32;
483 use core::arch::x86_64::_mm256_shuffle_epi32;
484
485 // Need to rearrange c98, since vpmuludq uses the low
486 // 32-bits of each 64-bit lane to compute the product:
487 //
488 // c98 = (c(x9), c(y9), c(x8), c(y8), c(z9), c(w9), c(z8), c(w8));
489 // c9_spread = (c(x9), c(x8), c(y9), c(y8), c(z9), c(z8), c(w9), c(w8)).
490 let c9_spread = _mm256_shuffle_epi32(c98.into_bits(), 0b11_01_10_00);
491
492 // Since the carryouts are bounded by 2^7, their products with 19
493 // are bounded by 2^11.25. This means that
494 //
495 // c9_19_spread = (19*c(x9), 0, 19*c(y9), 0, 19*c(z9), 0, 19*c(w9), 0).
496 let c9_19_spread = _mm256_mul_epu32(c9_spread, u64x4::splat(19).into_bits());
497
498 // Unshuffle:
499 // c9_19 = (19*c(x9), 19*c(y9), 0, 0, 19*c(z9), 19*c(w9), 0, 0).
500 _mm256_shuffle_epi32(c9_19_spread, 0b11_01_10_00).into_bits()
501 };
502
503 // Add the final carryin.
504 v[0] = v[0] + c9_19;
505
506 // Each output coefficient has exactly one carryin, which is
507 // bounded by 2^11.25, so they are bounded as
508 //
509 // c_even < 2^26 + 2^11.25 < 26.00006 < 2^{26+b}
510 // c_odd < 2^25 + 2^11.25 < 25.0001 < 2^{25+b}
511 //
512 // where b = 0.0002.
513 FieldElement2625x4(v)
514 }
515
516 /// Given an array of wide coefficients, reduce them to a `FieldElement2625x4`.
517 ///
518 /// # Postconditions
519 ///
520 /// The coefficients of the result are bounded with \\( b < 0.007 \\).
521 #[inline]
reduce64(mut z: [u64x4; 10]) -> FieldElement2625x4522 fn reduce64(mut z: [u64x4; 10]) -> FieldElement2625x4 {
523 // These aren't const because splat isn't a const fn
524 let LOW_25_BITS: u64x4 = u64x4::splat((1 << 25) - 1);
525 let LOW_26_BITS: u64x4 = u64x4::splat((1 << 26) - 1);
526
527 // Carry the value from limb i = 0..8 to limb i+1
528 let carry = |z: &mut [u64x4; 10], i: usize| {
529 debug_assert!(i < 9);
530 if i % 2 == 0 {
531 // Even limbs have 26 bits
532 z[i + 1] = z[i + 1] + (z[i] >> 26);
533 z[i] = z[i] & LOW_26_BITS;
534 } else {
535 // Odd limbs have 25 bits
536 z[i + 1] = z[i + 1] + (z[i] >> 25);
537 z[i] = z[i] & LOW_25_BITS;
538 }
539 };
540
541 // Perform two halves of the carry chain in parallel.
542 carry(&mut z, 0); carry(&mut z, 4);
543 carry(&mut z, 1); carry(&mut z, 5);
544 carry(&mut z, 2); carry(&mut z, 6);
545 carry(&mut z, 3); carry(&mut z, 7);
546 // Since z[3] < 2^64, c < 2^(64-25) = 2^39,
547 // so z[4] < 2^26 + 2^39 < 2^39.0002
548 carry(&mut z, 4); carry(&mut z, 8);
549 // Now z[4] < 2^26
550 // and z[5] < 2^25 + 2^13.0002 < 2^25.0004 (good enough)
551
552 // Last carry has a multiplication by 19. In the serial case we
553 // do a 64-bit multiplication by 19, but here we want to do a
554 // 32-bit multiplication. However, if we only know z[9] < 2^64,
555 // the carry is bounded as c < 2^(64-25) = 2^39, which is too
556 // big. To ensure c < 2^32, we would need z[9] < 2^57.
557 // Instead, we split the carry in two, with c = c_0 + c_1*2^26.
558
559 let c = z[9] >> 25;
560 z[9] = z[9] & LOW_25_BITS;
561 let mut c0: u64x4 = c & LOW_26_BITS; // c0 < 2^26;
562 let mut c1: u64x4 = c >> 26; // c1 < 2^(39-26) = 2^13;
563
564 unsafe {
565 use core::arch::x86_64::_mm256_mul_epu32;
566 let x19 = u64x4::splat(19);
567 c0 = _mm256_mul_epu32(c0.into_bits(), x19.into_bits()).into_bits(); // c0 < 2^30.25
568 c1 = _mm256_mul_epu32(c1.into_bits(), x19.into_bits()).into_bits(); // c1 < 2^17.25
569 }
570
571 z[0] = z[0] + c0; // z0 < 2^26 + 2^30.25 < 2^30.33
572 z[1] = z[1] + c1; // z1 < 2^25 + 2^17.25 < 2^25.0067
573 carry(&mut z, 0); // z0 < 2^26, z1 < 2^25.0067 + 2^4.33 = 2^25.007
574
575 // The output coefficients are bounded with
576 //
577 // b = 0.007 for z[1]
578 // b = 0.0004 for z[5]
579 // b = 0 for other z[i].
580 //
581 // So the packed result is bounded with b = 0.007.
582 FieldElement2625x4([
583 repack_pair(z[0].into_bits(), z[1].into_bits()),
584 repack_pair(z[2].into_bits(), z[3].into_bits()),
585 repack_pair(z[4].into_bits(), z[5].into_bits()),
586 repack_pair(z[6].into_bits(), z[7].into_bits()),
587 repack_pair(z[8].into_bits(), z[9].into_bits()),
588 ])
589 }
590
591 /// Square this field element, and negate the result's \\(D\\) value.
592 ///
593 /// # Preconditions
594 ///
595 /// The coefficients of `self` must be bounded with \\( b < 1.5 \\).
596 ///
597 /// # Postconditions
598 ///
599 /// The coefficients of the result are bounded with \\( b < 0.007 \\).
square_and_negate_D(&self) -> FieldElement2625x4600 pub fn square_and_negate_D(&self) -> FieldElement2625x4 {
601 #[inline(always)]
602 fn m(x: u32x8, y: u32x8) -> u64x4 {
603 use core::arch::x86_64::_mm256_mul_epu32;
604 unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
605 }
606
607 #[inline(always)]
608 fn m_lo(x: u32x8, y: u32x8) -> u32x8 {
609 use core::arch::x86_64::_mm256_mul_epu32;
610 unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
611 }
612
613 let v19 = u32x8::new(19, 0, 19, 0, 19, 0, 19, 0);
614
615 let (x0, x1) = unpack_pair(self.0[0]);
616 let (x2, x3) = unpack_pair(self.0[1]);
617 let (x4, x5) = unpack_pair(self.0[2]);
618 let (x6, x7) = unpack_pair(self.0[3]);
619 let (x8, x9) = unpack_pair(self.0[4]);
620
621 let x0_2 = x0 << 1;
622 let x1_2 = x1 << 1;
623 let x2_2 = x2 << 1;
624 let x3_2 = x3 << 1;
625 let x4_2 = x4 << 1;
626 let x5_2 = x5 << 1;
627 let x6_2 = x6 << 1;
628 let x7_2 = x7 << 1;
629
630 let x5_19 = m_lo(v19, x5);
631 let x6_19 = m_lo(v19, x6);
632 let x7_19 = m_lo(v19, x7);
633 let x8_19 = m_lo(v19, x8);
634 let x9_19 = m_lo(v19, x9);
635
636 let mut z0 = m(x0, x0) + m(x2_2,x8_19) + m(x4_2,x6_19) + ((m(x1_2,x9_19) + m(x3_2,x7_19) + m(x5,x5_19)) << 1);
637 let mut z1 = m(x0_2,x1) + m(x3_2,x8_19) + m(x5_2,x6_19) + ((m(x2,x9_19) + m(x4,x7_19)) << 1);
638 let mut z2 = m(x0_2,x2) + m(x1_2,x1) + m(x4_2,x8_19) + m(x6,x6_19) + ((m(x3_2,x9_19) + m(x5_2,x7_19)) << 1);
639 let mut z3 = m(x0_2,x3) + m(x1_2,x2) + m(x5_2,x8_19) + ((m(x4,x9_19) + m(x6,x7_19)) << 1);
640 let mut z4 = m(x0_2,x4) + m(x1_2,x3_2) + m(x2, x2) + m(x6_2,x8_19) + ((m(x5_2,x9_19) + m(x7,x7_19)) << 1);
641 let mut z5 = m(x0_2,x5) + m(x1_2,x4) + m(x2_2,x3) + m(x7_2,x8_19) + ((m(x6,x9_19)) << 1);
642 let mut z6 = m(x0_2,x6) + m(x1_2,x5_2) + m(x2_2,x4) + m(x3_2,x3) + m(x8,x8_19) + ((m(x7_2,x9_19)) << 1);
643 let mut z7 = m(x0_2,x7) + m(x1_2,x6) + m(x2_2,x5) + m(x3_2,x4) + ((m(x8,x9_19)) << 1);
644 let mut z8 = m(x0_2,x8) + m(x1_2,x7_2) + m(x2_2,x6) + m(x3_2,x5_2) + m(x4,x4) + ((m(x9,x9_19)) << 1);
645 let mut z9 = m(x0_2,x9) + m(x1_2,x8) + m(x2_2,x7) + m(x3_2,x6) + m(x4_2,x5);
646
647 // The biggest z_i is bounded as z_i < 249*2^(51 + 2*b);
648 // if b < 1.5 we get z_i < 4485585228861014016.
649 //
650 // The limbs of the multiples of p are bounded above by
651 //
652 // 0x3fffffff << 37 = 9223371899415822336 < 2^63
653 //
654 // and below by
655 //
656 // 0x1fffffff << 37 = 4611685880988434432
657 // > 4485585228861014016
658 //
659 // So these multiples of p are big enough to avoid underflow
660 // in subtraction, and small enough to fit within u64
661 // with room for a carry.
662
663 let low__p37 = u64x4::splat(0x3ffffed << 37);
664 let even_p37 = u64x4::splat(0x3ffffff << 37);
665 let odd__p37 = u64x4::splat(0x1ffffff << 37);
666
667 let negate_D = |x: u64x4, p: u64x4| -> u64x4 {
668 unsafe {
669 use core::arch::x86_64::_mm256_blend_epi32;
670 _mm256_blend_epi32(x.into_bits(), (p - x).into_bits(), D_LANES64 as i32).into_bits()
671 }
672 };
673
674 z0 = negate_D(z0, low__p37);
675 z1 = negate_D(z1, odd__p37);
676 z2 = negate_D(z2, even_p37);
677 z3 = negate_D(z3, odd__p37);
678 z4 = negate_D(z4, even_p37);
679 z5 = negate_D(z5, odd__p37);
680 z6 = negate_D(z6, even_p37);
681 z7 = negate_D(z7, odd__p37);
682 z8 = negate_D(z8, even_p37);
683 z9 = negate_D(z9, odd__p37);
684
685 FieldElement2625x4::reduce64([z0, z1, z2, z3, z4, z5, z6, z7, z8, z9])
686 }
687 }
688
689 impl Neg for FieldElement2625x4 {
690 type Output = FieldElement2625x4;
691
692 /// Negate this field element, performing a reduction.
693 ///
694 /// If the coefficients are known to be small, use `negate_lazy`
695 /// to avoid performing a reduction.
696 ///
697 /// # Preconditions
698 ///
699 /// The coefficients of `self` must be bounded with \\( b < 4.0 \\).
700 ///
701 /// # Postconditions
702 ///
703 /// The coefficients of the result are bounded with \\( b < 0.0002 \\).
704 #[inline]
neg(self) -> FieldElement2625x4705 fn neg(self) -> FieldElement2625x4 {
706 FieldElement2625x4([
707 P_TIMES_16_LO - self.0[0],
708 P_TIMES_16_HI - self.0[1],
709 P_TIMES_16_HI - self.0[2],
710 P_TIMES_16_HI - self.0[3],
711 P_TIMES_16_HI - self.0[4],
712 ]).reduce()
713 }
714 }
715
716 impl Add<FieldElement2625x4> for FieldElement2625x4 {
717 type Output = FieldElement2625x4;
718 /// Add two `FieldElement2625x4`s, without performing a reduction.
719 #[inline]
add(self, rhs: FieldElement2625x4) -> FieldElement2625x4720 fn add(self, rhs: FieldElement2625x4) -> FieldElement2625x4 {
721 FieldElement2625x4([
722 self.0[0] + rhs.0[0],
723 self.0[1] + rhs.0[1],
724 self.0[2] + rhs.0[2],
725 self.0[3] + rhs.0[3],
726 self.0[4] + rhs.0[4],
727 ])
728 }
729 }
730
731 impl Mul<(u32, u32, u32, u32)> for FieldElement2625x4 {
732 type Output = FieldElement2625x4;
733 /// Perform a multiplication by a vector of small constants.
734 ///
735 /// # Postconditions
736 ///
737 /// The coefficients of the result are bounded with \\( b < 0.007 \\).
738 #[inline]
mul(self, scalars: (u32, u32, u32, u32)) -> FieldElement2625x4739 fn mul(self, scalars: (u32, u32, u32, u32)) -> FieldElement2625x4 {
740 unsafe {
741 use core::arch::x86_64::_mm256_mul_epu32;
742
743 let consts = u32x8::new(scalars.0, 0, scalars.1, 0, scalars.2, 0, scalars.3, 0);
744
745 let (b0, b1) = unpack_pair(self.0[0]);
746 let (b2, b3) = unpack_pair(self.0[1]);
747 let (b4, b5) = unpack_pair(self.0[2]);
748 let (b6, b7) = unpack_pair(self.0[3]);
749 let (b8, b9) = unpack_pair(self.0[4]);
750
751 FieldElement2625x4::reduce64([
752 _mm256_mul_epu32(b0.into_bits(), consts.into_bits()).into_bits(),
753 _mm256_mul_epu32(b1.into_bits(), consts.into_bits()).into_bits(),
754 _mm256_mul_epu32(b2.into_bits(), consts.into_bits()).into_bits(),
755 _mm256_mul_epu32(b3.into_bits(), consts.into_bits()).into_bits(),
756 _mm256_mul_epu32(b4.into_bits(), consts.into_bits()).into_bits(),
757 _mm256_mul_epu32(b5.into_bits(), consts.into_bits()).into_bits(),
758 _mm256_mul_epu32(b6.into_bits(), consts.into_bits()).into_bits(),
759 _mm256_mul_epu32(b7.into_bits(), consts.into_bits()).into_bits(),
760 _mm256_mul_epu32(b8.into_bits(), consts.into_bits()).into_bits(),
761 _mm256_mul_epu32(b9.into_bits(), consts.into_bits()).into_bits(),
762 ])
763 }
764 }
765 }
766
767 impl<'a, 'b> Mul<&'b FieldElement2625x4> for &'a FieldElement2625x4 {
768 type Output = FieldElement2625x4;
769 /// Multiply `self` by `rhs`.
770 ///
771 /// # Preconditions
772 ///
773 /// The coefficients of `self` must be bounded with \\( b < 2.5 \\).
774 ///
775 /// The coefficients of `rhs` must be bounded with \\( b < 1.75 \\).
776 ///
777 /// # Postconditions
778 ///
779 /// The coefficients of the result are bounded with \\( b < 0.007 \\).
780 ///
mul(self, rhs: &'b FieldElement2625x4) -> FieldElement2625x4781 fn mul(self, rhs: &'b FieldElement2625x4) -> FieldElement2625x4 {
782 #[inline(always)]
783 fn m(x: u32x8, y: u32x8) -> u64x4 {
784 use core::arch::x86_64::_mm256_mul_epu32;
785 unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
786 }
787
788 #[inline(always)]
789 fn m_lo(x: u32x8, y: u32x8) -> u32x8 {
790 use core::arch::x86_64::_mm256_mul_epu32;
791 unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
792 }
793
794 let (x0, x1) = unpack_pair(self.0[0]);
795 let (x2, x3) = unpack_pair(self.0[1]);
796 let (x4, x5) = unpack_pair(self.0[2]);
797 let (x6, x7) = unpack_pair(self.0[3]);
798 let (x8, x9) = unpack_pair(self.0[4]);
799
800 let (y0, y1) = unpack_pair(rhs.0[0]);
801 let (y2, y3) = unpack_pair(rhs.0[1]);
802 let (y4, y5) = unpack_pair(rhs.0[2]);
803 let (y6, y7) = unpack_pair(rhs.0[3]);
804 let (y8, y9) = unpack_pair(rhs.0[4]);
805
806 let v19 = u32x8::new(19, 0, 19, 0, 19, 0, 19, 0);
807
808 let y1_19 = m_lo(v19, y1); // This fits in a u32
809 let y2_19 = m_lo(v19, y2); // iff 26 + b + lg(19) < 32
810 let y3_19 = m_lo(v19, y3); // if b < 32 - 26 - 4.248 = 1.752
811 let y4_19 = m_lo(v19, y4);
812 let y5_19 = m_lo(v19, y5);
813 let y6_19 = m_lo(v19, y6);
814 let y7_19 = m_lo(v19, y7);
815 let y8_19 = m_lo(v19, y8);
816 let y9_19 = m_lo(v19, y9);
817
818 let x1_2 = x1 + x1; // This fits in a u32 iff 25 + b + 1 < 32
819 let x3_2 = x3 + x3; // iff b < 6
820 let x5_2 = x5 + x5;
821 let x7_2 = x7 + x7;
822 let x9_2 = x9 + x9;
823
824 let z0 = m(x0,y0) + m(x1_2,y9_19) + m(x2,y8_19) + m(x3_2,y7_19) + m(x4,y6_19) + m(x5_2,y5_19) + m(x6,y4_19) + m(x7_2,y3_19) + m(x8,y2_19) + m(x9_2,y1_19);
825 let z1 = m(x0,y1) + m(x1,y0) + m(x2,y9_19) + m(x3,y8_19) + m(x4,y7_19) + m(x5,y6_19) + m(x6,y5_19) + m(x7,y4_19) + m(x8,y3_19) + m(x9,y2_19);
826 let z2 = m(x0,y2) + m(x1_2,y1) + m(x2,y0) + m(x3_2,y9_19) + m(x4,y8_19) + m(x5_2,y7_19) + m(x6,y6_19) + m(x7_2,y5_19) + m(x8,y4_19) + m(x9_2,y3_19);
827 let z3 = m(x0,y3) + m(x1,y2) + m(x2,y1) + m(x3,y0) + m(x4,y9_19) + m(x5,y8_19) + m(x6,y7_19) + m(x7,y6_19) + m(x8,y5_19) + m(x9,y4_19);
828 let z4 = m(x0,y4) + m(x1_2,y3) + m(x2,y2) + m(x3_2,y1) + m(x4,y0) + m(x5_2,y9_19) + m(x6,y8_19) + m(x7_2,y7_19) + m(x8,y6_19) + m(x9_2,y5_19);
829 let z5 = m(x0,y5) + m(x1,y4) + m(x2,y3) + m(x3,y2) + m(x4,y1) + m(x5,y0) + m(x6,y9_19) + m(x7,y8_19) + m(x8,y7_19) + m(x9,y6_19);
830 let z6 = m(x0,y6) + m(x1_2,y5) + m(x2,y4) + m(x3_2,y3) + m(x4,y2) + m(x5_2,y1) + m(x6,y0) + m(x7_2,y9_19) + m(x8,y8_19) + m(x9_2,y7_19);
831 let z7 = m(x0,y7) + m(x1,y6) + m(x2,y5) + m(x3,y4) + m(x4,y3) + m(x5,y2) + m(x6,y1) + m(x7,y0) + m(x8,y9_19) + m(x9,y8_19);
832 let z8 = m(x0,y8) + m(x1_2,y7) + m(x2,y6) + m(x3_2,y5) + m(x4,y4) + m(x5_2,y3) + m(x6,y2) + m(x7_2,y1) + m(x8,y0) + m(x9_2,y9_19);
833 let z9 = m(x0,y9) + m(x1,y8) + m(x2,y7) + m(x3,y6) + m(x4,y5) + m(x5,y4) + m(x6,y3) + m(x7,y2) + m(x8,y1) + m(x9,y0);
834
835 // The bounds on z[i] are the same as in the serial 32-bit code
836 // and the comment below is copied from there:
837
838 // How big is the contribution to z[i+j] from x[i], y[j]?
839 //
840 // Using the bounds above, we get:
841 //
842 // i even, j even: x[i]*y[j] < 2^(26+b)*2^(26+b) = 2*2^(51+2*b)
843 // i odd, j even: x[i]*y[j] < 2^(25+b)*2^(26+b) = 1*2^(51+2*b)
844 // i even, j odd: x[i]*y[j] < 2^(26+b)*2^(25+b) = 1*2^(51+2*b)
845 // i odd, j odd: 2*x[i]*y[j] < 2*2^(25+b)*2^(25+b) = 1*2^(51+2*b)
846 //
847 // We perform inline reduction mod p by replacing 2^255 by 19
848 // (since 2^255 - 19 = 0 mod p). This adds a factor of 19, so
849 // we get the bounds (z0 is the biggest one, but calculated for
850 // posterity here in case finer estimation is needed later):
851 //
852 // z0 < ( 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 249*2^(51 + 2*b)
853 // z1 < ( 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 154*2^(51 + 2*b)
854 // z2 < ( 2 + 1 + 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 195*2^(51 + 2*b)
855 // z3 < ( 1 + 1 + 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 118*2^(51 + 2*b)
856 // z4 < ( 2 + 1 + 2 + 1 + 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 141*2^(51 + 2*b)
857 // z5 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 82*2^(51 + 2*b)
858 // z6 < ( 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 87*2^(51 + 2*b)
859 // z7 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1*19 + 1*19 )*2^(51 + 2b) = 46*2^(51 + 2*b)
860 // z8 < ( 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1*19 )*2^(51 + 2b) = 33*2^(51 + 2*b)
861 // z9 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 )*2^(51 + 2b) = 10*2^(51 + 2*b)
862 //
863 // So z[0] fits into a u64 if 51 + 2*b + lg(249) < 64
864 // if b < 2.5.
865
866 // In fact this bound is slightly sloppy, since it treats both
867 // inputs x and y as being bounded by the same parameter b,
868 // while they are in fact bounded by b_x and b_y, and we
869 // already require that b_y < 1.75 in order to fit the
870 // multiplications by 19 into a u32. The tighter bound on b_y
871 // means we could get a tighter bound on the outputs, or a
872 // looser bound on b_x.
873 FieldElement2625x4::reduce64([z0, z1, z2, z3, z4, z5, z6, z7, z8, z9])
874 }
875 }
876
877 #[cfg(test)]
878 mod test {
879 use super::*;
880
881 #[test]
scale_by_curve_constants()882 fn scale_by_curve_constants() {
883 let mut x = FieldElement2625x4::splat(&FieldElement51::one());
884
885 x = x * (121666, 121666, 2*121666, 2*121665);
886
887 let xs = x.split();
888 assert_eq!(xs[0], FieldElement51([121666, 0, 0, 0, 0]));
889 assert_eq!(xs[1], FieldElement51([121666, 0, 0, 0, 0]));
890 assert_eq!(xs[2], FieldElement51([2 * 121666, 0, 0, 0, 0]));
891 assert_eq!(xs[3], FieldElement51([2 * 121665, 0, 0, 0, 0]));
892 }
893
894 #[test]
diff_sum_vs_serial()895 fn diff_sum_vs_serial() {
896 let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
897 let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
898 let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
899 let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
900
901 let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3).diff_sum();
902
903 let result = vec.split();
904
905 assert_eq!(result[0], &x1 - &x0);
906 assert_eq!(result[1], &x1 + &x0);
907 assert_eq!(result[2], &x3 - &x2);
908 assert_eq!(result[3], &x3 + &x2);
909 }
910
911 #[test]
square_vs_serial()912 fn square_vs_serial() {
913 let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
914 let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
915 let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
916 let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
917
918 let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
919
920 let result = vec.square_and_negate_D().split();
921
922 assert_eq!(result[0], &x0 * &x0);
923 assert_eq!(result[1], &x1 * &x1);
924 assert_eq!(result[2], &x2 * &x2);
925 assert_eq!(result[3], -&(&x3 * &x3));
926 }
927
928 #[test]
multiply_vs_serial()929 fn multiply_vs_serial() {
930 let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
931 let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
932 let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
933 let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
934
935 let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
936 let vecprime = vec.clone();
937
938 let result = (&vec * &vecprime).split();
939
940 assert_eq!(result[0], &x0 * &x0);
941 assert_eq!(result[1], &x1 * &x1);
942 assert_eq!(result[2], &x2 * &x2);
943 assert_eq!(result[3], &x3 * &x3);
944 }
945
946 #[test]
test_unpack_repack_pair()947 fn test_unpack_repack_pair() {
948 let x0 = FieldElement51([10000 + (10001 << 26), 0, 0, 0, 0]);
949 let x1 = FieldElement51([10100 + (10101 << 26), 0, 0, 0, 0]);
950 let x2 = FieldElement51([10200 + (10201 << 26), 0, 0, 0, 0]);
951 let x3 = FieldElement51([10300 + (10301 << 26), 0, 0, 0, 0]);
952
953 let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
954
955 let src = vec.0[0];
956
957 let (a, b) = unpack_pair(src);
958
959 let expected_a = u32x8::new(10000, 0, 10100, 0, 10200, 0, 10300, 0);
960 let expected_b = u32x8::new(10001, 0, 10101, 0, 10201, 0, 10301, 0);
961
962 assert_eq!(a, expected_a);
963 assert_eq!(b, expected_b);
964
965 let expected_src = repack_pair(a, b);
966
967 assert_eq!(src, expected_src);
968 }
969
970 #[test]
new_split_roundtrips()971 fn new_split_roundtrips() {
972 let x0 = FieldElement51::from_bytes(&[0x10; 32]);
973 let x1 = FieldElement51::from_bytes(&[0x11; 32]);
974 let x2 = FieldElement51::from_bytes(&[0x12; 32]);
975 let x3 = FieldElement51::from_bytes(&[0x13; 32]);
976
977 let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
978
979 let splits = vec.split();
980
981 assert_eq!(x0, splits[0]);
982 assert_eq!(x1, splits[1]);
983 assert_eq!(x2, splits[2]);
984 assert_eq!(x3, splits[3]);
985 }
986 }
987