1 // donna_sse.cpp - written and placed in public domain by Jeffrey Walton
2 //                 This is a integration of Andrew Moon's public domain code.
3 //                 Also see https://github.com/floodyberry/curve25519-donna.
4 
5 // This is a integration of Andrew Moon's public domain code. The port was
6 // clean, but it has one potential problem. The original code is C and relies
7 // upon unions. Accessing the inactive union member is undefined behavior in
8 // C++. That means copying the array into packedelem8.u is OK; but then using
9 // packedelem8.v in a calcualtion is UB. Fortunately most (all?) compilers
10 // take pity on C++ developers and compile the code. We will have to keep an
11 // eye on things or rewrite significant portions of this code.
12 
13 // If needed, see Moon's commit "Go back to ignoring 256th bit [sic]",
14 // https://github.com/floodyberry/curve25519-donna/commit/57a683d18721a658
15 
16 #include "pch.h"
17 
18 #include "config.h"
19 #include "donna.h"
20 #include "secblock.h"
21 #include "misc.h"
22 
23 // The data is aligned, but Clang issues warning based on type
24 // and not the actual alignment of the variable and data.
25 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
26 # pragma GCC diagnostic ignored "-Wcast-align"
27 # pragma GCC diagnostic ignored "-Wunused-function"
28 #endif
29 
30 #if CRYPTOPP_MSC_VERSION
31 # pragma warning(disable: 4244)
32 #endif
33 
34 // Squash MS LNK4221 and libtool warnings
35 extern const char DONNA_SSE_FNAME[] = __FILE__;
36 
37 #if (CRYPTOPP_CURVE25519_SSE2)
38 
39 #include "donna_sse.h"
40 
41 ANONYMOUS_NAMESPACE_BEGIN
42 
43 using CryptoPP::byte;
44 using CryptoPP::word32;
45 using CryptoPP::sword32;
46 using CryptoPP::word64;
47 using CryptoPP::sword64;
48 using CryptoPP::GetBlock;
49 using CryptoPP::LittleEndian;
50 
51 // Bring in all the symbols from the SSE header
52 using namespace CryptoPP::Donna::ArchSSE;
53 
54 /* Copy a bignum to another: out = in */
55 inline void
curve25519_copy(bignum25519 out,const bignum25519 in)56 curve25519_copy(bignum25519 out, const bignum25519 in) {
57     xmmi x0,x1,x2;
58     x0 = _mm_load_si128((xmmi*)in + 0);
59     x1 = _mm_load_si128((xmmi*)in + 1);
60     x2 = _mm_load_si128((xmmi*)in + 2);
61     _mm_store_si128((xmmi*)out + 0, x0);
62     _mm_store_si128((xmmi*)out + 1, x1);
63     _mm_store_si128((xmmi*)out + 2, x2);
64 }
65 
66 /* Take a little-endian, 32-byte number and expand it into polynomial form */
67 inline void
curve25519_expand(bignum25519 out,const byte in[32])68 curve25519_expand(bignum25519 out, const byte in[32]) {
69     word32 x0,x1,x2,x3,x4,x5,x6,x7;
70 
71     x0 = *(word32 *)(in + 0);
72     x1 = *(word32 *)(in + 4);
73     x2 = *(word32 *)(in + 8);
74     x3 = *(word32 *)(in + 12);
75     x4 = *(word32 *)(in + 16);
76     x5 = *(word32 *)(in + 20);
77     x6 = *(word32 *)(in + 24);
78     x7 = *(word32 *)(in + 28);
79 
80     out[0] = (                      x0       ) & reduce_mask_26;
81     out[1] = ((((word64)x1 << 32) | x0) >> 26) & reduce_mask_25;
82     out[2] = ((((word64)x2 << 32) | x1) >> 19) & reduce_mask_26;
83     out[3] = ((((word64)x3 << 32) | x2) >> 13) & reduce_mask_25;
84     out[4] = ((                     x3) >>  6) & reduce_mask_26;
85     out[5] = (                      x4       ) & reduce_mask_25;
86     out[6] = ((((word64)x5 << 32) | x4) >> 25) & reduce_mask_26;
87     out[7] = ((((word64)x6 << 32) | x5) >> 19) & reduce_mask_25;
88     out[8] = ((((word64)x7 << 32) | x6) >> 12) & reduce_mask_26;
89     out[9] = ((                     x7) >>  6) & reduce_mask_25; /* ignore the top bit */
90 
91     out[10] = 0;
92     out[11] = 0;
93 }
94 
95 /* Take a fully reduced polynomial form number and contract it into a
96  * little-endian, 32-byte array
97  */
98 inline void
curve25519_contract(byte out[32],const bignum25519 in)99 curve25519_contract(byte out[32], const bignum25519 in) {
100     ALIGN(16) bignum25519 f;
101 
102     curve25519_copy(f, in);
103 
104     #define carry_pass() \
105         f[1] += f[0] >> 26; f[0] &= reduce_mask_26; \
106         f[2] += f[1] >> 25; f[1] &= reduce_mask_25; \
107         f[3] += f[2] >> 26; f[2] &= reduce_mask_26; \
108         f[4] += f[3] >> 25; f[3] &= reduce_mask_25; \
109         f[5] += f[4] >> 26; f[4] &= reduce_mask_26; \
110         f[6] += f[5] >> 25; f[5] &= reduce_mask_25; \
111         f[7] += f[6] >> 26; f[6] &= reduce_mask_26; \
112         f[8] += f[7] >> 25; f[7] &= reduce_mask_25; \
113         f[9] += f[8] >> 26; f[8] &= reduce_mask_26;
114 
115     #define carry_pass_full() \
116         carry_pass() \
117         f[0] += 19 * (f[9] >> 25); f[9] &= reduce_mask_25;
118 
119     #define carry_pass_final() \
120         carry_pass() \
121         f[9] &= reduce_mask_25;
122 
123     carry_pass_full()
124     carry_pass_full()
125 
126     /* now t is between 0 and 2^255-1, properly carried. */
127     /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
128     f[0] += 19;
129     carry_pass_full()
130 
131     /* now between 19 and 2^255-1 in both cases, and offset by 19. */
132     f[0] += (1 << 26) - 19;
133     f[1] += (1 << 25) - 1;
134     f[2] += (1 << 26) - 1;
135     f[3] += (1 << 25) - 1;
136     f[4] += (1 << 26) - 1;
137     f[5] += (1 << 25) - 1;
138     f[6] += (1 << 26) - 1;
139     f[7] += (1 << 25) - 1;
140     f[8] += (1 << 26) - 1;
141     f[9] += (1 << 25) - 1;
142 
143     /* now between 2^255 and 2^256-20, and offset by 2^255. */
144     carry_pass_final()
145 
146     #undef carry_pass
147     #undef carry_full
148     #undef carry_final
149 
150     *(word32 *)(out +  0) = ((f[0]      ) | (f[1] << 26));
151     *(word32 *)(out +  4) = ((f[1] >>  6) | (f[2] << 19));
152     *(word32 *)(out +  8) = ((f[2] >> 13) | (f[3] << 13));
153     *(word32 *)(out + 12) = ((f[3] >> 19) | (f[4] <<  6));
154     *(word32 *)(out + 16) = ((f[5]      ) | (f[6] << 25));
155     *(word32 *)(out + 20) = ((f[6] >>  7) | (f[7] << 19));
156     *(word32 *)(out + 24) = ((f[7] >> 13) | (f[8] << 12));
157     *(word32 *)(out + 28) = ((f[8] >> 20) | (f[9] <<  6));
158 }
159 
160 /*
161  * Maybe swap the contents of two felem arrays (@a and @b), each 5 elements
162  * long. Perform the swap iff @swap is non-zero.
163  */
164 inline void
curve25519_swap_conditional(bignum25519 a,bignum25519 b,word32 iswap)165 curve25519_swap_conditional(bignum25519 a, bignum25519 b, word32 iswap) {
166     const word32 swap = (word32)(-(sword32)iswap);
167     xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
168     xmmi mask = _mm_cvtsi32_si128(swap);
169     mask = _mm_shuffle_epi32(mask, 0);
170     a0 = _mm_load_si128((xmmi *)a + 0);
171     a1 = _mm_load_si128((xmmi *)a + 1);
172     a2 = _mm_load_si128((xmmi *)a + 2);
173     b0 = _mm_load_si128((xmmi *)b + 0);
174     b1 = _mm_load_si128((xmmi *)b + 1);
175     b2 = _mm_load_si128((xmmi *)b + 2);
176     b0 = _mm_xor_si128(a0, b0);
177     b1 = _mm_xor_si128(a1, b1);
178     b2 = _mm_xor_si128(a2, b2);
179     x0 = _mm_and_si128(b0, mask);
180     x1 = _mm_and_si128(b1, mask);
181     x2 = _mm_and_si128(b2, mask);
182     x0 = _mm_xor_si128(x0, a0);
183     x1 = _mm_xor_si128(x1, a1);
184     x2 = _mm_xor_si128(x2, a2);
185     a0 = _mm_xor_si128(x0, b0);
186     a1 = _mm_xor_si128(x1, b1);
187     a2 = _mm_xor_si128(x2, b2);
188     _mm_store_si128((xmmi *)a + 0, x0);
189     _mm_store_si128((xmmi *)a + 1, x1);
190     _mm_store_si128((xmmi *)a + 2, x2);
191     _mm_store_si128((xmmi *)b + 0, a0);
192     _mm_store_si128((xmmi *)b + 1, a1);
193     _mm_store_si128((xmmi *)b + 2, a2);
194 }
195 
196 /* interleave two bignums */
197 inline void
curve25519_tangle32(packedelem32 * out,const bignum25519 x,const bignum25519 z)198 curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {
199     xmmi x0,x1,x2,z0,z1,z2;
200 
201     x0 = _mm_load_si128((xmmi *)(x + 0));
202     x1 = _mm_load_si128((xmmi *)(x + 4));
203     x2 = _mm_load_si128((xmmi *)(x + 8));
204     z0 = _mm_load_si128((xmmi *)(z + 0));
205     z1 = _mm_load_si128((xmmi *)(z + 4));
206     z2 = _mm_load_si128((xmmi *)(z + 8));
207 
208     out[0].v = _mm_unpacklo_epi32(x0, z0);
209     out[1].v = _mm_unpackhi_epi32(x0, z0);
210     out[2].v = _mm_unpacklo_epi32(x1, z1);
211     out[3].v = _mm_unpackhi_epi32(x1, z1);
212     out[4].v = _mm_unpacklo_epi32(x2, z2);
213 }
214 
215 /* split a packed bignum in to it's two parts */
216 inline void
curve25519_untangle64(bignum25519 x,bignum25519 z,const packedelem64 * in)217 curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {
218     _mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
219     _mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
220     _mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v)                                                          );
221     _mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
222     _mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
223     _mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v)                                                          );
224 }
225 
226 /* add two packed bignums */
227 inline void
curve25519_add_packed32(packedelem32 * out,const packedelem32 * r,const packedelem32 * s)228 curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
229     out[0].v = _mm_add_epi32(r[0].v, s[0].v);
230     out[1].v = _mm_add_epi32(r[1].v, s[1].v);
231     out[2].v = _mm_add_epi32(r[2].v, s[2].v);
232     out[3].v = _mm_add_epi32(r[3].v, s[3].v);
233     out[4].v = _mm_add_epi32(r[4].v, s[4].v);
234 }
235 
236 /* subtract two packed bignums */
237 inline void
curve25519_sub_packed32(packedelem32 * out,const packedelem32 * r,const packedelem32 * s)238 curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
239     xmmi r0,r1,r2,r3,r4;
240     xmmi s0,s1,s2,s3;
241     xmmi c1,c2;
242 
243     r0 = _mm_add_epi32(r[0].v, packed32zeromodp0.v);
244     r1 = _mm_add_epi32(r[1].v, packed32zeromodp1.v);
245     r2 = _mm_add_epi32(r[2].v, packed32zeromodp1.v);
246     r3 = _mm_add_epi32(r[3].v, packed32zeromodp1.v);
247     r4 = _mm_add_epi32(r[4].v, packed32zeromodp1.v);
248     r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
249     r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
250     r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
251     r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
252     r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
253 
254     s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
255     s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
256     s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
257     s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
258 
259     c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
260     c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0,  _mm_slli_si128(c2, 8));
261 
262     out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
263     out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
264     out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
265     out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
266     out[4].v = r4;                         /* 88 99 */
267 }
268 
269 /* multiply two packed bignums */
270 inline void
curve25519_mul_packed64(packedelem64 * out,const packedelem64 * r,const packedelem64 * s)271 curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {
272     xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
273     xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
274     xmmi c1,c2;
275 
276     out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
277     out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
278     r1_2 = _mm_slli_epi32(r[1].v, 1);
279     out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
280     out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
281     r3_2 = _mm_slli_epi32(r[3].v, 1);
282     out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2  , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
283     out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
284     r5_2 = _mm_slli_epi32(r[5].v, 1);
285     out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2  , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2  , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
286     out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v  , s[0].v))))))));
287     r7_2 = _mm_slli_epi32(r[7].v, 1);
288     out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2  , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2  , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2  , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
289     out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
290 
291     r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
292     r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
293     r1_2 = _mm_slli_epi32(r1, 1);
294     r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
295     r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
296     r3_2 = _mm_slli_epi32(r3, 1);
297     r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
298     r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
299     r5_2 = _mm_slli_epi32(r5, 1);
300     r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
301     r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
302     r7_2 = _mm_slli_epi32(r7, 1);
303     r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
304     r9_2 = _mm_slli_epi32(r9, 1);
305 
306     out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
307     out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7  , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5  , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3  , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
308     out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
309     out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7  , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5  , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
310     out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
311     out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7  , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
312     out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
313     out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[8].v), _mm_mul_epu32(r8, s[9].v)));
314     out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
315 
316     c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
317     c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
318     c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
319     c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
320                                        c2 = _mm_srli_epi64(out[8].v, 26);                                                     out[8].v = _mm_and_si128(out[8].v, packedmask26.v);                                         out[9].v = _mm_add_epi64(out[9].v, c2);
321                                        c2 = _mm_srli_epi64(out[9].v, 25);                                                     out[9].v = _mm_and_si128(out[9].v, packedmask25.v);                                         out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
322     c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
323 }
324 
325 /* multiply a bignum */
326 void
curve25519_mul(bignum25519 out,const bignum25519 r,const bignum25519 s)327 curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {
328     xmmi m01,m23,m45,m67,m89;
329     xmmi m0123,m4567;
330     xmmi s0123,s4567;
331     xmmi s01,s23,s45,s67,s89;
332     xmmi s12,s34,s56,s78,s9;
333     xmmi r0,r2,r4,r6,r8;
334     xmmi r1,r3,r5,r7,r9;
335     xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
336     xmmi c1,c2,c3;
337 
338     s0123 = _mm_load_si128((xmmi*)s + 0);
339     s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
340     s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
341     s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
342     s4567 = _mm_load_si128((xmmi*)s + 1);
343     s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
344     s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
345     s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
346     s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
347     s89 = _mm_load_si128((xmmi*)s + 2);
348     s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
349     s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
350     s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
351 
352     r0 = _mm_load_si128((xmmi*)r + 0);
353     r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
354     r1 = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_top64bitmask.v));
355     r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
356     r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
357     r3 = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_top64bitmask.v));
358     r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
359     r4 = _mm_load_si128((xmmi*)r + 1);
360     r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
361     r5 = _mm_add_epi64(r5, _mm_and_si128(r5, sse2_top64bitmask.v));
362     r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
363     r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
364     r7 = _mm_add_epi64(r7, _mm_and_si128(r7, sse2_top64bitmask.v));
365     r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
366     r8 = _mm_load_si128((xmmi*)r + 2);
367     r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
368     r9 = _mm_add_epi64(r9, _mm_and_si128(r9, sse2_top64bitmask.v));
369     r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
370 
371     m01 = _mm_mul_epu32(r1,s01);
372     m23 = _mm_mul_epu32(r1,s23);
373     m45 = _mm_mul_epu32(r1,s45);
374     m67 = _mm_mul_epu32(r1,s67);
375     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
376     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
377     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
378     m89 = _mm_mul_epu32(r1,s89);
379     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
380     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
381     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
382     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
383     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
384     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
385     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
386 
387     /* shift up */
388     m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
389     m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
390     m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
391     m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
392     m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
393 
394     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
395     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
396     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
397     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
398     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
399     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
400     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
401     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
402     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
403     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
404     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
405     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
406     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
407     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
408     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
409 
410     r219 = _mm_mul_epu32(r2, packednineteen.v);
411     r419 = _mm_mul_epu32(r4, packednineteen.v);
412     r619 = _mm_mul_epu32(r6, packednineteen.v);
413     r819 = _mm_mul_epu32(r8, packednineteen.v);
414     r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
415     r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
416     r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
417     r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
418     r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
419 
420     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
421     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
422     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
423     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
424     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
425     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
426     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
427     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
428     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
429     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
430     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
431     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
432     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
433     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
434     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
435     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
436     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
437     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
438     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
439     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
440     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
441     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
442     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
443     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
444     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
445 
446     r0 = _mm_unpacklo_epi64(m01, m45);
447     r1 = _mm_unpackhi_epi64(m01, m45);
448     r2 = _mm_unpacklo_epi64(m23, m67);
449     r3 = _mm_unpackhi_epi64(m23, m67);
450     r4 = _mm_unpacklo_epi64(m89, m89);
451     r5 = _mm_unpackhi_epi64(m89, m89);
452 
453     c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
454     c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
455     c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1);
456     c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
457     c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
458 
459     m0123 = _mm_unpacklo_epi32(r0, r1);
460     m4567 = _mm_unpackhi_epi32(r0, r1);
461     m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
462     m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
463     m89 = _mm_unpackhi_epi32(r4, r5);
464 
465     _mm_store_si128((xmmi*)out + 0, m0123);
466     _mm_store_si128((xmmi*)out + 1, m4567);
467     _mm_store_si128((xmmi*)out + 2, m89);
468 }
469 
470 typedef struct bignum25519mulprecomp_t {
471     xmmi r0,r2,r4,r6,r8;
472     xmmi r1,r3,r5,r7,r9;
473     xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
474 } bignum25519mulprecomp;
475 
476 /* precompute a constant to multiply by */
477 inline void
curve25519_mul_precompute(bignum25519mulprecomp * pre,const bignum25519 r)478 curve25519_mul_precompute(bignum25519mulprecomp *pre, const bignum25519 r) {
479     pre->r0 = _mm_load_si128((xmmi*)r + 0);
480     pre->r1 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(1,1,1,1));
481     pre->r1 = _mm_add_epi64(pre->r1, _mm_and_si128(pre->r1, sse2_top64bitmask.v));
482     pre->r2 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(2,2,2,2));
483     pre->r3 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(3,3,3,3));
484     pre->r3 = _mm_add_epi64(pre->r3, _mm_and_si128(pre->r3, sse2_top64bitmask.v));
485     pre->r0 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(0,0,0,0));
486     pre->r4 = _mm_load_si128((xmmi*)r + 1);
487     pre->r5 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(1,1,1,1));
488     pre->r5 = _mm_add_epi64(pre->r5, _mm_and_si128(pre->r5, sse2_top64bitmask.v));
489     pre->r6 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(2,2,2,2));
490     pre->r7 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(3,3,3,3));
491     pre->r7 = _mm_add_epi64(pre->r7, _mm_and_si128(pre->r7, sse2_top64bitmask.v));
492     pre->r4 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(0,0,0,0));
493     pre->r8 = _mm_load_si128((xmmi*)r + 2);
494     pre->r9 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,1,3,1));
495     pre->r9 = _mm_add_epi64(pre->r9, _mm_and_si128(pre->r9, sse2_top64bitmask.v));
496     pre->r8 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,0,3,0));
497 
498     pre->r219 = _mm_mul_epu32(pre->r2, packednineteen.v);
499     pre->r419 = _mm_mul_epu32(pre->r4, packednineteen.v);
500     pre->r619 = _mm_mul_epu32(pre->r6, packednineteen.v);
501     pre->r819 = _mm_mul_epu32(pre->r8, packednineteen.v);
502     pre->r119 = _mm_shuffle_epi32(pre->r1,_MM_SHUFFLE(0,0,2,2)); pre->r119 = _mm_mul_epu32(pre->r119, packednineteen.v);
503     pre->r319 = _mm_shuffle_epi32(pre->r3,_MM_SHUFFLE(0,0,2,2)); pre->r319 = _mm_mul_epu32(pre->r319, packednineteen.v);
504     pre->r519 = _mm_shuffle_epi32(pre->r5,_MM_SHUFFLE(0,0,2,2)); pre->r519 = _mm_mul_epu32(pre->r519, packednineteen.v);
505     pre->r719 = _mm_shuffle_epi32(pre->r7,_MM_SHUFFLE(0,0,2,2)); pre->r719 = _mm_mul_epu32(pre->r719, packednineteen.v);
506     pre->r919 = _mm_shuffle_epi32(pre->r9,_MM_SHUFFLE(0,0,2,2)); pre->r919 = _mm_mul_epu32(pre->r919, packednineteen.v);
507 }
508 
509 
510 /* multiply a bignum by a pre-computed constant */
511 inline void
curve25519_mul_precomputed(bignum25519 out,const bignum25519 s,const bignum25519mulprecomp * r)512 curve25519_mul_precomputed(bignum25519 out, const bignum25519 s, const bignum25519mulprecomp *r) {
513     xmmi m01,m23,m45,m67,m89;
514     xmmi m0123,m4567;
515     xmmi s0123,s4567;
516     xmmi s01,s23,s45,s67,s89;
517     xmmi s12,s34,s56,s78,s9;
518     xmmi r0,r1,r2,r3,r4,r5;
519     xmmi c1,c2,c3;
520 
521     s0123 = _mm_load_si128((xmmi*)s + 0);
522     s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
523     s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
524     s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
525     s4567 = _mm_load_si128((xmmi*)s + 1);
526     s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
527     s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
528     s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
529     s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
530     s89 = _mm_load_si128((xmmi*)s + 2);
531     s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
532     s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
533     s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
534 
535     m01 = _mm_mul_epu32(r->r1,s01);
536     m23 = _mm_mul_epu32(r->r1,s23);
537     m45 = _mm_mul_epu32(r->r1,s45);
538     m67 = _mm_mul_epu32(r->r1,s67);
539     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r3,s01));
540     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r3,s23));
541     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r3,s45));
542     m89 = _mm_mul_epu32(r->r1,s89);
543     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r5,s01));
544     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r5,s23));
545     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r3,s67));
546     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r7,s01));
547     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r5,s45));
548     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r7,s23));
549     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r9,s01));
550 
551     /* shift up */
552     m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
553     m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
554     m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
555     m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
556     m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
557 
558     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r0,s01));
559     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r0,s23));
560     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r0,s45));
561     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r0,s67));
562     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r2,s01));
563     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r2,s23));
564     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r4,s23));
565     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r0,s89));
566     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r4,s01));
567     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r2,s45));
568     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r2,s67));
569     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r6,s01));
570     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r4,s45));
571     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r6,s23));
572     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r8,s01));
573     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r919,s12));
574     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r919,s34));
575     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r919,s56));
576     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r919,s78));
577     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r719,s34));
578     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r719,s56));
579     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r719,s78));
580     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r719,s9));
581     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r519,s56));
582     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r519,s78));
583     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r519,s9));
584     m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r819,s89));
585     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r319,s78));
586     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r319,s9));
587     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r619,s89));
588     m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r919,s9));
589     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r819,s23));
590     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r819,s45));
591     m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r819,s67));
592     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r619,s45));
593     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r619,s67));
594     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r419,s67));
595     m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r419,s89));
596     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r219,s89));
597     m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r119,s9));
598 
599     r0 = _mm_unpacklo_epi64(m01, m45);
600     r1 = _mm_unpackhi_epi64(m01, m45);
601     r2 = _mm_unpacklo_epi64(m23, m67);
602     r3 = _mm_unpackhi_epi64(m23, m67);
603     r4 = _mm_unpacklo_epi64(m89, m89);
604     r5 = _mm_unpackhi_epi64(m89, m89);
605 
606     c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
607     c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
608     c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1);
609     c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
610     c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
611 
612     m0123 = _mm_unpacklo_epi32(r0, r1);
613     m4567 = _mm_unpackhi_epi32(r0, r1);
614     m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
615     m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
616     m89 = _mm_unpackhi_epi32(r4, r5);
617 
618     _mm_store_si128((xmmi*)out + 0, m0123);
619     _mm_store_si128((xmmi*)out + 1, m4567);
620     _mm_store_si128((xmmi*)out + 2, m89);
621 }
622 
623 /* square a bignum 'count' times */
624 #define curve25519_square(r,x) curve25519_square_times(r,x,1)
625 
626 void
curve25519_square_times(bignum25519 r,const bignum25519 in,int count)627 curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {
628     xmmi m01,m23,m45,m67,m89;
629     xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
630     xmmi r0a,r1a,r2a,r3a,r7a,r9a;
631     xmmi r0123,r4567;
632     xmmi r01,r23,r45,r67,r6x,r89,r8x;
633     xmmi r12,r34,r56,r78,r9x;
634     xmmi r5619;
635     xmmi c1,c2,c3;
636 
637     r0123 = _mm_load_si128((xmmi*)in + 0);
638     r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
639     r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
640     r4567 = _mm_load_si128((xmmi*)in + 1);
641     r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
642     r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
643     r89 = _mm_load_si128((xmmi*)in + 2);
644     r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
645 
646     do {
647         r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
648         r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
649         r0 = _mm_add_epi64(r0, _mm_and_si128(r0, sse2_top64bitmask.v));
650         r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
651         r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
652         r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
653         r2 = _mm_add_epi64(r2, _mm_and_si128(r2, sse2_top64bitmask.v));
654         r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
655         r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
656         r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
657         r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
658         r4 = _mm_add_epi64(r4, _mm_and_si128(r4, sse2_top64bitmask.v));
659         r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
660         r5619 = _mm_mul_epu32(r56, packednineteen.v);
661         r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
662         r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
663         r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
664         r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
665         r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
666         r7 = _mm_mul_epu32(r7, packed3819.v);
667         r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
668         r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
669         r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
670         r8 = _mm_mul_epu32(r8, packednineteen.v);
671         r9  = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
672         r9x  = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
673         r9 = _mm_mul_epu32(r9, packed3819.v);
674         r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
675 
676         m01 = _mm_mul_epu32(r01, r0);
677         m23 = _mm_mul_epu32(r23, r0a);
678         m45 = _mm_mul_epu32(r45, r0a);
679         m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
680         r23 = _mm_slli_epi32(r23, 1);
681         m67 = _mm_mul_epu32(r67, r0a);
682         m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
683         m89 = _mm_mul_epu32(r89, r0a);
684         m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
685         r67 = _mm_slli_epi32(r67, 1);
686         m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
687         r45 = _mm_slli_epi32(r45, 1);
688 
689         r1 = _mm_slli_epi32(r1, 1);
690         r3 = _mm_slli_epi32(r3, 1);
691         r1a = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_bot64bitmask.v));
692         r3a = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_bot64bitmask.v));
693 
694         m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
695         m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
696         m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
697         m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
698         r34 = _mm_slli_epi32(r34, 1);
699         m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
700         r78 = _mm_slli_epi32(r78, 1);
701         m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
702         r56 = _mm_slli_epi32(r56, 1);
703 
704         m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
705         m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
706         m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
707         m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
708         m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
709         m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
710         m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
711         m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
712         m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
713         m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
714         m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
715         m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
716         m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
717         m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
718         m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
719 
720         r0 = _mm_unpacklo_epi64(m01, m45);
721         r1 = _mm_unpackhi_epi64(m01, m45);
722         r2 = _mm_unpacklo_epi64(m23, m67);
723         r3 = _mm_unpackhi_epi64(m23, m67);
724         r4 = _mm_unpacklo_epi64(m89, m89);
725         r5 = _mm_unpackhi_epi64(m89, m89);
726 
727         c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
728         c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
729         c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1);
730         c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
731         c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
732 
733         r01 = _mm_unpacklo_epi64(r0, r1);
734         r45 = _mm_unpackhi_epi64(r0, r1);
735         r23 = _mm_unpacklo_epi64(r2, r3);
736         r67 = _mm_unpackhi_epi64(r2, r3);
737         r89 = _mm_unpackhi_epi64(r4, r5);
738     } while (--count);
739 
740     r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
741     r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
742     r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
743     r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
744     r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
745 
746     _mm_store_si128((xmmi*)r + 0, r0123);
747     _mm_store_si128((xmmi*)r + 1, r4567);
748     _mm_store_si128((xmmi*)r + 2, r89);
749 }
750 
751 /* square two packed bignums */
752 inline void
curve25519_square_packed64(packedelem64 * out,const packedelem64 * r)753 curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {
754     xmmi r0,r1,r2,r3;
755     xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
756     xmmi d5,d6,d7,d8,d9;
757     xmmi c1,c2;
758 
759     r0 = r[0].v;
760     r1 = r[1].v;
761     r2 = r[2].v;
762     r3 = r[3].v;
763 
764     out[0].v = _mm_mul_epu32(r0, r0);
765     r0 = _mm_slli_epi32(r0, 1);
766     out[1].v = _mm_mul_epu32(r0, r1);
767     r1_2 = _mm_slli_epi32(r1, 1);
768     out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2    ), _mm_mul_epu32(r1, r1_2));
769     r1 = r1_2;
770     out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3    ), _mm_mul_epu32(r1, r2  ));
771     r3_2 = _mm_slli_epi32(r3, 1);
772     out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2  ), _mm_mul_epu32(r2, r2)));
773     r2 = _mm_slli_epi32(r2, 1);
774     out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
775     r5_2 = _mm_slli_epi32(r[5].v, 1);
776     out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2  ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2  ))));
777     r3 = r3_2;
778     out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
779     r7_2 = _mm_slli_epi32(r[7].v, 1);
780     out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2  ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2  ), _mm_mul_epu32(r[4].v, r[4].v)))));
781     out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2  )))));
782 
783     d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
784     d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
785     d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
786     d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
787     d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
788 
789     r4_2 = _mm_slli_epi32(r[4].v, 1);
790     r6_2 = _mm_slli_epi32(r[6].v, 1);
791     out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1                   ), _mm_add_epi64(_mm_mul_epu32(d8, r2  ), _mm_add_epi64(_mm_mul_epu32(d7, r3    ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
792     out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3  ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2  )))));
793     out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3                   ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2  ), _mm_mul_epu32(d6, r[6].v)))));
794     out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v               ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
795     out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2                 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
796     out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v               ), _mm_mul_epu32(d8, r7_2  )));
797     out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2                 ), _mm_mul_epu32(d8, r[8].v)));
798     out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
799     out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
800 
801     c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
802     c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
803     c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
804     c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
805                                        c2 = _mm_srli_epi64(out[8].v, 26);                                                     out[8].v = _mm_and_si128(out[8].v, packedmask26.v);                                         out[9].v = _mm_add_epi64(out[9].v, c2);
806                                        c2 = _mm_srli_epi64(out[9].v, 25);                                                     out[9].v = _mm_and_si128(out[9].v, packedmask25.v);                                         out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
807     c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
808 }
809 
810 /* make [nqx+nqz,nqpqx+nqpqz], [nqpqx-nqpqz,nqx-nqz] from [nqx+nqz,nqpqx+nqpqz], [nqx-nqz,nqpqx-nqpqz] */
811 inline void
curve25519_make_nqpq(packedelem64 * primex,packedelem64 * primez,const packedelem32 * pqx,const packedelem32 * pqz)812 curve25519_make_nqpq(packedelem64 *primex, packedelem64 *primez, const packedelem32 *pqx, const packedelem32 *pqz) {
813     primex[0].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(1,1,0,0));
814     primex[1].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(3,3,2,2));
815     primex[2].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(1,1,0,0));
816     primex[3].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(3,3,2,2));
817     primex[4].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(1,1,0,0));
818     primex[5].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(3,3,2,2));
819     primex[6].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(1,1,0,0));
820     primex[7].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(3,3,2,2));
821     primex[8].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(1,1,0,0));
822     primex[9].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(3,3,2,2));
823     primez[0].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(0,0,1,1));
824     primez[1].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(2,2,3,3));
825     primez[2].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(0,0,1,1));
826     primez[3].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(2,2,3,3));
827     primez[4].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(0,0,1,1));
828     primez[5].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(2,2,3,3));
829     primez[6].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(0,0,1,1));
830     primez[7].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(2,2,3,3));
831     primez[8].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(0,0,1,1));
832     primez[9].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(2,2,3,3));
833 }
834 
835 /* make [nqx+nqz,nqx-nqz] from [nqx+nqz,nqpqx+nqpqz], [nqx-nqz,nqpqx-nqpqz] */
836 inline void
curve25519_make_nq(packedelem64 * nq,const packedelem32 * pqx,const packedelem32 * pqz)837 curve25519_make_nq(packedelem64 *nq, const packedelem32 *pqx, const packedelem32 *pqz) {
838     nq[0].v = _mm_unpacklo_epi64(pqx[0].v, pqz[0].v);
839     nq[1].v = _mm_unpackhi_epi64(pqx[0].v, pqz[0].v);
840     nq[2].v = _mm_unpacklo_epi64(pqx[1].v, pqz[1].v);
841     nq[3].v = _mm_unpackhi_epi64(pqx[1].v, pqz[1].v);
842     nq[4].v = _mm_unpacklo_epi64(pqx[2].v, pqz[2].v);
843     nq[5].v = _mm_unpackhi_epi64(pqx[2].v, pqz[2].v);
844     nq[6].v = _mm_unpacklo_epi64(pqx[3].v, pqz[3].v);
845     nq[7].v = _mm_unpackhi_epi64(pqx[3].v, pqz[3].v);
846     nq[8].v = _mm_unpacklo_epi64(pqx[4].v, pqz[4].v);
847     nq[9].v = _mm_unpackhi_epi64(pqx[4].v, pqz[4].v);
848 }
849 
850 /* compute [nqx+nqz,nqx-nqz] from nqx, nqz */
851 inline void
curve25519_compute_nq(packedelem64 * nq,const bignum25519 nqx,const bignum25519 nqz)852 curve25519_compute_nq(packedelem64 *nq, const bignum25519 nqx, const bignum25519 nqz) {
853     xmmi x0,x1,x2;
854     xmmi z0,z1,z2;
855     xmmi a0,a1,a2;
856     xmmi s0,s1,s2;
857     xmmi r0,r1;
858     xmmi c1,c2;
859     x0 = _mm_load_si128((xmmi*)nqx + 0);
860     x1 = _mm_load_si128((xmmi*)nqx + 1);
861     x2 = _mm_load_si128((xmmi*)nqx + 2);
862     z0 = _mm_load_si128((xmmi*)nqz + 0);
863     z1 = _mm_load_si128((xmmi*)nqz + 1);
864     z2 = _mm_load_si128((xmmi*)nqz + 2);
865     a0 = _mm_add_epi32(x0, z0);
866     a1 = _mm_add_epi32(x1, z1);
867     a2 = _mm_add_epi32(x2, z2);
868     s0 = _mm_add_epi32(x0, packed2p0.v);
869     s1 = _mm_add_epi32(x1, packed2p1.v);
870     s2 = _mm_add_epi32(x2, packed2p2.v);
871     s0 = _mm_sub_epi32(s0, z0);
872     s1 = _mm_sub_epi32(s1, z1);
873     s2 = _mm_sub_epi32(s2, z2);
874     r0 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(2,2,0,0)), sse2_bot32bitmask.v);
875     r1 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(3,3,1,1)), sse2_bot32bitmask.v);
876     c1 = _mm_srli_epi32(r0, 26);
877     c2 = _mm_srli_epi32(r1, 25);
878     r0 = _mm_and_si128(r0, packedmask26.v);
879     r1 = _mm_and_si128(r1, packedmask25.v);
880     r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
881     r1 = _mm_add_epi32(r1, c1);
882     s0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
883     s1 = _mm_add_epi32(s1, _mm_srli_si128(c2, 8));
884     nq[0].v = _mm_unpacklo_epi64(a0, s0);
885     nq[2].v = _mm_unpackhi_epi64(a0, s0);
886     nq[4].v = _mm_unpacklo_epi64(a1, s1);
887     nq[6].v = _mm_unpackhi_epi64(a1, s1);
888     nq[8].v = _mm_unpacklo_epi64(a2, s2);
889     nq[1].v = _mm_shuffle_epi32(nq[0].v, _MM_SHUFFLE(3,3,1,1));
890     nq[3].v = _mm_shuffle_epi32(nq[2].v, _MM_SHUFFLE(3,3,1,1));
891     nq[5].v = _mm_shuffle_epi32(nq[4].v, _MM_SHUFFLE(3,3,1,1));
892     nq[7].v = _mm_shuffle_epi32(nq[6].v, _MM_SHUFFLE(3,3,1,1));
893     nq[9].v = _mm_shuffle_epi32(nq[8].v, _MM_SHUFFLE(3,3,1,1));
894 }
895 
896 
897 /* compute [x+z,x-z] from [x,z] */
898 inline void
curve25519_addsub_packed64(packedelem64 * r)899 curve25519_addsub_packed64(packedelem64 *r)  {
900     packed32bignum25519 x,z,add,sub;
901 
902     x[0].v = _mm_unpacklo_epi64(r[0].v, r[1].v);
903     z[0].v = _mm_unpackhi_epi64(r[0].v, r[1].v);
904     x[1].v = _mm_unpacklo_epi64(r[2].v, r[3].v);
905     z[1].v = _mm_unpackhi_epi64(r[2].v, r[3].v);
906     x[2].v = _mm_unpacklo_epi64(r[4].v, r[5].v);
907     z[2].v = _mm_unpackhi_epi64(r[4].v, r[5].v);
908     x[3].v = _mm_unpacklo_epi64(r[6].v, r[7].v);
909     z[3].v = _mm_unpackhi_epi64(r[6].v, r[7].v);
910     x[4].v = _mm_unpacklo_epi64(r[8].v, r[9].v);
911     z[4].v = _mm_unpackhi_epi64(r[8].v, r[9].v);
912 
913     curve25519_add_packed32(add, x, z);
914     curve25519_sub_packed32(sub, x, z);
915 
916     r[0].v = _mm_unpacklo_epi64(add[0].v, sub[0].v);
917     r[1].v = _mm_unpackhi_epi64(add[0].v, sub[0].v);
918     r[2].v = _mm_unpacklo_epi64(add[1].v, sub[1].v);
919     r[3].v = _mm_unpackhi_epi64(add[1].v, sub[1].v);
920     r[4].v = _mm_unpacklo_epi64(add[2].v, sub[2].v);
921     r[5].v = _mm_unpackhi_epi64(add[2].v, sub[2].v);
922     r[6].v = _mm_unpacklo_epi64(add[3].v, sub[3].v);
923     r[7].v = _mm_unpackhi_epi64(add[3].v, sub[3].v);
924     r[8].v = _mm_unpacklo_epi64(add[4].v, sub[4].v);
925     r[9].v = _mm_unpackhi_epi64(add[4].v, sub[4].v);
926 }
927 
928 /* compute [x,z] * [121666,121665] */
929 inline void
curve25519_121665_packed64(packedelem64 * out,const packedelem64 * in)930 curve25519_121665_packed64(packedelem64 *out, const packedelem64 *in) {
931     xmmi c1,c2;
932 
933     out[0].v = _mm_mul_epu32(in[0].v, packed121666121665.v);
934     out[1].v = _mm_mul_epu32(in[1].v, packed121666121665.v);
935     out[2].v = _mm_mul_epu32(in[2].v, packed121666121665.v);
936     out[3].v = _mm_mul_epu32(in[3].v, packed121666121665.v);
937     out[4].v = _mm_mul_epu32(in[4].v, packed121666121665.v);
938     out[5].v = _mm_mul_epu32(in[5].v, packed121666121665.v);
939     out[6].v = _mm_mul_epu32(in[6].v, packed121666121665.v);
940     out[7].v = _mm_mul_epu32(in[7].v, packed121666121665.v);
941     out[8].v = _mm_mul_epu32(in[8].v, packed121666121665.v);
942     out[9].v = _mm_mul_epu32(in[9].v, packed121666121665.v);
943 
944     c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
945     c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
946     c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
947     c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
948                                        c2 = _mm_srli_epi64(out[8].v, 26);                                                     out[8].v = _mm_and_si128(out[8].v, packedmask26.v);                                         out[9].v = _mm_add_epi64(out[9].v, c2);
949                                        c2 = _mm_srli_epi64(out[9].v, 25);                                                     out[9].v = _mm_and_si128(out[9].v, packedmask25.v);                                         out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
950     c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
951 }
952 
953 /* compute [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
954 inline void
curve25519_final_nq(packedelem64 * nq,const packedelem64 * sq,const packedelem64 * sq121665)955 curve25519_final_nq(packedelem64 *nq, const packedelem64 *sq, const packedelem64 *sq121665) {
956     packed32bignum25519 x, z, sub;
957     packed64bignum25519 t, nqa, nqb;
958 
959     x[0].v = _mm_or_si128(_mm_unpacklo_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[0].v, sq121665[1].v), 4));
960     z[0].v = _mm_or_si128(_mm_unpackhi_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[0].v, sq121665[1].v), 4));
961     x[1].v = _mm_or_si128(_mm_unpacklo_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[2].v, sq121665[3].v), 4));
962     z[1].v = _mm_or_si128(_mm_unpackhi_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[2].v, sq121665[3].v), 4));
963     x[2].v = _mm_or_si128(_mm_unpacklo_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[4].v, sq121665[5].v), 4));
964     z[2].v = _mm_or_si128(_mm_unpackhi_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[4].v, sq121665[5].v), 4));
965     x[3].v = _mm_or_si128(_mm_unpacklo_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[6].v, sq121665[7].v), 4));
966     z[3].v = _mm_or_si128(_mm_unpackhi_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[6].v, sq121665[7].v), 4));
967     x[4].v = _mm_or_si128(_mm_unpacklo_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[8].v, sq121665[9].v), 4));
968     z[4].v = _mm_or_si128(_mm_unpackhi_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[8].v, sq121665[9].v), 4));
969 
970     curve25519_sub_packed32(sub, x, z);
971 
972     t[0].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(1,1,0,0));
973     t[1].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(3,3,2,2));
974     t[2].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(1,1,0,0));
975     t[3].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(3,3,2,2));
976     t[4].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(1,1,0,0));
977     t[5].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(3,3,2,2));
978     t[6].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(1,1,0,0));
979     t[7].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(3,3,2,2));
980     t[8].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(1,1,0,0));
981     t[9].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(3,3,2,2));
982 
983     nqa[0].v = _mm_unpacklo_epi64(sq[0].v, t[0].v);
984     nqb[0].v = _mm_unpackhi_epi64(sq[0].v, t[0].v);
985     nqa[1].v = _mm_unpacklo_epi64(sq[1].v, t[1].v);
986     nqb[1].v = _mm_unpackhi_epi64(sq[1].v, t[1].v);
987     nqa[2].v = _mm_unpacklo_epi64(sq[2].v, t[2].v);
988     nqb[2].v = _mm_unpackhi_epi64(sq[2].v, t[2].v);
989     nqa[3].v = _mm_unpacklo_epi64(sq[3].v, t[3].v);
990     nqb[3].v = _mm_unpackhi_epi64(sq[3].v, t[3].v);
991     nqa[4].v = _mm_unpacklo_epi64(sq[4].v, t[4].v);
992     nqb[4].v = _mm_unpackhi_epi64(sq[4].v, t[4].v);
993     nqa[5].v = _mm_unpacklo_epi64(sq[5].v, t[5].v);
994     nqb[5].v = _mm_unpackhi_epi64(sq[5].v, t[5].v);
995     nqa[6].v = _mm_unpacklo_epi64(sq[6].v, t[6].v);
996     nqb[6].v = _mm_unpackhi_epi64(sq[6].v, t[6].v);
997     nqa[7].v = _mm_unpacklo_epi64(sq[7].v, t[7].v);
998     nqb[7].v = _mm_unpackhi_epi64(sq[7].v, t[7].v);
999     nqa[8].v = _mm_unpacklo_epi64(sq[8].v, t[8].v);
1000     nqb[8].v = _mm_unpackhi_epi64(sq[8].v, t[8].v);
1001     nqa[9].v = _mm_unpacklo_epi64(sq[9].v, t[9].v);
1002     nqb[9].v = _mm_unpackhi_epi64(sq[9].v, t[9].v);
1003 
1004     curve25519_mul_packed64(nq, nqa, nqb);
1005 }
1006 
1007 /*
1008  * In:  b =   2^5 - 2^0
1009  * Out: b = 2^250 - 2^0
1010  */
1011 void
curve25519_pow_two5mtwo0_two250mtwo0(bignum25519 b)1012 curve25519_pow_two5mtwo0_two250mtwo0(bignum25519 b) {
1013     ALIGN(16) bignum25519 t0,c;
1014 
1015     /* 2^5  - 2^0 */ /* b */
1016     /* 2^10 - 2^5 */ curve25519_square_times(t0, b, 5);
1017     /* 2^10 - 2^0 */ curve25519_mul(b, t0, b);
1018     /* 2^20 - 2^10 */ curve25519_square_times(t0, b, 10);
1019     /* 2^20 - 2^0 */ curve25519_mul(c, t0, b);
1020     /* 2^40 - 2^20 */ curve25519_square_times(t0, c, 20);
1021     /* 2^40 - 2^0 */ curve25519_mul(t0, t0, c);
1022     /* 2^50 - 2^10 */ curve25519_square_times(t0, t0, 10);
1023     /* 2^50 - 2^0 */ curve25519_mul(b, t0, b);
1024     /* 2^100 - 2^50 */ curve25519_square_times(t0, b, 50);
1025     /* 2^100 - 2^0 */ curve25519_mul(c, t0, b);
1026     /* 2^200 - 2^100 */ curve25519_square_times(t0, c, 100);
1027     /* 2^200 - 2^0 */ curve25519_mul(t0, t0, c);
1028     /* 2^250 - 2^50 */ curve25519_square_times(t0, t0, 50);
1029     /* 2^250 - 2^0 */ curve25519_mul(b, t0, b);
1030 }
1031 
1032 /*
1033  * z^(p - 2) = z(2^255 - 21)
1034  */
1035 void
curve25519_recip(bignum25519 out,const bignum25519 z)1036 curve25519_recip(bignum25519 out, const bignum25519 z) {
1037     ALIGN(16) bignum25519 a, t0, b;
1038 
1039     /* 2 */ curve25519_square(a, z); /* a = 2 */
1040     /* 8 */ curve25519_square_times(t0, a, 2);
1041     /* 9 */ curve25519_mul(b, t0, z); /* b = 9 */
1042     /* 11 */ curve25519_mul(a, b, a); /* a = 11 */
1043     /* 22 */ curve25519_square(t0, a);
1044     /* 2^5 - 2^0 = 31 */ curve25519_mul(b, t0, b);
1045     /* 2^250 - 2^0 */ curve25519_pow_two5mtwo0_two250mtwo0(b);
1046     /* 2^255 - 2^5 */ curve25519_square_times(b, b, 5);
1047     /* 2^255 - 21 */  curve25519_mul(out, b, a);
1048 }
1049 
1050 ANONYMOUS_NAMESPACE_END
1051 
NAMESPACE_BEGIN(CryptoPP)1052 NAMESPACE_BEGIN(CryptoPP)
1053 NAMESPACE_BEGIN(Donna)
1054 
1055 int curve25519_mult_SSE2(byte sharedKey[32], const byte secretKey[32], const byte othersKey[32])
1056 {
1057     FixedSizeSecBlock<byte, 32> e;
1058     for (size_t i = 0;i < 32;++i)
1059         e[i] = secretKey[i];
1060     e[0] &= 0xf8; e[31] &= 0x7f; e[31] |= 0x40;
1061 
1062     ALIGN(16) bignum25519 nqx = {1}, nqpqz = {1}, nqz = {0}, nqpqx, zmone;
1063     packed32bignum25519 qx, qz, pqz, pqx;
1064     packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;
1065     bignum25519mulprecomp preq;
1066     size_t bit=0;
1067 
1068     curve25519_expand(nqpqx, othersKey);
1069     curve25519_mul_precompute(&preq, nqpqx);
1070 
1071     /* do bits 254..3 */
1072     for (size_t i = 254, lastbit=0; i >= 3; i--) {
1073         bit = (e[i/8] >> (i & 7)) & 1;
1074         curve25519_swap_conditional(nqx, nqpqx, (word32)(bit ^ lastbit));
1075         curve25519_swap_conditional(nqz, nqpqz, (word32)(bit ^ lastbit));
1076         lastbit = bit;
1077 
1078         curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */
1079         curve25519_tangle32(qz, nqz, nqpqz); /* qz = [nqz,nqpqz] */
1080 
1081         curve25519_add_packed32(pqx, qx, qz); /* pqx = [nqx+nqz,nqpqx+nqpqz] */
1082         curve25519_sub_packed32(pqz, qx, qz); /* pqz = [nqx-nqz,nqpqx-nqpqz] */
1083 
1084         curve25519_make_nqpq(primex, primez, pqx, pqz); /* primex = [nqx+nqz,nqpqx+nqpqz], primez = [nqpqx-nqpqz,nqx-nqz] */
1085         curve25519_mul_packed64(prime, primex, primez); /* prime = [nqx+nqz,nqpqx+nqpqz] * [nqpqx-nqpqz,nqx-nqz] */
1086         curve25519_addsub_packed64(prime); /* prime = [prime.x+prime.z,prime.x-prime.z] */
1087         curve25519_square_packed64(nqpq, prime); /* nqpq = prime^2 */
1088         curve25519_untangle64(nqpqx, nqpqz, nqpq);
1089         curve25519_mul_precomputed(nqpqz, nqpqz, &preq); /* nqpqz = nqpqz * q */
1090 
1091         /* (((sq.x-sq.z)*121665)+sq.x) * (sq.x-sq.z) is equivalent to (sq.x*121666-sq.z*121665) * (sq.x-sq.z) */
1092         curve25519_make_nq(nq, pqx, pqz); /* nq = [nqx+nqz,nqx-nqz] */
1093         curve25519_square_packed64(sq, nq); /* sq = nq^2 */
1094         curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
1095         curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
1096         curve25519_untangle64(nqx, nqz, nq);
1097     };
1098 
1099     /* it's possible to get rid of this swap with the swap in the above loop
1100        at the bottom instead of the top, but compilers seem to optimize better this way */
1101     curve25519_swap_conditional(nqx, nqpqx, (word32)bit);
1102     curve25519_swap_conditional(nqz, nqpqz, (word32)bit);
1103 
1104     /* do bits 2..0 */
1105     for (size_t i = 0; i < 3; i++) {
1106         curve25519_compute_nq(nq, nqx, nqz);
1107         curve25519_square_packed64(sq, nq); /* sq = nq^2 */
1108         curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
1109         curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
1110         curve25519_untangle64(nqx, nqz, nq);
1111     }
1112 
1113     curve25519_recip(zmone, nqz);
1114     curve25519_mul(nqz, nqx, zmone);
1115     curve25519_contract(sharedKey, nqz);
1116 
1117     return 0;
1118 }
1119 
1120 NAMESPACE_END  // Donna
1121 NAMESPACE_END  // CryptoPP
1122 
1123 #endif  // CRYPTOPP_CURVE25519_SSE2
1124