1 /*
2 	Public domain by Andrew M. <liquidsun@gmail.com>
3 	See: https://github.com/floodyberry/curve25519-donna
4 
5 	SSE2 curve25519 implementation
6 */
7 
8 #if defined(ED25519_SSE2)
9 
10 #include <emmintrin.h>
11 typedef __m128i xmmi;
12 
13 typedef union packedelem8_t {
14 	unsigned char u[16];
15 	xmmi v;
16 } packedelem8;
17 
18 typedef union packedelem32_t {
19 	uint32_t u[4];
20 	xmmi v;
21 } packedelem32;
22 
23 typedef union packedelem64_t {
24 	uint64_t u[2];
25 	xmmi v;
26 } packedelem64;
27 
28 /* 10 elements + an extra 2 to fit in 3 xmm registers */
29 typedef uint32_t bignum25519[12];
30 typedef packedelem32 packed32bignum25519[5];
31 typedef packedelem64 packed64bignum25519[10];
32 
33 static const packedelem32 bot32bitmask = {{0xffffffff, 0x00000000, 0xffffffff, 0x00000000}};
34 static const packedelem32 top32bitmask = {{0x00000000, 0xffffffff, 0x00000000, 0xffffffff}};
35 static const packedelem32 top64bitmask = {{0x00000000, 0x00000000, 0xffffffff, 0xffffffff}};
36 static const packedelem32 bot64bitmask = {{0xffffffff, 0xffffffff, 0x00000000, 0x00000000}};
37 
38 /* reduction masks */
39 static const packedelem64 packedmask26 = {{0x03ffffff, 0x03ffffff}};
40 static const packedelem64 packedmask25 = {{0x01ffffff, 0x01ffffff}};
41 static const packedelem32 packedmask2625 = {{0x3ffffff,0,0x1ffffff,0}};
42 static const packedelem32 packedmask26262626 = {{0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff}};
43 static const packedelem32 packedmask25252525 = {{0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}};
44 
45 /* multipliers */
46 static const packedelem64 packednineteen = {{19, 19}};
47 static const packedelem64 packednineteenone = {{19, 1}};
48 static const packedelem64 packedthirtyeight = {{38, 38}};
49 static const packedelem64 packed3819 = {{19*2,19}};
50 static const packedelem64 packed9638 = {{19*4,19*2}};
51 
52 /* 121666,121665 */
53 static const packedelem64 packed121666121665 = {{121666, 121665}};
54 
55 /* 2*(2^255 - 19) = 0 mod p */
56 static const packedelem32 packed2p0 = {{0x7ffffda,0x3fffffe,0x7fffffe,0x3fffffe}};
57 static const packedelem32 packed2p1 = {{0x7fffffe,0x3fffffe,0x7fffffe,0x3fffffe}};
58 static const packedelem32 packed2p2 = {{0x7fffffe,0x3fffffe,0x0000000,0x0000000}};
59 
60 static const packedelem32 packed32packed2p0 = {{0x7ffffda,0x7ffffda,0x3fffffe,0x3fffffe}};
61 static const packedelem32 packed32packed2p1 = {{0x7fffffe,0x7fffffe,0x3fffffe,0x3fffffe}};
62 
63 /* 4*(2^255 - 19) = 0 mod p */
64 static const packedelem32 packed4p0 = {{0xfffffb4,0x7fffffc,0xffffffc,0x7fffffc}};
65 static const packedelem32 packed4p1 = {{0xffffffc,0x7fffffc,0xffffffc,0x7fffffc}};
66 static const packedelem32 packed4p2 = {{0xffffffc,0x7fffffc,0x0000000,0x0000000}};
67 
68 static const packedelem32 packed32packed4p0 = {{0xfffffb4,0xfffffb4,0x7fffffc,0x7fffffc}};
69 static const packedelem32 packed32packed4p1 = {{0xffffffc,0xffffffc,0x7fffffc,0x7fffffc}};
70 
71 /* out = in */
72 DONNA_INLINE static void
curve25519_copy(bignum25519 out,const bignum25519 in)73 curve25519_copy(bignum25519 out, const bignum25519 in) {
74 	xmmi x0,x1,x2;
75 	x0 = _mm_load_si128((xmmi*)in + 0);
76 	x1 = _mm_load_si128((xmmi*)in + 1);
77 	x2 = _mm_load_si128((xmmi*)in + 2);
78 	_mm_store_si128((xmmi*)out + 0, x0);
79 	_mm_store_si128((xmmi*)out + 1, x1);
80 	_mm_store_si128((xmmi*)out + 2, x2);
81 }
82 
83 /* out = a + b */
84 DONNA_INLINE static void
curve25519_add(bignum25519 out,const bignum25519 a,const bignum25519 b)85 curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
86 	xmmi a0,a1,a2,b0,b1,b2;
87 	a0 = _mm_load_si128((xmmi*)a + 0);
88 	a1 = _mm_load_si128((xmmi*)a + 1);
89 	a2 = _mm_load_si128((xmmi*)a + 2);
90 	b0 = _mm_load_si128((xmmi*)b + 0);
91 	b1 = _mm_load_si128((xmmi*)b + 1);
92 	b2 = _mm_load_si128((xmmi*)b + 2);
93 	a0 = _mm_add_epi32(a0, b0);
94 	a1 = _mm_add_epi32(a1, b1);
95 	a2 = _mm_add_epi32(a2, b2);
96 	_mm_store_si128((xmmi*)out + 0, a0);
97 	_mm_store_si128((xmmi*)out + 1, a1);
98 	_mm_store_si128((xmmi*)out + 2, a2);
99 }
100 
101 #define curve25519_add_after_basic curve25519_add_reduce
102 DONNA_INLINE static void
curve25519_add_reduce(bignum25519 out,const bignum25519 a,const bignum25519 b)103 curve25519_add_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
104 	xmmi a0,a1,a2,b0,b1,b2;
105 	xmmi c1,c2,c3;
106 	xmmi r0,r1,r2,r3,r4,r5;
107 
108 	a0 = _mm_load_si128((xmmi*)a + 0);
109 	a1 = _mm_load_si128((xmmi*)a + 1);
110 	a2 = _mm_load_si128((xmmi*)a + 2);
111 	b0 = _mm_load_si128((xmmi*)b + 0);
112 	b1 = _mm_load_si128((xmmi*)b + 1);
113 	b2 = _mm_load_si128((xmmi*)b + 2);
114 	a0 = _mm_add_epi32(a0, b0);
115 	a1 = _mm_add_epi32(a1, b1);
116 	a2 = _mm_add_epi32(a2, b2);
117 
118 	r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
119 	r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
120 	r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
121 	r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
122 	r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
123 	r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
124 
125 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
126 	c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
127 	c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1);
128 	c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
129 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
130 
131 	_mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
132 	_mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
133 	_mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
134 }
135 
136 DONNA_INLINE static void
curve25519_sub(bignum25519 out,const bignum25519 a,const bignum25519 b)137 curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
138 	xmmi a0,a1,a2,b0,b1,b2;
139 	xmmi c1,c2;
140 	xmmi r0,r1;
141 
142 	a0 = _mm_load_si128((xmmi*)a + 0);
143 	a1 = _mm_load_si128((xmmi*)a + 1);
144 	a2 = _mm_load_si128((xmmi*)a + 2);
145 	a0 = _mm_add_epi32(a0, packed2p0.v);
146 	a1 = _mm_add_epi32(a1, packed2p1.v);
147 	a2 = _mm_add_epi32(a2, packed2p2.v);
148 	b0 = _mm_load_si128((xmmi*)b + 0);
149 	b1 = _mm_load_si128((xmmi*)b + 1);
150 	b2 = _mm_load_si128((xmmi*)b + 2);
151 	a0 = _mm_sub_epi32(a0, b0);
152 	a1 = _mm_sub_epi32(a1, b1);
153 	a2 = _mm_sub_epi32(a2, b2);
154 
155 	r0 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(2,2,0,0)), bot32bitmask.v);
156 	r1 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3,3,1,1)), bot32bitmask.v);
157 
158 	c1 = _mm_srli_epi32(r0, 26);
159 	c2 = _mm_srli_epi32(r1, 25);
160 	r0 = _mm_and_si128(r0, packedmask26.v);
161 	r1 = _mm_and_si128(r1, packedmask25.v);
162 	r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
163 	r1 = _mm_add_epi32(r1, c1);
164 
165 	a0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
166 	a1 = _mm_add_epi32(a1, _mm_srli_si128(c2, 8));
167 
168 	_mm_store_si128((xmmi*)out + 0, a0);
169 	_mm_store_si128((xmmi*)out + 1, a1);
170 	_mm_store_si128((xmmi*)out + 2, a2);
171 }
172 
173 DONNA_INLINE static void
curve25519_sub_after_basic(bignum25519 out,const bignum25519 a,const bignum25519 b)174 curve25519_sub_after_basic(bignum25519 out, const bignum25519 a, const bignum25519 b) {
175 	xmmi a0,a1,a2,b0,b1,b2;
176 	xmmi c1,c2,c3;
177 	xmmi r0,r1,r2,r3,r4,r5;
178 
179 	a0 = _mm_load_si128((xmmi*)a + 0);
180 	a1 = _mm_load_si128((xmmi*)a + 1);
181 	a2 = _mm_load_si128((xmmi*)a + 2);
182 	a0 = _mm_add_epi32(a0, packed4p0.v);
183 	a1 = _mm_add_epi32(a1, packed4p1.v);
184 	a2 = _mm_add_epi32(a2, packed4p2.v);
185 	b0 = _mm_load_si128((xmmi*)b + 0);
186 	b1 = _mm_load_si128((xmmi*)b + 1);
187 	b2 = _mm_load_si128((xmmi*)b + 2);
188 	a0 = _mm_sub_epi32(a0, b0);
189 	a1 = _mm_sub_epi32(a1, b1);
190 	a2 = _mm_sub_epi32(a2, b2);
191 
192 	r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
193 	r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
194 	r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
195 	r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
196 	r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
197 	r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
198 
199 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
200 	c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
201 	c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1);
202 	c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
203 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
204 
205 	_mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
206 	_mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
207 	_mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
208 }
209 
210 DONNA_INLINE static void
curve25519_sub_reduce(bignum25519 out,const bignum25519 a,const bignum25519 b)211 curve25519_sub_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
212 	xmmi a0,a1,a2,b0,b1,b2;
213 	xmmi c1,c2,c3;
214 	xmmi r0,r1,r2,r3,r4,r5;
215 
216 	a0 = _mm_load_si128((xmmi*)a + 0);
217 	a1 = _mm_load_si128((xmmi*)a + 1);
218 	a2 = _mm_load_si128((xmmi*)a + 2);
219 	a0 = _mm_add_epi32(a0, packed2p0.v);
220 	a1 = _mm_add_epi32(a1, packed2p1.v);
221 	a2 = _mm_add_epi32(a2, packed2p2.v);
222 	b0 = _mm_load_si128((xmmi*)b + 0);
223 	b1 = _mm_load_si128((xmmi*)b + 1);
224 	b2 = _mm_load_si128((xmmi*)b + 2);
225 	a0 = _mm_sub_epi32(a0, b0);
226 	a1 = _mm_sub_epi32(a1, b1);
227 	a2 = _mm_sub_epi32(a2, b2);
228 
229 	r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
230 	r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
231 	r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
232 	r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
233 	r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
234 	r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
235 
236 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
237 	c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
238 	c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1);
239 	c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
240 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
241 
242 	_mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
243 	_mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
244 	_mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
245 }
246 
247 
248 DONNA_INLINE static void
curve25519_neg(bignum25519 out,const bignum25519 b)249 curve25519_neg(bignum25519 out, const bignum25519 b) {
250 	xmmi a0,a1,a2,b0,b1,b2;
251 	xmmi c1,c2,c3;
252 	xmmi r0,r1,r2,r3,r4,r5;
253 
254 	a0 = packed2p0.v;
255 	a1 = packed2p1.v;
256 	a2 = packed2p2.v;
257 	b0 = _mm_load_si128((xmmi*)b + 0);
258 	b1 = _mm_load_si128((xmmi*)b + 1);
259 	b2 = _mm_load_si128((xmmi*)b + 2);
260 	a0 = _mm_sub_epi32(a0, b0);
261 	a1 = _mm_sub_epi32(a1, b1);
262 	a2 = _mm_sub_epi32(a2, b2);
263 
264 	r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
265 	r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
266 	r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
267 	r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
268 	r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
269 	r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
270 
271 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
272 	c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
273 	c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1);
274 	c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
275 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
276 
277 	_mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
278 	_mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
279 	_mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
280 }
281 
282 
283 /* Multiply two numbers: out = in2 * in */
284 static void
curve25519_mul(bignum25519 out,const bignum25519 r,const bignum25519 s)285 curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {
286 	xmmi m01,m23,m45,m67,m89;
287 	xmmi m0123,m4567;
288 	xmmi s0123,s4567;
289 	xmmi s01,s23,s45,s67,s89;
290 	xmmi s12,s34,s56,s78,s9;
291 	xmmi r0,r2,r4,r6,r8;
292 	xmmi r1,r3,r5,r7,r9;
293 	xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
294 	xmmi c1,c2,c3;
295 
296 	s0123 = _mm_load_si128((xmmi*)s + 0);
297 	s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
298 	s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
299 	s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
300 	s4567 = _mm_load_si128((xmmi*)s + 1);
301 	s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
302 	s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
303 	s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
304 	s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
305 	s89 = _mm_load_si128((xmmi*)s + 2);
306 	s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
307 	s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
308 	s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
309 
310 	r0 = _mm_load_si128((xmmi*)r + 0);
311 	r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
312 	r1 = _mm_add_epi64(r1, _mm_and_si128(r1, top64bitmask.v));
313 	r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
314 	r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
315 	r3 = _mm_add_epi64(r3, _mm_and_si128(r3, top64bitmask.v));
316 	r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
317 	r4 = _mm_load_si128((xmmi*)r + 1);
318 	r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
319 	r5 = _mm_add_epi64(r5, _mm_and_si128(r5, top64bitmask.v));
320 	r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
321 	r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
322 	r7 = _mm_add_epi64(r7, _mm_and_si128(r7, top64bitmask.v));
323 	r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
324 	r8 = _mm_load_si128((xmmi*)r + 2);
325 	r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
326 	r9 = _mm_add_epi64(r9, _mm_and_si128(r9, top64bitmask.v));
327 	r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
328 
329 	m01 = _mm_mul_epu32(r1,s01);
330 	m23 = _mm_mul_epu32(r1,s23);
331 	m45 = _mm_mul_epu32(r1,s45);
332 	m67 = _mm_mul_epu32(r1,s67);
333 	m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
334 	m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
335 	m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
336 	m89 = _mm_mul_epu32(r1,s89);
337 	m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
338 	m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
339 	m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
340 	m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
341 	m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
342 	m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
343 	m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
344 
345 	/* shift up */
346 	m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
347 	m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
348 	m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
349 	m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
350 	m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
351 
352 	m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
353 	m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
354 	m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
355 	m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
356 	m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
357 	m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
358 	m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
359 	m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
360 	m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
361 	m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
362 	m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
363 	m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
364 	m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
365 	m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
366 	m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
367 
368 	r219 = _mm_mul_epu32(r2, packednineteen.v);
369 	r419 = _mm_mul_epu32(r4, packednineteen.v);
370 	r619 = _mm_mul_epu32(r6, packednineteen.v);
371 	r819 = _mm_mul_epu32(r8, packednineteen.v);
372 	r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
373 	r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
374 	r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
375 	r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
376 	r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
377 
378 	m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
379 	m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
380 	m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
381 	m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
382 	m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
383 	m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
384 	m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
385 	m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
386 	m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
387 	m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
388 	m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
389 	m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
390 	m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
391 	m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
392 	m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
393 	m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
394 	m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
395 	m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
396 	m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
397 	m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
398 	m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
399 	m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
400 	m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
401 	m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
402 	m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
403 
404 	r0 = _mm_unpacklo_epi64(m01, m45);
405 	r1 = _mm_unpackhi_epi64(m01, m45);
406 	r2 = _mm_unpacklo_epi64(m23, m67);
407 	r3 = _mm_unpackhi_epi64(m23, m67);
408 	r4 = _mm_unpacklo_epi64(m89, m89);
409 	r5 = _mm_unpackhi_epi64(m89, m89);
410 
411 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
412 	c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
413 	c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1);
414 	c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
415 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
416 
417 	m0123 = _mm_unpacklo_epi32(r0, r1);
418 	m4567 = _mm_unpackhi_epi32(r0, r1);
419 	m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
420 	m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
421 	m89 = _mm_unpackhi_epi32(r4, r5);
422 
423 	_mm_store_si128((xmmi*)out + 0, m0123);
424 	_mm_store_si128((xmmi*)out + 1, m4567);
425 	_mm_store_si128((xmmi*)out + 2, m89);
426 }
427 
428 DONNA_NOINLINE static void
curve25519_mul_noinline(bignum25519 out,const bignum25519 r,const bignum25519 s)429 curve25519_mul_noinline(bignum25519 out, const bignum25519 r, const bignum25519 s) {
430 	curve25519_mul(out, r, s);
431 }
432 
433 #define curve25519_square(r, n) curve25519_square_times(r, n, 1)
434 static void
curve25519_square_times(bignum25519 r,const bignum25519 in,int count)435 curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {
436 	xmmi m01,m23,m45,m67,m89;
437 	xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
438 	xmmi r0a,r1a,r2a,r3a,r7a,r9a;
439 	xmmi r0123,r4567;
440 	xmmi r01,r23,r45,r67,r6x,r89,r8x;
441 	xmmi r12,r34,r56,r78,r9x;
442 	xmmi r5619;
443 	xmmi c1,c2,c3;
444 
445 	r0123 = _mm_load_si128((xmmi*)in + 0);
446 	r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
447 	r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
448 	r4567 = _mm_load_si128((xmmi*)in + 1);
449 	r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
450 	r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
451 	r89 = _mm_load_si128((xmmi*)in + 2);
452 	r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
453 
454 	do {
455 		r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
456 		r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
457 		r0 = _mm_add_epi64(r0, _mm_and_si128(r0, top64bitmask.v));
458 		r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
459 		r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
460 		r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
461 		r2 = _mm_add_epi64(r2, _mm_and_si128(r2, top64bitmask.v));
462 		r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
463 		r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
464 		r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
465 		r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
466 		r4 = _mm_add_epi64(r4, _mm_and_si128(r4, top64bitmask.v));
467 		r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
468 		r5619 = _mm_mul_epu32(r56, packednineteen.v);
469 		r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
470 		r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
471 		r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
472 		r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
473 		r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
474 		r7 = _mm_mul_epu32(r7, packed3819.v);
475 		r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
476 		r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
477 		r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
478 		r8 = _mm_mul_epu32(r8, packednineteen.v);
479 		r9  = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
480 		r9x  = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
481 		r9 = _mm_mul_epu32(r9, packed3819.v);
482 		r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
483 
484 		m01 = _mm_mul_epu32(r01, r0);
485 		m23 = _mm_mul_epu32(r23, r0a);
486 		m45 = _mm_mul_epu32(r45, r0a);
487 		m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
488 		r23 = _mm_slli_epi32(r23, 1);
489 		m67 = _mm_mul_epu32(r67, r0a);
490 		m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
491 		m89 = _mm_mul_epu32(r89, r0a);
492 		m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
493 		r67 = _mm_slli_epi32(r67, 1);
494 		m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
495 		r45 = _mm_slli_epi32(r45, 1);
496 
497 		r1 = _mm_slli_epi32(r1, 1);
498 		r3 = _mm_slli_epi32(r3, 1);
499 		r1a = _mm_add_epi64(r1, _mm_and_si128(r1, bot64bitmask.v));
500 		r3a = _mm_add_epi64(r3, _mm_and_si128(r3, bot64bitmask.v));
501 
502 		m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
503 		m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
504 		m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
505 		m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
506 		r34 = _mm_slli_epi32(r34, 1);
507 		m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
508 		r78 = _mm_slli_epi32(r78, 1);
509 		m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
510 		r56 = _mm_slli_epi32(r56, 1);
511 
512 		m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
513 		m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
514 		m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
515 		m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
516 		m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
517 		m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
518 		m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
519 		m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
520 		m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
521 		m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
522 		m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
523 		m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
524 		m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
525 		m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
526 		m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
527 
528 		r0 = _mm_unpacklo_epi64(m01, m45);
529 		r1 = _mm_unpackhi_epi64(m01, m45);
530 		r2 = _mm_unpacklo_epi64(m23, m67);
531 		r3 = _mm_unpackhi_epi64(m23, m67);
532 		r4 = _mm_unpacklo_epi64(m89, m89);
533 		r5 = _mm_unpackhi_epi64(m89, m89);
534 
535 		c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
536 		c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
537 		c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1);
538 		c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
539 		c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
540 
541 		r01 = _mm_unpacklo_epi64(r0, r1);
542 		r45 = _mm_unpackhi_epi64(r0, r1);
543 		r23 = _mm_unpacklo_epi64(r2, r3);
544 		r67 = _mm_unpackhi_epi64(r2, r3);
545 		r89 = _mm_unpackhi_epi64(r4, r5);
546 	} while (--count);
547 
548 	r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
549 	r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
550 	r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
551 	r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
552 	r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
553 
554 	_mm_store_si128((xmmi*)r + 0, r0123);
555 	_mm_store_si128((xmmi*)r + 1, r4567);
556 	_mm_store_si128((xmmi*)r + 2, r89);
557 }
558 
559 DONNA_INLINE static void
curve25519_tangle32(packedelem32 * out,const bignum25519 x,const bignum25519 z)560 curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {
561 	xmmi x0,x1,x2,z0,z1,z2;
562 
563 	x0 = _mm_load_si128((xmmi *)(x + 0));
564 	x1 = _mm_load_si128((xmmi *)(x + 4));
565 	x2 = _mm_load_si128((xmmi *)(x + 8));
566 	z0 = _mm_load_si128((xmmi *)(z + 0));
567 	z1 = _mm_load_si128((xmmi *)(z + 4));
568 	z2 = _mm_load_si128((xmmi *)(z + 8));
569 
570 	out[0].v = _mm_unpacklo_epi32(x0, z0);
571 	out[1].v = _mm_unpackhi_epi32(x0, z0);
572 	out[2].v = _mm_unpacklo_epi32(x1, z1);
573 	out[3].v = _mm_unpackhi_epi32(x1, z1);
574 	out[4].v = _mm_unpacklo_epi32(x2, z2);
575 }
576 
577 DONNA_INLINE static void
curve25519_untangle32(bignum25519 x,bignum25519 z,const packedelem32 * in)578 curve25519_untangle32(bignum25519 x, bignum25519 z, const packedelem32 *in) {
579 	xmmi t0,t1,t2,t3,t4,zero;
580 
581 	t0 = _mm_shuffle_epi32(in[0].v, _MM_SHUFFLE(3,1,2,0));
582 	t1 = _mm_shuffle_epi32(in[1].v, _MM_SHUFFLE(3,1,2,0));
583 	t2 = _mm_shuffle_epi32(in[2].v, _MM_SHUFFLE(3,1,2,0));
584 	t3 = _mm_shuffle_epi32(in[3].v, _MM_SHUFFLE(3,1,2,0));
585 	t4 = _mm_shuffle_epi32(in[4].v, _MM_SHUFFLE(3,1,2,0));
586 	zero = _mm_setzero_si128();
587 	_mm_store_si128((xmmi *)x + 0, _mm_unpacklo_epi64(t0, t1));
588 	_mm_store_si128((xmmi *)x + 1, _mm_unpacklo_epi64(t2, t3));
589 	_mm_store_si128((xmmi *)x + 2, _mm_unpacklo_epi64(t4, zero));
590 	_mm_store_si128((xmmi *)z + 0, _mm_unpackhi_epi64(t0, t1));
591 	_mm_store_si128((xmmi *)z + 1, _mm_unpackhi_epi64(t2, t3));
592 	_mm_store_si128((xmmi *)z + 2, _mm_unpackhi_epi64(t4, zero));
593 }
594 
595 DONNA_INLINE static void
curve25519_add_reduce_packed32(packedelem32 * out,const packedelem32 * r,const packedelem32 * s)596 curve25519_add_reduce_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
597 	xmmi r0,r1,r2,r3,r4;
598 	xmmi s0,s1,s2,s3,s4,s5;
599 	xmmi c1,c2;
600 
601 	r0 = _mm_add_epi32(r[0].v, s[0].v);
602 	r1 = _mm_add_epi32(r[1].v, s[1].v);
603 	r2 = _mm_add_epi32(r[2].v, s[2].v);
604 	r3 = _mm_add_epi32(r[3].v, s[3].v);
605 	r4 = _mm_add_epi32(r[4].v, s[4].v);
606 
607 	s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
608 	s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
609 	s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
610 	s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
611 	s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4);  /* 00 88 */
612 	s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4);  /* 00 99 */
613 
614 	c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
615 	c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
616 	c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
617 	c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
618 	c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
619 
620 	out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
621 	out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
622 	out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
623 	out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
624 	out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
625 }
626 
627 DONNA_INLINE static void
curve25519_add_packed32(packedelem32 * out,const packedelem32 * r,const packedelem32 * s)628 curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
629 	out[0].v = _mm_add_epi32(r[0].v, s[0].v);
630 	out[1].v = _mm_add_epi32(r[1].v, s[1].v);
631 	out[2].v = _mm_add_epi32(r[2].v, s[2].v);
632 	out[3].v = _mm_add_epi32(r[3].v, s[3].v);
633 	out[4].v = _mm_add_epi32(r[4].v, s[4].v);
634 }
635 
636 DONNA_INLINE static void
curve25519_sub_packed32(packedelem32 * out,const packedelem32 * r,const packedelem32 * s)637 curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
638 	xmmi r0,r1,r2,r3,r4;
639 	xmmi s0,s1,s2,s3;
640 	xmmi c1,c2;
641 
642 	r0 = _mm_add_epi32(r[0].v, packed32packed2p0.v);
643 	r1 = _mm_add_epi32(r[1].v, packed32packed2p1.v);
644 	r2 = _mm_add_epi32(r[2].v, packed32packed2p1.v);
645 	r3 = _mm_add_epi32(r[3].v, packed32packed2p1.v);
646 	r4 = _mm_add_epi32(r[4].v, packed32packed2p1.v);
647 	r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
648 	r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
649 	r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
650 	r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
651 	r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
652 
653 	s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
654 	s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
655 	s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
656 	s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
657 
658 	c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
659 	c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0,  _mm_slli_si128(c2, 8));
660 
661 	out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
662 	out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
663 	out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
664 	out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
665 	out[4].v = r4;
666 }
667 
668 DONNA_INLINE static void
curve25519_sub_after_basic_packed32(packedelem32 * out,const packedelem32 * r,const packedelem32 * s)669 curve25519_sub_after_basic_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
670 	xmmi r0,r1,r2,r3,r4;
671 	xmmi s0,s1,s2,s3,s4,s5;
672 	xmmi c1,c2;
673 
674 	r0 = _mm_add_epi32(r[0].v, packed32packed4p0.v);
675 	r1 = _mm_add_epi32(r[1].v, packed32packed4p1.v);
676 	r2 = _mm_add_epi32(r[2].v, packed32packed4p1.v);
677 	r3 = _mm_add_epi32(r[3].v, packed32packed4p1.v);
678 	r4 = _mm_add_epi32(r[4].v, packed32packed4p1.v);
679 	r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
680 	r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
681 	r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
682 	r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
683 	r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
684 
685 	s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
686 	s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
687 	s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
688 	s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
689 	s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4);  /* 00 88 */
690 	s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4);  /* 00 99 */
691 
692 	c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
693 	c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
694 	c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
695 	c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
696 	c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
697 
698 	out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
699 	out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
700 	out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
701 	out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
702 	out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
703 }
704 
705 DONNA_INLINE static void
curve25519_tangle64_from32(packedelem64 * a,packedelem64 * b,const packedelem32 * c,const packedelem32 * d)706 curve25519_tangle64_from32(packedelem64 *a, packedelem64 *b, const packedelem32 *c, const packedelem32 *d) {
707 	xmmi c0,c1,c2,c3,c4,c5,t;
708 	xmmi d0,d1,d2,d3,d4,d5;
709 	xmmi t0,t1,t2,t3,t4,zero;
710 
711 	t0 = _mm_shuffle_epi32(c[0].v, _MM_SHUFFLE(3,1,2,0));
712 	t1 = _mm_shuffle_epi32(c[1].v, _MM_SHUFFLE(3,1,2,0));
713 	t2 = _mm_shuffle_epi32(d[0].v, _MM_SHUFFLE(3,1,2,0));
714 	t3 = _mm_shuffle_epi32(d[1].v, _MM_SHUFFLE(3,1,2,0));
715 	c0 = _mm_unpacklo_epi64(t0, t1);
716 	c3 = _mm_unpackhi_epi64(t0, t1);
717 	d0 = _mm_unpacklo_epi64(t2, t3);
718 	d3 = _mm_unpackhi_epi64(t2, t3);
719 	t = _mm_unpacklo_epi64(c0, d0); a[0].v = t; a[1].v = _mm_srli_epi64(t, 32);
720 	t = _mm_unpackhi_epi64(c0, d0); a[2].v = t; a[3].v = _mm_srli_epi64(t, 32);
721 	t = _mm_unpacklo_epi64(c3, d3); b[0].v = t; b[1].v = _mm_srli_epi64(t, 32);
722 	t = _mm_unpackhi_epi64(c3, d3); b[2].v = t; b[3].v = _mm_srli_epi64(t, 32);
723 
724 	t0 = _mm_shuffle_epi32(c[2].v, _MM_SHUFFLE(3,1,2,0));
725 	t1 = _mm_shuffle_epi32(c[3].v, _MM_SHUFFLE(3,1,2,0));
726 	t2 = _mm_shuffle_epi32(d[2].v, _MM_SHUFFLE(3,1,2,0));
727 	t3 = _mm_shuffle_epi32(d[3].v, _MM_SHUFFLE(3,1,2,0));
728 	c1 = _mm_unpacklo_epi64(t0, t1);
729 	c4 = _mm_unpackhi_epi64(t0, t1);
730 	d1 = _mm_unpacklo_epi64(t2, t3);
731 	d4 = _mm_unpackhi_epi64(t2, t3);
732 	t = _mm_unpacklo_epi64(c1, d1); a[4].v = t; a[5].v = _mm_srli_epi64(t, 32);
733 	t = _mm_unpackhi_epi64(c1, d1); a[6].v = t; a[7].v = _mm_srli_epi64(t, 32);
734 	t = _mm_unpacklo_epi64(c4, d4); b[4].v = t; b[5].v = _mm_srli_epi64(t, 32);
735 	t = _mm_unpackhi_epi64(c4, d4); b[6].v = t; b[7].v = _mm_srli_epi64(t, 32);
736 
737 	t4 = _mm_shuffle_epi32(c[4].v, _MM_SHUFFLE(3,1,2,0));
738 	zero = _mm_setzero_si128();
739 	c2 = _mm_unpacklo_epi64(t4, zero);
740 	c5 = _mm_unpackhi_epi64(t4, zero);
741 	t4 = _mm_shuffle_epi32(d[4].v, _MM_SHUFFLE(3,1,2,0));
742 	d2 = _mm_unpacklo_epi64(t4, zero);
743 	d5 = _mm_unpackhi_epi64(t4, zero);
744 	t = _mm_unpacklo_epi64(c2, d2); a[8].v = t; a[9].v = _mm_srli_epi64(t, 32);
745 	t = _mm_unpacklo_epi64(c5, d5); b[8].v = t; b[9].v = _mm_srli_epi64(t, 32);
746 }
747 
748 DONNA_INLINE static void
curve25519_tangle64(packedelem64 * out,const bignum25519 x,const bignum25519 z)749 curve25519_tangle64(packedelem64 *out, const bignum25519 x, const bignum25519 z) {
750 	xmmi x0,x1,x2,z0,z1,z2,t;
751 
752 	x0 = _mm_load_si128((xmmi *)x + 0);
753 	x1 = _mm_load_si128((xmmi *)x + 1);
754 	x2 = _mm_load_si128((xmmi *)x + 2);
755 	z0 = _mm_load_si128((xmmi *)z + 0);
756 	z1 = _mm_load_si128((xmmi *)z + 1);
757 	z2 = _mm_load_si128((xmmi *)z + 2);
758 
759 	t = _mm_unpacklo_epi64(x0, z0);	out[0].v = t; out[1].v = _mm_srli_epi64(t, 32);
760 	t = _mm_unpackhi_epi64(x0, z0);	out[2].v = t; out[3].v = _mm_srli_epi64(t, 32);
761 	t = _mm_unpacklo_epi64(x1, z1);	out[4].v = t; out[5].v = _mm_srli_epi64(t, 32);
762 	t = _mm_unpackhi_epi64(x1, z1);	out[6].v = t; out[7].v = _mm_srli_epi64(t, 32);
763 	t = _mm_unpacklo_epi64(x2, z2);	out[8].v = t; out[9].v = _mm_srli_epi64(t, 32);
764 }
765 
766 DONNA_INLINE static void
curve25519_tangleone64(packedelem64 * out,const bignum25519 x)767 curve25519_tangleone64(packedelem64 *out, const bignum25519 x) {
768 	xmmi x0,x1,x2;
769 
770 	x0 = _mm_load_si128((xmmi *)(x + 0));
771 	x1 = _mm_load_si128((xmmi *)(x + 4));
772 	x2 = _mm_load_si128((xmmi *)(x + 8));
773 
774 	out[0].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(0,0,0,0));
775 	out[1].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(1,1,1,1));
776 	out[2].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(2,2,2,2));
777 	out[3].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(3,3,3,3));
778 	out[4].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0,0,0,0));
779 	out[5].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,1,1,1));
780 	out[6].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2,2,2,2));
781 	out[7].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(3,3,3,3));
782 	out[8].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0,0,0,0));
783 	out[9].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1,1,1,1));
784 }
785 
786 DONNA_INLINE static void
curve25519_swap64(packedelem64 * out)787 curve25519_swap64(packedelem64 *out) {
788 	out[0].v = _mm_shuffle_epi32(out[0].v, _MM_SHUFFLE(1,0,3,2));
789 	out[1].v = _mm_shuffle_epi32(out[1].v, _MM_SHUFFLE(1,0,3,2));
790 	out[2].v = _mm_shuffle_epi32(out[2].v, _MM_SHUFFLE(1,0,3,2));
791 	out[3].v = _mm_shuffle_epi32(out[3].v, _MM_SHUFFLE(1,0,3,2));
792 	out[4].v = _mm_shuffle_epi32(out[4].v, _MM_SHUFFLE(1,0,3,2));
793 	out[5].v = _mm_shuffle_epi32(out[5].v, _MM_SHUFFLE(1,0,3,2));
794 	out[6].v = _mm_shuffle_epi32(out[6].v, _MM_SHUFFLE(1,0,3,2));
795 	out[7].v = _mm_shuffle_epi32(out[7].v, _MM_SHUFFLE(1,0,3,2));
796 	out[8].v = _mm_shuffle_epi32(out[8].v, _MM_SHUFFLE(1,0,3,2));
797 	out[9].v = _mm_shuffle_epi32(out[9].v, _MM_SHUFFLE(1,0,3,2));
798 }
799 
800 DONNA_INLINE static void
curve25519_untangle64(bignum25519 x,bignum25519 z,const packedelem64 * in)801 curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {
802 	_mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
803 	_mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
804 	_mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v)                                                          );
805 	_mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
806 	_mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
807 	_mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v)                                                          );
808 }
809 
810 DONNA_INLINE static void
curve25519_mul_packed64(packedelem64 * out,const packedelem64 * r,const packedelem64 * s)811 curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {
812 	xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
813 	xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
814 	xmmi c1,c2;
815 
816 	out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
817 	out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
818 	r1_2 = _mm_slli_epi32(r[1].v, 1);
819 	out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
820 	out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
821 	r3_2 = _mm_slli_epi32(r[3].v, 1);
822 	out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2  , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
823 	out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
824 	r5_2 = _mm_slli_epi32(r[5].v, 1);
825 	out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2  , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2  , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
826 	out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v  , s[0].v))))))));
827 	r7_2 = _mm_slli_epi32(r[7].v, 1);
828 	out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2  , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2  , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2  , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
829 	out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
830 
831 	r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
832 	r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
833 	r1_2 = _mm_slli_epi32(r1, 1);
834 	r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
835 	r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
836 	r3_2 = _mm_slli_epi32(r3, 1);
837 	r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
838 	r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
839 	r5_2 = _mm_slli_epi32(r5, 1);
840 	r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
841 	r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
842 	r7_2 = _mm_slli_epi32(r7, 1);
843 	r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
844 	r9_2 = _mm_slli_epi32(r9, 1);
845 
846 	out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
847 	out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7  , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5  , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3  , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
848 	out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
849 	out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7  , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5  , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
850 	out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
851 	out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7  , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
852 	out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
853 	out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[8].v), _mm_mul_epu32(r8, s[9].v)));
854 	out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
855 
856 	c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
857 	c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
858 	c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
859 	c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
860 	                                   c2 = _mm_srli_epi64(out[8].v, 26);                                                     out[8].v = _mm_and_si128(out[8].v, packedmask26.v);                                         out[9].v = _mm_add_epi64(out[9].v, c2);
861 	                                   c2 = _mm_srli_epi64(out[9].v, 25);                                                     out[9].v = _mm_and_si128(out[9].v, packedmask25.v);                                         out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
862 	c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
863 }
864 
865 DONNA_INLINE static void
curve25519_square_packed64(packedelem64 * out,const packedelem64 * r)866 curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {
867 	xmmi r0,r1,r2,r3;
868 	xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
869 	xmmi d5,d6,d7,d8,d9;
870 	xmmi c1,c2;
871 
872 	r0 = r[0].v;
873 	r1 = r[1].v;
874 	r2 = r[2].v;
875 	r3 = r[3].v;
876 
877 	out[0].v = _mm_mul_epu32(r0, r0);
878 	r0 = _mm_slli_epi32(r0, 1);
879 	out[1].v = _mm_mul_epu32(r0, r1);
880 	r1_2 = _mm_slli_epi32(r1, 1);
881 	out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2    ), _mm_mul_epu32(r1, r1_2));
882 	r1 = r1_2;
883 	out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3    ), _mm_mul_epu32(r1, r2  ));
884 	r3_2 = _mm_slli_epi32(r3, 1);
885 	out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2  ), _mm_mul_epu32(r2, r2)));
886 	r2 = _mm_slli_epi32(r2, 1);
887 	out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
888 	r5_2 = _mm_slli_epi32(r[5].v, 1);
889 	out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2  ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2  ))));
890 	r3 = r3_2;
891 	out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
892 	r7_2 = _mm_slli_epi32(r[7].v, 1);
893 	out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2  ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2  ), _mm_mul_epu32(r[4].v, r[4].v)))));
894 	out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2  )))));
895 
896 	d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
897 	d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
898 	d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
899 	d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
900 	d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
901 
902 	r4_2 = _mm_slli_epi32(r[4].v, 1);
903 	r6_2 = _mm_slli_epi32(r[6].v, 1);
904 	out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1                   ), _mm_add_epi64(_mm_mul_epu32(d8, r2  ), _mm_add_epi64(_mm_mul_epu32(d7, r3    ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
905 	out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3  ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2  )))));
906 	out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3                   ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2  ), _mm_mul_epu32(d6, r[6].v)))));
907 	out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v               ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
908 	out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2                 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
909 	out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v               ), _mm_mul_epu32(d8, r7_2  )));
910 	out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2                 ), _mm_mul_epu32(d8, r[8].v)));
911 	out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
912 	out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
913 
914 	c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
915 	c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
916 	c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
917 	c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
918 	                                   c2 = _mm_srli_epi64(out[8].v, 26);                                                     out[8].v = _mm_and_si128(out[8].v, packedmask26.v);                                         out[9].v = _mm_add_epi64(out[9].v, c2);
919 	                                   c2 = _mm_srli_epi64(out[9].v, 25);                                                     out[9].v = _mm_and_si128(out[9].v, packedmask25.v);                                         out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
920 	c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
921 }
922 
923 
924 /* Take a little-endian, 32-byte number and expand it into polynomial form */
925 static void
curve25519_expand(bignum25519 out,const unsigned char in[32])926 curve25519_expand(bignum25519 out, const unsigned char in[32]) {
927 	uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
928 
929 	x0 = *(uint32_t *)(in + 0);
930 	x1 = *(uint32_t *)(in + 4);
931 	x2 = *(uint32_t *)(in + 8);
932 	x3 = *(uint32_t *)(in + 12);
933 	x4 = *(uint32_t *)(in + 16);
934 	x5 = *(uint32_t *)(in + 20);
935 	x6 = *(uint32_t *)(in + 24);
936 	x7 = *(uint32_t *)(in + 28);
937 
938 	out[0] = (                        x0       ) & 0x3ffffff;
939 	out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & 0x1ffffff;
940 	out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & 0x3ffffff;
941 	out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & 0x1ffffff;
942 	out[4] = ((                       x3) >>  6) & 0x3ffffff;
943 	out[5] = (                        x4       ) & 0x1ffffff;
944 	out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & 0x3ffffff;
945 	out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & 0x1ffffff;
946 	out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & 0x3ffffff;
947 	out[9] = ((                       x7) >>  6) & 0x1ffffff;
948 	out[10] = 0;
949 	out[11] = 0;
950 }
951 
952 /* Take a fully reduced polynomial form number and contract it into a
953  * little-endian, 32-byte array
954  */
955 static void
curve25519_contract(unsigned char out[32],const bignum25519 in)956 curve25519_contract(unsigned char out[32], const bignum25519 in) {
957 	bignum25519 ALIGN(16) f;
958 	curve25519_copy(f, in);
959 
960 	#define carry_pass() \
961 		f[1] += f[0] >> 26; f[0] &= 0x3ffffff; \
962 		f[2] += f[1] >> 25; f[1] &= 0x1ffffff; \
963 		f[3] += f[2] >> 26; f[2] &= 0x3ffffff; \
964 		f[4] += f[3] >> 25; f[3] &= 0x1ffffff; \
965 		f[5] += f[4] >> 26; f[4] &= 0x3ffffff; \
966 		f[6] += f[5] >> 25; f[5] &= 0x1ffffff; \
967 		f[7] += f[6] >> 26; f[6] &= 0x3ffffff; \
968 		f[8] += f[7] >> 25; f[7] &= 0x1ffffff; \
969 		f[9] += f[8] >> 26; f[8] &= 0x3ffffff;
970 
971 	#define carry_pass_full() \
972 		carry_pass() \
973 		f[0] += 19 * (f[9] >> 25); f[9] &= 0x1ffffff;
974 
975 	#define carry_pass_final() \
976 		carry_pass() \
977 		f[9] &= 0x1ffffff;
978 
979 	carry_pass_full()
980 	carry_pass_full()
981 
982 	/* now t is between 0 and 2^255-1, properly carried. */
983 	/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
984 	f[0] += 19;
985 	carry_pass_full()
986 
987 	/* now between 19 and 2^255-1 in both cases, and offset by 19. */
988 	f[0] += (1 << 26) - 19;
989 	f[1] += (1 << 25) - 1;
990 	f[2] += (1 << 26) - 1;
991 	f[3] += (1 << 25) - 1;
992 	f[4] += (1 << 26) - 1;
993 	f[5] += (1 << 25) - 1;
994 	f[6] += (1 << 26) - 1;
995 	f[7] += (1 << 25) - 1;
996 	f[8] += (1 << 26) - 1;
997 	f[9] += (1 << 25) - 1;
998 
999 	/* now between 2^255 and 2^256-20, and offset by 2^255. */
1000 	carry_pass_final()
1001 
1002 	#undef carry_pass
1003 	#undef carry_full
1004 	#undef carry_final
1005 
1006 	f[1] <<= 2;
1007 	f[2] <<= 3;
1008 	f[3] <<= 5;
1009 	f[4] <<= 6;
1010 	f[6] <<= 1;
1011 	f[7] <<= 3;
1012 	f[8] <<= 4;
1013 	f[9] <<= 6;
1014 
1015 	#define F(i, s) \
1016 		out[s+0] |= (unsigned char )(f[i] & 0xff); \
1017 		out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \
1018 		out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \
1019 		out[s+3] = (unsigned char )((f[i] >> 24) & 0xff);
1020 
1021 	out[0] = 0;
1022 	out[16] = 0;
1023 	F(0,0);
1024 	F(1,3);
1025 	F(2,6);
1026 	F(3,9);
1027 	F(4,12);
1028 	F(5,16);
1029 	F(6,19);
1030 	F(7,22);
1031 	F(8,25);
1032 	F(9,28);
1033 	#undef F
1034 }
1035 
1036 /* if (iswap) swap(a, b) */
1037 DONNA_INLINE static void
curve25519_swap_conditional(bignum25519 a,bignum25519 b,uint32_t iswap)1038 curve25519_swap_conditional(bignum25519 a, bignum25519 b, uint32_t iswap) {
1039 	const uint32_t swap = (uint32_t)(-(int32_t)iswap);
1040 	xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
1041 	xmmi mask = _mm_cvtsi32_si128(swap);
1042 	mask = _mm_shuffle_epi32(mask, 0);
1043 	a0 = _mm_load_si128((xmmi *)a + 0);
1044 	a1 = _mm_load_si128((xmmi *)a + 1);
1045 	b0 = _mm_load_si128((xmmi *)b + 0);
1046 	b1 = _mm_load_si128((xmmi *)b + 1);
1047 	b0 = _mm_xor_si128(a0, b0);
1048 	b1 = _mm_xor_si128(a1, b1);
1049 	x0 = _mm_and_si128(b0, mask);
1050 	x1 = _mm_and_si128(b1, mask);
1051 	x0 = _mm_xor_si128(x0, a0);
1052 	x1 = _mm_xor_si128(x1, a1);
1053 	a0 = _mm_xor_si128(x0, b0);
1054 	a1 = _mm_xor_si128(x1, b1);
1055 	_mm_store_si128((xmmi *)a + 0, x0);
1056 	_mm_store_si128((xmmi *)a + 1, x1);
1057 	_mm_store_si128((xmmi *)b + 0, a0);
1058 	_mm_store_si128((xmmi *)b + 1, a1);
1059 
1060 	a2 = _mm_load_si128((xmmi *)a + 2);
1061 	b2 = _mm_load_si128((xmmi *)b + 2);
1062 	b2 = _mm_xor_si128(a2, b2);
1063 	x2 = _mm_and_si128(b2, mask);
1064 	x2 = _mm_xor_si128(x2, a2);
1065 	a2 = _mm_xor_si128(x2, b2);
1066 	_mm_store_si128((xmmi *)b + 2, a2);
1067 	_mm_store_si128((xmmi *)a + 2, x2);
1068 }
1069 
1070 /* out = (flag) ? out : in */
1071 DONNA_INLINE static void
curve25519_move_conditional_bytes(uint8_t out[96],const uint8_t in[96],uint32_t flag)1072 curve25519_move_conditional_bytes(uint8_t out[96], const uint8_t in[96], uint32_t flag) {
1073 	xmmi a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5;
1074 	const uint32_t nb = flag - 1;
1075 	xmmi masknb = _mm_shuffle_epi32(_mm_cvtsi32_si128(nb),0);
1076 	a0 = _mm_load_si128((xmmi *)in + 0);
1077 	a1 = _mm_load_si128((xmmi *)in + 1);
1078 	a2 = _mm_load_si128((xmmi *)in + 2);
1079 	b0 = _mm_load_si128((xmmi *)out + 0);
1080 	b1 = _mm_load_si128((xmmi *)out + 1);
1081 	b2 = _mm_load_si128((xmmi *)out + 2);
1082 	a0 = _mm_andnot_si128(masknb, a0);
1083 	a1 = _mm_andnot_si128(masknb, a1);
1084 	a2 = _mm_andnot_si128(masknb, a2);
1085 	b0 = _mm_and_si128(masknb, b0);
1086 	b1 = _mm_and_si128(masknb, b1);
1087 	b2 = _mm_and_si128(masknb, b2);
1088 	a0 = _mm_or_si128(a0, b0);
1089 	a1 = _mm_or_si128(a1, b1);
1090 	a2 = _mm_or_si128(a2, b2);
1091 	_mm_store_si128((xmmi*)out + 0, a0);
1092 	_mm_store_si128((xmmi*)out + 1, a1);
1093 	_mm_store_si128((xmmi*)out + 2, a2);
1094 
1095 	a3 = _mm_load_si128((xmmi *)in + 3);
1096 	a4 = _mm_load_si128((xmmi *)in + 4);
1097 	a5 = _mm_load_si128((xmmi *)in + 5);
1098 	b3 = _mm_load_si128((xmmi *)out + 3);
1099 	b4 = _mm_load_si128((xmmi *)out + 4);
1100 	b5 = _mm_load_si128((xmmi *)out + 5);
1101 	a3 = _mm_andnot_si128(masknb, a3);
1102 	a4 = _mm_andnot_si128(masknb, a4);
1103 	a5 = _mm_andnot_si128(masknb, a5);
1104 	b3 = _mm_and_si128(masknb, b3);
1105 	b4 = _mm_and_si128(masknb, b4);
1106 	b5 = _mm_and_si128(masknb, b5);
1107 	a3 = _mm_or_si128(a3, b3);
1108 	a4 = _mm_or_si128(a4, b4);
1109 	a5 = _mm_or_si128(a5, b5);
1110 	_mm_store_si128((xmmi*)out + 3, a3);
1111 	_mm_store_si128((xmmi*)out + 4, a4);
1112 	_mm_store_si128((xmmi*)out + 5, a5);
1113 }
1114 
1115 #endif /* defined(ED25519_SSE2) */
1116