1 /*
2 * Implementation lifted from Monocypher
3 * Removed some macro fsckery, more needed.
4 */
5 #include <inttypes.h>
6 #include <stddef.h>
7 #include <string.h>
8
9 #include "x25519.h"
10
11 ///////////////
12 /// X-25519 /// Taken from SUPERCOP's ref10 implementation.
13 ///////////////
14
15 ////////////////////////////////////
16 /// Arithmetic modulo 2^255 - 19 ///
17 ////////////////////////////////////
18 // Taken from SUPERCOP's ref10 implementation.
19 // A bit bigger than TweetNaCl, over 4 times faster.
20
21 #define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
22 #define FOR(i, start, end) FOR_T(size_t, i, start, end)
23
24 typedef int8_t i8;
25 typedef uint8_t u8;
26 typedef int16_t i16;
27 typedef uint32_t u32;
28 typedef int32_t i32;
29 typedef int64_t i64;
30 typedef uint64_t u64;
31
32 // field element
33 typedef int32_t fe[10];
fe_0(fe h)34 static void fe_0(fe h) { FOR(i, 0, 10) h[i] = 0; }
fe_1(fe h)35 static void fe_1(fe h) { h[0] = 1; FOR(i, 1, 10) h[i] = 0; }
36
fe_copy(fe h,const fe f)37 static void fe_copy(fe h,const fe f ){FOR(i,0,10) h[i] = f[i]; }
fe_neg(fe h,const fe f)38 static void fe_neg (fe h,const fe f ){FOR(i,0,10) h[i] = -f[i]; }
fe_add(fe h,const fe f,const fe g)39 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
fe_sub(fe h,const fe f,const fe g)40 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
41
load24_le(const u8 s[3])42 static u32 load24_le(const u8 s[3])
43 {
44 return (u32)s[0]
45 | ((u32)s[1] << 8)
46 | ((u32)s[2] << 16);
47 }
48
load32_le(const u8 s[4])49 static u32 load32_le(const u8 s[4])
50 {
51 return (u32)s[0]
52 | ((u32)s[1] << 8)
53 | ((u32)s[2] << 16)
54 | ((u32)s[3] << 24);
55 }
56
load64_le(const u8 s[8])57 static u64 load64_le(const u8 s[8])
58 {
59 return load32_le(s) | ((u64)load32_le(s+4) << 32);
60 }
61
store32_le(u8 out[4],u32 in)62 static void store32_le(u8 out[4], u32 in)
63 {
64 out[0] = in & 0xff;
65 out[1] = (in >> 8) & 0xff;
66 out[2] = (in >> 16) & 0xff;
67 out[3] = (in >> 24) & 0xff;
68 }
69
store64_le(u8 out[8],u64 in)70 static void store64_le(u8 out[8], u64 in)
71 {
72 store32_le(out , (u32)in );
73 store32_le(out + 4, in >> 32);
74 }
75
rotr64(u64 x,u64 n)76 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
rotl32(u32 x,u32 n)77 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
78
neq0(u64 diff)79 static int neq0(u64 diff)
80 { // constant time comparison to zero
81 // return diff != 0 ? -1 : 0
82 u64 half = (diff >> 32) | ((u32)diff);
83 return (1 & ((half - 1) >> 32)) - 1;
84 }
85
x16(const u8 a[16],const u8 b[16])86 static u64 x16(const u8 a[16], const u8 b[16])
87 {
88 return (load64_le(a + 0) ^ load64_le(b + 0))
89 | (load64_le(a + 8) ^ load64_le(b + 8));
90 }
91
fe_cswap(fe f,fe g,int b)92 static void fe_cswap(fe f, fe g, int b)
93 {
94 i32 mask = -b; // rely on 2's complement: -1 = 0xffffffff
95 FOR (i, 0, 10) {
96 i32 x = (f[i] ^ g[i]) & mask;
97 f[i] = f[i] ^ x;
98 g[i] = g[i] ^ x;
99 }
100 }
101
fe_ccopy(fe f,const fe g,int b)102 static void fe_ccopy(fe f, const fe g, int b)
103 {
104 i32 mask = -b; // rely on 2's complement: -1 = 0xffffffff
105 FOR (i, 0, 10) {
106 i32 x = (f[i] ^ g[i]) & mask;
107 f[i] = f[i] ^ x;
108 }
109 }
110
111 #define FE_CARRY \
112 i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; \
113 c9 = (t9 + (i64)(1<<24)) >> 25; t0 += c9 * 19; t9 -= c9 * (1 << 25); \
114 c1 = (t1 + (i64)(1<<24)) >> 25; t2 += c1; t1 -= c1 * (1 << 25); \
115 c3 = (t3 + (i64)(1<<24)) >> 25; t4 += c3; t3 -= c3 * (1 << 25); \
116 c5 = (t5 + (i64)(1<<24)) >> 25; t6 += c5; t5 -= c5 * (1 << 25); \
117 c7 = (t7 + (i64)(1<<24)) >> 25; t8 += c7; t7 -= c7 * (1 << 25); \
118 c0 = (t0 + (i64)(1<<25)) >> 26; t1 += c0; t0 -= c0 * (1 << 26); \
119 c2 = (t2 + (i64)(1<<25)) >> 26; t3 += c2; t2 -= c2 * (1 << 26); \
120 c4 = (t4 + (i64)(1<<25)) >> 26; t5 += c4; t4 -= c4 * (1 << 26); \
121 c6 = (t6 + (i64)(1<<25)) >> 26; t7 += c6; t6 -= c6 * (1 << 26); \
122 c8 = (t8 + (i64)(1<<25)) >> 26; t9 += c8; t8 -= c8 * (1 << 26); \
123 h[0]=(i32)t0; h[1]=(i32)t1; h[2]=(i32)t2; h[3]=(i32)t3; h[4]=(i32)t4; \
124 h[5]=(i32)t5; h[6]=(i32)t6; h[7]=(i32)t7; h[8]=(i32)t8; h[9]=(i32)t9
125
fe_frombytes(fe h,const uint8_t s[32])126 static void fe_frombytes(fe h, const uint8_t s[32])
127 {
128 i64 t0 = load32_le(s);
129 i64 t1 = load24_le(s + 4) << 6;
130 i64 t2 = load24_le(s + 7) << 5;
131 i64 t3 = load24_le(s + 10) << 3;
132 i64 t4 = load24_le(s + 13) << 2;
133 i64 t5 = load32_le(s + 16);
134 i64 t6 = load24_le(s + 20) << 7;
135 i64 t7 = load24_le(s + 23) << 5;
136 i64 t8 = load24_le(s + 26) << 4;
137 i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2;
138 FE_CARRY;
139 }
140
fe_mul_small(fe h,const fe f,i32 g)141 static void fe_mul_small(fe h, const fe f, i32 g)
142 {
143 i64 t0 = f[0] * (i64) g; i64 t1 = f[1] * (i64) g;
144 i64 t2 = f[2] * (i64) g; i64 t3 = f[3] * (i64) g;
145 i64 t4 = f[4] * (i64) g; i64 t5 = f[5] * (i64) g;
146 i64 t6 = f[6] * (i64) g; i64 t7 = f[7] * (i64) g;
147 i64 t8 = f[8] * (i64) g; i64 t9 = f[9] * (i64) g;
148 FE_CARRY;
149 }
fe_mul121666(fe h,const fe f)150 static void fe_mul121666(fe h, const fe f) { fe_mul_small(h, f, 121666); }
151
fe_mul(fe h,const fe f,const fe g)152 static void fe_mul(fe h, const fe f, const fe g)
153 {
154 // Everything is unrolled and put in temporary variables.
155 // We could roll the loop, but that would make curve25519 twice as slow.
156 i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
157 i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
158 i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
159 i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
160 i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
161 i32 G1 = g1*19; i32 G2 = g2*19; i32 G3 = g3*19;
162 i32 G4 = g4*19; i32 G5 = g5*19; i32 G6 = g6*19;
163 i32 G7 = g7*19; i32 G8 = g8*19; i32 G9 = g9*19;
164
165 i64 h0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
166 + F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
167 i64 h1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
168 + f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
169 i64 h2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
170 + F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
171 i64 h3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
172 + f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
173 i64 h4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
174 + F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
175 i64 h5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
176 + f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
177 i64 h6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
178 + F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
179 i64 h7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
180 + f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
181 i64 h8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
182 + F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
183 i64 h9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
184 + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
185
186 #define CARRY \
187 i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; \
188 c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 * (1 << 26); \
189 c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 * (1 << 26); \
190 c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1; h1 -= c1 * (1 << 25); \
191 c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5; h5 -= c5 * (1 << 25); \
192 c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2; h2 -= c2 * (1 << 26); \
193 c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6; h6 -= c6 * (1 << 26); \
194 c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3; h3 -= c3 * (1 << 25); \
195 c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7; h7 -= c7 * (1 << 25); \
196 c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 * (1 << 26); \
197 c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8; h8 -= c8 * (1 << 26); \
198 c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 * (1 << 25); \
199 c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 * (1 << 26); \
200 h[0]=(i32)h0; h[1]=(i32)h1; h[2]=(i32)h2; h[3]=(i32)h3; h[4]=(i32)h4; \
201 h[5]=(i32)h5; h[6]=(i32)h6; h[7]=(i32)h7; h[8]=(i32)h8; h[9]=(i32)h9
202
203 CARRY;
204 }
205
206 // we could use fe_mul() for this, but this is significantly faster
fe_sq(fe h,const fe f)207 static void fe_sq(fe h, const fe f)
208 {
209 i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
210 i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
211 i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2;
212 i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2;
213 i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38;
214 i32 f8_19 = f8*19; i32 f9_38 = f9*38;
215
216 i64 h0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
217 + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38;
218 i64 h1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19
219 + f4 *(i64)f7_38 + f5_2*(i64)f6_19;
220 i64 h2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38
221 + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19;
222 i64 h3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38
223 + f5_2*(i64)f8_19 + f6 *(i64)f7_38;
224 i64 h4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2
225 + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38;
226 i64 h5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3
227 + f6 *(i64)f9_38 + f7_2*(i64)f8_19;
228 i64 h6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4
229 + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19;
230 i64 h7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5
231 + f3_2*(i64)f4 + f8 *(i64)f9_38;
232 i64 h8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6
233 + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38;
234 i64 h9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7
235 + f3_2*(i64)f6 + f4 *(i64)f5_2;
236
237 CARRY;
238 }
239
fe_sq2(fe h,const fe f)240 static void fe_sq2(fe h, const fe f)
241 {
242 fe_sq(h, f);
243 fe_mul_small(h, h, 2);
244 }
245
246 // This could be simplified, but it would be slower
fe_invert(fe out,const fe z)247 static void fe_invert(fe out, const fe z)
248 {
249 fe t0, t1, t2, t3;
250 fe_sq(t0, z );
251 fe_sq(t1, t0);
252 fe_sq(t1, t1);
253 fe_mul(t1, z, t1);
254 fe_mul(t0, t0, t1);
255 fe_sq(t2, t0); fe_mul(t1 , t1, t2);
256 fe_sq(t2, t1); FOR (i, 1, 5) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
257 fe_sq(t2, t1); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
258 fe_sq(t3, t2); FOR (i, 1, 20) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
259 fe_sq(t2, t2); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
260 fe_sq(t2, t1); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
261 fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
262 fe_sq(t2, t2); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
263 fe_sq(t1, t1); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(out, t1, t0);
264 }
265
266 // This could be simplified, but it would be slower
fe_pow22523(fe out,const fe z)267 static void fe_pow22523(fe out, const fe z)
268 {
269 fe t0, t1, t2;
270 fe_sq(t0, z);
271 fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1);
272 fe_mul(t0, t0, t1);
273 fe_sq(t0, t0); fe_mul(t0, t1, t0);
274 fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0);
275 fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0);
276 fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1);
277 fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0);
278 fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0);
279 fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1);
280 fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0);
281 fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z);
282 }
283
fe_tobytes(uint8_t s[32],const fe h)284 static void fe_tobytes(uint8_t s[32], const fe h)
285 {
286 i32 t[10];
287 FOR (i, 0, 10) {
288 t[i] = h[i];
289 }
290 i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
291 FOR (i, 0, 5) {
292 q += t[2*i ]; q >>= 26;
293 q += t[2*i+1]; q >>= 25;
294 }
295 t[0] += 19 * q;
296
297 i32 c0 = t[0] >> 26; t[1] += c0; t[0] -= c0 * (1 << 26);
298 i32 c1 = t[1] >> 25; t[2] += c1; t[1] -= c1 * (1 << 25);
299 i32 c2 = t[2] >> 26; t[3] += c2; t[2] -= c2 * (1 << 26);
300 i32 c3 = t[3] >> 25; t[4] += c3; t[3] -= c3 * (1 << 25);
301 i32 c4 = t[4] >> 26; t[5] += c4; t[4] -= c4 * (1 << 26);
302 i32 c5 = t[5] >> 25; t[6] += c5; t[5] -= c5 * (1 << 25);
303 i32 c6 = t[6] >> 26; t[7] += c6; t[6] -= c6 * (1 << 26);
304 i32 c7 = t[7] >> 25; t[8] += c7; t[7] -= c7 * (1 << 25);
305 i32 c8 = t[8] >> 26; t[9] += c8; t[8] -= c8 * (1 << 26);
306 i32 c9 = t[9] >> 25; t[9] -= c9 * (1 << 25);
307
308 store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26));
309 store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19));
310 store32_le(s + 8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
311 store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] << 6));
312 store32_le(s + 16, ((u32)t[5] >> 0) | ((u32)t[6] << 25));
313 store32_le(s + 20, ((u32)t[6] >> 7) | ((u32)t[7] << 19));
314 store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
315 store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] << 6));
316 }
317
x32(const u8 a[32],const u8 b[32])318 static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
319
crypto_verify32(const u8 a[32],const u8 b[32])320 static int crypto_verify32(const u8 a[32], const u8 b[32])
321 {
322 return neq0(x32(a, b));
323 }
324
zerocmp32(const u8 p[32])325 static int zerocmp32(const u8 p[32])
326 {
327 static const u8 zero[128] = {0};
328 return crypto_verify32(p, zero);
329 }
330
331 // Parity check. Returns 0 if even, 1 if odd
fe_isnegative(const fe f)332 static int fe_isnegative(const fe f)
333 {
334 uint8_t s[32];
335 fe_tobytes(s, f);
336 uint8_t isneg = s[0] & 1;
337 return isneg;
338 }
339
fe_isnonzero(const fe f)340 static int fe_isnonzero(const fe f)
341 {
342 uint8_t s[32];
343 fe_tobytes(s, f);
344 int isnonzero = zerocmp32(s);
345 return isnonzero;
346 }
347
trim_scalar(uint8_t s[32])348 static void trim_scalar(uint8_t s[32])
349 {
350 s[ 0] &= 248;
351 s[31] &= 127;
352 s[31] |= 64;
353 }
354
scalar_bit(const uint8_t s[32],int i)355 static int scalar_bit(const uint8_t s[32], int i) {
356 if (i < 0) { return 0; } // handle -1 for sliding windows
357 return (s[i>>3] >> (i&7)) & 1;
358 }
359
360 extern void arcan_random(uint8_t* dst, size_t nb);
x25519_private_key(uint8_t secret[static32])361 void x25519_private_key(uint8_t secret[static 32])
362 {
363 arcan_random(secret, 32);
364 secret[0] &= 248;
365 secret[31] &= 127;
366 secret[31] |= 64;
367 }
368
x25519_public_key(const uint8_t secret[static32],uint8_t public[static32])369 void x25519_public_key(
370 const uint8_t secret[static 32], uint8_t public[static 32])
371 {
372 const uint8_t basepoint[32] = {9};
373 x25519_shared_secret(public, secret, basepoint);
374 }
375
x25519_shared_secret(uint8_t secret_out[static32],const uint8_t secret[static32],const uint8_t ext_pub[static32])376 int x25519_shared_secret(
377 uint8_t secret_out[static 32],
378 const uint8_t secret [static 32], const uint8_t ext_pub [static 32])
379 {
380 // computes the scalar product
381 fe x1;
382 fe_frombytes(x1, ext_pub);
383
384 // restrict the possible scalar values
385 uint8_t e[32];
386 memcpy(e, secret, 32);
387 trim_scalar(e);
388
389 // computes the actual scalar product (the result is in x2 and z2)
390 fe x2, z2, x3, z3, t0, t1;
391 // Montgomery ladder
392 // In projective coordinates, to avoid divisions: x = X / Z
393 // We don't care about the y coordinate, it's only 1 bit of information
394 fe_1(x2); fe_0(z2); // "zero" point
395 fe_copy(x3, x1); fe_1(z3); // "one" point
396 int swap = 0;
397 for (int pos = 254; pos >= 0; --pos) {
398 // constant time conditional swap before ladder step
399 int b = scalar_bit(e, pos);
400 swap ^= b; // xor trick avoids swapping at the end of the loop
401 fe_cswap(x2, x3, swap);
402 fe_cswap(z2, z3, swap);
403 swap = b; // anticipates one last swap after the loop
404
405 // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
406 // with differential addition
407 fe_sub(t0, x3, z3); fe_sub(t1, x2, z2); fe_add(x2, x2, z2);
408 fe_add(z2, x3, z3); fe_mul(z3, t0, x2); fe_mul(z2, z2, t1);
409 fe_sq (t0, t1 ); fe_sq (t1, x2 ); fe_add(x3, z3, z2);
410 fe_sub(z2, z3, z2); fe_mul(x2, t1, t0); fe_sub(t1, t1, t0);
411 fe_sq (z2, z2 ); fe_mul121666(z3, t1); fe_sq (x3, x3 );
412 fe_add(t0, t0, z3); fe_mul(z3, x1, z2); fe_mul(z2, t1, t0);
413 }
414 // last swap is necessary to compensate for the xor trick
415 // Note: after this swap, P3 == P2 + P1.
416 fe_cswap(x2, x3, swap);
417 fe_cswap(z2, z3, swap);
418
419 // normalises the coordinates: x == X / Z
420 fe_invert(z2, z2);
421 fe_mul(x2, x2, z2);
422 fe_tobytes(secret_out, x2);
423
424 // Returns -1 if the output is all zero
425 // (happens with some malicious public keys)
426 return -1 - zerocmp32(secret_out);
427 }
428