1 /*
2  * Implementation lifted from Monocypher
3  * Removed some macro fsckery, more needed.
4  */
5 #include <inttypes.h>
6 #include <stddef.h>
7 #include <string.h>
8 
9 #include "x25519.h"
10 
11 ///////////////
12 /// X-25519 /// Taken from SUPERCOP's ref10 implementation.
13 ///////////////
14 
15 ////////////////////////////////////
16 /// Arithmetic modulo 2^255 - 19 ///
17 ////////////////////////////////////
18 //  Taken from SUPERCOP's ref10 implementation.
19 //  A bit bigger than TweetNaCl, over 4 times faster.
20 
21 #define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
22 #define FOR(i, start, end)         FOR_T(size_t, i, start, end)
23 
24 typedef int8_t   i8;
25 typedef uint8_t  u8;
26 typedef int16_t  i16;
27 typedef uint32_t u32;
28 typedef int32_t  i32;
29 typedef int64_t  i64;
30 typedef uint64_t u64;
31 
32 // field element
33 typedef int32_t fe[10];
fe_0(fe h)34 static void fe_0(fe h) {            FOR(i, 0, 10) h[i] = 0; }
fe_1(fe h)35 static void fe_1(fe h) { h[0] = 1;  FOR(i, 1, 10) h[i] = 0; }
36 
fe_copy(fe h,const fe f)37 static void fe_copy(fe h,const fe f           ){FOR(i,0,10) h[i] =  f[i];      }
fe_neg(fe h,const fe f)38 static void fe_neg (fe h,const fe f           ){FOR(i,0,10) h[i] = -f[i];      }
fe_add(fe h,const fe f,const fe g)39 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
fe_sub(fe h,const fe f,const fe g)40 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
41 
load24_le(const u8 s[3])42 static u32 load24_le(const u8 s[3])
43 {
44     return (u32)s[0]
45         | ((u32)s[1] <<  8)
46         | ((u32)s[2] << 16);
47 }
48 
load32_le(const u8 s[4])49 static u32 load32_le(const u8 s[4])
50 {
51     return (u32)s[0]
52         | ((u32)s[1] <<  8)
53         | ((u32)s[2] << 16)
54         | ((u32)s[3] << 24);
55 }
56 
load64_le(const u8 s[8])57 static u64 load64_le(const u8 s[8])
58 {
59     return load32_le(s) | ((u64)load32_le(s+4) << 32);
60 }
61 
store32_le(u8 out[4],u32 in)62 static void store32_le(u8 out[4], u32 in)
63 {
64     out[0] =  in        & 0xff;
65     out[1] = (in >>  8) & 0xff;
66     out[2] = (in >> 16) & 0xff;
67     out[3] = (in >> 24) & 0xff;
68 }
69 
store64_le(u8 out[8],u64 in)70 static void store64_le(u8 out[8], u64 in)
71 {
72     store32_le(out    , (u32)in );
73     store32_le(out + 4, in >> 32);
74 }
75 
rotr64(u64 x,u64 n)76 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
rotl32(u32 x,u32 n)77 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
78 
neq0(u64 diff)79 static int neq0(u64 diff)
80 {   // constant time comparison to zero
81     // return diff != 0 ? -1 : 0
82     u64 half = (diff >> 32) | ((u32)diff);
83     return (1 & ((half - 1) >> 32)) - 1;
84 }
85 
x16(const u8 a[16],const u8 b[16])86 static u64 x16(const u8 a[16], const u8 b[16])
87 {
88     return (load64_le(a + 0) ^ load64_le(b + 0))
89         |  (load64_le(a + 8) ^ load64_le(b + 8));
90 }
91 
fe_cswap(fe f,fe g,int b)92 static void fe_cswap(fe f, fe g, int b)
93 {
94     i32 mask = -b; // rely on 2's complement: -1 = 0xffffffff
95     FOR (i, 0, 10) {
96         i32 x = (f[i] ^ g[i]) & mask;
97         f[i] = f[i] ^ x;
98         g[i] = g[i] ^ x;
99     }
100 }
101 
fe_ccopy(fe f,const fe g,int b)102 static void fe_ccopy(fe f, const fe g, int b)
103 {
104     i32 mask = -b; // rely on 2's complement: -1 = 0xffffffff
105     FOR (i, 0, 10) {
106         i32 x = (f[i] ^ g[i]) & mask;
107         f[i] = f[i] ^ x;
108     }
109 }
110 
111 #define FE_CARRY                                                        \
112     i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;                         \
113     c9 = (t9 + (i64)(1<<24)) >> 25; t0 += c9 * 19; t9 -= c9 * (1 << 25); \
114     c1 = (t1 + (i64)(1<<24)) >> 25; t2 += c1;      t1 -= c1 * (1 << 25); \
115     c3 = (t3 + (i64)(1<<24)) >> 25; t4 += c3;      t3 -= c3 * (1 << 25); \
116     c5 = (t5 + (i64)(1<<24)) >> 25; t6 += c5;      t5 -= c5 * (1 << 25); \
117     c7 = (t7 + (i64)(1<<24)) >> 25; t8 += c7;      t7 -= c7 * (1 << 25); \
118     c0 = (t0 + (i64)(1<<25)) >> 26; t1 += c0;      t0 -= c0 * (1 << 26); \
119     c2 = (t2 + (i64)(1<<25)) >> 26; t3 += c2;      t2 -= c2 * (1 << 26); \
120     c4 = (t4 + (i64)(1<<25)) >> 26; t5 += c4;      t4 -= c4 * (1 << 26); \
121     c6 = (t6 + (i64)(1<<25)) >> 26; t7 += c6;      t6 -= c6 * (1 << 26); \
122     c8 = (t8 + (i64)(1<<25)) >> 26; t9 += c8;      t8 -= c8 * (1 << 26); \
123     h[0]=(i32)t0;  h[1]=(i32)t1;  h[2]=(i32)t2;  h[3]=(i32)t3;  h[4]=(i32)t4; \
124     h[5]=(i32)t5;  h[6]=(i32)t6;  h[7]=(i32)t7;  h[8]=(i32)t8;  h[9]=(i32)t9
125 
fe_frombytes(fe h,const uint8_t s[32])126 static void fe_frombytes(fe h, const uint8_t s[32])
127 {
128     i64 t0 =  load32_le(s);
129     i64 t1 =  load24_le(s +  4) << 6;
130     i64 t2 =  load24_le(s +  7) << 5;
131     i64 t3 =  load24_le(s + 10) << 3;
132     i64 t4 =  load24_le(s + 13) << 2;
133     i64 t5 =  load32_le(s + 16);
134     i64 t6 =  load24_le(s + 20) << 7;
135     i64 t7 =  load24_le(s + 23) << 5;
136     i64 t8 =  load24_le(s + 26) << 4;
137     i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2;
138     FE_CARRY;
139 }
140 
fe_mul_small(fe h,const fe f,i32 g)141 static void fe_mul_small(fe h, const fe f, i32 g)
142 {
143     i64 t0 = f[0] * (i64) g;  i64 t1 = f[1] * (i64) g;
144     i64 t2 = f[2] * (i64) g;  i64 t3 = f[3] * (i64) g;
145     i64 t4 = f[4] * (i64) g;  i64 t5 = f[5] * (i64) g;
146     i64 t6 = f[6] * (i64) g;  i64 t7 = f[7] * (i64) g;
147     i64 t8 = f[8] * (i64) g;  i64 t9 = f[9] * (i64) g;
148     FE_CARRY;
149 }
fe_mul121666(fe h,const fe f)150 static void fe_mul121666(fe h, const fe f) { fe_mul_small(h, f, 121666); }
151 
fe_mul(fe h,const fe f,const fe g)152 static void fe_mul(fe h, const fe f, const fe g)
153 {
154     // Everything is unrolled and put in temporary variables.
155     // We could roll the loop, but that would make curve25519 twice as slow.
156     i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
157     i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
158     i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
159     i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
160     i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
161     i32 G1 = g1*19;  i32 G2 = g2*19;  i32 G3 = g3*19;
162     i32 G4 = g4*19;  i32 G5 = g5*19;  i32 G6 = g6*19;
163     i32 G7 = g7*19;  i32 G8 = g8*19;  i32 G9 = g9*19;
164 
165     i64 h0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
166         +    F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
167     i64 h1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
168         +    f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
169     i64 h2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
170         +    F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
171     i64 h3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
172         +    f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
173     i64 h4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
174         +    F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
175     i64 h5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
176         +    f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
177     i64 h6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
178         +    F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
179     i64 h7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
180         +    f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
181     i64 h8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
182         +    F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
183     i64 h9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
184         +    f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
185 
186 #define CARRY                                                             \
187     i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;                           \
188     c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= c0 * (1 << 26); \
189     c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4;      h4 -= c4 * (1 << 26); \
190     c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1;      h1 -= c1 * (1 << 25); \
191     c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5;      h5 -= c5 * (1 << 25); \
192     c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2;      h2 -= c2 * (1 << 26); \
193     c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6;      h6 -= c6 * (1 << 26); \
194     c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3;      h3 -= c3 * (1 << 25); \
195     c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7;      h7 -= c7 * (1 << 25); \
196     c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4;      h4 -= c4 * (1 << 26); \
197     c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8;      h8 -= c8 * (1 << 26); \
198     c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 * (1 << 25); \
199     c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= c0 * (1 << 26); \
200     h[0]=(i32)h0;  h[1]=(i32)h1;  h[2]=(i32)h2;  h[3]=(i32)h3;  h[4]=(i32)h4; \
201     h[5]=(i32)h5;  h[6]=(i32)h6;  h[7]=(i32)h7;  h[8]=(i32)h8;  h[9]=(i32)h9
202 
203     CARRY;
204 }
205 
206 // we could use fe_mul() for this, but this is significantly faster
fe_sq(fe h,const fe f)207 static void fe_sq(fe h, const fe f)
208 {
209     i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
210     i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
211     i32 f0_2  = f0*2;   i32 f1_2  = f1*2;   i32 f2_2  = f2*2;   i32 f3_2 = f3*2;
212     i32 f4_2  = f4*2;   i32 f5_2  = f5*2;   i32 f6_2  = f6*2;   i32 f7_2 = f7*2;
213     i32 f5_38 = f5*38;  i32 f6_19 = f6*19;  i32 f7_38 = f7*38;
214     i32 f8_19 = f8*19;  i32 f9_38 = f9*38;
215 
216     i64 h0 = f0  *(i64)f0    + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
217         +    f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5  *(i64)f5_38;
218     i64 h1 = f0_2*(i64)f1    + f2  *(i64)f9_38 + f3_2*(i64)f8_19
219         +    f4  *(i64)f7_38 + f5_2*(i64)f6_19;
220     i64 h2 = f0_2*(i64)f2    + f1_2*(i64)f1    + f3_2*(i64)f9_38
221         +    f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6  *(i64)f6_19;
222     i64 h3 = f0_2*(i64)f3    + f1_2*(i64)f2    + f4  *(i64)f9_38
223         +    f5_2*(i64)f8_19 + f6  *(i64)f7_38;
224     i64 h4 = f0_2*(i64)f4    + f1_2*(i64)f3_2  + f2  *(i64)f2
225         +    f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7  *(i64)f7_38;
226     i64 h5 = f0_2*(i64)f5    + f1_2*(i64)f4    + f2_2*(i64)f3
227         +    f6  *(i64)f9_38 + f7_2*(i64)f8_19;
228     i64 h6 = f0_2*(i64)f6    + f1_2*(i64)f5_2  + f2_2*(i64)f4
229         +    f3_2*(i64)f3    + f7_2*(i64)f9_38 + f8  *(i64)f8_19;
230     i64 h7 = f0_2*(i64)f7    + f1_2*(i64)f6    + f2_2*(i64)f5
231         +    f3_2*(i64)f4    + f8  *(i64)f9_38;
232     i64 h8 = f0_2*(i64)f8    + f1_2*(i64)f7_2  + f2_2*(i64)f6
233         +    f3_2*(i64)f5_2  + f4  *(i64)f4    + f9  *(i64)f9_38;
234     i64 h9 = f0_2*(i64)f9    + f1_2*(i64)f8    + f2_2*(i64)f7
235         +    f3_2*(i64)f6    + f4  *(i64)f5_2;
236 
237     CARRY;
238 }
239 
fe_sq2(fe h,const fe f)240 static void fe_sq2(fe h, const fe f)
241 {
242     fe_sq(h, f);
243     fe_mul_small(h, h, 2);
244 }
245 
246 // This could be simplified, but it would be slower
fe_invert(fe out,const fe z)247 static void fe_invert(fe out, const fe z)
248 {
249     fe t0, t1, t2, t3;
250     fe_sq(t0, z );
251     fe_sq(t1, t0);
252     fe_sq(t1, t1);
253     fe_mul(t1,  z, t1);
254     fe_mul(t0, t0, t1);
255     fe_sq(t2, t0);                                fe_mul(t1 , t1, t2);
256     fe_sq(t2, t1); FOR (i, 1,   5) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
257     fe_sq(t2, t1); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
258     fe_sq(t3, t2); FOR (i, 1,  20) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
259     fe_sq(t2, t2); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
260     fe_sq(t2, t1); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
261     fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
262     fe_sq(t2, t2); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
263     fe_sq(t1, t1); FOR (i, 1,   5) fe_sq(t1, t1); fe_mul(out, t1, t0);
264 }
265 
266 // This could be simplified, but it would be slower
fe_pow22523(fe out,const fe z)267 static void fe_pow22523(fe out, const fe z)
268 {
269     fe t0, t1, t2;
270     fe_sq(t0, z);
271     fe_sq(t1,t0);                   fe_sq(t1, t1);  fe_mul(t1, z, t1);
272     fe_mul(t0, t0, t1);
273     fe_sq(t0, t0);                                  fe_mul(t0, t1, t0);
274     fe_sq(t1, t0);  FOR (i, 1,   5) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
275     fe_sq(t1, t0);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
276     fe_sq(t2, t1);  FOR (i, 1,  20) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
277     fe_sq(t1, t1);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
278     fe_sq(t1, t0);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
279     fe_sq(t2, t1);  FOR (i, 1, 100) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
280     fe_sq(t1, t1);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
281     fe_sq(t0, t0);  FOR (i, 1,   2) fe_sq(t0, t0);  fe_mul(out, t0, z);
282 }
283 
fe_tobytes(uint8_t s[32],const fe h)284 static void fe_tobytes(uint8_t s[32], const fe h)
285 {
286     i32 t[10];
287     FOR (i, 0, 10) {
288         t[i] = h[i];
289     }
290     i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
291     FOR (i, 0, 5) {
292         q += t[2*i  ]; q >>= 26;
293         q += t[2*i+1]; q >>= 25;
294     }
295     t[0] += 19 * q;
296 
297     i32 c0 = t[0] >> 26; t[1] += c0; t[0] -= c0 * (1 << 26);
298     i32 c1 = t[1] >> 25; t[2] += c1; t[1] -= c1 * (1 << 25);
299     i32 c2 = t[2] >> 26; t[3] += c2; t[2] -= c2 * (1 << 26);
300     i32 c3 = t[3] >> 25; t[4] += c3; t[3] -= c3 * (1 << 25);
301     i32 c4 = t[4] >> 26; t[5] += c4; t[4] -= c4 * (1 << 26);
302     i32 c5 = t[5] >> 25; t[6] += c5; t[5] -= c5 * (1 << 25);
303     i32 c6 = t[6] >> 26; t[7] += c6; t[6] -= c6 * (1 << 26);
304     i32 c7 = t[7] >> 25; t[8] += c7; t[7] -= c7 * (1 << 25);
305     i32 c8 = t[8] >> 26; t[9] += c8; t[8] -= c8 * (1 << 26);
306     i32 c9 = t[9] >> 25;             t[9] -= c9 * (1 << 25);
307 
308     store32_le(s +  0, ((u32)t[0] >>  0) | ((u32)t[1] << 26));
309     store32_le(s +  4, ((u32)t[1] >>  6) | ((u32)t[2] << 19));
310     store32_le(s +  8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
311     store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] <<  6));
312     store32_le(s + 16, ((u32)t[5] >>  0) | ((u32)t[6] << 25));
313     store32_le(s + 20, ((u32)t[6] >>  7) | ((u32)t[7] << 19));
314     store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
315     store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] <<  6));
316 }
317 
x32(const u8 a[32],const u8 b[32])318 static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
319 
crypto_verify32(const u8 a[32],const u8 b[32])320 static int crypto_verify32(const u8 a[32], const u8 b[32])
321 {
322 	return neq0(x32(a, b));
323 }
324 
zerocmp32(const u8 p[32])325 static int zerocmp32(const u8 p[32])
326 {
327 	static const u8 zero[128] = {0};
328 	return crypto_verify32(p, zero);
329 }
330 
331 //  Parity check.  Returns 0 if even, 1 if odd
fe_isnegative(const fe f)332 static int fe_isnegative(const fe f)
333 {
334     uint8_t s[32];
335     fe_tobytes(s, f);
336     uint8_t isneg = s[0] & 1;
337     return isneg;
338 }
339 
fe_isnonzero(const fe f)340 static int fe_isnonzero(const fe f)
341 {
342     uint8_t s[32];
343     fe_tobytes(s, f);
344     int isnonzero = zerocmp32(s);
345     return isnonzero;
346 }
347 
trim_scalar(uint8_t s[32])348 static void trim_scalar(uint8_t s[32])
349 {
350     s[ 0] &= 248;
351     s[31] &= 127;
352     s[31] |= 64;
353 }
354 
scalar_bit(const uint8_t s[32],int i)355 static int scalar_bit(const uint8_t s[32], int i) {
356     if (i < 0) { return 0; } // handle -1 for sliding windows
357     return (s[i>>3] >> (i&7)) & 1;
358 }
359 
360 extern void arcan_random(uint8_t* dst, size_t nb);
x25519_private_key(uint8_t secret[static32])361 void x25519_private_key(uint8_t secret[static 32])
362 {
363 	arcan_random(secret, 32);
364 	secret[0] &= 248;
365 	secret[31] &= 127;
366 	secret[31] |= 64;
367 }
368 
x25519_public_key(const uint8_t secret[static32],uint8_t public[static32])369 void x25519_public_key(
370 	const uint8_t secret[static 32], uint8_t public[static 32])
371 {
372 	const uint8_t basepoint[32] = {9};
373 	x25519_shared_secret(public, secret, basepoint);
374 }
375 
x25519_shared_secret(uint8_t secret_out[static32],const uint8_t secret[static32],const uint8_t ext_pub[static32])376 int x25519_shared_secret(
377 	uint8_t secret_out[static 32],
378 	const uint8_t secret [static 32], const uint8_t ext_pub [static 32])
379 {
380     // computes the scalar product
381     fe x1;
382     fe_frombytes(x1, ext_pub);
383 
384     // restrict the possible scalar values
385     uint8_t e[32];
386 		memcpy(e, secret, 32);
387     trim_scalar(e);
388 
389     // computes the actual scalar product (the result is in x2 and z2)
390     fe x2, z2, x3, z3, t0, t1;
391     // Montgomery ladder
392     // In projective coordinates, to avoid divisions: x = X / Z
393     // We don't care about the y coordinate, it's only 1 bit of information
394     fe_1(x2);        fe_0(z2); // "zero" point
395     fe_copy(x3, x1); fe_1(z3); // "one"  point
396     int swap = 0;
397     for (int pos = 254; pos >= 0; --pos) {
398         // constant time conditional swap before ladder step
399         int b = scalar_bit(e, pos);
400         swap ^= b; // xor trick avoids swapping at the end of the loop
401         fe_cswap(x2, x3, swap);
402         fe_cswap(z2, z3, swap);
403         swap = b;  // anticipates one last swap after the loop
404 
405         // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
406         // with differential addition
407         fe_sub(t0, x3, z3);  fe_sub(t1, x2, z2);    fe_add(x2, x2, z2);
408         fe_add(z2, x3, z3);  fe_mul(z3, t0, x2);    fe_mul(z2, z2, t1);
409         fe_sq (t0, t1    );  fe_sq (t1, x2    );    fe_add(x3, z3, z2);
410         fe_sub(z2, z3, z2);  fe_mul(x2, t1, t0);    fe_sub(t1, t1, t0);
411         fe_sq (z2, z2    );  fe_mul121666(z3, t1);  fe_sq (x3, x3    );
412         fe_add(t0, t0, z3);  fe_mul(z3, x1, z2);    fe_mul(z2, t1, t0);
413     }
414     // last swap is necessary to compensate for the xor trick
415     // Note: after this swap, P3 == P2 + P1.
416     fe_cswap(x2, x3, swap);
417     fe_cswap(z2, z3, swap);
418 
419     // normalises the coordinates: x == X / Z
420     fe_invert(z2, z2);
421     fe_mul(x2, x2, z2);
422     fe_tobytes(secret_out, x2);
423 
424     // Returns -1 if the output is all zero
425     // (happens with some malicious public keys)
426     return -1 - zerocmp32(secret_out);
427 }
428