1 /* Copyright 2008, Google Inc.
2  * All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  * curve25519-donna: Curve25519 elliptic curve, public key function
31  *
32  * http://code.google.com/p/curve25519-donna/
33  *
34  * Adam Langley <agl@imperialviolet.org>
35  *
36  * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
37  *
38  * More information about curve25519 can be found here
39  *   http://cr.yp.to/ecdh.html
40  *
41  * djb's sample implementation of curve25519 is written in a special assembly
42  * language called qhasm and uses the floating point registers.
43  *
44  * This is, almost, a clean room reimplementation from the curve25519 paper. It
45  * uses many of the tricks described therein. Only the crecip function is taken
46  * from the sample implementation.
47  */
48 
49 #include <string.h>
50 #include <stdint.h>
51 
52 #ifdef _MSC_VER
53 #define inline __inline
54 #endif
55 
56 typedef uint8_t u8;
57 typedef int32_t s32;
58 typedef int64_t limb;
59 
60 /* Field element representation:
61  *
62  * Field elements are written as an array of signed, 64-bit limbs, least
63  * significant first. The value of the field element is:
64  *   x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
65  *
66  * i.e. the limbs are 26, 25, 26, 25, ... bits wide.
67  */
68 
69 /* Sum two numbers: output += in */
fsum(limb * output,const limb * in)70 static void fsum(limb *output, const limb *in) {
71   unsigned i;
72   for (i = 0; i < 10; i += 2) {
73     output[0+i] = (output[0+i] + in[0+i]);
74     output[1+i] = (output[1+i] + in[1+i]);
75   }
76 }
77 
78 /* Find the difference of two numbers: output = in - output
79  * (note the order of the arguments!)
80  */
fdifference(limb * output,const limb * in)81 static void fdifference(limb *output, const limb *in) {
82   unsigned i;
83   for (i = 0; i < 10; ++i) {
84     output[i] = (in[i] - output[i]);
85   }
86 }
87 
88 /* Multiply a number by a scalar: output = in * scalar */
fscalar_product(limb * output,const limb * in,const limb scalar)89 static void fscalar_product(limb *output, const limb *in, const limb scalar) {
90   unsigned i;
91   for (i = 0; i < 10; ++i) {
92     output[i] = in[i] * scalar;
93   }
94 }
95 
96 /* Multiply two numbers: output = in2 * in
97  *
98  * output must be distinct to both inputs. The inputs are reduced coefficient
99  * form, the output is not.
100  */
fproduct(limb * output,const limb * in2,const limb * in)101 static void fproduct(limb *output, const limb *in2, const limb *in) {
102   output[0] =       ((limb) ((s32) in2[0])) * ((s32) in[0]);
103   output[1] =       ((limb) ((s32) in2[0])) * ((s32) in[1]) +
104                     ((limb) ((s32) in2[1])) * ((s32) in[0]);
105   output[2] =  2 *  ((limb) ((s32) in2[1])) * ((s32) in[1]) +
106                     ((limb) ((s32) in2[0])) * ((s32) in[2]) +
107                     ((limb) ((s32) in2[2])) * ((s32) in[0]);
108   output[3] =       ((limb) ((s32) in2[1])) * ((s32) in[2]) +
109                     ((limb) ((s32) in2[2])) * ((s32) in[1]) +
110                     ((limb) ((s32) in2[0])) * ((s32) in[3]) +
111                     ((limb) ((s32) in2[3])) * ((s32) in[0]);
112   output[4] =       ((limb) ((s32) in2[2])) * ((s32) in[2]) +
113                2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
114                     ((limb) ((s32) in2[3])) * ((s32) in[1])) +
115                     ((limb) ((s32) in2[0])) * ((s32) in[4]) +
116                     ((limb) ((s32) in2[4])) * ((s32) in[0]);
117   output[5] =       ((limb) ((s32) in2[2])) * ((s32) in[3]) +
118                     ((limb) ((s32) in2[3])) * ((s32) in[2]) +
119                     ((limb) ((s32) in2[1])) * ((s32) in[4]) +
120                     ((limb) ((s32) in2[4])) * ((s32) in[1]) +
121                     ((limb) ((s32) in2[0])) * ((s32) in[5]) +
122                     ((limb) ((s32) in2[5])) * ((s32) in[0]);
123   output[6] =  2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
124                     ((limb) ((s32) in2[1])) * ((s32) in[5]) +
125                     ((limb) ((s32) in2[5])) * ((s32) in[1])) +
126                     ((limb) ((s32) in2[2])) * ((s32) in[4]) +
127                     ((limb) ((s32) in2[4])) * ((s32) in[2]) +
128                     ((limb) ((s32) in2[0])) * ((s32) in[6]) +
129                     ((limb) ((s32) in2[6])) * ((s32) in[0]);
130   output[7] =       ((limb) ((s32) in2[3])) * ((s32) in[4]) +
131                     ((limb) ((s32) in2[4])) * ((s32) in[3]) +
132                     ((limb) ((s32) in2[2])) * ((s32) in[5]) +
133                     ((limb) ((s32) in2[5])) * ((s32) in[2]) +
134                     ((limb) ((s32) in2[1])) * ((s32) in[6]) +
135                     ((limb) ((s32) in2[6])) * ((s32) in[1]) +
136                     ((limb) ((s32) in2[0])) * ((s32) in[7]) +
137                     ((limb) ((s32) in2[7])) * ((s32) in[0]);
138   output[8] =       ((limb) ((s32) in2[4])) * ((s32) in[4]) +
139                2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
140                     ((limb) ((s32) in2[5])) * ((s32) in[3]) +
141                     ((limb) ((s32) in2[1])) * ((s32) in[7]) +
142                     ((limb) ((s32) in2[7])) * ((s32) in[1])) +
143                     ((limb) ((s32) in2[2])) * ((s32) in[6]) +
144                     ((limb) ((s32) in2[6])) * ((s32) in[2]) +
145                     ((limb) ((s32) in2[0])) * ((s32) in[8]) +
146                     ((limb) ((s32) in2[8])) * ((s32) in[0]);
147   output[9] =       ((limb) ((s32) in2[4])) * ((s32) in[5]) +
148                     ((limb) ((s32) in2[5])) * ((s32) in[4]) +
149                     ((limb) ((s32) in2[3])) * ((s32) in[6]) +
150                     ((limb) ((s32) in2[6])) * ((s32) in[3]) +
151                     ((limb) ((s32) in2[2])) * ((s32) in[7]) +
152                     ((limb) ((s32) in2[7])) * ((s32) in[2]) +
153                     ((limb) ((s32) in2[1])) * ((s32) in[8]) +
154                     ((limb) ((s32) in2[8])) * ((s32) in[1]) +
155                     ((limb) ((s32) in2[0])) * ((s32) in[9]) +
156                     ((limb) ((s32) in2[9])) * ((s32) in[0]);
157   output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
158                     ((limb) ((s32) in2[3])) * ((s32) in[7]) +
159                     ((limb) ((s32) in2[7])) * ((s32) in[3]) +
160                     ((limb) ((s32) in2[1])) * ((s32) in[9]) +
161                     ((limb) ((s32) in2[9])) * ((s32) in[1])) +
162                     ((limb) ((s32) in2[4])) * ((s32) in[6]) +
163                     ((limb) ((s32) in2[6])) * ((s32) in[4]) +
164                     ((limb) ((s32) in2[2])) * ((s32) in[8]) +
165                     ((limb) ((s32) in2[8])) * ((s32) in[2]);
166   output[11] =      ((limb) ((s32) in2[5])) * ((s32) in[6]) +
167                     ((limb) ((s32) in2[6])) * ((s32) in[5]) +
168                     ((limb) ((s32) in2[4])) * ((s32) in[7]) +
169                     ((limb) ((s32) in2[7])) * ((s32) in[4]) +
170                     ((limb) ((s32) in2[3])) * ((s32) in[8]) +
171                     ((limb) ((s32) in2[8])) * ((s32) in[3]) +
172                     ((limb) ((s32) in2[2])) * ((s32) in[9]) +
173                     ((limb) ((s32) in2[9])) * ((s32) in[2]);
174   output[12] =      ((limb) ((s32) in2[6])) * ((s32) in[6]) +
175                2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
176                     ((limb) ((s32) in2[7])) * ((s32) in[5]) +
177                     ((limb) ((s32) in2[3])) * ((s32) in[9]) +
178                     ((limb) ((s32) in2[9])) * ((s32) in[3])) +
179                     ((limb) ((s32) in2[4])) * ((s32) in[8]) +
180                     ((limb) ((s32) in2[8])) * ((s32) in[4]);
181   output[13] =      ((limb) ((s32) in2[6])) * ((s32) in[7]) +
182                     ((limb) ((s32) in2[7])) * ((s32) in[6]) +
183                     ((limb) ((s32) in2[5])) * ((s32) in[8]) +
184                     ((limb) ((s32) in2[8])) * ((s32) in[5]) +
185                     ((limb) ((s32) in2[4])) * ((s32) in[9]) +
186                     ((limb) ((s32) in2[9])) * ((s32) in[4]);
187   output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
188                     ((limb) ((s32) in2[5])) * ((s32) in[9]) +
189                     ((limb) ((s32) in2[9])) * ((s32) in[5])) +
190                     ((limb) ((s32) in2[6])) * ((s32) in[8]) +
191                     ((limb) ((s32) in2[8])) * ((s32) in[6]);
192   output[15] =      ((limb) ((s32) in2[7])) * ((s32) in[8]) +
193                     ((limb) ((s32) in2[8])) * ((s32) in[7]) +
194                     ((limb) ((s32) in2[6])) * ((s32) in[9]) +
195                     ((limb) ((s32) in2[9])) * ((s32) in[6]);
196   output[16] =      ((limb) ((s32) in2[8])) * ((s32) in[8]) +
197                2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
198                     ((limb) ((s32) in2[9])) * ((s32) in[7]));
199   output[17] =      ((limb) ((s32) in2[8])) * ((s32) in[9]) +
200                     ((limb) ((s32) in2[9])) * ((s32) in[8]);
201   output[18] = 2 *  ((limb) ((s32) in2[9])) * ((s32) in[9]);
202 }
203 
204 /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
freduce_degree(limb * output)205 static void freduce_degree(limb *output) {
206   /* Each of these shifts and adds ends up multiplying the value by 19. */
207   output[8] += output[18] << 4;
208   output[8] += output[18] << 1;
209   output[8] += output[18];
210   output[7] += output[17] << 4;
211   output[7] += output[17] << 1;
212   output[7] += output[17];
213   output[6] += output[16] << 4;
214   output[6] += output[16] << 1;
215   output[6] += output[16];
216   output[5] += output[15] << 4;
217   output[5] += output[15] << 1;
218   output[5] += output[15];
219   output[4] += output[14] << 4;
220   output[4] += output[14] << 1;
221   output[4] += output[14];
222   output[3] += output[13] << 4;
223   output[3] += output[13] << 1;
224   output[3] += output[13];
225   output[2] += output[12] << 4;
226   output[2] += output[12] << 1;
227   output[2] += output[12];
228   output[1] += output[11] << 4;
229   output[1] += output[11] << 1;
230   output[1] += output[11];
231   output[0] += output[10] << 4;
232   output[0] += output[10] << 1;
233   output[0] += output[10];
234 }
235 
236 #if (-1 & 3) != 3
237 #error "This code only works on a two's complement system"
238 #endif
239 
240 /* return v / 2^26, using only shifts and adds. */
241 static inline limb
div_by_2_26(const limb v)242 div_by_2_26(const limb v)
243 {
244   /* High word of v; no shift needed*/
245   const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
246   /* Set to all 1s if v was negative; else set to 0s. */
247   const int32_t sign = ((int32_t) highword) >> 31;
248   /* Set to 0x3ffffff if v was negative; else set to 0. */
249   const int32_t roundoff = ((uint32_t) sign) >> 6;
250   /* Should return v / (1<<26) */
251   return (v + roundoff) >> 26;
252 }
253 
254 /* return v / (2^25), using only shifts and adds. */
255 static inline limb
div_by_2_25(const limb v)256 div_by_2_25(const limb v)
257 {
258   /* High word of v; no shift needed*/
259   const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
260   /* Set to all 1s if v was negative; else set to 0s. */
261   const int32_t sign = ((int32_t) highword) >> 31;
262   /* Set to 0x1ffffff if v was negative; else set to 0. */
263   const int32_t roundoff = ((uint32_t) sign) >> 7;
264   /* Should return v / (1<<25) */
265   return (v + roundoff) >> 25;
266 }
267 
268 static inline s32
div_s32_by_2_25(const s32 v)269 div_s32_by_2_25(const s32 v)
270 {
271    const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
272    return (v + roundoff) >> 25;
273 }
274 
275 /* Reduce all coefficients of the short form input so that |x| < 2^26.
276  *
277  * On entry: |output[i]| < 2^62
278  */
freduce_coefficients(limb * output)279 static void freduce_coefficients(limb *output) {
280   unsigned i;
281 
282   output[10] = 0;
283 
284   for (i = 0; i < 10; i += 2) {
285     limb over = div_by_2_26(output[i]);
286     output[i] -= over << 26;
287     output[i+1] += over;
288 
289     over = div_by_2_25(output[i+1]);
290     output[i+1] -= over << 25;
291     output[i+2] += over;
292   }
293   /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */
294   output[0] += output[10] << 4;
295   output[0] += output[10] << 1;
296   output[0] += output[10];
297 
298   output[10] = 0;
299 
300   /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38
301    * So |over| will be no more than 77825  */
302   {
303     limb over = div_by_2_26(output[0]);
304     output[0] -= over << 26;
305     output[1] += over;
306   }
307 
308   /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825
309    * So |over| will be no more than 1. */
310   {
311     /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
312     s32 over32 = div_s32_by_2_25((s32) output[1]);
313     output[1] -= over32 << 25;
314     output[2] += over32;
315   }
316 
317   /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
318    * we have |output[2]| <= 2^26.  This is good enough for all of our math,
319    * but it will require an extra freduce_coefficients before fcontract. */
320 }
321 
322 /* A helpful wrapper around fproduct: output = in * in2.
323  *
324  * output must be distinct to both inputs. The output is reduced degree and
325  * reduced coefficient.
326  */
327 static void
fmul(limb * output,const limb * in,const limb * in2)328 fmul(limb *output, const limb *in, const limb *in2) {
329   limb t[19];
330   fproduct(t, in, in2);
331   freduce_degree(t);
332   freduce_coefficients(t);
333   memcpy(output, t, sizeof(limb) * 10);
334 }
335 
fsquare_inner(limb * output,const limb * in)336 static void fsquare_inner(limb *output, const limb *in) {
337   output[0] =       ((limb) ((s32) in[0])) * ((s32) in[0]);
338   output[1] =  2 *  ((limb) ((s32) in[0])) * ((s32) in[1]);
339   output[2] =  2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
340                     ((limb) ((s32) in[0])) * ((s32) in[2]));
341   output[3] =  2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
342                     ((limb) ((s32) in[0])) * ((s32) in[3]));
343   output[4] =       ((limb) ((s32) in[2])) * ((s32) in[2]) +
344                4 *  ((limb) ((s32) in[1])) * ((s32) in[3]) +
345                2 *  ((limb) ((s32) in[0])) * ((s32) in[4]);
346   output[5] =  2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
347                     ((limb) ((s32) in[1])) * ((s32) in[4]) +
348                     ((limb) ((s32) in[0])) * ((s32) in[5]));
349   output[6] =  2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
350                     ((limb) ((s32) in[2])) * ((s32) in[4]) +
351                     ((limb) ((s32) in[0])) * ((s32) in[6]) +
352                2 *  ((limb) ((s32) in[1])) * ((s32) in[5]));
353   output[7] =  2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
354                     ((limb) ((s32) in[2])) * ((s32) in[5]) +
355                     ((limb) ((s32) in[1])) * ((s32) in[6]) +
356                     ((limb) ((s32) in[0])) * ((s32) in[7]));
357   output[8] =       ((limb) ((s32) in[4])) * ((s32) in[4]) +
358                2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
359                     ((limb) ((s32) in[0])) * ((s32) in[8]) +
360                2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
361                     ((limb) ((s32) in[3])) * ((s32) in[5])));
362   output[9] =  2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
363                     ((limb) ((s32) in[3])) * ((s32) in[6]) +
364                     ((limb) ((s32) in[2])) * ((s32) in[7]) +
365                     ((limb) ((s32) in[1])) * ((s32) in[8]) +
366                     ((limb) ((s32) in[0])) * ((s32) in[9]));
367   output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
368                     ((limb) ((s32) in[4])) * ((s32) in[6]) +
369                     ((limb) ((s32) in[2])) * ((s32) in[8]) +
370                2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
371                     ((limb) ((s32) in[1])) * ((s32) in[9])));
372   output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
373                     ((limb) ((s32) in[4])) * ((s32) in[7]) +
374                     ((limb) ((s32) in[3])) * ((s32) in[8]) +
375                     ((limb) ((s32) in[2])) * ((s32) in[9]));
376   output[12] =      ((limb) ((s32) in[6])) * ((s32) in[6]) +
377                2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
378                2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
379                     ((limb) ((s32) in[3])) * ((s32) in[9])));
380   output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
381                     ((limb) ((s32) in[5])) * ((s32) in[8]) +
382                     ((limb) ((s32) in[4])) * ((s32) in[9]));
383   output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
384                     ((limb) ((s32) in[6])) * ((s32) in[8]) +
385                2 *  ((limb) ((s32) in[5])) * ((s32) in[9]));
386   output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
387                     ((limb) ((s32) in[6])) * ((s32) in[9]));
388   output[16] =      ((limb) ((s32) in[8])) * ((s32) in[8]) +
389                4 *  ((limb) ((s32) in[7])) * ((s32) in[9]);
390   output[17] = 2 *  ((limb) ((s32) in[8])) * ((s32) in[9]);
391   output[18] = 2 *  ((limb) ((s32) in[9])) * ((s32) in[9]);
392 }
393 
394 static void
fsquare(limb * output,const limb * in)395 fsquare(limb *output, const limb *in) {
396   limb t[19];
397   fsquare_inner(t, in);
398   freduce_degree(t);
399   freduce_coefficients(t);
400   memcpy(output, t, sizeof(limb) * 10);
401 }
402 
403 /* Take a little-endian, 32-byte number and expand it into polynomial form */
404 static void
fexpand(limb * output,const u8 * input)405 fexpand(limb *output, const u8 *input) {
406 #define F(n,start,shift,mask) \
407   output[n] = ((((limb) input[start + 0]) | \
408                 ((limb) input[start + 1]) << 8 | \
409                 ((limb) input[start + 2]) << 16 | \
410                 ((limb) input[start + 3]) << 24) >> shift) & mask;
411   F(0, 0, 0, 0x3ffffff);
412   F(1, 3, 2, 0x1ffffff);
413   F(2, 6, 3, 0x3ffffff);
414   F(3, 9, 5, 0x1ffffff);
415   F(4, 12, 6, 0x3ffffff);
416   F(5, 16, 0, 0x1ffffff);
417   F(6, 19, 1, 0x3ffffff);
418   F(7, 22, 3, 0x1ffffff);
419   F(8, 25, 4, 0x3ffffff);
420   F(9, 28, 6, 0x3ffffff);
421 #undef F
422 }
423 
424 #if (-32 >> 1) != -16
425 #error "This code only works when >> does sign-extension on negative numbers"
426 #endif
427 
428 /* Take a fully reduced polynomial form number and contract it into a
429  * little-endian, 32-byte array
430  */
431 static void
fcontract(u8 * output,limb * input)432 fcontract(u8 *output, limb *input) {
433   int i;
434   int j;
435 
436   for (j = 0; j < 2; ++j) {
437     for (i = 0; i < 9; ++i) {
438       if ((i & 1) == 1) {
439         /* This calculation is a time-invariant way to make input[i] positive
440            by borrowing from the next-larger limb.
441         */
442         const s32 mask = (s32)(input[i]) >> 31;
443         const s32 carry = -(((s32)(input[i]) & mask) >> 25);
444         input[i] = (s32)(input[i]) + (carry << 25);
445         input[i+1] = (s32)(input[i+1]) - carry;
446       } else {
447         const s32 mask = (s32)(input[i]) >> 31;
448         const s32 carry = -(((s32)(input[i]) & mask) >> 26);
449         input[i] = (s32)(input[i]) + (carry << 26);
450         input[i+1] = (s32)(input[i+1]) - carry;
451       }
452     }
453     {
454       const s32 mask = (s32)(input[9]) >> 31;
455       const s32 carry = -(((s32)(input[9]) & mask) >> 25);
456       input[9] = (s32)(input[9]) + (carry << 25);
457       input[0] = (s32)(input[0]) - (carry * 19);
458     }
459   }
460 
461   /* The first borrow-propagation pass above ended with every limb
462      except (possibly) input[0] non-negative.
463 
464      Since each input limb except input[0] is decreased by at most 1
465      by a borrow-propagation pass, the second borrow-propagation pass
466      could only have wrapped around to decrease input[0] again if the
467      first pass left input[0] negative *and* input[1] through input[9]
468      were all zero.  In that case, input[1] is now 2^25 - 1, and this
469      last borrow-propagation step will leave input[1] non-negative.
470   */
471   {
472     const s32 mask = (s32)(input[0]) >> 31;
473     const s32 carry = -(((s32)(input[0]) & mask) >> 26);
474     input[0] = (s32)(input[0]) + (carry << 26);
475     input[1] = (s32)(input[1]) - carry;
476   }
477 
478   /* Both passes through the above loop, plus the last 0-to-1 step, are
479      necessary: if input[9] is -1 and input[0] through input[8] are 0,
480      negative values will remain in the array until the end.
481    */
482 
483   input[1] <<= 2;
484   input[2] <<= 3;
485   input[3] <<= 5;
486   input[4] <<= 6;
487   input[6] <<= 1;
488   input[7] <<= 3;
489   input[8] <<= 4;
490   input[9] <<= 6;
491 #define F(i, s) \
492   output[s+0] |=  input[i] & 0xff; \
493   output[s+1]  = (input[i] >> 8) & 0xff; \
494   output[s+2]  = (input[i] >> 16) & 0xff; \
495   output[s+3]  = (input[i] >> 24) & 0xff;
496   output[0] = 0;
497   output[16] = 0;
498   F(0,0);
499   F(1,3);
500   F(2,6);
501   F(3,9);
502   F(4,12);
503   F(5,16);
504   F(6,19);
505   F(7,22);
506   F(8,25);
507   F(9,28);
508 #undef F
509 }
510 
511 /* Input: Q, Q', Q-Q'
512  * Output: 2Q, Q+Q'
513  *
514  *   x2 z3: long form
515  *   x3 z3: long form
516  *   x z: short form, destroyed
517  *   xprime zprime: short form, destroyed
518  *   qmqp: short form, preserved
519  */
fmonty(limb * x2,limb * z2,limb * x3,limb * z3,limb * x,limb * z,limb * xprime,limb * zprime,const limb * qmqp)520 static void fmonty(limb *x2, limb *z2,  /* output 2Q */
521                    limb *x3, limb *z3,  /* output Q + Q' */
522                    limb *x, limb *z,    /* input Q */
523                    limb *xprime, limb *zprime,  /* input Q' */
524                    const limb *qmqp /* input Q - Q' */) {
525   limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
526         zzprime[19], zzzprime[19], xxxprime[19];
527 
528   memcpy(origx, x, 10 * sizeof(limb));
529   fsum(x, z);
530   fdifference(z, origx);  // does x - z
531 
532   memcpy(origxprime, xprime, sizeof(limb) * 10);
533   fsum(xprime, zprime);
534   fdifference(zprime, origxprime);
535   fproduct(xxprime, xprime, z);
536   fproduct(zzprime, x, zprime);
537   freduce_degree(xxprime);
538   freduce_coefficients(xxprime);
539   freduce_degree(zzprime);
540   freduce_coefficients(zzprime);
541   memcpy(origxprime, xxprime, sizeof(limb) * 10);
542   fsum(xxprime, zzprime);
543   fdifference(zzprime, origxprime);
544   fsquare(xxxprime, xxprime);
545   fsquare(zzzprime, zzprime);
546   fproduct(zzprime, zzzprime, qmqp);
547   freduce_degree(zzprime);
548   freduce_coefficients(zzprime);
549   memcpy(x3, xxxprime, sizeof(limb) * 10);
550   memcpy(z3, zzprime, sizeof(limb) * 10);
551 
552   fsquare(xx, x);
553   fsquare(zz, z);
554   fproduct(x2, xx, zz);
555   freduce_degree(x2);
556   freduce_coefficients(x2);
557   fdifference(zz, xx);  // does zz = xx - zz
558   memset(zzz + 10, 0, sizeof(limb) * 9);
559   fscalar_product(zzz, zz, 121665);
560   /* No need to call freduce_degree here:
561      fscalar_product doesn't increase the degree of its input. */
562   freduce_coefficients(zzz);
563   fsum(zzz, xx);
564   fproduct(z2, zz, zzz);
565   freduce_degree(z2);
566   freduce_coefficients(z2);
567 }
568 
569 /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
570  * them unchanged if 'iswap' is 0.  Runs in data-invariant time to avoid
571  * side-channel attacks.
572  *
573  * NOTE that this function requires that 'iswap' be 1 or 0; other values give
574  * wrong results.  Also, the two limb arrays must be in reduced-coefficient,
575  * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
576  * and all all values in a[0..9],b[0..9] must have magnitude less than
577  * INT32_MAX.
578  */
579 static void
swap_conditional(limb a[19],limb b[19],limb iswap)580 swap_conditional(limb a[19], limb b[19], limb iswap) {
581   unsigned i;
582   const s32 swap = (s32) -iswap;
583 
584   for (i = 0; i < 10; ++i) {
585     const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
586     a[i] = ((s32)a[i]) ^ x;
587     b[i] = ((s32)b[i]) ^ x;
588   }
589 }
590 
591 /* Calculates nQ where Q is the x-coordinate of a point on the curve
592  *
593  *   resultx/resultz: the x coordinate of the resulting curve point (short form)
594  *   n: a little endian, 32-byte number
595  *   q: a point of the curve (short form)
596  */
597 static void
cmult(limb * resultx,limb * resultz,const u8 * n,const limb * q)598 cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
599   limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
600   limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
601   limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
602   limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
603 
604   unsigned i, j;
605 
606   memcpy(nqpqx, q, sizeof(limb) * 10);
607 
608   for (i = 0; i < 32; ++i) {
609     u8 byte = n[31 - i];
610     for (j = 0; j < 8; ++j) {
611       const limb bit = byte >> 7;
612 
613       swap_conditional(nqx, nqpqx, bit);
614       swap_conditional(nqz, nqpqz, bit);
615       fmonty(nqx2, nqz2,
616              nqpqx2, nqpqz2,
617              nqx, nqz,
618              nqpqx, nqpqz,
619              q);
620       swap_conditional(nqx2, nqpqx2, bit);
621       swap_conditional(nqz2, nqpqz2, bit);
622 
623       t = nqx;
624       nqx = nqx2;
625       nqx2 = t;
626       t = nqz;
627       nqz = nqz2;
628       nqz2 = t;
629       t = nqpqx;
630       nqpqx = nqpqx2;
631       nqpqx2 = t;
632       t = nqpqz;
633       nqpqz = nqpqz2;
634       nqpqz2 = t;
635 
636       byte <<= 1;
637     }
638   }
639 
640   memcpy(resultx, nqx, sizeof(limb) * 10);
641   memcpy(resultz, nqz, sizeof(limb) * 10);
642 }
643 
644 // -----------------------------------------------------------------------------
645 // Shamelessly copied from djb's code
646 // -----------------------------------------------------------------------------
647 static void
crecip(limb * out,const limb * z)648 crecip(limb *out, const limb *z) {
649   limb z2[10];
650   limb z9[10];
651   limb z11[10];
652   limb z2_5_0[10];
653   limb z2_10_0[10];
654   limb z2_20_0[10];
655   limb z2_50_0[10];
656   limb z2_100_0[10];
657   limb t0[10];
658   limb t1[10];
659   int i;
660 
661   /* 2 */ fsquare(z2,z);
662   /* 4 */ fsquare(t1,z2);
663   /* 8 */ fsquare(t0,t1);
664   /* 9 */ fmul(z9,t0,z);
665   /* 11 */ fmul(z11,z9,z2);
666   /* 22 */ fsquare(t0,z11);
667   /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
668 
669   /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
670   /* 2^7 - 2^2 */ fsquare(t1,t0);
671   /* 2^8 - 2^3 */ fsquare(t0,t1);
672   /* 2^9 - 2^4 */ fsquare(t1,t0);
673   /* 2^10 - 2^5 */ fsquare(t0,t1);
674   /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
675 
676   /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
677   /* 2^12 - 2^2 */ fsquare(t1,t0);
678   /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
679   /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
680 
681   /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
682   /* 2^22 - 2^2 */ fsquare(t1,t0);
683   /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
684   /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
685 
686   /* 2^41 - 2^1 */ fsquare(t1,t0);
687   /* 2^42 - 2^2 */ fsquare(t0,t1);
688   /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
689   /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
690 
691   /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
692   /* 2^52 - 2^2 */ fsquare(t1,t0);
693   /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
694   /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
695 
696   /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
697   /* 2^102 - 2^2 */ fsquare(t0,t1);
698   /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
699   /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
700 
701   /* 2^201 - 2^1 */ fsquare(t0,t1);
702   /* 2^202 - 2^2 */ fsquare(t1,t0);
703   /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
704   /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
705 
706   /* 2^251 - 2^1 */ fsquare(t1,t0);
707   /* 2^252 - 2^2 */ fsquare(t0,t1);
708   /* 2^253 - 2^3 */ fsquare(t1,t0);
709   /* 2^254 - 2^4 */ fsquare(t0,t1);
710   /* 2^255 - 2^5 */ fsquare(t1,t0);
711   /* 2^255 - 21 */ fmul(out,t1,z11);
712 }
713 
714 int curve25519_donna(u8 *, const u8 *, const u8 *);
715 
716 int
curve25519_donna(u8 * mypublic,const u8 * secret,const u8 * basepoint)717 curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
718   limb bp[10], x[10], z[11], zmone[10];
719   uint8_t e[32];
720   int i;
721 
722   for (i = 0; i < 32; ++i) e[i] = secret[i];
723   e[0] &= 248;
724   e[31] &= 127;
725   e[31] |= 64;
726 
727   fexpand(bp, basepoint);
728   cmult(x, z, e, bp);
729   crecip(zmone, z);
730   fmul(z, x, zmone);
731   freduce_coefficients(z);
732   fcontract(mypublic, z);
733   return 0;
734 }
735