1 /*
2  * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the OpenSSL license (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 /* Copyright 2011 Google Inc.
11  *
12  * Licensed under the Apache License, Version 2.0 (the "License");
13  *
14  * you may not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  *     http://www.apache.org/licenses/LICENSE-2.0
18  *
19  *  Unless required by applicable law or agreed to in writing, software
20  *  distributed under the License is distributed on an "AS IS" BASIS,
21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  *  See the License for the specific language governing permissions and
23  *  limitations under the License.
24  */
25 
26 /*
27  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
28  *
29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
31  * work which got its smarts from Daniel J. Bernstein's work on the same.
32  */
33 
34 #include <openssl/opensslconf.h>
35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
36 NON_EMPTY_TRANSLATION_UNIT
37 #else
38 
39 # include <stdint.h>
40 # include <string.h>
41 # include <openssl/err.h>
42 # include "ec_local.h"
43 
44 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
45   /* even with gcc, the typedef won't work for 32-bit platforms */
46 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
47                                  * platforms */
48 typedef __int128_t int128_t;
49 # else
50 #  error "Your compiler doesn't appear to support 128-bit integer types"
51 # endif
52 
53 typedef uint8_t u8;
54 typedef uint32_t u32;
55 typedef uint64_t u64;
56 
57 /*
58  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
59  * can serialise an element of this field into 32 bytes. We call this an
60  * felem_bytearray.
61  */
62 
63 typedef u8 felem_bytearray[32];
64 
65 /*
66  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
67  * values are big-endian.
68  */
69 static const felem_bytearray nistp256_curve_params[5] = {
70     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
71      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
72      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
74     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
75      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc},
78     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, /* b */
79      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
80      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
81      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
82     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
83      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
84      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
85      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
86     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
87      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
88      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
89      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
90 };
91 
92 /*-
93  * The representation of field elements.
94  * ------------------------------------
95  *
96  * We represent field elements with either four 128-bit values, eight 128-bit
97  * values, or four 64-bit values. The field element represented is:
98  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
99  * or:
100  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
101  *
102  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
103  * apart, but are 128-bits wide, the most significant bits of each limb overlap
104  * with the least significant bits of the next.
105  *
106  * A field element with four limbs is an 'felem'. One with eight limbs is a
107  * 'longfelem'
108  *
109  * A field element with four, 64-bit values is called a 'smallfelem'. Small
110  * values are used as intermediate values before multiplication.
111  */
112 
113 # define NLIMBS 4
114 
115 typedef uint128_t limb;
116 typedef limb felem[NLIMBS];
117 typedef limb longfelem[NLIMBS * 2];
118 typedef u64 smallfelem[NLIMBS];
119 
120 /* This is the value of the prime as four 64-bit words, little-endian. */
121 static const u64 kPrime[4] =
122     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
123 static const u64 bottom63bits = 0x7ffffffffffffffful;
124 
125 /*
126  * bin32_to_felem takes a little-endian byte array and converts it into felem
127  * form. This assumes that the CPU is little-endian.
128  */
129 static void bin32_to_felem(felem out, const u8 in[32])
130 {
131     out[0] = *((u64 *)&in[0]);
132     out[1] = *((u64 *)&in[8]);
133     out[2] = *((u64 *)&in[16]);
134     out[3] = *((u64 *)&in[24]);
135 }
136 
137 /*
138  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
139  * endian, 32 byte array. This assumes that the CPU is little-endian.
140  */
141 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
142 {
143     *((u64 *)&out[0]) = in[0];
144     *((u64 *)&out[8]) = in[1];
145     *((u64 *)&out[16]) = in[2];
146     *((u64 *)&out[24]) = in[3];
147 }
148 
149 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
150 static int BN_to_felem(felem out, const BIGNUM *bn)
151 {
152     felem_bytearray b_out;
153     int num_bytes;
154 
155     if (BN_is_negative(bn)) {
156         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
157         return 0;
158     }
159     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
160     if (num_bytes < 0) {
161         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
162         return 0;
163     }
164     bin32_to_felem(out, b_out);
165     return 1;
166 }
167 
168 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
169 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
170 {
171     felem_bytearray b_out;
172     smallfelem_to_bin32(b_out, in);
173     return BN_lebin2bn(b_out, sizeof(b_out), out);
174 }
175 
176 /*-
177  * Field operations
178  * ----------------
179  */
180 
181 static void smallfelem_one(smallfelem out)
182 {
183     out[0] = 1;
184     out[1] = 0;
185     out[2] = 0;
186     out[3] = 0;
187 }
188 
189 static void smallfelem_assign(smallfelem out, const smallfelem in)
190 {
191     out[0] = in[0];
192     out[1] = in[1];
193     out[2] = in[2];
194     out[3] = in[3];
195 }
196 
197 static void felem_assign(felem out, const felem in)
198 {
199     out[0] = in[0];
200     out[1] = in[1];
201     out[2] = in[2];
202     out[3] = in[3];
203 }
204 
205 /* felem_sum sets out = out + in. */
206 static void felem_sum(felem out, const felem in)
207 {
208     out[0] += in[0];
209     out[1] += in[1];
210     out[2] += in[2];
211     out[3] += in[3];
212 }
213 
214 /* felem_small_sum sets out = out + in. */
215 static void felem_small_sum(felem out, const smallfelem in)
216 {
217     out[0] += in[0];
218     out[1] += in[1];
219     out[2] += in[2];
220     out[3] += in[3];
221 }
222 
223 /* felem_scalar sets out = out * scalar */
224 static void felem_scalar(felem out, const u64 scalar)
225 {
226     out[0] *= scalar;
227     out[1] *= scalar;
228     out[2] *= scalar;
229     out[3] *= scalar;
230 }
231 
232 /* longfelem_scalar sets out = out * scalar */
233 static void longfelem_scalar(longfelem out, const u64 scalar)
234 {
235     out[0] *= scalar;
236     out[1] *= scalar;
237     out[2] *= scalar;
238     out[3] *= scalar;
239     out[4] *= scalar;
240     out[5] *= scalar;
241     out[6] *= scalar;
242     out[7] *= scalar;
243 }
244 
245 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
246 # define two105 (((limb)1) << 105)
247 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
248 
249 /* zero105 is 0 mod p */
250 static const felem zero105 =
251     { two105m41m9, two105, two105m41p9, two105m41p9 };
252 
253 /*-
254  * smallfelem_neg sets |out| to |-small|
255  * On exit:
256  *   out[i] < out[i] + 2^105
257  */
258 static void smallfelem_neg(felem out, const smallfelem small)
259 {
260     /* In order to prevent underflow, we subtract from 0 mod p. */
261     out[0] = zero105[0] - small[0];
262     out[1] = zero105[1] - small[1];
263     out[2] = zero105[2] - small[2];
264     out[3] = zero105[3] - small[3];
265 }
266 
267 /*-
268  * felem_diff subtracts |in| from |out|
269  * On entry:
270  *   in[i] < 2^104
271  * On exit:
272  *   out[i] < out[i] + 2^105
273  */
274 static void felem_diff(felem out, const felem in)
275 {
276     /*
277      * In order to prevent underflow, we add 0 mod p before subtracting.
278      */
279     out[0] += zero105[0];
280     out[1] += zero105[1];
281     out[2] += zero105[2];
282     out[3] += zero105[3];
283 
284     out[0] -= in[0];
285     out[1] -= in[1];
286     out[2] -= in[2];
287     out[3] -= in[3];
288 }
289 
290 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
291 # define two107 (((limb)1) << 107)
292 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
293 
294 /* zero107 is 0 mod p */
295 static const felem zero107 =
296     { two107m43m11, two107, two107m43p11, two107m43p11 };
297 
298 /*-
299  * An alternative felem_diff for larger inputs |in|
300  * felem_diff_zero107 subtracts |in| from |out|
301  * On entry:
302  *   in[i] < 2^106
303  * On exit:
304  *   out[i] < out[i] + 2^107
305  */
306 static void felem_diff_zero107(felem out, const felem in)
307 {
308     /*
309      * In order to prevent underflow, we add 0 mod p before subtracting.
310      */
311     out[0] += zero107[0];
312     out[1] += zero107[1];
313     out[2] += zero107[2];
314     out[3] += zero107[3];
315 
316     out[0] -= in[0];
317     out[1] -= in[1];
318     out[2] -= in[2];
319     out[3] -= in[3];
320 }
321 
322 /*-
323  * longfelem_diff subtracts |in| from |out|
324  * On entry:
325  *   in[i] < 7*2^67
326  * On exit:
327  *   out[i] < out[i] + 2^70 + 2^40
328  */
329 static void longfelem_diff(longfelem out, const longfelem in)
330 {
331     static const limb two70m8p6 =
332         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
333     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
334     static const limb two70 = (((limb) 1) << 70);
335     static const limb two70m40m38p6 =
336         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
337         (((limb) 1) << 6);
338     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
339 
340     /* add 0 mod p to avoid underflow */
341     out[0] += two70m8p6;
342     out[1] += two70p40;
343     out[2] += two70;
344     out[3] += two70m40m38p6;
345     out[4] += two70m6;
346     out[5] += two70m6;
347     out[6] += two70m6;
348     out[7] += two70m6;
349 
350     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
351     out[0] -= in[0];
352     out[1] -= in[1];
353     out[2] -= in[2];
354     out[3] -= in[3];
355     out[4] -= in[4];
356     out[5] -= in[5];
357     out[6] -= in[6];
358     out[7] -= in[7];
359 }
360 
361 # define two64m0 (((limb)1) << 64) - 1
362 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
363 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
364 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
365 
366 /* zero110 is 0 mod p */
367 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
368 
369 /*-
370  * felem_shrink converts an felem into a smallfelem. The result isn't quite
371  * minimal as the value may be greater than p.
372  *
373  * On entry:
374  *   in[i] < 2^109
375  * On exit:
376  *   out[i] < 2^64
377  */
378 static void felem_shrink(smallfelem out, const felem in)
379 {
380     felem tmp;
381     u64 a, b, mask;
382     u64 high, low;
383     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
384 
385     /* Carry 2->3 */
386     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
387     /* tmp[3] < 2^110 */
388 
389     tmp[2] = zero110[2] + (u64)in[2];
390     tmp[0] = zero110[0] + in[0];
391     tmp[1] = zero110[1] + in[1];
392     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
393 
394     /*
395      * We perform two partial reductions where we eliminate the high-word of
396      * tmp[3]. We don't update the other words till the end.
397      */
398     a = tmp[3] >> 64;           /* a < 2^46 */
399     tmp[3] = (u64)tmp[3];
400     tmp[3] -= a;
401     tmp[3] += ((limb) a) << 32;
402     /* tmp[3] < 2^79 */
403 
404     b = a;
405     a = tmp[3] >> 64;           /* a < 2^15 */
406     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
407     tmp[3] = (u64)tmp[3];
408     tmp[3] -= a;
409     tmp[3] += ((limb) a) << 32;
410     /* tmp[3] < 2^64 + 2^47 */
411 
412     /*
413      * This adjusts the other two words to complete the two partial
414      * reductions.
415      */
416     tmp[0] += b;
417     tmp[1] -= (((limb) b) << 32);
418 
419     /*
420      * In order to make space in tmp[3] for the carry from 2 -> 3, we
421      * conditionally subtract kPrime if tmp[3] is large enough.
422      */
423     high = (u64)(tmp[3] >> 64);
424     /* As tmp[3] < 2^65, high is either 1 or 0 */
425     high = 0 - high;
426     /*-
427      * high is:
428      *   all ones   if the high word of tmp[3] is 1
429      *   all zeros  if the high word of tmp[3] if 0
430      */
431     low = (u64)tmp[3];
432     mask = 0 - (low >> 63);
433     /*-
434      * mask is:
435      *   all ones   if the MSB of low is 1
436      *   all zeros  if the MSB of low if 0
437      */
438     low &= bottom63bits;
439     low -= kPrime3Test;
440     /* if low was greater than kPrime3Test then the MSB is zero */
441     low = ~low;
442     low = 0 - (low >> 63);
443     /*-
444      * low is:
445      *   all ones   if low was > kPrime3Test
446      *   all zeros  if low was <= kPrime3Test
447      */
448     mask = (mask & low) | high;
449     tmp[0] -= mask & kPrime[0];
450     tmp[1] -= mask & kPrime[1];
451     /* kPrime[2] is zero, so omitted */
452     tmp[3] -= mask & kPrime[3];
453     /* tmp[3] < 2**64 - 2**32 + 1 */
454 
455     tmp[1] += ((u64)(tmp[0] >> 64));
456     tmp[0] = (u64)tmp[0];
457     tmp[2] += ((u64)(tmp[1] >> 64));
458     tmp[1] = (u64)tmp[1];
459     tmp[3] += ((u64)(tmp[2] >> 64));
460     tmp[2] = (u64)tmp[2];
461     /* tmp[i] < 2^64 */
462 
463     out[0] = tmp[0];
464     out[1] = tmp[1];
465     out[2] = tmp[2];
466     out[3] = tmp[3];
467 }
468 
469 /* smallfelem_expand converts a smallfelem to an felem */
470 static void smallfelem_expand(felem out, const smallfelem in)
471 {
472     out[0] = in[0];
473     out[1] = in[1];
474     out[2] = in[2];
475     out[3] = in[3];
476 }
477 
478 /*-
479  * smallfelem_square sets |out| = |small|^2
480  * On entry:
481  *   small[i] < 2^64
482  * On exit:
483  *   out[i] < 7 * 2^64 < 2^67
484  */
485 static void smallfelem_square(longfelem out, const smallfelem small)
486 {
487     limb a;
488     u64 high, low;
489 
490     a = ((uint128_t) small[0]) * small[0];
491     low = a;
492     high = a >> 64;
493     out[0] = low;
494     out[1] = high;
495 
496     a = ((uint128_t) small[0]) * small[1];
497     low = a;
498     high = a >> 64;
499     out[1] += low;
500     out[1] += low;
501     out[2] = high;
502 
503     a = ((uint128_t) small[0]) * small[2];
504     low = a;
505     high = a >> 64;
506     out[2] += low;
507     out[2] *= 2;
508     out[3] = high;
509 
510     a = ((uint128_t) small[0]) * small[3];
511     low = a;
512     high = a >> 64;
513     out[3] += low;
514     out[4] = high;
515 
516     a = ((uint128_t) small[1]) * small[2];
517     low = a;
518     high = a >> 64;
519     out[3] += low;
520     out[3] *= 2;
521     out[4] += high;
522 
523     a = ((uint128_t) small[1]) * small[1];
524     low = a;
525     high = a >> 64;
526     out[2] += low;
527     out[3] += high;
528 
529     a = ((uint128_t) small[1]) * small[3];
530     low = a;
531     high = a >> 64;
532     out[4] += low;
533     out[4] *= 2;
534     out[5] = high;
535 
536     a = ((uint128_t) small[2]) * small[3];
537     low = a;
538     high = a >> 64;
539     out[5] += low;
540     out[5] *= 2;
541     out[6] = high;
542     out[6] += high;
543 
544     a = ((uint128_t) small[2]) * small[2];
545     low = a;
546     high = a >> 64;
547     out[4] += low;
548     out[5] += high;
549 
550     a = ((uint128_t) small[3]) * small[3];
551     low = a;
552     high = a >> 64;
553     out[6] += low;
554     out[7] = high;
555 }
556 
557 /*-
558  * felem_square sets |out| = |in|^2
559  * On entry:
560  *   in[i] < 2^109
561  * On exit:
562  *   out[i] < 7 * 2^64 < 2^67
563  */
564 static void felem_square(longfelem out, const felem in)
565 {
566     u64 small[4];
567     felem_shrink(small, in);
568     smallfelem_square(out, small);
569 }
570 
571 /*-
572  * smallfelem_mul sets |out| = |small1| * |small2|
573  * On entry:
574  *   small1[i] < 2^64
575  *   small2[i] < 2^64
576  * On exit:
577  *   out[i] < 7 * 2^64 < 2^67
578  */
579 static void smallfelem_mul(longfelem out, const smallfelem small1,
580                            const smallfelem small2)
581 {
582     limb a;
583     u64 high, low;
584 
585     a = ((uint128_t) small1[0]) * small2[0];
586     low = a;
587     high = a >> 64;
588     out[0] = low;
589     out[1] = high;
590 
591     a = ((uint128_t) small1[0]) * small2[1];
592     low = a;
593     high = a >> 64;
594     out[1] += low;
595     out[2] = high;
596 
597     a = ((uint128_t) small1[1]) * small2[0];
598     low = a;
599     high = a >> 64;
600     out[1] += low;
601     out[2] += high;
602 
603     a = ((uint128_t) small1[0]) * small2[2];
604     low = a;
605     high = a >> 64;
606     out[2] += low;
607     out[3] = high;
608 
609     a = ((uint128_t) small1[1]) * small2[1];
610     low = a;
611     high = a >> 64;
612     out[2] += low;
613     out[3] += high;
614 
615     a = ((uint128_t) small1[2]) * small2[0];
616     low = a;
617     high = a >> 64;
618     out[2] += low;
619     out[3] += high;
620 
621     a = ((uint128_t) small1[0]) * small2[3];
622     low = a;
623     high = a >> 64;
624     out[3] += low;
625     out[4] = high;
626 
627     a = ((uint128_t) small1[1]) * small2[2];
628     low = a;
629     high = a >> 64;
630     out[3] += low;
631     out[4] += high;
632 
633     a = ((uint128_t) small1[2]) * small2[1];
634     low = a;
635     high = a >> 64;
636     out[3] += low;
637     out[4] += high;
638 
639     a = ((uint128_t) small1[3]) * small2[0];
640     low = a;
641     high = a >> 64;
642     out[3] += low;
643     out[4] += high;
644 
645     a = ((uint128_t) small1[1]) * small2[3];
646     low = a;
647     high = a >> 64;
648     out[4] += low;
649     out[5] = high;
650 
651     a = ((uint128_t) small1[2]) * small2[2];
652     low = a;
653     high = a >> 64;
654     out[4] += low;
655     out[5] += high;
656 
657     a = ((uint128_t) small1[3]) * small2[1];
658     low = a;
659     high = a >> 64;
660     out[4] += low;
661     out[5] += high;
662 
663     a = ((uint128_t) small1[2]) * small2[3];
664     low = a;
665     high = a >> 64;
666     out[5] += low;
667     out[6] = high;
668 
669     a = ((uint128_t) small1[3]) * small2[2];
670     low = a;
671     high = a >> 64;
672     out[5] += low;
673     out[6] += high;
674 
675     a = ((uint128_t) small1[3]) * small2[3];
676     low = a;
677     high = a >> 64;
678     out[6] += low;
679     out[7] = high;
680 }
681 
682 /*-
683  * felem_mul sets |out| = |in1| * |in2|
684  * On entry:
685  *   in1[i] < 2^109
686  *   in2[i] < 2^109
687  * On exit:
688  *   out[i] < 7 * 2^64 < 2^67
689  */
690 static void felem_mul(longfelem out, const felem in1, const felem in2)
691 {
692     smallfelem small1, small2;
693     felem_shrink(small1, in1);
694     felem_shrink(small2, in2);
695     smallfelem_mul(out, small1, small2);
696 }
697 
698 /*-
699  * felem_small_mul sets |out| = |small1| * |in2|
700  * On entry:
701  *   small1[i] < 2^64
702  *   in2[i] < 2^109
703  * On exit:
704  *   out[i] < 7 * 2^64 < 2^67
705  */
706 static void felem_small_mul(longfelem out, const smallfelem small1,
707                             const felem in2)
708 {
709     smallfelem small2;
710     felem_shrink(small2, in2);
711     smallfelem_mul(out, small1, small2);
712 }
713 
714 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
715 # define two100 (((limb)1) << 100)
716 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
717 /* zero100 is 0 mod p */
718 static const felem zero100 =
719     { two100m36m4, two100, two100m36p4, two100m36p4 };
720 
721 /*-
722  * Internal function for the different flavours of felem_reduce.
723  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
724  * On entry:
725  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
726  *   out[1] >= in[7] + 2^32*in[4]
727  *   out[2] >= in[5] + 2^32*in[5]
728  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
729  * On exit:
730  *   out[0] <= out[0] + in[4] + 2^32*in[5]
731  *   out[1] <= out[1] + in[5] + 2^33*in[6]
732  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
733  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
734  */
735 static void felem_reduce_(felem out, const longfelem in)
736 {
737     int128_t c;
738     /* combine common terms from below */
739     c = in[4] + (in[5] << 32);
740     out[0] += c;
741     out[3] -= c;
742 
743     c = in[5] - in[7];
744     out[1] += c;
745     out[2] -= c;
746 
747     /* the remaining terms */
748     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
749     out[1] -= (in[4] << 32);
750     out[3] += (in[4] << 32);
751 
752     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
753     out[2] -= (in[5] << 32);
754 
755     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
756     out[0] -= in[6];
757     out[0] -= (in[6] << 32);
758     out[1] += (in[6] << 33);
759     out[2] += (in[6] * 2);
760     out[3] -= (in[6] << 32);
761 
762     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
763     out[0] -= in[7];
764     out[0] -= (in[7] << 32);
765     out[2] += (in[7] << 33);
766     out[3] += (in[7] * 3);
767 }
768 
769 /*-
770  * felem_reduce converts a longfelem into an felem.
771  * To be called directly after felem_square or felem_mul.
772  * On entry:
773  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
774  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
775  * On exit:
776  *   out[i] < 2^101
777  */
778 static void felem_reduce(felem out, const longfelem in)
779 {
780     out[0] = zero100[0] + in[0];
781     out[1] = zero100[1] + in[1];
782     out[2] = zero100[2] + in[2];
783     out[3] = zero100[3] + in[3];
784 
785     felem_reduce_(out, in);
786 
787     /*-
788      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
789      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
790      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
791      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
792      *
793      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
794      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
795      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
796      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
797      */
798 }
799 
800 /*-
801  * felem_reduce_zero105 converts a larger longfelem into an felem.
802  * On entry:
803  *   in[0] < 2^71
804  * On exit:
805  *   out[i] < 2^106
806  */
807 static void felem_reduce_zero105(felem out, const longfelem in)
808 {
809     out[0] = zero105[0] + in[0];
810     out[1] = zero105[1] + in[1];
811     out[2] = zero105[2] + in[2];
812     out[3] = zero105[3] + in[3];
813 
814     felem_reduce_(out, in);
815 
816     /*-
817      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
818      * out[1] > 2^105 - 2^71 - 2^103 > 0
819      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
820      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
821      *
822      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
823      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
824      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
825      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
826      */
827 }
828 
829 /*
830  * subtract_u64 sets *result = *result - v and *carry to one if the
831  * subtraction underflowed.
832  */
833 static void subtract_u64(u64 *result, u64 *carry, u64 v)
834 {
835     uint128_t r = *result;
836     r -= v;
837     *carry = (r >> 64) & 1;
838     *result = (u64)r;
839 }
840 
841 /*
842  * felem_contract converts |in| to its unique, minimal representation. On
843  * entry: in[i] < 2^109
844  */
845 static void felem_contract(smallfelem out, const felem in)
846 {
847     unsigned i;
848     u64 all_equal_so_far = 0, result = 0, carry;
849 
850     felem_shrink(out, in);
851     /* small is minimal except that the value might be > p */
852 
853     all_equal_so_far--;
854     /*
855      * We are doing a constant time test if out >= kPrime. We need to compare
856      * each u64, from most-significant to least significant. For each one, if
857      * all words so far have been equal (m is all ones) then a non-equal
858      * result is the answer. Otherwise we continue.
859      */
860     for (i = 3; i < 4; i--) {
861         u64 equal;
862         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
863         /*
864          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
865          * will all be set.
866          */
867         result |= all_equal_so_far & ((u64)(a >> 64));
868 
869         /*
870          * if kPrime[i] == out[i] then |equal| will be all zeros and the
871          * decrement will make it all ones.
872          */
873         equal = kPrime[i] ^ out[i];
874         equal--;
875         equal &= equal << 32;
876         equal &= equal << 16;
877         equal &= equal << 8;
878         equal &= equal << 4;
879         equal &= equal << 2;
880         equal &= equal << 1;
881         equal = 0 - (equal >> 63);
882 
883         all_equal_so_far &= equal;
884     }
885 
886     /*
887      * if all_equal_so_far is still all ones then the two values are equal
888      * and so out >= kPrime is true.
889      */
890     result |= all_equal_so_far;
891 
892     /* if out >= kPrime then we subtract kPrime. */
893     subtract_u64(&out[0], &carry, result & kPrime[0]);
894     subtract_u64(&out[1], &carry, carry);
895     subtract_u64(&out[2], &carry, carry);
896     subtract_u64(&out[3], &carry, carry);
897 
898     subtract_u64(&out[1], &carry, result & kPrime[1]);
899     subtract_u64(&out[2], &carry, carry);
900     subtract_u64(&out[3], &carry, carry);
901 
902     subtract_u64(&out[2], &carry, result & kPrime[2]);
903     subtract_u64(&out[3], &carry, carry);
904 
905     subtract_u64(&out[3], &carry, result & kPrime[3]);
906 }
907 
908 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
909 {
910     longfelem longtmp;
911     felem tmp;
912 
913     smallfelem_square(longtmp, in);
914     felem_reduce(tmp, longtmp);
915     felem_contract(out, tmp);
916 }
917 
918 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
919                                     const smallfelem in2)
920 {
921     longfelem longtmp;
922     felem tmp;
923 
924     smallfelem_mul(longtmp, in1, in2);
925     felem_reduce(tmp, longtmp);
926     felem_contract(out, tmp);
927 }
928 
929 /*-
930  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
931  * otherwise.
932  * On entry:
933  *   small[i] < 2^64
934  */
935 static limb smallfelem_is_zero(const smallfelem small)
936 {
937     limb result;
938     u64 is_p;
939 
940     u64 is_zero = small[0] | small[1] | small[2] | small[3];
941     is_zero--;
942     is_zero &= is_zero << 32;
943     is_zero &= is_zero << 16;
944     is_zero &= is_zero << 8;
945     is_zero &= is_zero << 4;
946     is_zero &= is_zero << 2;
947     is_zero &= is_zero << 1;
948     is_zero = 0 - (is_zero >> 63);
949 
950     is_p = (small[0] ^ kPrime[0]) |
951         (small[1] ^ kPrime[1]) |
952         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
953     is_p--;
954     is_p &= is_p << 32;
955     is_p &= is_p << 16;
956     is_p &= is_p << 8;
957     is_p &= is_p << 4;
958     is_p &= is_p << 2;
959     is_p &= is_p << 1;
960     is_p = 0 - (is_p >> 63);
961 
962     is_zero |= is_p;
963 
964     result = is_zero;
965     result |= ((limb) is_zero) << 64;
966     return result;
967 }
968 
969 static int smallfelem_is_zero_int(const void *small)
970 {
971     return (int)(smallfelem_is_zero(small) & ((limb) 1));
972 }
973 
974 /*-
975  * felem_inv calculates |out| = |in|^{-1}
976  *
977  * Based on Fermat's Little Theorem:
978  *   a^p = a (mod p)
979  *   a^{p-1} = 1 (mod p)
980  *   a^{p-2} = a^{-1} (mod p)
981  */
982 static void felem_inv(felem out, const felem in)
983 {
984     felem ftmp, ftmp2;
985     /* each e_I will hold |in|^{2^I - 1} */
986     felem e2, e4, e8, e16, e32, e64;
987     longfelem tmp;
988     unsigned i;
989 
990     felem_square(tmp, in);
991     felem_reduce(ftmp, tmp);    /* 2^1 */
992     felem_mul(tmp, in, ftmp);
993     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
994     felem_assign(e2, ftmp);
995     felem_square(tmp, ftmp);
996     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
997     felem_square(tmp, ftmp);
998     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
999     felem_mul(tmp, ftmp, e2);
1000     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1001     felem_assign(e4, ftmp);
1002     felem_square(tmp, ftmp);
1003     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1004     felem_square(tmp, ftmp);
1005     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1006     felem_square(tmp, ftmp);
1007     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1008     felem_square(tmp, ftmp);
1009     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1010     felem_mul(tmp, ftmp, e4);
1011     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1012     felem_assign(e8, ftmp);
1013     for (i = 0; i < 8; i++) {
1014         felem_square(tmp, ftmp);
1015         felem_reduce(ftmp, tmp);
1016     }                           /* 2^16 - 2^8 */
1017     felem_mul(tmp, ftmp, e8);
1018     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1019     felem_assign(e16, ftmp);
1020     for (i = 0; i < 16; i++) {
1021         felem_square(tmp, ftmp);
1022         felem_reduce(ftmp, tmp);
1023     }                           /* 2^32 - 2^16 */
1024     felem_mul(tmp, ftmp, e16);
1025     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1026     felem_assign(e32, ftmp);
1027     for (i = 0; i < 32; i++) {
1028         felem_square(tmp, ftmp);
1029         felem_reduce(ftmp, tmp);
1030     }                           /* 2^64 - 2^32 */
1031     felem_assign(e64, ftmp);
1032     felem_mul(tmp, ftmp, in);
1033     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1034     for (i = 0; i < 192; i++) {
1035         felem_square(tmp, ftmp);
1036         felem_reduce(ftmp, tmp);
1037     }                           /* 2^256 - 2^224 + 2^192 */
1038 
1039     felem_mul(tmp, e64, e32);
1040     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1041     for (i = 0; i < 16; i++) {
1042         felem_square(tmp, ftmp2);
1043         felem_reduce(ftmp2, tmp);
1044     }                           /* 2^80 - 2^16 */
1045     felem_mul(tmp, ftmp2, e16);
1046     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1047     for (i = 0; i < 8; i++) {
1048         felem_square(tmp, ftmp2);
1049         felem_reduce(ftmp2, tmp);
1050     }                           /* 2^88 - 2^8 */
1051     felem_mul(tmp, ftmp2, e8);
1052     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1053     for (i = 0; i < 4; i++) {
1054         felem_square(tmp, ftmp2);
1055         felem_reduce(ftmp2, tmp);
1056     }                           /* 2^92 - 2^4 */
1057     felem_mul(tmp, ftmp2, e4);
1058     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1059     felem_square(tmp, ftmp2);
1060     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1061     felem_square(tmp, ftmp2);
1062     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1063     felem_mul(tmp, ftmp2, e2);
1064     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1065     felem_square(tmp, ftmp2);
1066     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1067     felem_square(tmp, ftmp2);
1068     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1069     felem_mul(tmp, ftmp2, in);
1070     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1071 
1072     felem_mul(tmp, ftmp2, ftmp);
1073     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1074 }
1075 
1076 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1077 {
1078     felem tmp;
1079 
1080     smallfelem_expand(tmp, in);
1081     felem_inv(tmp, tmp);
1082     felem_contract(out, tmp);
1083 }
1084 
1085 /*-
1086  * Group operations
1087  * ----------------
1088  *
1089  * Building on top of the field operations we have the operations on the
1090  * elliptic curve group itself. Points on the curve are represented in Jacobian
1091  * coordinates
1092  */
1093 
1094 /*-
1095  * point_double calculates 2*(x_in, y_in, z_in)
1096  *
1097  * The method is taken from:
1098  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1099  *
1100  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1101  * while x_out == y_in is not (maybe this works, but it's not tested).
1102  */
1103 static void
1104 point_double(felem x_out, felem y_out, felem z_out,
1105              const felem x_in, const felem y_in, const felem z_in)
1106 {
1107     longfelem tmp, tmp2;
1108     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1109     smallfelem small1, small2;
1110 
1111     felem_assign(ftmp, x_in);
1112     /* ftmp[i] < 2^106 */
1113     felem_assign(ftmp2, x_in);
1114     /* ftmp2[i] < 2^106 */
1115 
1116     /* delta = z^2 */
1117     felem_square(tmp, z_in);
1118     felem_reduce(delta, tmp);
1119     /* delta[i] < 2^101 */
1120 
1121     /* gamma = y^2 */
1122     felem_square(tmp, y_in);
1123     felem_reduce(gamma, tmp);
1124     /* gamma[i] < 2^101 */
1125     felem_shrink(small1, gamma);
1126 
1127     /* beta = x*gamma */
1128     felem_small_mul(tmp, small1, x_in);
1129     felem_reduce(beta, tmp);
1130     /* beta[i] < 2^101 */
1131 
1132     /* alpha = 3*(x-delta)*(x+delta) */
1133     felem_diff(ftmp, delta);
1134     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1135     felem_sum(ftmp2, delta);
1136     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1137     felem_scalar(ftmp2, 3);
1138     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1139     felem_mul(tmp, ftmp, ftmp2);
1140     felem_reduce(alpha, tmp);
1141     /* alpha[i] < 2^101 */
1142     felem_shrink(small2, alpha);
1143 
1144     /* x' = alpha^2 - 8*beta */
1145     smallfelem_square(tmp, small2);
1146     felem_reduce(x_out, tmp);
1147     felem_assign(ftmp, beta);
1148     felem_scalar(ftmp, 8);
1149     /* ftmp[i] < 8 * 2^101 = 2^104 */
1150     felem_diff(x_out, ftmp);
1151     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1152 
1153     /* z' = (y + z)^2 - gamma - delta */
1154     felem_sum(delta, gamma);
1155     /* delta[i] < 2^101 + 2^101 = 2^102 */
1156     felem_assign(ftmp, y_in);
1157     felem_sum(ftmp, z_in);
1158     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1159     felem_square(tmp, ftmp);
1160     felem_reduce(z_out, tmp);
1161     felem_diff(z_out, delta);
1162     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1163 
1164     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1165     felem_scalar(beta, 4);
1166     /* beta[i] < 4 * 2^101 = 2^103 */
1167     felem_diff_zero107(beta, x_out);
1168     /* beta[i] < 2^107 + 2^103 < 2^108 */
1169     felem_small_mul(tmp, small2, beta);
1170     /* tmp[i] < 7 * 2^64 < 2^67 */
1171     smallfelem_square(tmp2, small1);
1172     /* tmp2[i] < 7 * 2^64 */
1173     longfelem_scalar(tmp2, 8);
1174     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1175     longfelem_diff(tmp, tmp2);
1176     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1177     felem_reduce_zero105(y_out, tmp);
1178     /* y_out[i] < 2^106 */
1179 }
1180 
1181 /*
1182  * point_double_small is the same as point_double, except that it operates on
1183  * smallfelems
1184  */
1185 static void
1186 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1187                    const smallfelem x_in, const smallfelem y_in,
1188                    const smallfelem z_in)
1189 {
1190     felem felem_x_out, felem_y_out, felem_z_out;
1191     felem felem_x_in, felem_y_in, felem_z_in;
1192 
1193     smallfelem_expand(felem_x_in, x_in);
1194     smallfelem_expand(felem_y_in, y_in);
1195     smallfelem_expand(felem_z_in, z_in);
1196     point_double(felem_x_out, felem_y_out, felem_z_out,
1197                  felem_x_in, felem_y_in, felem_z_in);
1198     felem_shrink(x_out, felem_x_out);
1199     felem_shrink(y_out, felem_y_out);
1200     felem_shrink(z_out, felem_z_out);
1201 }
1202 
1203 /* copy_conditional copies in to out iff mask is all ones. */
1204 static void copy_conditional(felem out, const felem in, limb mask)
1205 {
1206     unsigned i;
1207     for (i = 0; i < NLIMBS; ++i) {
1208         const limb tmp = mask & (in[i] ^ out[i]);
1209         out[i] ^= tmp;
1210     }
1211 }
1212 
1213 /* copy_small_conditional copies in to out iff mask is all ones. */
1214 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1215 {
1216     unsigned i;
1217     const u64 mask64 = mask;
1218     for (i = 0; i < NLIMBS; ++i) {
1219         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1220     }
1221 }
1222 
1223 /*-
1224  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1225  *
1226  * The method is taken from:
1227  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1228  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1229  *
1230  * This function includes a branch for checking whether the two input points
1231  * are equal, (while not equal to the point at infinity). This case never
1232  * happens during single point multiplication, so there is no timing leak for
1233  * ECDH or ECDSA signing.
1234  */
1235 static void point_add(felem x3, felem y3, felem z3,
1236                       const felem x1, const felem y1, const felem z1,
1237                       const int mixed, const smallfelem x2,
1238                       const smallfelem y2, const smallfelem z2)
1239 {
1240     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1241     longfelem tmp, tmp2;
1242     smallfelem small1, small2, small3, small4, small5;
1243     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1244     limb points_equal;
1245 
1246     felem_shrink(small3, z1);
1247 
1248     z1_is_zero = smallfelem_is_zero(small3);
1249     z2_is_zero = smallfelem_is_zero(z2);
1250 
1251     /* ftmp = z1z1 = z1**2 */
1252     smallfelem_square(tmp, small3);
1253     felem_reduce(ftmp, tmp);
1254     /* ftmp[i] < 2^101 */
1255     felem_shrink(small1, ftmp);
1256 
1257     if (!mixed) {
1258         /* ftmp2 = z2z2 = z2**2 */
1259         smallfelem_square(tmp, z2);
1260         felem_reduce(ftmp2, tmp);
1261         /* ftmp2[i] < 2^101 */
1262         felem_shrink(small2, ftmp2);
1263 
1264         felem_shrink(small5, x1);
1265 
1266         /* u1 = ftmp3 = x1*z2z2 */
1267         smallfelem_mul(tmp, small5, small2);
1268         felem_reduce(ftmp3, tmp);
1269         /* ftmp3[i] < 2^101 */
1270 
1271         /* ftmp5 = z1 + z2 */
1272         felem_assign(ftmp5, z1);
1273         felem_small_sum(ftmp5, z2);
1274         /* ftmp5[i] < 2^107 */
1275 
1276         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1277         felem_square(tmp, ftmp5);
1278         felem_reduce(ftmp5, tmp);
1279         /* ftmp2 = z2z2 + z1z1 */
1280         felem_sum(ftmp2, ftmp);
1281         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1282         felem_diff(ftmp5, ftmp2);
1283         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1284 
1285         /* ftmp2 = z2 * z2z2 */
1286         smallfelem_mul(tmp, small2, z2);
1287         felem_reduce(ftmp2, tmp);
1288 
1289         /* s1 = ftmp2 = y1 * z2**3 */
1290         felem_mul(tmp, y1, ftmp2);
1291         felem_reduce(ftmp6, tmp);
1292         /* ftmp6[i] < 2^101 */
1293     } else {
1294         /*
1295          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1296          */
1297 
1298         /* u1 = ftmp3 = x1*z2z2 */
1299         felem_assign(ftmp3, x1);
1300         /* ftmp3[i] < 2^106 */
1301 
1302         /* ftmp5 = 2z1z2 */
1303         felem_assign(ftmp5, z1);
1304         felem_scalar(ftmp5, 2);
1305         /* ftmp5[i] < 2*2^106 = 2^107 */
1306 
1307         /* s1 = ftmp2 = y1 * z2**3 */
1308         felem_assign(ftmp6, y1);
1309         /* ftmp6[i] < 2^106 */
1310     }
1311 
1312     /* u2 = x2*z1z1 */
1313     smallfelem_mul(tmp, x2, small1);
1314     felem_reduce(ftmp4, tmp);
1315 
1316     /* h = ftmp4 = u2 - u1 */
1317     felem_diff_zero107(ftmp4, ftmp3);
1318     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1319     felem_shrink(small4, ftmp4);
1320 
1321     x_equal = smallfelem_is_zero(small4);
1322 
1323     /* z_out = ftmp5 * h */
1324     felem_small_mul(tmp, small4, ftmp5);
1325     felem_reduce(z_out, tmp);
1326     /* z_out[i] < 2^101 */
1327 
1328     /* ftmp = z1 * z1z1 */
1329     smallfelem_mul(tmp, small1, small3);
1330     felem_reduce(ftmp, tmp);
1331 
1332     /* s2 = tmp = y2 * z1**3 */
1333     felem_small_mul(tmp, y2, ftmp);
1334     felem_reduce(ftmp5, tmp);
1335 
1336     /* r = ftmp5 = (s2 - s1)*2 */
1337     felem_diff_zero107(ftmp5, ftmp6);
1338     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1339     felem_scalar(ftmp5, 2);
1340     /* ftmp5[i] < 2^109 */
1341     felem_shrink(small1, ftmp5);
1342     y_equal = smallfelem_is_zero(small1);
1343 
1344     /*
1345      * The formulae are incorrect if the points are equal, in affine coordinates
1346      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1347      * happens.
1348      *
1349      * We use bitwise operations to avoid potential side-channels introduced by
1350      * the short-circuiting behaviour of boolean operators.
1351      *
1352      * The special case of either point being the point at infinity (z1 and/or
1353      * z2 are zero), is handled separately later on in this function, so we
1354      * avoid jumping to point_double here in those special cases.
1355      */
1356     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1357 
1358     if (points_equal) {
1359         /*
1360          * This is obviously not constant-time but, as mentioned before, this
1361          * case never happens during single point multiplication, so there is no
1362          * timing leak for ECDH or ECDSA signing.
1363          */
1364         point_double(x3, y3, z3, x1, y1, z1);
1365         return;
1366     }
1367 
1368     /* I = ftmp = (2h)**2 */
1369     felem_assign(ftmp, ftmp4);
1370     felem_scalar(ftmp, 2);
1371     /* ftmp[i] < 2*2^108 = 2^109 */
1372     felem_square(tmp, ftmp);
1373     felem_reduce(ftmp, tmp);
1374 
1375     /* J = ftmp2 = h * I */
1376     felem_mul(tmp, ftmp4, ftmp);
1377     felem_reduce(ftmp2, tmp);
1378 
1379     /* V = ftmp4 = U1 * I */
1380     felem_mul(tmp, ftmp3, ftmp);
1381     felem_reduce(ftmp4, tmp);
1382 
1383     /* x_out = r**2 - J - 2V */
1384     smallfelem_square(tmp, small1);
1385     felem_reduce(x_out, tmp);
1386     felem_assign(ftmp3, ftmp4);
1387     felem_scalar(ftmp4, 2);
1388     felem_sum(ftmp4, ftmp2);
1389     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1390     felem_diff(x_out, ftmp4);
1391     /* x_out[i] < 2^105 + 2^101 */
1392 
1393     /* y_out = r(V-x_out) - 2 * s1 * J */
1394     felem_diff_zero107(ftmp3, x_out);
1395     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1396     felem_small_mul(tmp, small1, ftmp3);
1397     felem_mul(tmp2, ftmp6, ftmp2);
1398     longfelem_scalar(tmp2, 2);
1399     /* tmp2[i] < 2*2^67 = 2^68 */
1400     longfelem_diff(tmp, tmp2);
1401     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1402     felem_reduce_zero105(y_out, tmp);
1403     /* y_out[i] < 2^106 */
1404 
1405     copy_small_conditional(x_out, x2, z1_is_zero);
1406     copy_conditional(x_out, x1, z2_is_zero);
1407     copy_small_conditional(y_out, y2, z1_is_zero);
1408     copy_conditional(y_out, y1, z2_is_zero);
1409     copy_small_conditional(z_out, z2, z1_is_zero);
1410     copy_conditional(z_out, z1, z2_is_zero);
1411     felem_assign(x3, x_out);
1412     felem_assign(y3, y_out);
1413     felem_assign(z3, z_out);
1414 }
1415 
1416 /*
1417  * point_add_small is the same as point_add, except that it operates on
1418  * smallfelems
1419  */
1420 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1421                             smallfelem x1, smallfelem y1, smallfelem z1,
1422                             smallfelem x2, smallfelem y2, smallfelem z2)
1423 {
1424     felem felem_x3, felem_y3, felem_z3;
1425     felem felem_x1, felem_y1, felem_z1;
1426     smallfelem_expand(felem_x1, x1);
1427     smallfelem_expand(felem_y1, y1);
1428     smallfelem_expand(felem_z1, z1);
1429     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1430               x2, y2, z2);
1431     felem_shrink(x3, felem_x3);
1432     felem_shrink(y3, felem_y3);
1433     felem_shrink(z3, felem_z3);
1434 }
1435 
1436 /*-
1437  * Base point pre computation
1438  * --------------------------
1439  *
1440  * Two different sorts of precomputed tables are used in the following code.
1441  * Each contain various points on the curve, where each point is three field
1442  * elements (x, y, z).
1443  *
1444  * For the base point table, z is usually 1 (0 for the point at infinity).
1445  * This table has 2 * 16 elements, starting with the following:
1446  * index | bits    | point
1447  * ------+---------+------------------------------
1448  *     0 | 0 0 0 0 | 0G
1449  *     1 | 0 0 0 1 | 1G
1450  *     2 | 0 0 1 0 | 2^64G
1451  *     3 | 0 0 1 1 | (2^64 + 1)G
1452  *     4 | 0 1 0 0 | 2^128G
1453  *     5 | 0 1 0 1 | (2^128 + 1)G
1454  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1455  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1456  *     8 | 1 0 0 0 | 2^192G
1457  *     9 | 1 0 0 1 | (2^192 + 1)G
1458  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1459  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1460  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1461  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1462  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1463  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1464  * followed by a copy of this with each element multiplied by 2^32.
1465  *
1466  * The reason for this is so that we can clock bits into four different
1467  * locations when doing simple scalar multiplies against the base point,
1468  * and then another four locations using the second 16 elements.
1469  *
1470  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1471 
1472 /* gmul is the table of precomputed base points */
1473 static const smallfelem gmul[2][16][3] = {
1474     {{{0, 0, 0, 0},
1475       {0, 0, 0, 0},
1476       {0, 0, 0, 0}},
1477      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1478        0x6b17d1f2e12c4247},
1479       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1480        0x4fe342e2fe1a7f9b},
1481       {1, 0, 0, 0}},
1482      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1483        0x0fa822bc2811aaa5},
1484       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1485        0xbff44ae8f5dba80d},
1486       {1, 0, 0, 0}},
1487      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1488        0x300a4bbc89d6726f},
1489       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1490        0x72aac7e0d09b4644},
1491       {1, 0, 0, 0}},
1492      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1493        0x447d739beedb5e67},
1494       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1495        0x2d4825ab834131ee},
1496       {1, 0, 0, 0}},
1497      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1498        0xef9519328a9c72ff},
1499       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1500        0x611e9fc37dbb2c9b},
1501       {1, 0, 0, 0}},
1502      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1503        0x550663797b51f5d8},
1504       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1505        0x157164848aecb851},
1506       {1, 0, 0, 0}},
1507      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1508        0xeb5d7745b21141ea},
1509       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1510        0xeafd72ebdbecc17b},
1511       {1, 0, 0, 0}},
1512      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1513        0xa6d39677a7849276},
1514       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1515        0x674f84749b0b8816},
1516       {1, 0, 0, 0}},
1517      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1518        0x4e769e7672c9ddad},
1519       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1520        0x42b99082de830663},
1521       {1, 0, 0, 0}},
1522      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1523        0x78878ef61c6ce04d},
1524       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1525        0xb6cb3f5d7b72c321},
1526       {1, 0, 0, 0}},
1527      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1528        0x0c88bc4d716b1287},
1529       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1530        0xdd5ddea3f3901dc6},
1531       {1, 0, 0, 0}},
1532      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1533        0x68f344af6b317466},
1534       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1535        0x31b9c405f8540a20},
1536       {1, 0, 0, 0}},
1537      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1538        0x4052bf4b6f461db9},
1539       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1540        0xfecf4d5190b0fc61},
1541       {1, 0, 0, 0}},
1542      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1543        0x1eddbae2c802e41a},
1544       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1545        0x43104d86560ebcfc},
1546       {1, 0, 0, 0}},
1547      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1548        0xb48e26b484f7a21c},
1549       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1550        0xfac015404d4d3dab},
1551       {1, 0, 0, 0}}},
1552     {{{0, 0, 0, 0},
1553       {0, 0, 0, 0},
1554       {0, 0, 0, 0}},
1555      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1556        0x7fe36b40af22af89},
1557       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1558        0xe697d45825b63624},
1559       {1, 0, 0, 0}},
1560      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1561        0x4a5b506612a677a6},
1562       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1563        0xeb13461ceac089f1},
1564       {1, 0, 0, 0}},
1565      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1566        0x0781b8291c6a220a},
1567       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1568        0x690cde8df0151593},
1569       {1, 0, 0, 0}},
1570      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1571        0x8a535f566ec73617},
1572       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1573        0x0455c08468b08bd7},
1574       {1, 0, 0, 0}},
1575      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1576        0x06bada7ab77f8276},
1577       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1578        0x5b476dfd0e6cb18a},
1579       {1, 0, 0, 0}},
1580      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1581        0x3e29864e8a2ec908},
1582       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1583        0x239b90ea3dc31e7e},
1584       {1, 0, 0, 0}},
1585      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1586        0x820f4dd949f72ff7},
1587       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1588        0x140406ec783a05ec},
1589       {1, 0, 0, 0}},
1590      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1591        0x68f6b8542783dfee},
1592       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1593        0xcbe1feba92e40ce6},
1594       {1, 0, 0, 0}},
1595      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1596        0xd0b2f94d2f420109},
1597       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1598        0x971459828b0719e5},
1599       {1, 0, 0, 0}},
1600      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1601        0x961610004a866aba},
1602       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1603        0x7acb9fadcee75e44},
1604       {1, 0, 0, 0}},
1605      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1606        0x24eb9acca333bf5b},
1607       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1608        0x69f891c5acd079cc},
1609       {1, 0, 0, 0}},
1610      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1611        0xe51f547c5972a107},
1612       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1613        0x1c309a2b25bb1387},
1614       {1, 0, 0, 0}},
1615      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1616        0x20b87b8aa2c4e503},
1617       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1618        0xf5c6fa49919776be},
1619       {1, 0, 0, 0}},
1620      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1621        0x1ed7d1b9332010b9},
1622       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1623        0x3a2b03f03217257a},
1624       {1, 0, 0, 0}},
1625      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1626        0x15fee545c78dd9f6},
1627       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1628        0x4ab5b6b2b8753f81},
1629       {1, 0, 0, 0}}}
1630 };
1631 
1632 /*
1633  * select_point selects the |idx|th point from a precomputation table and
1634  * copies it to out.
1635  */
1636 static void select_point(const u64 idx, unsigned int size,
1637                          const smallfelem pre_comp[16][3], smallfelem out[3])
1638 {
1639     unsigned i, j;
1640     u64 *outlimbs = &out[0][0];
1641 
1642     memset(out, 0, sizeof(*out) * 3);
1643 
1644     for (i = 0; i < size; i++) {
1645         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1646         u64 mask = i ^ idx;
1647         mask |= mask >> 4;
1648         mask |= mask >> 2;
1649         mask |= mask >> 1;
1650         mask &= 1;
1651         mask--;
1652         for (j = 0; j < NLIMBS * 3; j++)
1653             outlimbs[j] |= inlimbs[j] & mask;
1654     }
1655 }
1656 
1657 /* get_bit returns the |i|th bit in |in| */
1658 static char get_bit(const felem_bytearray in, int i)
1659 {
1660     if ((i < 0) || (i >= 256))
1661         return 0;
1662     return (in[i >> 3] >> (i & 7)) & 1;
1663 }
1664 
1665 /*
1666  * Interleaved point multiplication using precomputed point multiples: The
1667  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1668  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1669  * generator, using certain (large) precomputed multiples in g_pre_comp.
1670  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1671  */
1672 static void batch_mul(felem x_out, felem y_out, felem z_out,
1673                       const felem_bytearray scalars[],
1674                       const unsigned num_points, const u8 *g_scalar,
1675                       const int mixed, const smallfelem pre_comp[][17][3],
1676                       const smallfelem g_pre_comp[2][16][3])
1677 {
1678     int i, skip;
1679     unsigned num, gen_mul = (g_scalar != NULL);
1680     felem nq[3], ftmp;
1681     smallfelem tmp[3];
1682     u64 bits;
1683     u8 sign, digit;
1684 
1685     /* set nq to the point at infinity */
1686     memset(nq, 0, sizeof(nq));
1687 
1688     /*
1689      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1690      * of the generator (two in each of the last 32 rounds) and additions of
1691      * other points multiples (every 5th round).
1692      */
1693     skip = 1;                   /* save two point operations in the first
1694                                  * round */
1695     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1696         /* double */
1697         if (!skip)
1698             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1699 
1700         /* add multiples of the generator */
1701         if (gen_mul && (i <= 31)) {
1702             /* first, look 32 bits upwards */
1703             bits = get_bit(g_scalar, i + 224) << 3;
1704             bits |= get_bit(g_scalar, i + 160) << 2;
1705             bits |= get_bit(g_scalar, i + 96) << 1;
1706             bits |= get_bit(g_scalar, i + 32);
1707             /* select the point to add, in constant time */
1708             select_point(bits, 16, g_pre_comp[1], tmp);
1709 
1710             if (!skip) {
1711                 /* Arg 1 below is for "mixed" */
1712                 point_add(nq[0], nq[1], nq[2],
1713                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1714             } else {
1715                 smallfelem_expand(nq[0], tmp[0]);
1716                 smallfelem_expand(nq[1], tmp[1]);
1717                 smallfelem_expand(nq[2], tmp[2]);
1718                 skip = 0;
1719             }
1720 
1721             /* second, look at the current position */
1722             bits = get_bit(g_scalar, i + 192) << 3;
1723             bits |= get_bit(g_scalar, i + 128) << 2;
1724             bits |= get_bit(g_scalar, i + 64) << 1;
1725             bits |= get_bit(g_scalar, i);
1726             /* select the point to add, in constant time */
1727             select_point(bits, 16, g_pre_comp[0], tmp);
1728             /* Arg 1 below is for "mixed" */
1729             point_add(nq[0], nq[1], nq[2],
1730                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1731         }
1732 
1733         /* do other additions every 5 doublings */
1734         if (num_points && (i % 5 == 0)) {
1735             /* loop over all scalars */
1736             for (num = 0; num < num_points; ++num) {
1737                 bits = get_bit(scalars[num], i + 4) << 5;
1738                 bits |= get_bit(scalars[num], i + 3) << 4;
1739                 bits |= get_bit(scalars[num], i + 2) << 3;
1740                 bits |= get_bit(scalars[num], i + 1) << 2;
1741                 bits |= get_bit(scalars[num], i) << 1;
1742                 bits |= get_bit(scalars[num], i - 1);
1743                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1744 
1745                 /*
1746                  * select the point to add or subtract, in constant time
1747                  */
1748                 select_point(digit, 17, pre_comp[num], tmp);
1749                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1750                                                * point */
1751                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1752                 felem_contract(tmp[1], ftmp);
1753 
1754                 if (!skip) {
1755                     point_add(nq[0], nq[1], nq[2],
1756                               nq[0], nq[1], nq[2],
1757                               mixed, tmp[0], tmp[1], tmp[2]);
1758                 } else {
1759                     smallfelem_expand(nq[0], tmp[0]);
1760                     smallfelem_expand(nq[1], tmp[1]);
1761                     smallfelem_expand(nq[2], tmp[2]);
1762                     skip = 0;
1763                 }
1764             }
1765         }
1766     }
1767     felem_assign(x_out, nq[0]);
1768     felem_assign(y_out, nq[1]);
1769     felem_assign(z_out, nq[2]);
1770 }
1771 
1772 /* Precomputation for the group generator. */
1773 struct nistp256_pre_comp_st {
1774     smallfelem g_pre_comp[2][16][3];
1775     CRYPTO_REF_COUNT references;
1776     CRYPTO_RWLOCK *lock;
1777 };
1778 
1779 const EC_METHOD *EC_GFp_nistp256_method(void)
1780 {
1781     static const EC_METHOD ret = {
1782         EC_FLAGS_DEFAULT_OCT,
1783         NID_X9_62_prime_field,
1784         ec_GFp_nistp256_group_init,
1785         ec_GFp_simple_group_finish,
1786         ec_GFp_simple_group_clear_finish,
1787         ec_GFp_nist_group_copy,
1788         ec_GFp_nistp256_group_set_curve,
1789         ec_GFp_simple_group_get_curve,
1790         ec_GFp_simple_group_get_degree,
1791         ec_group_simple_order_bits,
1792         ec_GFp_simple_group_check_discriminant,
1793         ec_GFp_simple_point_init,
1794         ec_GFp_simple_point_finish,
1795         ec_GFp_simple_point_clear_finish,
1796         ec_GFp_simple_point_copy,
1797         ec_GFp_simple_point_set_to_infinity,
1798         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1799         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1800         ec_GFp_simple_point_set_affine_coordinates,
1801         ec_GFp_nistp256_point_get_affine_coordinates,
1802         0 /* point_set_compressed_coordinates */ ,
1803         0 /* point2oct */ ,
1804         0 /* oct2point */ ,
1805         ec_GFp_simple_add,
1806         ec_GFp_simple_dbl,
1807         ec_GFp_simple_invert,
1808         ec_GFp_simple_is_at_infinity,
1809         ec_GFp_simple_is_on_curve,
1810         ec_GFp_simple_cmp,
1811         ec_GFp_simple_make_affine,
1812         ec_GFp_simple_points_make_affine,
1813         ec_GFp_nistp256_points_mul,
1814         ec_GFp_nistp256_precompute_mult,
1815         ec_GFp_nistp256_have_precompute_mult,
1816         ec_GFp_nist_field_mul,
1817         ec_GFp_nist_field_sqr,
1818         0 /* field_div */ ,
1819         ec_GFp_simple_field_inv,
1820         0 /* field_encode */ ,
1821         0 /* field_decode */ ,
1822         0,                      /* field_set_to_one */
1823         ec_key_simple_priv2oct,
1824         ec_key_simple_oct2priv,
1825         0, /* set private */
1826         ec_key_simple_generate_key,
1827         ec_key_simple_check_key,
1828         ec_key_simple_generate_public_key,
1829         0, /* keycopy */
1830         0, /* keyfinish */
1831         ecdh_simple_compute_key,
1832         0, /* field_inverse_mod_ord */
1833         0, /* blind_coordinates */
1834         0, /* ladder_pre */
1835         0, /* ladder_step */
1836         0  /* ladder_post */
1837     };
1838 
1839     return &ret;
1840 }
1841 
1842 /******************************************************************************/
1843 /*
1844  * FUNCTIONS TO MANAGE PRECOMPUTATION
1845  */
1846 
1847 static NISTP256_PRE_COMP *nistp256_pre_comp_new(void)
1848 {
1849     NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1850 
1851     if (ret == NULL) {
1852         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1853         return ret;
1854     }
1855 
1856     ret->references = 1;
1857 
1858     ret->lock = CRYPTO_THREAD_lock_new();
1859     if (ret->lock == NULL) {
1860         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1861         OPENSSL_free(ret);
1862         return NULL;
1863     }
1864     return ret;
1865 }
1866 
1867 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1868 {
1869     int i;
1870     if (p != NULL)
1871         CRYPTO_UP_REF(&p->references, &i, p->lock);
1872     return p;
1873 }
1874 
1875 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1876 {
1877     int i;
1878 
1879     if (pre == NULL)
1880         return;
1881 
1882     CRYPTO_DOWN_REF(&pre->references, &i, pre->lock);
1883     REF_PRINT_COUNT("EC_nistp256", x);
1884     if (i > 0)
1885         return;
1886     REF_ASSERT_ISNT(i < 0);
1887 
1888     CRYPTO_THREAD_lock_free(pre->lock);
1889     OPENSSL_free(pre);
1890 }
1891 
1892 /******************************************************************************/
1893 /*
1894  * OPENSSL EC_METHOD FUNCTIONS
1895  */
1896 
1897 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1898 {
1899     int ret;
1900     ret = ec_GFp_simple_group_init(group);
1901     group->a_is_minus3 = 1;
1902     return ret;
1903 }
1904 
1905 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1906                                     const BIGNUM *a, const BIGNUM *b,
1907                                     BN_CTX *ctx)
1908 {
1909     int ret = 0;
1910     BN_CTX *new_ctx = NULL;
1911     BIGNUM *curve_p, *curve_a, *curve_b;
1912 
1913     if (ctx == NULL)
1914         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1915             return 0;
1916     BN_CTX_start(ctx);
1917     curve_p = BN_CTX_get(ctx);
1918     curve_a = BN_CTX_get(ctx);
1919     curve_b = BN_CTX_get(ctx);
1920     if (curve_b == NULL)
1921         goto err;
1922     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1923     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1924     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1925     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1926         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1927               EC_R_WRONG_CURVE_PARAMETERS);
1928         goto err;
1929     }
1930     group->field_mod_func = BN_nist_mod_256;
1931     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1932  err:
1933     BN_CTX_end(ctx);
1934     BN_CTX_free(new_ctx);
1935     return ret;
1936 }
1937 
1938 /*
1939  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1940  * (X/Z^2, Y/Z^3)
1941  */
1942 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1943                                                  const EC_POINT *point,
1944                                                  BIGNUM *x, BIGNUM *y,
1945                                                  BN_CTX *ctx)
1946 {
1947     felem z1, z2, x_in, y_in;
1948     smallfelem x_out, y_out;
1949     longfelem tmp;
1950 
1951     if (EC_POINT_is_at_infinity(group, point)) {
1952         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1953               EC_R_POINT_AT_INFINITY);
1954         return 0;
1955     }
1956     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1957         (!BN_to_felem(z1, point->Z)))
1958         return 0;
1959     felem_inv(z2, z1);
1960     felem_square(tmp, z2);
1961     felem_reduce(z1, tmp);
1962     felem_mul(tmp, x_in, z1);
1963     felem_reduce(x_in, tmp);
1964     felem_contract(x_out, x_in);
1965     if (x != NULL) {
1966         if (!smallfelem_to_BN(x, x_out)) {
1967             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1968                   ERR_R_BN_LIB);
1969             return 0;
1970         }
1971     }
1972     felem_mul(tmp, z1, z2);
1973     felem_reduce(z1, tmp);
1974     felem_mul(tmp, y_in, z1);
1975     felem_reduce(y_in, tmp);
1976     felem_contract(y_out, y_in);
1977     if (y != NULL) {
1978         if (!smallfelem_to_BN(y, y_out)) {
1979             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1980                   ERR_R_BN_LIB);
1981             return 0;
1982         }
1983     }
1984     return 1;
1985 }
1986 
1987 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1988 static void make_points_affine(size_t num, smallfelem points[][3],
1989                                smallfelem tmp_smallfelems[])
1990 {
1991     /*
1992      * Runs in constant time, unless an input is the point at infinity (which
1993      * normally shouldn't happen).
1994      */
1995     ec_GFp_nistp_points_make_affine_internal(num,
1996                                              points,
1997                                              sizeof(smallfelem),
1998                                              tmp_smallfelems,
1999                                              (void (*)(void *))smallfelem_one,
2000                                              smallfelem_is_zero_int,
2001                                              (void (*)(void *, const void *))
2002                                              smallfelem_assign,
2003                                              (void (*)(void *, const void *))
2004                                              smallfelem_square_contract,
2005                                              (void (*)
2006                                               (void *, const void *,
2007                                                const void *))
2008                                              smallfelem_mul_contract,
2009                                              (void (*)(void *, const void *))
2010                                              smallfelem_inv_contract,
2011                                              /* nothing to contract */
2012                                              (void (*)(void *, const void *))
2013                                              smallfelem_assign);
2014 }
2015 
2016 /*
2017  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2018  * values Result is stored in r (r can equal one of the inputs).
2019  */
2020 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2021                                const BIGNUM *scalar, size_t num,
2022                                const EC_POINT *points[],
2023                                const BIGNUM *scalars[], BN_CTX *ctx)
2024 {
2025     int ret = 0;
2026     int j;
2027     int mixed = 0;
2028     BIGNUM *x, *y, *z, *tmp_scalar;
2029     felem_bytearray g_secret;
2030     felem_bytearray *secrets = NULL;
2031     smallfelem (*pre_comp)[17][3] = NULL;
2032     smallfelem *tmp_smallfelems = NULL;
2033     unsigned i;
2034     int num_bytes;
2035     int have_pre_comp = 0;
2036     size_t num_points = num;
2037     smallfelem x_in, y_in, z_in;
2038     felem x_out, y_out, z_out;
2039     NISTP256_PRE_COMP *pre = NULL;
2040     const smallfelem(*g_pre_comp)[16][3] = NULL;
2041     EC_POINT *generator = NULL;
2042     const EC_POINT *p = NULL;
2043     const BIGNUM *p_scalar = NULL;
2044 
2045     BN_CTX_start(ctx);
2046     x = BN_CTX_get(ctx);
2047     y = BN_CTX_get(ctx);
2048     z = BN_CTX_get(ctx);
2049     tmp_scalar = BN_CTX_get(ctx);
2050     if (tmp_scalar == NULL)
2051         goto err;
2052 
2053     if (scalar != NULL) {
2054         pre = group->pre_comp.nistp256;
2055         if (pre)
2056             /* we have precomputation, try to use it */
2057             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2058         else
2059             /* try to use the standard precomputation */
2060             g_pre_comp = &gmul[0];
2061         generator = EC_POINT_new(group);
2062         if (generator == NULL)
2063             goto err;
2064         /* get the generator from precomputation */
2065         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2066             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2067             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2068             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2069             goto err;
2070         }
2071         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2072                                                       generator, x, y, z,
2073                                                       ctx))
2074             goto err;
2075         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2076             /* precomputation matches generator */
2077             have_pre_comp = 1;
2078         else
2079             /*
2080              * we don't have valid precomputation: treat the generator as a
2081              * random point
2082              */
2083             num_points++;
2084     }
2085     if (num_points > 0) {
2086         if (num_points >= 3) {
2087             /*
2088              * unless we precompute multiples for just one or two points,
2089              * converting those into affine form is time well spent
2090              */
2091             mixed = 1;
2092         }
2093         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2094         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2095         if (mixed)
2096             tmp_smallfelems =
2097               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2098         if ((secrets == NULL) || (pre_comp == NULL)
2099             || (mixed && (tmp_smallfelems == NULL))) {
2100             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2101             goto err;
2102         }
2103 
2104         /*
2105          * we treat NULL scalars as 0, and NULL points as points at infinity,
2106          * i.e., they contribute nothing to the linear combination
2107          */
2108         memset(secrets, 0, sizeof(*secrets) * num_points);
2109         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2110         for (i = 0; i < num_points; ++i) {
2111             if (i == num) {
2112                 /*
2113                  * we didn't have a valid precomputation, so we pick the
2114                  * generator
2115                  */
2116                 p = EC_GROUP_get0_generator(group);
2117                 p_scalar = scalar;
2118             } else {
2119                 /* the i^th point */
2120                 p = points[i];
2121                 p_scalar = scalars[i];
2122             }
2123             if ((p_scalar != NULL) && (p != NULL)) {
2124                 /* reduce scalar to 0 <= scalar < 2^256 */
2125                 if ((BN_num_bits(p_scalar) > 256)
2126                     || (BN_is_negative(p_scalar))) {
2127                     /*
2128                      * this is an unusual input, and we don't guarantee
2129                      * constant-timeness
2130                      */
2131                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2132                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2133                         goto err;
2134                     }
2135                     num_bytes = BN_bn2lebinpad(tmp_scalar,
2136                                                secrets[i], sizeof(secrets[i]));
2137                 } else {
2138                     num_bytes = BN_bn2lebinpad(p_scalar,
2139                                                secrets[i], sizeof(secrets[i]));
2140                 }
2141                 if (num_bytes < 0) {
2142                     ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2143                     goto err;
2144                 }
2145                 /* precompute multiples */
2146                 if ((!BN_to_felem(x_out, p->X)) ||
2147                     (!BN_to_felem(y_out, p->Y)) ||
2148                     (!BN_to_felem(z_out, p->Z)))
2149                     goto err;
2150                 felem_shrink(pre_comp[i][1][0], x_out);
2151                 felem_shrink(pre_comp[i][1][1], y_out);
2152                 felem_shrink(pre_comp[i][1][2], z_out);
2153                 for (j = 2; j <= 16; ++j) {
2154                     if (j & 1) {
2155                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2156                                         pre_comp[i][j][2], pre_comp[i][1][0],
2157                                         pre_comp[i][1][1], pre_comp[i][1][2],
2158                                         pre_comp[i][j - 1][0],
2159                                         pre_comp[i][j - 1][1],
2160                                         pre_comp[i][j - 1][2]);
2161                     } else {
2162                         point_double_small(pre_comp[i][j][0],
2163                                            pre_comp[i][j][1],
2164                                            pre_comp[i][j][2],
2165                                            pre_comp[i][j / 2][0],
2166                                            pre_comp[i][j / 2][1],
2167                                            pre_comp[i][j / 2][2]);
2168                     }
2169                 }
2170             }
2171         }
2172         if (mixed)
2173             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2174     }
2175 
2176     /* the scalar for the generator */
2177     if ((scalar != NULL) && (have_pre_comp)) {
2178         memset(g_secret, 0, sizeof(g_secret));
2179         /* reduce scalar to 0 <= scalar < 2^256 */
2180         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2181             /*
2182              * this is an unusual input, and we don't guarantee
2183              * constant-timeness
2184              */
2185             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2186                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2187                 goto err;
2188             }
2189             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2190         } else {
2191             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2192         }
2193         /* do the multiplication with generator precomputation */
2194         batch_mul(x_out, y_out, z_out,
2195                   (const felem_bytearray(*))secrets, num_points,
2196                   g_secret,
2197                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2198     } else {
2199         /* do the multiplication without generator precomputation */
2200         batch_mul(x_out, y_out, z_out,
2201                   (const felem_bytearray(*))secrets, num_points,
2202                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2203     }
2204     /* reduce the output to its unique minimal representation */
2205     felem_contract(x_in, x_out);
2206     felem_contract(y_in, y_out);
2207     felem_contract(z_in, z_out);
2208     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2209         (!smallfelem_to_BN(z, z_in))) {
2210         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2211         goto err;
2212     }
2213     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2214 
2215  err:
2216     BN_CTX_end(ctx);
2217     EC_POINT_free(generator);
2218     OPENSSL_free(secrets);
2219     OPENSSL_free(pre_comp);
2220     OPENSSL_free(tmp_smallfelems);
2221     return ret;
2222 }
2223 
2224 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2225 {
2226     int ret = 0;
2227     NISTP256_PRE_COMP *pre = NULL;
2228     int i, j;
2229     BN_CTX *new_ctx = NULL;
2230     BIGNUM *x, *y;
2231     EC_POINT *generator = NULL;
2232     smallfelem tmp_smallfelems[32];
2233     felem x_tmp, y_tmp, z_tmp;
2234 
2235     /* throw away old precomputation */
2236     EC_pre_comp_free(group);
2237     if (ctx == NULL)
2238         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2239             return 0;
2240     BN_CTX_start(ctx);
2241     x = BN_CTX_get(ctx);
2242     y = BN_CTX_get(ctx);
2243     if (y == NULL)
2244         goto err;
2245     /* get the generator */
2246     if (group->generator == NULL)
2247         goto err;
2248     generator = EC_POINT_new(group);
2249     if (generator == NULL)
2250         goto err;
2251     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2252     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2253     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2254         goto err;
2255     if ((pre = nistp256_pre_comp_new()) == NULL)
2256         goto err;
2257     /*
2258      * if the generator is the standard one, use built-in precomputation
2259      */
2260     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2261         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2262         goto done;
2263     }
2264     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2265         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2266         (!BN_to_felem(z_tmp, group->generator->Z)))
2267         goto err;
2268     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2269     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2270     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2271     /*
2272      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2273      * 2^160*G, 2^224*G for the second one
2274      */
2275     for (i = 1; i <= 8; i <<= 1) {
2276         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2277                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2278                            pre->g_pre_comp[0][i][1],
2279                            pre->g_pre_comp[0][i][2]);
2280         for (j = 0; j < 31; ++j) {
2281             point_double_small(pre->g_pre_comp[1][i][0],
2282                                pre->g_pre_comp[1][i][1],
2283                                pre->g_pre_comp[1][i][2],
2284                                pre->g_pre_comp[1][i][0],
2285                                pre->g_pre_comp[1][i][1],
2286                                pre->g_pre_comp[1][i][2]);
2287         }
2288         if (i == 8)
2289             break;
2290         point_double_small(pre->g_pre_comp[0][2 * i][0],
2291                            pre->g_pre_comp[0][2 * i][1],
2292                            pre->g_pre_comp[0][2 * i][2],
2293                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2294                            pre->g_pre_comp[1][i][2]);
2295         for (j = 0; j < 31; ++j) {
2296             point_double_small(pre->g_pre_comp[0][2 * i][0],
2297                                pre->g_pre_comp[0][2 * i][1],
2298                                pre->g_pre_comp[0][2 * i][2],
2299                                pre->g_pre_comp[0][2 * i][0],
2300                                pre->g_pre_comp[0][2 * i][1],
2301                                pre->g_pre_comp[0][2 * i][2]);
2302         }
2303     }
2304     for (i = 0; i < 2; i++) {
2305         /* g_pre_comp[i][0] is the point at infinity */
2306         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2307         /* the remaining multiples */
2308         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2309         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2310                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2311                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2312                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2313                         pre->g_pre_comp[i][2][2]);
2314         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2315         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2316                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2317                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2318                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2319                         pre->g_pre_comp[i][2][2]);
2320         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2321         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2322                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2323                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2324                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2325                         pre->g_pre_comp[i][4][2]);
2326         /*
2327          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2328          */
2329         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2330                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2331                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2332                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2333                         pre->g_pre_comp[i][2][2]);
2334         for (j = 1; j < 8; ++j) {
2335             /* odd multiples: add G resp. 2^32*G */
2336             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2337                             pre->g_pre_comp[i][2 * j + 1][1],
2338                             pre->g_pre_comp[i][2 * j + 1][2],
2339                             pre->g_pre_comp[i][2 * j][0],
2340                             pre->g_pre_comp[i][2 * j][1],
2341                             pre->g_pre_comp[i][2 * j][2],
2342                             pre->g_pre_comp[i][1][0],
2343                             pre->g_pre_comp[i][1][1],
2344                             pre->g_pre_comp[i][1][2]);
2345         }
2346     }
2347     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2348 
2349  done:
2350     SETPRECOMP(group, nistp256, pre);
2351     pre = NULL;
2352     ret = 1;
2353 
2354  err:
2355     BN_CTX_end(ctx);
2356     EC_POINT_free(generator);
2357     BN_CTX_free(new_ctx);
2358     EC_nistp256_pre_comp_free(pre);
2359     return ret;
2360 }
2361 
2362 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2363 {
2364     return HAVEPRECOMP(group, nistp256);
2365 }
2366 #endif
2367