1 /*
2  * Copyright 2013 The Android Open Source Project
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *     * Redistributions of source code must retain the above copyright
7  *       notice, this list of conditions and the following disclaimer.
8  *     * Redistributions in binary form must reproduce the above copyright
9  *       notice, this list of conditions and the following disclaimer in the
10  *       documentation and/or other materials provided with the distribution.
11  *     * Neither the name of Google Inc. nor the names of its contributors may
12  *       be used to endorse or promote products derived from this software
13  *       without specific prior written permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
17  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
18  * EVENT SHALL Google Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
24  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 // This is an implementation of the P256 finite field. It's written to be
28 // portable and still constant-time.
29 //
30 // WARNING: Implementing these functions in a constant-time manner is far from
31 //          obvious. Be careful when touching this code.
32 //
33 // See http://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background.
34 
35 #include <stdint.h>
36 #include <stdio.h>
37 
38 #include <string.h>
39 #include <stdlib.h>
40 
41 #include "p256/p256.h"
42 
43 typedef uint8_t u8;
44 typedef uint32_t u32;
45 typedef int32_t s32;
46 typedef uint64_t u64;
47 
48 /* Our field elements are represented as nine 32-bit limbs.
49  *
50  * The value of an felem (field element) is:
51  *   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
52  *
53  * That is, each limb is alternately 29 or 28-bits wide in little-endian
54  * order.
55  *
56  * This means that an felem hits 2**257, rather than 2**256 as we would like. A
57  * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
58  * when multiplying as terms end up one bit short of a limb which would require
59  * much bit-shifting to correct.
60  *
61  * Finally, the values stored in an felem are in Montgomery form. So the value
62  * |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 2**257.
63  */
64 typedef u32 limb;
65 #define NLIMBS 9
66 typedef limb felem[NLIMBS];
67 
68 static const limb kBottom28Bits = 0xfffffff;
69 static const limb kBottom29Bits = 0x1fffffff;
70 
71 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
72  * 28-bit words. */
73 static const felem kOne = {
74     2, 0, 0, 0xffff800,
75     0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
76     0
77 };
78 static const felem kZero = {0};
79 static const felem kP = {
80     0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
81     0, 0, 0x200000, 0xf000000,
82     0xfffffff
83 };
84 static const felem k2P = {
85     0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
86     0, 0, 0x400000, 0xe000000,
87     0x1fffffff
88 };
89 /* kPrecomputed contains precomputed values to aid the calculation of scalar
90  * multiples of the base point, G. It's actually two, equal length, tables
91  * concatenated.
92  *
93  * The first table contains (x,y) felem pairs for 16 multiples of the base
94  * point, G.
95  *
96  *   Index  |  Index (binary) | Value
97  *       0  |           0000  | 0G (all zeros, omitted)
98  *       1  |           0001  | G
99  *       2  |           0010  | 2**64G
100  *       3  |           0011  | 2**64G + G
101  *       4  |           0100  | 2**128G
102  *       5  |           0101  | 2**128G + G
103  *       6  |           0110  | 2**128G + 2**64G
104  *       7  |           0111  | 2**128G + 2**64G + G
105  *       8  |           1000  | 2**192G
106  *       9  |           1001  | 2**192G + G
107  *      10  |           1010  | 2**192G + 2**64G
108  *      11  |           1011  | 2**192G + 2**64G + G
109  *      12  |           1100  | 2**192G + 2**128G
110  *      13  |           1101  | 2**192G + 2**128G + G
111  *      14  |           1110  | 2**192G + 2**128G + 2**64G
112  *      15  |           1111  | 2**192G + 2**128G + 2**64G + G
113  *
114  * The second table follows the same style, but the terms are 2**32G,
115  * 2**96G, 2**160G, 2**224G.
116  *
117  * This is ~2KB of data. */
118 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
119     0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
120     0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
121     0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
122     0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
123     0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
124     0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
125     0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
126     0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
127     0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
128     0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
129     0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
130     0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
131     0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
132     0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
133     0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
134     0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
135     0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
136     0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
137     0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
138     0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
139     0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
140     0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
141     0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
142     0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
143     0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
144     0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
145     0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
146     0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
147     0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
148     0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
149     0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
150     0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
151     0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
152     0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
153     0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
154     0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
155     0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
156     0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
157     0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
158     0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
159     0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
160     0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
161     0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
162     0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
163     0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
164     0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
165     0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
166     0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
167     0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
168     0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
169     0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
170     0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
171     0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
172     0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
173     0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
174     0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
175     0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
176     0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
177     0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
178     0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
179 };
180 
181 
182 /* Field element operations: */
183 
184 /* NON_ZERO_TO_ALL_ONES returns:
185  *   0xffffffff for 0 < x <= 2**31
186  *   0 for x == 0 or x > 2**31.
187  *
188  * x must be a u32 or an equivalent type such as limb. */
189 #define NON_ZERO_TO_ALL_ONES(x) ((((u32)(x) - 1) >> 31) - 1)
190 
191 /* felem_reduce_carry adds a multiple of p in order to cancel |carry|,
192  * which is a term at 2**257.
193  *
194  * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
195  * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29. */
felem_reduce_carry(felem inout,limb carry)196 static void felem_reduce_carry(felem inout, limb carry) {
197   const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
198 
199   inout[0] += carry << 1;
200   inout[3] += 0x10000000 & carry_mask;
201   /* carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
202    * previous line therefore this doesn't underflow. */
203   inout[3] -= carry << 11;
204   inout[4] += (0x20000000 - 1) & carry_mask;
205   inout[5] += (0x10000000 - 1) & carry_mask;
206   inout[6] += (0x20000000 - 1) & carry_mask;
207   inout[6] -= carry << 22;
208   /* This may underflow if carry is non-zero but, if so, we'll fix it in the
209    * next line. */
210   inout[7] -= 1 & carry_mask;
211   inout[7] += carry << 25;
212 }
213 
214 /* felem_sum sets out = in+in2.
215  *
216  * On entry, in[i]+in2[i] must not overflow a 32-bit word.
217  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
felem_sum(felem out,const felem in,const felem in2)218 static void felem_sum(felem out, const felem in, const felem in2) {
219   limb carry = 0;
220   unsigned i;
221 
222   for (i = 0;; i++) {
223     out[i] = in[i] + in2[i];
224     out[i] += carry;
225     carry = out[i] >> 29;
226     out[i] &= kBottom29Bits;
227 
228     i++;
229     if (i == NLIMBS)
230       break;
231 
232     out[i] = in[i] + in2[i];
233     out[i] += carry;
234     carry = out[i] >> 28;
235     out[i] &= kBottom28Bits;
236   }
237 
238   felem_reduce_carry(out, carry);
239 }
240 
241 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
242 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
243 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
244 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
245 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
246 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
247 
248 /* zero31 is 0 mod p. */
249 static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2 };
250 
251 /* felem_diff sets out = in-in2.
252  *
253  * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
254  *           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
255  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_diff(felem out,const felem in,const felem in2)256 static void felem_diff(felem out, const felem in, const felem in2) {
257   limb carry = 0;
258   unsigned i;
259 
260    for (i = 0;; i++) {
261     out[i] = in[i] - in2[i];
262     out[i] += zero31[i];
263     out[i] += carry;
264     carry = out[i] >> 29;
265     out[i] &= kBottom29Bits;
266 
267     i++;
268     if (i == NLIMBS)
269       break;
270 
271     out[i] = in[i] - in2[i];
272     out[i] += zero31[i];
273     out[i] += carry;
274     carry = out[i] >> 28;
275     out[i] &= kBottom28Bits;
276   }
277 
278   felem_reduce_carry(out, carry);
279 }
280 
281 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
282  * with the same 29,28,... bit positions as an felem.
283  *
284  * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
285  * Since we just multiplied two Montgomery values together, the result is
286  * x*y*R*R mod p. We wish to divide by R in order for the result also to be
287  * in Montgomery form.
288  *
289  * On entry: tmp[i] < 2**64
290  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
felem_reduce_degree(felem out,u64 tmp[17])291 static void felem_reduce_degree(felem out, u64 tmp[17]) {
292    /* The following table may be helpful when reading this code:
293     *
294     * Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
295     * Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
296     * Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
297     *   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285 */
298   limb tmp2[18], carry, x, xMask;
299   unsigned i;
300 
301   /* tmp contains 64-bit words with the same 29,28,29-bit positions as an
302    * felem. So the top of an element of tmp might overlap with another
303    * element two positions down. The following loop eliminates this
304    * overlap. */
305   tmp2[0] = (limb)(tmp[0] & kBottom29Bits);
306 
307   /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
308    * and hint to the compiler that it can do a single-word shift by selecting
309    * the right register rather than doing a double-word shift and truncating
310    * afterwards. */
311   tmp2[1] = ((limb) tmp[0]) >> 29;
312   tmp2[1] |= (((limb)(tmp[0] >> 32)) << 3) & kBottom28Bits;
313   tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
314   carry = tmp2[1] >> 28;
315   tmp2[1] &= kBottom28Bits;
316 
317   for (i = 2; i < 17; i++) {
318     tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
319     tmp2[i] += ((limb)(tmp[i - 1])) >> 28;
320     tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
321     tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
322     tmp2[i] += carry;
323     carry = tmp2[i] >> 29;
324     tmp2[i] &= kBottom29Bits;
325 
326     i++;
327     if (i == 17)
328       break;
329     tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
330     tmp2[i] += ((limb)(tmp[i - 1])) >> 29;
331     tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
332     tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
333     tmp2[i] += carry;
334     carry = tmp2[i] >> 28;
335     tmp2[i] &= kBottom28Bits;
336   }
337 
338   tmp2[17] = ((limb)(tmp[15] >> 32)) >> 25;
339   tmp2[17] += ((limb)(tmp[16])) >> 29;
340   tmp2[17] += (((limb)(tmp[16] >> 32)) << 3);
341   tmp2[17] += carry;
342 
343   /* Montgomery elimination of terms.
344    *
345    * Since R is 2**257, we can divide by R with a bitwise shift if we can
346    * ensure that the right-most 257 bits are all zero. We can make that true by
347    * adding multiplies of p without affecting the value.
348    *
349    * So we eliminate limbs from right to left. Since the bottom 29 bits of p
350    * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
351    * We can do that for 8 further limbs and then right shift to eliminate the
352    * extra factor of R. */
353   for (i = 0;; i += 2) {
354     tmp2[i + 1] += tmp2[i] >> 29;
355     x = tmp2[i] & kBottom29Bits;
356     xMask = NON_ZERO_TO_ALL_ONES(x);
357     tmp2[i] = 0;
358 
359     /* The bounds calculations for this loop are tricky. Each iteration of
360      * the loop eliminates two words by adding values to words to their
361      * right.
362      *
363      * The following table contains the amounts added to each word (as an
364      * offset from the value of i at the top of the loop). The amounts are
365      * accounted for from the first and second half of the loop separately
366      * and are written as, for example, 28 to mean a value <2**28.
367      *
368      * Word:                   3   4   5   6   7   8   9   10
369      * Added in top half:     28  11      29  21  29  28
370      *                                        28  29
371      *                                            29
372      * Added in bottom half:      29  10      28  21  28   28
373      *                                            29
374      *
375      * The value that is currently offset 7 will be offset 5 for the next
376      * iteration and then offset 3 for the iteration after that. Therefore
377      * the total value added will be the values added at 7, 5 and 3.
378      *
379      * The following table accumulates these values. The sums at the bottom
380      * are written as, for example, 29+28, to mean a value < 2**29+2**28.
381      *
382      * Word:                   3   4   5   6   7   8   9  10  11  12  13
383      *                        28  11  10  29  21  29  28  28  28  28  28
384      *                            29  28  11  28  29  28  29  28  29  28
385      *                                    29  28  21  21  29  21  29  21
386      *                                        10  29  28  21  28  21  28
387      *                                        28  29  28  29  28  29  28
388      *                                            11  10  29  10  29  10
389      *                                            29  28  11  28  11
390      *                                                    29      29
391      *                        --------------------------------------------
392      *                                                30+ 31+ 30+ 31+ 30+
393      *                                                28+ 29+ 28+ 29+ 21+
394      *                                                21+ 28+ 21+ 28+ 10
395      *                                                10  21+ 10  21+
396      *                                                    11      11
397      *
398      * So the greatest amount is added to tmp2[10] and tmp2[12]. If
399      * tmp2[10/12] has an initial value of <2**29, then the maximum value
400      * will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
401      * as required. */
402     tmp2[i + 3] += (x << 10) & kBottom28Bits;
403     tmp2[i + 4] += (x >> 18);
404 
405     tmp2[i + 6] += (x << 21) & kBottom29Bits;
406     tmp2[i + 7] += x >> 8;
407 
408     /* At position 200, which is the starting bit position for word 7, we
409      * have a factor of 0xf000000 = 2**28 - 2**24. */
410     tmp2[i + 7] += 0x10000000 & xMask;
411     /* Word 7 is 28 bits wide, so the 2**28 term exactly hits word 8. */
412     tmp2[i + 8] += (x - 1) & xMask;
413     tmp2[i + 7] -= (x << 24) & kBottom28Bits;
414     tmp2[i + 8] -= x >> 4;
415 
416     tmp2[i + 8] += 0x20000000 & xMask;
417     tmp2[i + 8] -= x;
418     tmp2[i + 8] += (x << 28) & kBottom29Bits;
419     tmp2[i + 9] += ((x >> 1) - 1) & xMask;
420 
421     if (i+1 == NLIMBS)
422       break;
423     tmp2[i + 2] += tmp2[i + 1] >> 28;
424     x = tmp2[i + 1] & kBottom28Bits;
425     xMask = NON_ZERO_TO_ALL_ONES(x);
426     tmp2[i + 1] = 0;
427 
428     tmp2[i + 4] += (x << 11) & kBottom29Bits;
429     tmp2[i + 5] += (x >> 18);
430 
431     tmp2[i + 7] += (x << 21) & kBottom28Bits;
432     tmp2[i + 8] += x >> 7;
433 
434     /* At position 199, which is the starting bit of the 8th word when
435      * dealing with a context starting on an odd word, we have a factor of
436      * 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
437      * word from i+1 is i+8. */
438     tmp2[i + 8] += 0x20000000 & xMask;
439     tmp2[i + 9] += (x - 1) & xMask;
440     tmp2[i + 8] -= (x << 25) & kBottom29Bits;
441     tmp2[i + 9] -= x >> 4;
442 
443     tmp2[i + 9] += 0x10000000 & xMask;
444     tmp2[i + 9] -= x;
445     tmp2[i + 10] += (x - 1) & xMask;
446   }
447 
448   /* We merge the right shift with a carry chain. The words above 2**257 have
449    * widths of 28,29,... which we need to correct when copying them down.  */
450   carry = 0;
451   for (i = 0; i < 8; i++) {
452     /* The maximum value of tmp2[i + 9] occurs on the first iteration and
453      * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
454      * therefore safe. */
455     out[i] = tmp2[i + 9];
456     out[i] += carry;
457     out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
458     carry = out[i] >> 29;
459     out[i] &= kBottom29Bits;
460 
461     i++;
462     out[i] = tmp2[i + 9] >> 1;
463     out[i] += carry;
464     carry = out[i] >> 28;
465     out[i] &= kBottom28Bits;
466   }
467 
468   out[8] = tmp2[17];
469   out[8] += carry;
470   carry = out[8] >> 29;
471   out[8] &= kBottom29Bits;
472 
473   felem_reduce_carry(out, carry);
474 }
475 
476 /* felem_square sets out=in*in.
477  *
478  * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
479  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_square(felem out,const felem in)480 static void felem_square(felem out, const felem in) {
481   u64 tmp[17];
482 
483   tmp[0] = ((u64) in[0]) * in[0];
484   tmp[1] = ((u64) in[0]) * (in[1] << 1);
485   tmp[2] = ((u64) in[0]) * (in[2] << 1) +
486            ((u64) in[1]) * (in[1] << 1);
487   tmp[3] = ((u64) in[0]) * (in[3] << 1) +
488            ((u64) in[1]) * (in[2] << 1);
489   tmp[4] = ((u64) in[0]) * (in[4] << 1) +
490            ((u64) in[1]) * (in[3] << 2) + ((u64) in[2]) * in[2];
491   tmp[5] = ((u64) in[0]) * (in[5] << 1) + ((u64) in[1]) *
492            (in[4] << 1) + ((u64) in[2]) * (in[3] << 1);
493   tmp[6] = ((u64) in[0]) * (in[6] << 1) + ((u64) in[1]) *
494            (in[5] << 2) + ((u64) in[2]) * (in[4] << 1) +
495            ((u64) in[3]) * (in[3] << 1);
496   tmp[7] = ((u64) in[0]) * (in[7] << 1) + ((u64) in[1]) *
497            (in[6] << 1) + ((u64) in[2]) * (in[5] << 1) +
498            ((u64) in[3]) * (in[4] << 1);
499   /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
500    * which is < 2**64 as required. */
501   tmp[8] = ((u64) in[0]) * (in[8] << 1) + ((u64) in[1]) *
502            (in[7] << 2) + ((u64) in[2]) * (in[6] << 1) +
503            ((u64) in[3]) * (in[5] << 2) + ((u64) in[4]) * in[4];
504   tmp[9] = ((u64) in[1]) * (in[8] << 1) + ((u64) in[2]) *
505            (in[7] << 1) + ((u64) in[3]) * (in[6] << 1) +
506            ((u64) in[4]) * (in[5] << 1);
507   tmp[10] = ((u64) in[2]) * (in[8] << 1) + ((u64) in[3]) *
508             (in[7] << 2) + ((u64) in[4]) * (in[6] << 1) +
509             ((u64) in[5]) * (in[5] << 1);
510   tmp[11] = ((u64) in[3]) * (in[8] << 1) + ((u64) in[4]) *
511             (in[7] << 1) + ((u64) in[5]) * (in[6] << 1);
512   tmp[12] = ((u64) in[4]) * (in[8] << 1) +
513             ((u64) in[5]) * (in[7] << 2) + ((u64) in[6]) * in[6];
514   tmp[13] = ((u64) in[5]) * (in[8] << 1) +
515             ((u64) in[6]) * (in[7] << 1);
516   tmp[14] = ((u64) in[6]) * (in[8] << 1) +
517             ((u64) in[7]) * (in[7] << 1);
518   tmp[15] = ((u64) in[7]) * (in[8] << 1);
519   tmp[16] = ((u64) in[8]) * in[8];
520 
521   felem_reduce_degree(out, tmp);
522 }
523 
524 /* felem_mul sets out=in*in2.
525  *
526  * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
527  *           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
528  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_mul(felem out,const felem in,const felem in2)529 static void felem_mul(felem out, const felem in, const felem in2) {
530   u64 tmp[17];
531 
532   tmp[0] = ((u64) in[0]) * in2[0];
533   tmp[1] = ((u64) in[0]) * (in2[1] << 0) +
534            ((u64) in[1]) * (in2[0] << 0);
535   tmp[2] = ((u64) in[0]) * (in2[2] << 0) + ((u64) in[1]) *
536            (in2[1] << 1) + ((u64) in[2]) * (in2[0] << 0);
537   tmp[3] = ((u64) in[0]) * (in2[3] << 0) + ((u64) in[1]) *
538            (in2[2] << 0) + ((u64) in[2]) * (in2[1] << 0) +
539            ((u64) in[3]) * (in2[0] << 0);
540   tmp[4] = ((u64) in[0]) * (in2[4] << 0) + ((u64) in[1]) *
541            (in2[3] << 1) + ((u64) in[2]) * (in2[2] << 0) +
542            ((u64) in[3]) * (in2[1] << 1) +
543            ((u64) in[4]) * (in2[0] << 0);
544   tmp[5] = ((u64) in[0]) * (in2[5] << 0) + ((u64) in[1]) *
545            (in2[4] << 0) + ((u64) in[2]) * (in2[3] << 0) +
546            ((u64) in[3]) * (in2[2] << 0) + ((u64) in[4]) *
547            (in2[1] << 0) + ((u64) in[5]) * (in2[0] << 0);
548   tmp[6] = ((u64) in[0]) * (in2[6] << 0) + ((u64) in[1]) *
549            (in2[5] << 1) + ((u64) in[2]) * (in2[4] << 0) +
550            ((u64) in[3]) * (in2[3] << 1) + ((u64) in[4]) *
551            (in2[2] << 0) + ((u64) in[5]) * (in2[1] << 1) +
552            ((u64) in[6]) * (in2[0] << 0);
553   tmp[7] = ((u64) in[0]) * (in2[7] << 0) + ((u64) in[1]) *
554            (in2[6] << 0) + ((u64) in[2]) * (in2[5] << 0) +
555            ((u64) in[3]) * (in2[4] << 0) + ((u64) in[4]) *
556            (in2[3] << 0) + ((u64) in[5]) * (in2[2] << 0) +
557            ((u64) in[6]) * (in2[1] << 0) +
558            ((u64) in[7]) * (in2[0] << 0);
559   /* tmp[8] has the greatest value but doesn't overflow. See logic in
560    * felem_square. */
561   tmp[8] = ((u64) in[0]) * (in2[8] << 0) + ((u64) in[1]) *
562            (in2[7] << 1) + ((u64) in[2]) * (in2[6] << 0) +
563            ((u64) in[3]) * (in2[5] << 1) + ((u64) in[4]) *
564            (in2[4] << 0) + ((u64) in[5]) * (in2[3] << 1) +
565            ((u64) in[6]) * (in2[2] << 0) + ((u64) in[7]) *
566            (in2[1] << 1) + ((u64) in[8]) * (in2[0] << 0);
567   tmp[9] = ((u64) in[1]) * (in2[8] << 0) + ((u64) in[2]) *
568            (in2[7] << 0) + ((u64) in[3]) * (in2[6] << 0) +
569            ((u64) in[4]) * (in2[5] << 0) + ((u64) in[5]) *
570            (in2[4] << 0) + ((u64) in[6]) * (in2[3] << 0) +
571            ((u64) in[7]) * (in2[2] << 0) +
572            ((u64) in[8]) * (in2[1] << 0);
573   tmp[10] = ((u64) in[2]) * (in2[8] << 0) + ((u64) in[3]) *
574             (in2[7] << 1) + ((u64) in[4]) * (in2[6] << 0) +
575             ((u64) in[5]) * (in2[5] << 1) + ((u64) in[6]) *
576             (in2[4] << 0) + ((u64) in[7]) * (in2[3] << 1) +
577             ((u64) in[8]) * (in2[2] << 0);
578   tmp[11] = ((u64) in[3]) * (in2[8] << 0) + ((u64) in[4]) *
579             (in2[7] << 0) + ((u64) in[5]) * (in2[6] << 0) +
580             ((u64) in[6]) * (in2[5] << 0) + ((u64) in[7]) *
581             (in2[4] << 0) + ((u64) in[8]) * (in2[3] << 0);
582   tmp[12] = ((u64) in[4]) * (in2[8] << 0) + ((u64) in[5]) *
583             (in2[7] << 1) + ((u64) in[6]) * (in2[6] << 0) +
584             ((u64) in[7]) * (in2[5] << 1) +
585             ((u64) in[8]) * (in2[4] << 0);
586   tmp[13] = ((u64) in[5]) * (in2[8] << 0) + ((u64) in[6]) *
587             (in2[7] << 0) + ((u64) in[7]) * (in2[6] << 0) +
588             ((u64) in[8]) * (in2[5] << 0);
589   tmp[14] = ((u64) in[6]) * (in2[8] << 0) + ((u64) in[7]) *
590             (in2[7] << 1) + ((u64) in[8]) * (in2[6] << 0);
591   tmp[15] = ((u64) in[7]) * (in2[8] << 0) +
592             ((u64) in[8]) * (in2[7] << 0);
593   tmp[16] = ((u64) in[8]) * (in2[8] << 0);
594 
595   felem_reduce_degree(out, tmp);
596 }
597 
felem_assign(felem out,const felem in)598 static void felem_assign(felem out, const felem in) {
599   memcpy(out, in, sizeof(felem));
600 }
601 
602 /* felem_scalar_3 sets out=3*out.
603  *
604  * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
605  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_scalar_3(felem out)606 static void felem_scalar_3(felem out) {
607   limb carry = 0;
608   unsigned i;
609 
610   for (i = 0;; i++) {
611     out[i] *= 3;
612     out[i] += carry;
613     carry = out[i] >> 29;
614     out[i] &= kBottom29Bits;
615 
616     i++;
617     if (i == NLIMBS)
618       break;
619 
620     out[i] *= 3;
621     out[i] += carry;
622     carry = out[i] >> 28;
623     out[i] &= kBottom28Bits;
624   }
625 
626   felem_reduce_carry(out, carry);
627 }
628 
629 /* felem_scalar_4 sets out=4*out.
630  *
631  * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
632  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_scalar_4(felem out)633 static void felem_scalar_4(felem out) {
634   limb carry = 0, next_carry;
635   unsigned i;
636 
637   for (i = 0;; i++) {
638     next_carry = out[i] >> 27;
639     out[i] <<= 2;
640     out[i] &= kBottom29Bits;
641     out[i] += carry;
642     carry = next_carry + (out[i] >> 29);
643     out[i] &= kBottom29Bits;
644 
645     i++;
646     if (i == NLIMBS)
647       break;
648 
649     next_carry = out[i] >> 26;
650     out[i] <<= 2;
651     out[i] &= kBottom28Bits;
652     out[i] += carry;
653     carry = next_carry + (out[i] >> 28);
654     out[i] &= kBottom28Bits;
655   }
656 
657   felem_reduce_carry(out, carry);
658 }
659 
660 /* felem_scalar_8 sets out=8*out.
661  *
662  * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
663  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_scalar_8(felem out)664 static void felem_scalar_8(felem out) {
665   limb carry = 0, next_carry;
666   unsigned i;
667 
668   for (i = 0;; i++) {
669     next_carry = out[i] >> 26;
670     out[i] <<= 3;
671     out[i] &= kBottom29Bits;
672     out[i] += carry;
673     carry = next_carry + (out[i] >> 29);
674     out[i] &= kBottom29Bits;
675 
676     i++;
677     if (i == NLIMBS)
678       break;
679 
680     next_carry = out[i] >> 25;
681     out[i] <<= 3;
682     out[i] &= kBottom28Bits;
683     out[i] += carry;
684     carry = next_carry + (out[i] >> 28);
685     out[i] &= kBottom28Bits;
686   }
687 
688   felem_reduce_carry(out, carry);
689 }
690 
691 /* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of
692  * time depending on the value of |in|. */
felem_is_zero_vartime(const felem in)693 static char felem_is_zero_vartime(const felem in) {
694   limb carry;
695   int i;
696   limb tmp[NLIMBS];
697 
698   felem_assign(tmp, in);
699 
700   /* First, reduce tmp to a minimal form. */
701   do {
702     carry = 0;
703     for (i = 0;; i++) {
704       tmp[i] += carry;
705       carry = tmp[i] >> 29;
706       tmp[i] &= kBottom29Bits;
707 
708       i++;
709       if (i == NLIMBS)
710         break;
711 
712       tmp[i] += carry;
713       carry = tmp[i] >> 28;
714       tmp[i] &= kBottom28Bits;
715     }
716 
717     felem_reduce_carry(tmp, carry);
718   } while (carry);
719 
720   /* tmp < 2**257, so the only possible zero values are 0, p and 2p. */
721   return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
722          memcmp(tmp, kP, sizeof(tmp)) == 0 ||
723          memcmp(tmp, k2P, sizeof(tmp)) == 0;
724 }
725 
726 
727 /* Montgomery operations: */
728 
729 #define kRDigits {2, 0, 0, 0xfffffffe, 0xffffffff, 0xffffffff, 0xfffffffd, 1} // 2^257 mod p256.p
730 
731 #define kRInvDigits {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xfffffffe, 1, 0x7fffffff}  // 1 / 2^257 mod p256.p
732 
733 static const cryptonite_p256_int kR = { kRDigits };
734 static const cryptonite_p256_int kRInv = { kRInvDigits };
735 
736 /* to_montgomery sets out = R*in. */
to_montgomery(felem out,const cryptonite_p256_int * in)737 static void to_montgomery(felem out, const cryptonite_p256_int* in) {
738   cryptonite_p256_int in_shifted;
739   int i;
740 
741   cryptonite_p256_init(&in_shifted);
742   cryptonite_p256_modmul(&cryptonite_SECP256r1_p, in, 0, &kR, &in_shifted);
743 
744   for (i = 0; i < NLIMBS; i++) {
745     if ((i & 1) == 0) {
746       out[i] = P256_DIGIT(&in_shifted, 0) & kBottom29Bits;
747       cryptonite_p256_shr(&in_shifted, 29, &in_shifted);
748     } else {
749       out[i] = P256_DIGIT(&in_shifted, 0) & kBottom28Bits;
750       cryptonite_p256_shr(&in_shifted, 28, &in_shifted);
751     }
752   }
753 
754   cryptonite_p256_clear(&in_shifted);
755 }
756 
757 /* from_montgomery sets out=in/R. */
from_montgomery(cryptonite_p256_int * out,const felem in)758 static void from_montgomery(cryptonite_p256_int* out, const felem in) {
759   cryptonite_p256_int result, tmp;
760   int i, top;
761 
762   cryptonite_p256_init(&result);
763   cryptonite_p256_init(&tmp);
764 
765   cryptonite_p256_add_d(&tmp, in[NLIMBS - 1], &result);
766   for (i = NLIMBS - 2; i >= 0; i--) {
767     if ((i & 1) == 0) {
768       top = cryptonite_p256_shl(&result, 29, &tmp);
769     } else {
770       top = cryptonite_p256_shl(&result, 28, &tmp);
771     }
772     top |= cryptonite_p256_add_d(&tmp, in[i], &result);
773   }
774 
775   cryptonite_p256_modmul(&cryptonite_SECP256r1_p, &kRInv, top, &result, out);
776 
777   cryptonite_p256_clear(&result);
778   cryptonite_p256_clear(&tmp);
779 }
780