1 /*
2 * Copyright 2013 The Android Open Source Project
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 * * Redistributions of source code must retain the above copyright
7 * notice, this list of conditions and the following disclaimer.
8 * * Redistributions in binary form must reproduce the above copyright
9 * notice, this list of conditions and the following disclaimer in the
10 * documentation and/or other materials provided with the distribution.
11 * * Neither the name of Google Inc. nor the names of its contributors may
12 * be used to endorse or promote products derived from this software
13 * without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
17 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
18 * EVENT SHALL Google Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
24 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 // This is an implementation of the P256 finite field. It's written to be
28 // portable and still constant-time.
29 //
30 // WARNING: Implementing these functions in a constant-time manner is far from
31 // obvious. Be careful when touching this code.
32 //
33 // See http://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background.
34
35 #include <stdint.h>
36 #include <stdio.h>
37
38 #include <string.h>
39 #include <stdlib.h>
40
41 #include "p256/p256.h"
42
43 typedef uint8_t u8;
44 typedef uint32_t u32;
45 typedef int32_t s32;
46 typedef uint64_t u64;
47
48 /* Our field elements are represented as nine 32-bit limbs.
49 *
50 * The value of an felem (field element) is:
51 * x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
52 *
53 * That is, each limb is alternately 29 or 28-bits wide in little-endian
54 * order.
55 *
56 * This means that an felem hits 2**257, rather than 2**256 as we would like. A
57 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
58 * when multiplying as terms end up one bit short of a limb which would require
59 * much bit-shifting to correct.
60 *
61 * Finally, the values stored in an felem are in Montgomery form. So the value
62 * |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 2**257.
63 */
64 typedef u32 limb;
65 #define NLIMBS 9
66 typedef limb felem[NLIMBS];
67
68 static const limb kBottom28Bits = 0xfffffff;
69 static const limb kBottom29Bits = 0x1fffffff;
70
71 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
72 * 28-bit words. */
73 static const felem kOne = {
74 2, 0, 0, 0xffff800,
75 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
76 0
77 };
78 static const felem kZero = {0};
79 static const felem kP = {
80 0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
81 0, 0, 0x200000, 0xf000000,
82 0xfffffff
83 };
84 static const felem k2P = {
85 0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
86 0, 0, 0x400000, 0xe000000,
87 0x1fffffff
88 };
89 /* kPrecomputed contains precomputed values to aid the calculation of scalar
90 * multiples of the base point, G. It's actually two, equal length, tables
91 * concatenated.
92 *
93 * The first table contains (x,y) felem pairs for 16 multiples of the base
94 * point, G.
95 *
96 * Index | Index (binary) | Value
97 * 0 | 0000 | 0G (all zeros, omitted)
98 * 1 | 0001 | G
99 * 2 | 0010 | 2**64G
100 * 3 | 0011 | 2**64G + G
101 * 4 | 0100 | 2**128G
102 * 5 | 0101 | 2**128G + G
103 * 6 | 0110 | 2**128G + 2**64G
104 * 7 | 0111 | 2**128G + 2**64G + G
105 * 8 | 1000 | 2**192G
106 * 9 | 1001 | 2**192G + G
107 * 10 | 1010 | 2**192G + 2**64G
108 * 11 | 1011 | 2**192G + 2**64G + G
109 * 12 | 1100 | 2**192G + 2**128G
110 * 13 | 1101 | 2**192G + 2**128G + G
111 * 14 | 1110 | 2**192G + 2**128G + 2**64G
112 * 15 | 1111 | 2**192G + 2**128G + 2**64G + G
113 *
114 * The second table follows the same style, but the terms are 2**32G,
115 * 2**96G, 2**160G, 2**224G.
116 *
117 * This is ~2KB of data. */
118 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
119 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
120 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
121 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
122 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
123 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
124 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
125 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
126 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
127 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
128 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
129 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
130 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
131 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
132 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
133 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
134 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
135 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
136 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
137 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
138 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
139 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
140 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
141 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
142 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
143 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
144 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
145 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
146 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
147 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
148 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
149 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
150 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
151 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
152 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
153 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
154 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
155 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
156 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
157 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
158 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
159 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
160 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
161 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
162 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
163 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
164 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
165 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
166 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
167 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
168 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
169 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
170 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
171 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
172 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
173 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
174 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
175 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
176 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
177 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
178 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
179 };
180
181
182 /* Field element operations: */
183
184 /* NON_ZERO_TO_ALL_ONES returns:
185 * 0xffffffff for 0 < x <= 2**31
186 * 0 for x == 0 or x > 2**31.
187 *
188 * x must be a u32 or an equivalent type such as limb. */
189 #define NON_ZERO_TO_ALL_ONES(x) ((((u32)(x) - 1) >> 31) - 1)
190
191 /* felem_reduce_carry adds a multiple of p in order to cancel |carry|,
192 * which is a term at 2**257.
193 *
194 * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
195 * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29. */
felem_reduce_carry(felem inout,limb carry)196 static void felem_reduce_carry(felem inout, limb carry) {
197 const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
198
199 inout[0] += carry << 1;
200 inout[3] += 0x10000000 & carry_mask;
201 /* carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
202 * previous line therefore this doesn't underflow. */
203 inout[3] -= carry << 11;
204 inout[4] += (0x20000000 - 1) & carry_mask;
205 inout[5] += (0x10000000 - 1) & carry_mask;
206 inout[6] += (0x20000000 - 1) & carry_mask;
207 inout[6] -= carry << 22;
208 /* This may underflow if carry is non-zero but, if so, we'll fix it in the
209 * next line. */
210 inout[7] -= 1 & carry_mask;
211 inout[7] += carry << 25;
212 }
213
214 /* felem_sum sets out = in+in2.
215 *
216 * On entry, in[i]+in2[i] must not overflow a 32-bit word.
217 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
felem_sum(felem out,const felem in,const felem in2)218 static void felem_sum(felem out, const felem in, const felem in2) {
219 limb carry = 0;
220 unsigned i;
221
222 for (i = 0;; i++) {
223 out[i] = in[i] + in2[i];
224 out[i] += carry;
225 carry = out[i] >> 29;
226 out[i] &= kBottom29Bits;
227
228 i++;
229 if (i == NLIMBS)
230 break;
231
232 out[i] = in[i] + in2[i];
233 out[i] += carry;
234 carry = out[i] >> 28;
235 out[i] &= kBottom28Bits;
236 }
237
238 felem_reduce_carry(out, carry);
239 }
240
241 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
242 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
243 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
244 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
245 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
246 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
247
248 /* zero31 is 0 mod p. */
249 static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2 };
250
251 /* felem_diff sets out = in-in2.
252 *
253 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
254 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
255 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_diff(felem out,const felem in,const felem in2)256 static void felem_diff(felem out, const felem in, const felem in2) {
257 limb carry = 0;
258 unsigned i;
259
260 for (i = 0;; i++) {
261 out[i] = in[i] - in2[i];
262 out[i] += zero31[i];
263 out[i] += carry;
264 carry = out[i] >> 29;
265 out[i] &= kBottom29Bits;
266
267 i++;
268 if (i == NLIMBS)
269 break;
270
271 out[i] = in[i] - in2[i];
272 out[i] += zero31[i];
273 out[i] += carry;
274 carry = out[i] >> 28;
275 out[i] &= kBottom28Bits;
276 }
277
278 felem_reduce_carry(out, carry);
279 }
280
281 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
282 * with the same 29,28,... bit positions as an felem.
283 *
284 * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
285 * Since we just multiplied two Montgomery values together, the result is
286 * x*y*R*R mod p. We wish to divide by R in order for the result also to be
287 * in Montgomery form.
288 *
289 * On entry: tmp[i] < 2**64
290 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
felem_reduce_degree(felem out,u64 tmp[17])291 static void felem_reduce_degree(felem out, u64 tmp[17]) {
292 /* The following table may be helpful when reading this code:
293 *
294 * Limb number: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
295 * Width (bits): 29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
296 * Start bit: 0 | 29| 57| 86|114|143|171|200|228|257|285
297 * (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285 */
298 limb tmp2[18], carry, x, xMask;
299 unsigned i;
300
301 /* tmp contains 64-bit words with the same 29,28,29-bit positions as an
302 * felem. So the top of an element of tmp might overlap with another
303 * element two positions down. The following loop eliminates this
304 * overlap. */
305 tmp2[0] = (limb)(tmp[0] & kBottom29Bits);
306
307 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
308 * and hint to the compiler that it can do a single-word shift by selecting
309 * the right register rather than doing a double-word shift and truncating
310 * afterwards. */
311 tmp2[1] = ((limb) tmp[0]) >> 29;
312 tmp2[1] |= (((limb)(tmp[0] >> 32)) << 3) & kBottom28Bits;
313 tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
314 carry = tmp2[1] >> 28;
315 tmp2[1] &= kBottom28Bits;
316
317 for (i = 2; i < 17; i++) {
318 tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
319 tmp2[i] += ((limb)(tmp[i - 1])) >> 28;
320 tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
321 tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
322 tmp2[i] += carry;
323 carry = tmp2[i] >> 29;
324 tmp2[i] &= kBottom29Bits;
325
326 i++;
327 if (i == 17)
328 break;
329 tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
330 tmp2[i] += ((limb)(tmp[i - 1])) >> 29;
331 tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
332 tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
333 tmp2[i] += carry;
334 carry = tmp2[i] >> 28;
335 tmp2[i] &= kBottom28Bits;
336 }
337
338 tmp2[17] = ((limb)(tmp[15] >> 32)) >> 25;
339 tmp2[17] += ((limb)(tmp[16])) >> 29;
340 tmp2[17] += (((limb)(tmp[16] >> 32)) << 3);
341 tmp2[17] += carry;
342
343 /* Montgomery elimination of terms.
344 *
345 * Since R is 2**257, we can divide by R with a bitwise shift if we can
346 * ensure that the right-most 257 bits are all zero. We can make that true by
347 * adding multiplies of p without affecting the value.
348 *
349 * So we eliminate limbs from right to left. Since the bottom 29 bits of p
350 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
351 * We can do that for 8 further limbs and then right shift to eliminate the
352 * extra factor of R. */
353 for (i = 0;; i += 2) {
354 tmp2[i + 1] += tmp2[i] >> 29;
355 x = tmp2[i] & kBottom29Bits;
356 xMask = NON_ZERO_TO_ALL_ONES(x);
357 tmp2[i] = 0;
358
359 /* The bounds calculations for this loop are tricky. Each iteration of
360 * the loop eliminates two words by adding values to words to their
361 * right.
362 *
363 * The following table contains the amounts added to each word (as an
364 * offset from the value of i at the top of the loop). The amounts are
365 * accounted for from the first and second half of the loop separately
366 * and are written as, for example, 28 to mean a value <2**28.
367 *
368 * Word: 3 4 5 6 7 8 9 10
369 * Added in top half: 28 11 29 21 29 28
370 * 28 29
371 * 29
372 * Added in bottom half: 29 10 28 21 28 28
373 * 29
374 *
375 * The value that is currently offset 7 will be offset 5 for the next
376 * iteration and then offset 3 for the iteration after that. Therefore
377 * the total value added will be the values added at 7, 5 and 3.
378 *
379 * The following table accumulates these values. The sums at the bottom
380 * are written as, for example, 29+28, to mean a value < 2**29+2**28.
381 *
382 * Word: 3 4 5 6 7 8 9 10 11 12 13
383 * 28 11 10 29 21 29 28 28 28 28 28
384 * 29 28 11 28 29 28 29 28 29 28
385 * 29 28 21 21 29 21 29 21
386 * 10 29 28 21 28 21 28
387 * 28 29 28 29 28 29 28
388 * 11 10 29 10 29 10
389 * 29 28 11 28 11
390 * 29 29
391 * --------------------------------------------
392 * 30+ 31+ 30+ 31+ 30+
393 * 28+ 29+ 28+ 29+ 21+
394 * 21+ 28+ 21+ 28+ 10
395 * 10 21+ 10 21+
396 * 11 11
397 *
398 * So the greatest amount is added to tmp2[10] and tmp2[12]. If
399 * tmp2[10/12] has an initial value of <2**29, then the maximum value
400 * will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
401 * as required. */
402 tmp2[i + 3] += (x << 10) & kBottom28Bits;
403 tmp2[i + 4] += (x >> 18);
404
405 tmp2[i + 6] += (x << 21) & kBottom29Bits;
406 tmp2[i + 7] += x >> 8;
407
408 /* At position 200, which is the starting bit position for word 7, we
409 * have a factor of 0xf000000 = 2**28 - 2**24. */
410 tmp2[i + 7] += 0x10000000 & xMask;
411 /* Word 7 is 28 bits wide, so the 2**28 term exactly hits word 8. */
412 tmp2[i + 8] += (x - 1) & xMask;
413 tmp2[i + 7] -= (x << 24) & kBottom28Bits;
414 tmp2[i + 8] -= x >> 4;
415
416 tmp2[i + 8] += 0x20000000 & xMask;
417 tmp2[i + 8] -= x;
418 tmp2[i + 8] += (x << 28) & kBottom29Bits;
419 tmp2[i + 9] += ((x >> 1) - 1) & xMask;
420
421 if (i+1 == NLIMBS)
422 break;
423 tmp2[i + 2] += tmp2[i + 1] >> 28;
424 x = tmp2[i + 1] & kBottom28Bits;
425 xMask = NON_ZERO_TO_ALL_ONES(x);
426 tmp2[i + 1] = 0;
427
428 tmp2[i + 4] += (x << 11) & kBottom29Bits;
429 tmp2[i + 5] += (x >> 18);
430
431 tmp2[i + 7] += (x << 21) & kBottom28Bits;
432 tmp2[i + 8] += x >> 7;
433
434 /* At position 199, which is the starting bit of the 8th word when
435 * dealing with a context starting on an odd word, we have a factor of
436 * 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
437 * word from i+1 is i+8. */
438 tmp2[i + 8] += 0x20000000 & xMask;
439 tmp2[i + 9] += (x - 1) & xMask;
440 tmp2[i + 8] -= (x << 25) & kBottom29Bits;
441 tmp2[i + 9] -= x >> 4;
442
443 tmp2[i + 9] += 0x10000000 & xMask;
444 tmp2[i + 9] -= x;
445 tmp2[i + 10] += (x - 1) & xMask;
446 }
447
448 /* We merge the right shift with a carry chain. The words above 2**257 have
449 * widths of 28,29,... which we need to correct when copying them down. */
450 carry = 0;
451 for (i = 0; i < 8; i++) {
452 /* The maximum value of tmp2[i + 9] occurs on the first iteration and
453 * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
454 * therefore safe. */
455 out[i] = tmp2[i + 9];
456 out[i] += carry;
457 out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
458 carry = out[i] >> 29;
459 out[i] &= kBottom29Bits;
460
461 i++;
462 out[i] = tmp2[i + 9] >> 1;
463 out[i] += carry;
464 carry = out[i] >> 28;
465 out[i] &= kBottom28Bits;
466 }
467
468 out[8] = tmp2[17];
469 out[8] += carry;
470 carry = out[8] >> 29;
471 out[8] &= kBottom29Bits;
472
473 felem_reduce_carry(out, carry);
474 }
475
476 /* felem_square sets out=in*in.
477 *
478 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
479 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_square(felem out,const felem in)480 static void felem_square(felem out, const felem in) {
481 u64 tmp[17];
482
483 tmp[0] = ((u64) in[0]) * in[0];
484 tmp[1] = ((u64) in[0]) * (in[1] << 1);
485 tmp[2] = ((u64) in[0]) * (in[2] << 1) +
486 ((u64) in[1]) * (in[1] << 1);
487 tmp[3] = ((u64) in[0]) * (in[3] << 1) +
488 ((u64) in[1]) * (in[2] << 1);
489 tmp[4] = ((u64) in[0]) * (in[4] << 1) +
490 ((u64) in[1]) * (in[3] << 2) + ((u64) in[2]) * in[2];
491 tmp[5] = ((u64) in[0]) * (in[5] << 1) + ((u64) in[1]) *
492 (in[4] << 1) + ((u64) in[2]) * (in[3] << 1);
493 tmp[6] = ((u64) in[0]) * (in[6] << 1) + ((u64) in[1]) *
494 (in[5] << 2) + ((u64) in[2]) * (in[4] << 1) +
495 ((u64) in[3]) * (in[3] << 1);
496 tmp[7] = ((u64) in[0]) * (in[7] << 1) + ((u64) in[1]) *
497 (in[6] << 1) + ((u64) in[2]) * (in[5] << 1) +
498 ((u64) in[3]) * (in[4] << 1);
499 /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
500 * which is < 2**64 as required. */
501 tmp[8] = ((u64) in[0]) * (in[8] << 1) + ((u64) in[1]) *
502 (in[7] << 2) + ((u64) in[2]) * (in[6] << 1) +
503 ((u64) in[3]) * (in[5] << 2) + ((u64) in[4]) * in[4];
504 tmp[9] = ((u64) in[1]) * (in[8] << 1) + ((u64) in[2]) *
505 (in[7] << 1) + ((u64) in[3]) * (in[6] << 1) +
506 ((u64) in[4]) * (in[5] << 1);
507 tmp[10] = ((u64) in[2]) * (in[8] << 1) + ((u64) in[3]) *
508 (in[7] << 2) + ((u64) in[4]) * (in[6] << 1) +
509 ((u64) in[5]) * (in[5] << 1);
510 tmp[11] = ((u64) in[3]) * (in[8] << 1) + ((u64) in[4]) *
511 (in[7] << 1) + ((u64) in[5]) * (in[6] << 1);
512 tmp[12] = ((u64) in[4]) * (in[8] << 1) +
513 ((u64) in[5]) * (in[7] << 2) + ((u64) in[6]) * in[6];
514 tmp[13] = ((u64) in[5]) * (in[8] << 1) +
515 ((u64) in[6]) * (in[7] << 1);
516 tmp[14] = ((u64) in[6]) * (in[8] << 1) +
517 ((u64) in[7]) * (in[7] << 1);
518 tmp[15] = ((u64) in[7]) * (in[8] << 1);
519 tmp[16] = ((u64) in[8]) * in[8];
520
521 felem_reduce_degree(out, tmp);
522 }
523
524 /* felem_mul sets out=in*in2.
525 *
526 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
527 * in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
528 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_mul(felem out,const felem in,const felem in2)529 static void felem_mul(felem out, const felem in, const felem in2) {
530 u64 tmp[17];
531
532 tmp[0] = ((u64) in[0]) * in2[0];
533 tmp[1] = ((u64) in[0]) * (in2[1] << 0) +
534 ((u64) in[1]) * (in2[0] << 0);
535 tmp[2] = ((u64) in[0]) * (in2[2] << 0) + ((u64) in[1]) *
536 (in2[1] << 1) + ((u64) in[2]) * (in2[0] << 0);
537 tmp[3] = ((u64) in[0]) * (in2[3] << 0) + ((u64) in[1]) *
538 (in2[2] << 0) + ((u64) in[2]) * (in2[1] << 0) +
539 ((u64) in[3]) * (in2[0] << 0);
540 tmp[4] = ((u64) in[0]) * (in2[4] << 0) + ((u64) in[1]) *
541 (in2[3] << 1) + ((u64) in[2]) * (in2[2] << 0) +
542 ((u64) in[3]) * (in2[1] << 1) +
543 ((u64) in[4]) * (in2[0] << 0);
544 tmp[5] = ((u64) in[0]) * (in2[5] << 0) + ((u64) in[1]) *
545 (in2[4] << 0) + ((u64) in[2]) * (in2[3] << 0) +
546 ((u64) in[3]) * (in2[2] << 0) + ((u64) in[4]) *
547 (in2[1] << 0) + ((u64) in[5]) * (in2[0] << 0);
548 tmp[6] = ((u64) in[0]) * (in2[6] << 0) + ((u64) in[1]) *
549 (in2[5] << 1) + ((u64) in[2]) * (in2[4] << 0) +
550 ((u64) in[3]) * (in2[3] << 1) + ((u64) in[4]) *
551 (in2[2] << 0) + ((u64) in[5]) * (in2[1] << 1) +
552 ((u64) in[6]) * (in2[0] << 0);
553 tmp[7] = ((u64) in[0]) * (in2[7] << 0) + ((u64) in[1]) *
554 (in2[6] << 0) + ((u64) in[2]) * (in2[5] << 0) +
555 ((u64) in[3]) * (in2[4] << 0) + ((u64) in[4]) *
556 (in2[3] << 0) + ((u64) in[5]) * (in2[2] << 0) +
557 ((u64) in[6]) * (in2[1] << 0) +
558 ((u64) in[7]) * (in2[0] << 0);
559 /* tmp[8] has the greatest value but doesn't overflow. See logic in
560 * felem_square. */
561 tmp[8] = ((u64) in[0]) * (in2[8] << 0) + ((u64) in[1]) *
562 (in2[7] << 1) + ((u64) in[2]) * (in2[6] << 0) +
563 ((u64) in[3]) * (in2[5] << 1) + ((u64) in[4]) *
564 (in2[4] << 0) + ((u64) in[5]) * (in2[3] << 1) +
565 ((u64) in[6]) * (in2[2] << 0) + ((u64) in[7]) *
566 (in2[1] << 1) + ((u64) in[8]) * (in2[0] << 0);
567 tmp[9] = ((u64) in[1]) * (in2[8] << 0) + ((u64) in[2]) *
568 (in2[7] << 0) + ((u64) in[3]) * (in2[6] << 0) +
569 ((u64) in[4]) * (in2[5] << 0) + ((u64) in[5]) *
570 (in2[4] << 0) + ((u64) in[6]) * (in2[3] << 0) +
571 ((u64) in[7]) * (in2[2] << 0) +
572 ((u64) in[8]) * (in2[1] << 0);
573 tmp[10] = ((u64) in[2]) * (in2[8] << 0) + ((u64) in[3]) *
574 (in2[7] << 1) + ((u64) in[4]) * (in2[6] << 0) +
575 ((u64) in[5]) * (in2[5] << 1) + ((u64) in[6]) *
576 (in2[4] << 0) + ((u64) in[7]) * (in2[3] << 1) +
577 ((u64) in[8]) * (in2[2] << 0);
578 tmp[11] = ((u64) in[3]) * (in2[8] << 0) + ((u64) in[4]) *
579 (in2[7] << 0) + ((u64) in[5]) * (in2[6] << 0) +
580 ((u64) in[6]) * (in2[5] << 0) + ((u64) in[7]) *
581 (in2[4] << 0) + ((u64) in[8]) * (in2[3] << 0);
582 tmp[12] = ((u64) in[4]) * (in2[8] << 0) + ((u64) in[5]) *
583 (in2[7] << 1) + ((u64) in[6]) * (in2[6] << 0) +
584 ((u64) in[7]) * (in2[5] << 1) +
585 ((u64) in[8]) * (in2[4] << 0);
586 tmp[13] = ((u64) in[5]) * (in2[8] << 0) + ((u64) in[6]) *
587 (in2[7] << 0) + ((u64) in[7]) * (in2[6] << 0) +
588 ((u64) in[8]) * (in2[5] << 0);
589 tmp[14] = ((u64) in[6]) * (in2[8] << 0) + ((u64) in[7]) *
590 (in2[7] << 1) + ((u64) in[8]) * (in2[6] << 0);
591 tmp[15] = ((u64) in[7]) * (in2[8] << 0) +
592 ((u64) in[8]) * (in2[7] << 0);
593 tmp[16] = ((u64) in[8]) * (in2[8] << 0);
594
595 felem_reduce_degree(out, tmp);
596 }
597
felem_assign(felem out,const felem in)598 static void felem_assign(felem out, const felem in) {
599 memcpy(out, in, sizeof(felem));
600 }
601
602 /* felem_scalar_3 sets out=3*out.
603 *
604 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
605 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_scalar_3(felem out)606 static void felem_scalar_3(felem out) {
607 limb carry = 0;
608 unsigned i;
609
610 for (i = 0;; i++) {
611 out[i] *= 3;
612 out[i] += carry;
613 carry = out[i] >> 29;
614 out[i] &= kBottom29Bits;
615
616 i++;
617 if (i == NLIMBS)
618 break;
619
620 out[i] *= 3;
621 out[i] += carry;
622 carry = out[i] >> 28;
623 out[i] &= kBottom28Bits;
624 }
625
626 felem_reduce_carry(out, carry);
627 }
628
629 /* felem_scalar_4 sets out=4*out.
630 *
631 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
632 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_scalar_4(felem out)633 static void felem_scalar_4(felem out) {
634 limb carry = 0, next_carry;
635 unsigned i;
636
637 for (i = 0;; i++) {
638 next_carry = out[i] >> 27;
639 out[i] <<= 2;
640 out[i] &= kBottom29Bits;
641 out[i] += carry;
642 carry = next_carry + (out[i] >> 29);
643 out[i] &= kBottom29Bits;
644
645 i++;
646 if (i == NLIMBS)
647 break;
648
649 next_carry = out[i] >> 26;
650 out[i] <<= 2;
651 out[i] &= kBottom28Bits;
652 out[i] += carry;
653 carry = next_carry + (out[i] >> 28);
654 out[i] &= kBottom28Bits;
655 }
656
657 felem_reduce_carry(out, carry);
658 }
659
660 /* felem_scalar_8 sets out=8*out.
661 *
662 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
663 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
felem_scalar_8(felem out)664 static void felem_scalar_8(felem out) {
665 limb carry = 0, next_carry;
666 unsigned i;
667
668 for (i = 0;; i++) {
669 next_carry = out[i] >> 26;
670 out[i] <<= 3;
671 out[i] &= kBottom29Bits;
672 out[i] += carry;
673 carry = next_carry + (out[i] >> 29);
674 out[i] &= kBottom29Bits;
675
676 i++;
677 if (i == NLIMBS)
678 break;
679
680 next_carry = out[i] >> 25;
681 out[i] <<= 3;
682 out[i] &= kBottom28Bits;
683 out[i] += carry;
684 carry = next_carry + (out[i] >> 28);
685 out[i] &= kBottom28Bits;
686 }
687
688 felem_reduce_carry(out, carry);
689 }
690
691 /* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of
692 * time depending on the value of |in|. */
felem_is_zero_vartime(const felem in)693 static char felem_is_zero_vartime(const felem in) {
694 limb carry;
695 int i;
696 limb tmp[NLIMBS];
697
698 felem_assign(tmp, in);
699
700 /* First, reduce tmp to a minimal form. */
701 do {
702 carry = 0;
703 for (i = 0;; i++) {
704 tmp[i] += carry;
705 carry = tmp[i] >> 29;
706 tmp[i] &= kBottom29Bits;
707
708 i++;
709 if (i == NLIMBS)
710 break;
711
712 tmp[i] += carry;
713 carry = tmp[i] >> 28;
714 tmp[i] &= kBottom28Bits;
715 }
716
717 felem_reduce_carry(tmp, carry);
718 } while (carry);
719
720 /* tmp < 2**257, so the only possible zero values are 0, p and 2p. */
721 return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
722 memcmp(tmp, kP, sizeof(tmp)) == 0 ||
723 memcmp(tmp, k2P, sizeof(tmp)) == 0;
724 }
725
726
727 /* Montgomery operations: */
728
729 #define kRDigits {2, 0, 0, 0xfffffffe, 0xffffffff, 0xffffffff, 0xfffffffd, 1} // 2^257 mod p256.p
730
731 #define kRInvDigits {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xfffffffe, 1, 0x7fffffff} // 1 / 2^257 mod p256.p
732
733 static const cryptonite_p256_int kR = { kRDigits };
734 static const cryptonite_p256_int kRInv = { kRInvDigits };
735
736 /* to_montgomery sets out = R*in. */
to_montgomery(felem out,const cryptonite_p256_int * in)737 static void to_montgomery(felem out, const cryptonite_p256_int* in) {
738 cryptonite_p256_int in_shifted;
739 int i;
740
741 cryptonite_p256_init(&in_shifted);
742 cryptonite_p256_modmul(&cryptonite_SECP256r1_p, in, 0, &kR, &in_shifted);
743
744 for (i = 0; i < NLIMBS; i++) {
745 if ((i & 1) == 0) {
746 out[i] = P256_DIGIT(&in_shifted, 0) & kBottom29Bits;
747 cryptonite_p256_shr(&in_shifted, 29, &in_shifted);
748 } else {
749 out[i] = P256_DIGIT(&in_shifted, 0) & kBottom28Bits;
750 cryptonite_p256_shr(&in_shifted, 28, &in_shifted);
751 }
752 }
753
754 cryptonite_p256_clear(&in_shifted);
755 }
756
757 /* from_montgomery sets out=in/R. */
from_montgomery(cryptonite_p256_int * out,const felem in)758 static void from_montgomery(cryptonite_p256_int* out, const felem in) {
759 cryptonite_p256_int result, tmp;
760 int i, top;
761
762 cryptonite_p256_init(&result);
763 cryptonite_p256_init(&tmp);
764
765 cryptonite_p256_add_d(&tmp, in[NLIMBS - 1], &result);
766 for (i = NLIMBS - 2; i >= 0; i--) {
767 if ((i & 1) == 0) {
768 top = cryptonite_p256_shl(&result, 29, &tmp);
769 } else {
770 top = cryptonite_p256_shl(&result, 28, &tmp);
771 }
772 top |= cryptonite_p256_add_d(&tmp, in[i], &result);
773 }
774
775 cryptonite_p256_modmul(&cryptonite_SECP256r1_p, &kRInv, top, &result, out);
776
777 cryptonite_p256_clear(&result);
778 cryptonite_p256_clear(&tmp);
779 }
780