1 /*
2 * Copyright 2013 The Android Open Source Project
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 * * Redistributions of source code must retain the above copyright
7 * notice, this list of conditions and the following disclaimer.
8 * * Redistributions in binary form must reproduce the above copyright
9 * notice, this list of conditions and the following disclaimer in the
10 * documentation and/or other materials provided with the distribution.
11 * * Neither the name of Google Inc. nor the names of its contributors may
12 * be used to endorse or promote products derived from this software
13 * without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
17 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
18 * EVENT SHALL Google Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
24 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 // This is an implementation of the P256 finite field. It's written to be
28 // portable and still constant-time.
29 //
30 // WARNING: Implementing these functions in a constant-time manner is far from
31 // obvious. Be careful when touching this code.
32 //
33 // See http://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background.
34
35 #include <stdint.h>
36 #include <stdio.h>
37
38 #include <string.h>
39 #include <stdlib.h>
40
41 #include "p256/p256.h"
42
43 typedef uint8_t u8;
44 typedef uint32_t u32;
45 typedef uint64_t u64;
46 typedef int64_t s64;
47 typedef __uint128_t u128;
48
49 /* Our field elements are represented as five 64-bit limbs.
50 *
51 * The value of an felem (field element) is:
52 * x[0] + (x[1] * 2**51) + (x[2] * 2**103) + ... + (x[4] * 2**206)
53 *
54 * That is, each limb is alternately 51 or 52-bits wide in little-endian
55 * order.
56 *
57 * This means that an felem hits 2**257, rather than 2**256 as we would like.
58 *
59 * Finally, the values stored in an felem are in Montgomery form. So the value
60 * |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 2**257.
61 */
62 typedef u64 limb;
63 #define NLIMBS 5
64 typedef limb felem[NLIMBS];
65
66 static const limb kBottom51Bits = 0x7ffffffffffff;
67 static const limb kBottom52Bits = 0xfffffffffffff;
68
69 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 51 and
70 * 52-bit words. */
71 static const felem kOne = {
72 2, 0xfc00000000000, 0x7ffffffffffff, 0xfff7fffffffff, 0x7ffff
73 };
74 static const felem kZero = {0};
75 static const felem kP = {
76 0x7ffffffffffff, 0x1fffffffffff, 0, 0x4000000000, 0x3fffffffc0000
77 };
78 static const felem k2P = {
79 0x7fffffffffffe, 0x3fffffffffff, 0, 0x8000000000, 0x7fffffff80000
80 };
81 /* kPrecomputed contains precomputed values to aid the calculation of scalar
82 * multiples of the base point, G. It's actually two, equal length, tables
83 * concatenated.
84 *
85 * The first table contains (x,y) felem pairs for 16 multiples of the base
86 * point, G.
87 *
88 * Index | Index (binary) | Value
89 * 0 | 0000 | 0G (all zeros, omitted)
90 * 1 | 0001 | G
91 * 2 | 0010 | 2**64G
92 * 3 | 0011 | 2**64G + G
93 * 4 | 0100 | 2**128G
94 * 5 | 0101 | 2**128G + G
95 * 6 | 0110 | 2**128G + 2**64G
96 * 7 | 0111 | 2**128G + 2**64G + G
97 * 8 | 1000 | 2**192G
98 * 9 | 1001 | 2**192G + G
99 * 10 | 1010 | 2**192G + 2**64G
100 * 11 | 1011 | 2**192G + 2**64G + G
101 * 12 | 1100 | 2**192G + 2**128G
102 * 13 | 1101 | 2**192G + 2**128G + G
103 * 14 | 1110 | 2**192G + 2**128G + 2**64G
104 * 15 | 1111 | 2**192G + 2**128G + 2**64G + G
105 *
106 * The second table follows the same style, but the terms are 2**32G,
107 * 2**96G, 2**160G, 2**224G.
108 *
109 * This is ~2KB of data. */
110 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
111 0x661a831522878, 0xf17fb6d805e79, 0x5889441d6ea57, 0xae33cfdb995bb, 0xc482fbb529ba,
112 0x4a6af9d2aac15, 0x90e867917377c, 0x487cc962d2ae3, 0xec2a97443446e, 0x2b8ff8c52c42,
113 0x45f8a2d41a576, 0xb06988d2653e4, 0x718b22c357305, 0x33fc920e79d2b, 0x17af34b0fe8db,
114 0x38e17eb402f2f, 0x3382558649705, 0x47f6d48f482d1, 0x7bd42488d9b83, 0x3b247c8b86b78,
115 0x4d08fc26f7778, 0x7a29a82fb2795, 0x75cd18f90d11a, 0xad8e213b0bc, 0x2d5f0142899e8,
116 0x506f98098fb57, 0x2f0c98301e4aa, 0x39b30dd5cf67d, 0x9c146498ab13c, 0xa5db92df5b7b,
117 0x184897fc4124a, 0xe3f73a19d8aa, 0x4e1c18e47066b, 0x27b2d4b52eaee, 0x30eac3ea10e99,
118 0x4e74546e2e7d5, 0x1f4dde2d97a1d, 0x6ead0f88e1200, 0x7dec87c220f02, 0x3d08ff096310f,
119 0x23e5659633ffa, 0x6ec648f08c722, 0x3172a3806ea35, 0xf6e5b681eb3c5, 0x2c3758260f89d,
120 0x38dca4fd1da12, 0xf06067b78830d, 0x3194be87a068c, 0x78893c7eb602b, 0xcead60438432,
121 0x6ee69a56a67ab, 0xd886f77701895, 0x67b0a4d9cee2b, 0x3586bbf3e4d53, 0x1db6f32921d93,
122 0x260756ca4b366, 0x4f40e9d2039fa, 0x4f3f09f5a82bf, 0xccde2d641e8cd, 0x305a30cd2e8c5,
123 0x471c235cb5439, 0xab279cd962f5a, 0x17e1fb6e2dd94, 0xfe64589800a77, 0xe8793d99775f,
124 0x48c62f4e614aa, 0xbf76ef20eb2a4, 0x669c672556c, 0x24683e0eff056, 0x12252b369ab76,
125 0x821de9f162d5, 0xf911ec99a95be, 0x6721f065c906b, 0x58d452035c736, 0x1f9f01a6a15,
126 0x6135009b7d8d3, 0xdaeeeb417dfc0, 0x63865fea0ee17, 0x6e0a304b939d6, 0x204ba2076833d,
127 0x4ade586f35669, 0x2c1077e34611a, 0x5b1a3bea3b81a, 0xf97d018a22c8b, 0x38d7996b08af8,
128 0x6ea62baeb7aa0, 0xebdcbd9ef2670, 0x35dc8fe0df3fe, 0xe458309d20c24, 0x11e87898716a0,
129 0x7c44bab7cb456, 0xd64d3cf1bb64, 0x189bff1bf9e66, 0xb5218a049311, 0x285dda6cbcc81,
130 0x3238dcafd8c7c, 0x607736c8de0, 0xdb83d99508b1, 0x4e1a0d404cd81, 0x1588008c00ff2,
131 0x16b8b36722b27, 0x876609c3f3f1a, 0x66b72ef0e17d6, 0x705f8a279d568, 0x2eaac4cd01fdd,
132 0x1171ce9705fe9, 0xffc79cd3264ee, 0x700c8ab4b80f0, 0x208d3d4f57a1, 0x337262a8ca4eb,
133 0x297fd01d843fd, 0xa90956fa097f8, 0x529759fdb3845, 0x1d78c5e2d0397, 0x3d6938a4adbf3,
134 0x16d5853560b66, 0xf138946b9a430, 0x2ab79f4dea6a0, 0xd42053ee43ae1, 0x3b9c3ef1cf870,
135 0x598934ad81baf, 0x5f1821b1d07a7, 0x416bb3a973ff3, 0x23f07bd0a047a, 0x19bdc2e09f786,
136 0x56dc9981cd51f, 0xfbace23c8cd65, 0x673bd3bf5b52e, 0x46a95d229fd61, 0xe09ad64bcfb1,
137 0xe5292b91f17d, 0xfeefcd8afc287, 0x58f52b0a58711, 0x4800f20c201ef, 0x2084fce608f67,
138 0x12ba0b128ae0b, 0x5977ae17030b4, 0x101126ee420f6, 0xf70823495c6bd, 0xde19a27d7770,
139 0x5c6ac852260e8, 0x9d22950ac4356, 0x441cca955246c, 0x660a34e5332d9, 0x14ac8ea92f8d2,
140 0x6b6d7709f307e, 0x67d7e13879db, 0x2ea8626f9fbbd, 0x99609006a4b40, 0x31bb2a8f8c779,
141 0x10c04828ea335, 0xae9acdcbc080a, 0x617af2342607a, 0xc7494ea53e553, 0x2ca9e2872defa,
142 0x6c399fab21f1f, 0xab139b245e758, 0x3ad933dcba589, 0x4797fecb08811, 0x31f5dbf8f594,
143 0x7dc6361cc7a69, 0xc8a7953ead3f9, 0x79ed693d18015, 0x418a024999a6a, 0x2c4fdc9436aa,
144 0x1eb98cb06aa75, 0x2989592796a9c, 0x11194821e425, 0xe27a648228388, 0x35d834b6c12a0,
145 0x541807713b532, 0x7ae0a1008aaee, 0x7017a29bcb5e, 0x6b193c23c315c, 0x19bd25ac82f2a,
146 0x6a01a43eef294, 0xddf5b5fd84f19, 0x33f5ba081c016, 0xdeb052d1bc082, 0x6b2f06afa617,
147 0x7ca1eda6a939f, 0xbdeb35997b50c, 0x47f2d1bccda5, 0xc2ff4adfed667, 0x87712997be4,
148 0x21fc2e2b37659, 0xf7d62cd5ed951, 0x27fa9cbdf7efa, 0xba25582bf3a6b, 0x2a42b8bd89398,
149 0x6d377d07eecd2, 0x9ca1df5af387, 0x1109e3427e2ba, 0xce4aa4572a19, 0x103baaef71e16,
150 0x2c3b2dfde328a, 0xbec4b4a30e1ef, 0x37d92a86204f3, 0x806cfde68eb39, 0x246e2f72b8aa5,
151 0x68d3de93462a9, 0x53b8acba6bbc3, 0x2492a70fa1696, 0x38c62d5760f55, 0x15096fe4904f2,
152 0x4e44e9bed3e3a, 0xb28bfd79cc9bc, 0x6a77513839320, 0x480dcec6739db, 0x3601b739f2465,
153 0x43c348e2a7e1, 0xe448106327879, 0x175d9cae1b0ed, 0xd3b89dee743b8, 0x392d73ca255bc,
154 0x32946db0d3a18, 0x9261b09907cc, 0x5ba517a755722, 0x51f24fdaf5184, 0x1cdc732989ed8,
155 0x2f7806ba16694, 0xae0c9f029f8d0, 0xd8b45102ce1, 0xca1c7db9316d6, 0x162088a67066f,
156 0x39de35b2b4162, 0xa19f550d88ae9, 0x7921b27026cde, 0x94b936b66e900, 0x1023bd5fa17fc,
157 0x436837814cfa4, 0x29113492283c4, 0x66d1cdd8b51d8, 0xa540702278eb2, 0x47ef1b29285d,
158 0x587b50917e50e, 0xb4cda75bab3b, 0x112520b0a9886, 0x66b9ac16fee49, 0x17bf17e92b2eb,
159 0x2456a2f150ed7, 0xfa214412d0280, 0x3ca7dd947fe5b, 0xa72c28598d58a, 0x255d945efc3e,
160 0x2873f04e0f215, 0x74178fd1af57b, 0x788848b5b2d6, 0xb1ffafaae0db6, 0x32a1b7b3cbb2a,
161 0x4bd9935d6b2da, 0x9c08f24ad30a5, 0x4e58407a80f, 0x1b3a3825a5b17, 0x6547e9fc82f5,
162 0x47484aa3656c3, 0x6ee43f341a494, 0x64a98f87adea2, 0x619b3f8e95f01, 0xb6e513266ed8,
163 0x421c2a673090, 0xa1c1de32348c7, 0x55b85c3a1e8a3, 0xe05ce8ef330b4, 0x2561e49c15d84,
164 0x40aa2d33130fa, 0x12b827d35866f, 0xfe4cf62c8ddb, 0x2fa0ef05bb28d, 0x1c06ca63f1cb8,
165 0x32a971863863b, 0xff6fc86830da1, 0x71e7b25a14cf3, 0xea9c5ebb1373a, 0x250bbaa3e1634,
166 0x5b5ffeda5b765, 0xf25d2a746331b, 0x115e3a3f43632, 0x67303af43c9d5, 0x14bb538a0e559,
167 0x75623687d43b7, 0xa349674a4b38d, 0x613c61829ffc6, 0x689828d8110c7, 0x139115f5af7d5,
168 0xf1d856152289, 0x45cbe967168ab, 0x51f38e1680901, 0x34808e8f652b0, 0x1f4a6a921e156,
169 0x35dfaf3d8341f, 0xf53ace725cb63, 0x3d86a54eef35b, 0xa103aabaffe2c, 0x2decc36296fbd,
170 0x510282be73d6f, 0xd4e6365db206a, 0x4bdc5f5bb8bf3, 0xde7ea32a3aee7, 0x71269e274305,
171 };
172
173
174 /* Field element operations: */
175
176 /* NON_ZERO_TO_ALL_ONES returns:
177 * 0xffffffffffffffff for 0 < x <= 2**63
178 * 0 for x == 0 or x > 2**63.
179 *
180 * x must be a u64 or an equivalent type such as limb. */
181 #define NON_ZERO_TO_ALL_ONES(x) ((((u64)(x) - 1) >> 63) - 1)
182
183 /* felem_reduce_carry adds a multiple of p in order to cancel |carry|,
184 * which is a term at 2**257.
185 *
186 * On entry: carry < 2**6, inout[0,2,...] < 2**51, inout[1,3,...] < 2**52.
187 * On exit: inout[0,2,..] < 2**52, inout[1,3,...] < 2**53. */
felem_reduce_carry(felem inout,limb carry)188 static void felem_reduce_carry(felem inout, limb carry) {
189 const u64 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
190
191 inout[0] += carry << 1;
192 inout[1] += 0x10000000000000 & carry_mask;
193 /* carry < 2**6 thus (carry << 46) < 2**52 and we added 2**52 in the
194 * previous line therefore this doesn't underflow. */
195 inout[1] -= carry << 46;
196 inout[2] += (0x8000000000000 - 1) & carry_mask;
197 inout[3] += (0x10000000000000 - 1) & carry_mask;
198 inout[3] -= carry << 39;
199 /* This may underflow if carry is non-zero but, if so, we'll fix it in the
200 * next line. */
201 inout[4] -= 1 & carry_mask;
202 inout[4] += carry << 19;
203 }
204
205 /* felem_sum sets out = in+in2.
206 *
207 * On entry, in[i]+in2[i] must not overflow a 64-bit word.
208 * On exit: out[0,2,...] < 2**52, out[1,3,...] < 2**53 */
felem_sum(felem out,const felem in,const felem in2)209 static void felem_sum(felem out, const felem in, const felem in2) {
210 limb carry = 0;
211 unsigned i;
212
213 for (i = 0;; i++) {
214 out[i] = in[i] + in2[i];
215 out[i] += carry;
216 carry = out[i] >> 51;
217 out[i] &= kBottom51Bits;
218
219 i++;
220 if (i == NLIMBS)
221 break;
222
223 out[i] = in[i] + in2[i];
224 out[i] += carry;
225 carry = out[i] >> 52;
226 out[i] &= kBottom52Bits;
227 }
228
229 felem_reduce_carry(out, carry);
230 }
231
232 #define two53m3 (((limb)1) << 53) - (((limb)1) << 3)
233 #define two54m52p48m2 (((limb)1) << 54) - (((limb)1) << 52) + (((limb)1) << 48) - (((limb)1) << 2)
234 #define two53m2p0 (((limb)1) << 53) - (((limb)1) << 2) + (((limb)1) << 0)
235 #define two54m52p41m2 (((limb)1) << 54) - (((limb)1) << 52) + (((limb)1) << 41) - (((limb)1) << 2)
236 #define two53m21m2p0 (((limb)1) << 53) - (((limb)1) << 21) - (((limb)1) << 2) + (((limb)1) << 0)
237
238 /* zero53 is 0 mod p. */
239 static const felem zero53 = { two53m3, two54m52p48m2, two53m2p0, two54m52p41m2, two53m21m2p0 };
240
241 /* felem_diff sets out = in-in2.
242 *
243 * On entry: in[0,2,...] < 2**52, in[1,3,...] < 2**53 and
244 * in2[0,2,...] < 2**52, in2[1,3,...] < 2**53.
245 * On exit: out[0,2,...] < 2**52, out[1,3,...] < 2**53. */
felem_diff(felem out,const felem in,const felem in2)246 static void felem_diff(felem out, const felem in, const felem in2) {
247 limb carry = 0;
248 unsigned i;
249
250 for (i = 0;; i++) {
251 out[i] = in[i] - in2[i];
252 out[i] += zero53[i];
253 out[i] += carry;
254 carry = out[i] >> 51;
255 out[i] &= kBottom51Bits;
256
257 i++;
258 if (i == NLIMBS)
259 break;
260
261 out[i] = in[i] - in2[i];
262 out[i] += zero53[i];
263 out[i] += carry;
264 carry = out[i] >> 52;
265 out[i] &= kBottom52Bits;
266 }
267
268 felem_reduce_carry(out, carry);
269 }
270
271 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
272 * with the same 51,52,... bit positions as an felem.
273 *
274 * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
275 * Since we just multiplied two Montgomery values together, the result is
276 * x*y*R*R mod p. We wish to divide by R in order for the result also to be
277 * in Montgomery form.
278 *
279 * On entry: tmp[i] < 2**128
280 * On exit: out[0,2,...] < 2**52, out[1,3,...] < 2**53 */
felem_reduce_degree(felem out,u128 tmp[9])281 static void felem_reduce_degree(felem out, u128 tmp[9]) {
282 /* The following table may be helpful when reading this code:
283 *
284 * Limb number: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10
285 * Width (bits): 51| 52| 51| 52| 51| 52| 51| 52| 51| 52| 51
286 * Start bit: 0 | 51|103|154|206|257|309|360|412|463|515
287 * (odd phase): 0 | 52|103|155|206|258|309|361|412|464|515 */
288 limb tmp2[10], carry, x, xShiftedMask;
289 unsigned i;
290
291 /* tmp contains 128-bit words with the same 51,52,51-bit positions as an
292 * felem. So the top of an element of tmp might overlap with another
293 * element two positions down. The following loop eliminates this
294 * overlap. */
295 tmp2[0] = (limb)(tmp[0] & kBottom51Bits);
296
297 /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>64)" to try
298 * and hint to the compiler that it can do a single-word shift by selecting
299 * the right register rather than doing a double-word shift and truncating
300 * afterwards. */
301 tmp2[1] = ((limb) tmp[0]) >> 51;
302 tmp2[1] |= (((limb)(tmp[0] >> 64)) << 13) & kBottom52Bits;
303 tmp2[1] += ((limb) tmp[1]) & kBottom52Bits;
304 carry = tmp2[1] >> 52;
305 tmp2[1] &= kBottom52Bits;
306
307 for (i = 2; i < 9; i++) {
308 tmp2[i] = ((limb)(tmp[i - 2] >> 64)) >> 39;
309 tmp2[i] += ((limb)(tmp[i - 1])) >> 52;
310 tmp2[i] += (((limb)(tmp[i - 1] >> 64)) << 12) & kBottom51Bits;
311 tmp2[i] += ((limb) tmp[i]) & kBottom51Bits;
312 tmp2[i] += carry;
313 carry = tmp2[i] >> 51;
314 tmp2[i] &= kBottom51Bits;
315
316 i++;
317 if (i == 9)
318 break;
319 tmp2[i] = ((limb)(tmp[i - 2] >> 64)) >> 39;
320 tmp2[i] += ((limb)(tmp[i - 1])) >> 51;
321 tmp2[i] += (((limb)(tmp[i - 1] >> 64)) << 13) & kBottom52Bits;
322 tmp2[i] += ((limb) tmp[i]) & kBottom52Bits;
323 tmp2[i] += carry;
324 carry = tmp2[i] >> 52;
325 tmp2[i] &= kBottom52Bits;
326 }
327
328 tmp2[9] = ((limb)(tmp[7] >> 64)) >> 39;
329 tmp2[9] += ((limb)(tmp[8])) >> 51;
330 tmp2[9] += (((limb)(tmp[8] >> 64)) << 13);
331 tmp2[9] += carry;
332
333 /* Montgomery elimination of terms.
334 *
335 * Since R is 2**257, we can divide by R with a bitwise shift if we can
336 * ensure that the right-most 257 bits are all zero. We can make that true by
337 * adding multiplies of p without affecting the value.
338 *
339 * So we eliminate limbs from right to left. Since the bottom 51 bits of p
340 * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
341 * We can do that for 8 further limbs and then right shift to eliminate the
342 * extra factor of R. */
343 for (i = 0;; i += 2) {
344 tmp2[i + 1] += tmp2[i] >> 51;
345 x = tmp2[i] & kBottom51Bits;
346 xShiftedMask = NON_ZERO_TO_ALL_ONES(x >> 1);
347 tmp2[i] = 0;
348
349 /* The bounds calculations for this loop are tricky. Each iteration of
350 * the loop eliminates two words by adding values to words to their
351 * right.
352 *
353 * The following table contains the amounts added to each word (as an
354 * offset from the value of i at the top of the loop). The amounts are
355 * accounted for from the first and second half of the loop separately
356 * and are written as, for example, 51 to mean a value <2**51.
357 *
358 * Word: 1 2 3 4 5 6
359 * Added in top half: 52 44 52 37 50
360 * 51
361 * 51
362 * Added in bottom half: 51 45 51 38 50
363 * 52
364 * 52
365 *
366 * The value that is currently offset 5 will be offset 3 for the next
367 * iteration and then offset 1 for the iteration after that. Therefore
368 * the total value added will be the values added at 5, 3 and 1.
369 *
370 * The following table accumulates these values. The sums at the bottom
371 * are written as, for example, 53+45, to mean a value < 2**53+2**45.
372 *
373 * Word: 1 2 3 4 5 6 7 8 9
374 * 52 44 52 37 50 50 50 50 50
375 * 51 45 51 38 37 38 37
376 * 52 51 52 51 52 51
377 * 51 52 51 52 51
378 * 44 52 51 52
379 * 51 45 44
380 * 52
381 * ------------------------------------
382 * 53+ 53+ 54+ 52+ 53+ 52+
383 * 45 44+ 50+ 51+ 52+ 50+
384 * 37 45+ 50+ 50+ 37
385 * 38 44+ 38
386 * 37
387 *
388 * So the greatest amount is added to tmp2[5]. If tmp2[5] has an initial
389 * value of <2**52, then the maximum value will be < 2**54 + 2**52 + 2**50 +
390 * 2**45 + 2**38, which is < 2**64, as required. */
391 tmp2[i + 1] += (x << 45) & kBottom52Bits;
392 tmp2[i + 2] += x >> 7;
393
394 tmp2[i + 3] += (x << 38) & kBottom52Bits;
395 tmp2[i + 4] += x >> 14;
396
397 /* On tmp2[i + 4], when x < 2**1, the subtraction with (x << 18) will not
398 * underflow because it is balanced with the (x << 50) term. On the next
399 * word tmp2[i + 5], terms with (x >> 1) and (x >> 33) are both zero and
400 * there is no underflow either.
401 *
402 * When x >= 2**1, we add 2**51 to tmp2[i + 4] to avoid an underflow.
403 * Removing 1 from tmp2[i + 5] is safe because (x >> 1) - (x >> 33) is
404 * strictly positive.
405 */
406 tmp2[i + 4] += 0x8000000000000 & xShiftedMask;
407 tmp2[i + 5] -= 1 & xShiftedMask;
408
409 tmp2[i + 4] -= (x << 18) & kBottom51Bits;
410 tmp2[i + 4] += (x << 50) & kBottom51Bits;
411 tmp2[i + 5] += (x >> 1) - (x >> 33);
412
413 if (i+1 == NLIMBS)
414 break;
415 tmp2[i + 2] += tmp2[i + 1] >> 52;
416 x = tmp2[i + 1] & kBottom52Bits;
417 xShiftedMask = NON_ZERO_TO_ALL_ONES(x >> 2);
418 tmp2[i + 1] = 0;
419
420 tmp2[i + 2] += (x << 44) & kBottom51Bits;
421 tmp2[i + 3] += x >> 7;
422
423 tmp2[i + 4] += (x << 37) & kBottom51Bits;
424 tmp2[i + 5] += x >> 14;
425
426 /* On tmp2[i + 5], when x < 2**2, the subtraction with (x << 18) will not
427 * underflow because it is balanced with the (x << 50) term. On the next
428 * word tmp2[i + 6], terms with (x >> 2) and (x >> 34) are both zero and
429 * there is no underflow either.
430 *
431 * When x >= 2**2, we add 2**52 to tmp2[i + 5] to avoid an underflow.
432 * Removing 1 from tmp2[i + 6] is safe because (x >> 2) - (x >> 34) is
433 * stricly positive.
434 */
435 tmp2[i + 5] += 0x10000000000000 & xShiftedMask;
436 tmp2[i + 6] -= 1 & xShiftedMask;
437
438 tmp2[i + 5] -= (x << 18) & kBottom52Bits;
439 tmp2[i + 5] += (x << 50) & kBottom52Bits;
440 tmp2[i + 6] += (x >> 2) - (x >> 34);
441 }
442
443 /* We merge the right shift with a carry chain. The words above 2**257 have
444 * widths of 52,51,... which we need to correct when copying them down. */
445 carry = 0;
446 for (i = 0; i < 4; i++) {
447 out[i] = tmp2[i + 5];
448 out[i] += carry;
449 carry = out[i] >> 51;
450 out[i] &= kBottom51Bits;
451
452 i++;
453 out[i] = tmp2[i + 5] << 1;
454 out[i] += carry;
455 carry = out[i] >> 52;
456 out[i] &= kBottom52Bits;
457 }
458
459 out[4] = tmp2[9];
460 out[4] += carry;
461 carry = out[4] >> 51;
462 out[4] &= kBottom51Bits;
463
464 felem_reduce_carry(out, carry);
465 }
466
467 /* felem_square sets out=in*in.
468 *
469 * On entry: in[0,2,...] < 2**52, in[1,3,...] < 2**53.
470 * On exit: out[0,2,...] < 2**52, out[1,3,...] < 2**53. */
felem_square(felem out,const felem in)471 static void felem_square(felem out, const felem in) {
472 u128 tmp[9], x1x1, x3x3;
473
474 x1x1 = ((u128) in[1]) * in[1];
475 x3x3 = ((u128) in[3]) * in[3];
476
477 tmp[0] = ((u128) in[0]) * (in[0] << 0);
478 tmp[1] = ((u128) in[0]) * (in[1] << 1) + ((x1x1 & 1) << 51);
479 tmp[2] = ((u128) in[0]) * (in[2] << 1) + (x1x1 >> 1);
480 tmp[3] = ((u128) in[0]) * (in[3] << 1) +
481 ((u128) in[1]) * (in[2] << 1);
482 tmp[4] = ((u128) in[0]) * (in[4] << 1) +
483 ((u128) in[1]) * (in[3] << 0) +
484 ((u128) in[2]) * (in[2] << 0);
485 tmp[5] = ((u128) in[1]) * (in[4] << 1) +
486 ((u128) in[2]) * (in[3] << 1) + ((x3x3 & 1) << 51);
487 tmp[6] = ((u128) in[2]) * (in[4] << 1) + (x3x3 >> 1);
488 tmp[7] = ((u128) in[3]) * (in[4] << 1);
489 tmp[8] = ((u128) in[4]) * (in[4] << 0);
490
491 felem_reduce_degree(out, tmp);
492 }
493
494 /* felem_mul sets out=in*in2.
495 *
496 * On entry: in[0,2,...] < 2**52, in[1,3,...] < 2**53 and
497 * in2[0,2,...] < 2**52, in2[1,3,...] < 2**53.
498 * On exit: out[0,2,...] < 2**52, out[1,3,...] < 2**53. */
felem_mul(felem out,const felem in,const felem in2)499 static void felem_mul(felem out, const felem in, const felem in2) {
500 u128 tmp[9], x1y1, x1y3, x3y1, x3y3;
501
502 x1y1 = ((u128) in[1]) * in2[1];
503 x1y3 = ((u128) in[1]) * in2[3];
504 x3y1 = ((u128) in[3]) * in2[1];
505 x3y3 = ((u128) in[3]) * in2[3];
506
507 tmp[0] = ((u128) in[0]) * in2[0];
508 tmp[1] = ((u128) in[0]) * in2[1] +
509 ((u128) in[1]) * in2[0] + ((x1y1 & 1) << 51);
510 tmp[2] = ((u128) in[0]) * in2[2] + (x1y1 >> 1) +
511 ((u128) in[2]) * in2[0];
512 tmp[3] = ((u128) in[0]) * in2[3] +
513 ((u128) in[1]) * in2[2] +
514 ((u128) in[2]) * in2[1] + ((x1y3 & 1) << 51) +
515 ((u128) in[3]) * in2[0] + ((x3y1 & 1) << 51);
516 tmp[4] = ((u128) in[0]) * in2[4] + (x1y3 >> 1) +
517 ((u128) in[2]) * in2[2] + (x3y1 >> 1) +
518 ((u128) in[4]) * in2[0];
519 tmp[5] = ((u128) in[1]) * in2[4] +
520 ((u128) in[2]) * in2[3] +
521 ((u128) in[3]) * in2[2] +
522 ((u128) in[4]) * in2[1] + ((x3y3 & 1) << 51);
523 tmp[6] = ((u128) in[2]) * in2[4] + (x3y3 >> 1) +
524 ((u128) in[4]) * in2[2];
525 tmp[7] = ((u128) in[3]) * in2[4] +
526 ((u128) in[4]) * in2[3];
527 tmp[8] = ((u128) in[4]) * in2[4];
528
529 felem_reduce_degree(out, tmp);
530 }
531
felem_assign(felem out,const felem in)532 static void felem_assign(felem out, const felem in) {
533 memcpy(out, in, sizeof(felem));
534 }
535
536 /* felem_scalar_3 sets out=3*out.
537 *
538 * On entry: out[0,2,...] < 2**52, out[1,3,...] < 2**53.
539 * On exit: out[0,2,...] < 2**52, out[1,3,...] < 2**53. */
felem_scalar_3(felem out)540 static void felem_scalar_3(felem out) {
541 limb carry = 0;
542 unsigned i;
543
544 for (i = 0;; i++) {
545 out[i] *= 3;
546 out[i] += carry;
547 carry = out[i] >> 51;
548 out[i] &= kBottom51Bits;
549
550 i++;
551 if (i == NLIMBS)
552 break;
553
554 out[i] *= 3;
555 out[i] += carry;
556 carry = out[i] >> 52;
557 out[i] &= kBottom52Bits;
558 }
559
560 felem_reduce_carry(out, carry);
561 }
562
563 /* felem_scalar_4 sets out=4*out.
564 *
565 * On entry: out[0,2,...] < 2**52, out[1,3,...] < 2**53.
566 * On exit: out[0,2,...] < 2**52, out[1,3,...] < 2**53. */
felem_scalar_4(felem out)567 static void felem_scalar_4(felem out) {
568 limb carry = 0, next_carry;
569 unsigned i;
570
571 for (i = 0;; i++) {
572 next_carry = out[i] >> 49;
573 out[i] <<= 2;
574 out[i] &= kBottom51Bits;
575 out[i] += carry;
576 carry = next_carry + (out[i] >> 51);
577 out[i] &= kBottom51Bits;
578
579 i++;
580 if (i == NLIMBS)
581 break;
582
583 next_carry = out[i] >> 50;
584 out[i] <<= 2;
585 out[i] &= kBottom52Bits;
586 out[i] += carry;
587 carry = next_carry + (out[i] >> 52);
588 out[i] &= kBottom52Bits;
589 }
590
591 felem_reduce_carry(out, carry);
592 }
593
594 /* felem_scalar_8 sets out=8*out.
595 *
596 * On entry: out[0,2,...] < 2**52, out[1,3,...] < 2**53.
597 * On exit: out[0,2,...] < 2**52, out[1,3,...] < 2**53. */
felem_scalar_8(felem out)598 static void felem_scalar_8(felem out) {
599 limb carry = 0, next_carry;
600 unsigned i;
601
602 for (i = 0;; i++) {
603 next_carry = out[i] >> 48;
604 out[i] <<= 3;
605 out[i] &= kBottom51Bits;
606 out[i] += carry;
607 carry = next_carry + (out[i] >> 51);
608 out[i] &= kBottom51Bits;
609
610 i++;
611 if (i == NLIMBS)
612 break;
613
614 next_carry = out[i] >> 49;
615 out[i] <<= 3;
616 out[i] &= kBottom52Bits;
617 out[i] += carry;
618 carry = next_carry + (out[i] >> 52);
619 out[i] &= kBottom52Bits;
620 }
621
622 felem_reduce_carry(out, carry);
623 }
624
625 /* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of
626 * time depending on the value of |in|. */
felem_is_zero_vartime(const felem in)627 static char felem_is_zero_vartime(const felem in) {
628 limb carry;
629 int i;
630 limb tmp[NLIMBS];
631
632 felem_assign(tmp, in);
633
634 /* First, reduce tmp to a minimal form. */
635 do {
636 carry = 0;
637 for (i = 0;; i++) {
638 tmp[i] += carry;
639 carry = tmp[i] >> 51;
640 tmp[i] &= kBottom51Bits;
641
642 i++;
643 if (i == NLIMBS)
644 break;
645
646 tmp[i] += carry;
647 carry = tmp[i] >> 52;
648 tmp[i] &= kBottom52Bits;
649 }
650
651 felem_reduce_carry(tmp, carry);
652 } while (carry);
653
654 /* tmp < 2**257, so the only possible zero values are 0, p and 2p. */
655 return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
656 memcmp(tmp, kP, sizeof(tmp)) == 0 ||
657 memcmp(tmp, k2P, sizeof(tmp)) == 0;
658 }
659
660
661 /* Montgomery operations: */
662
663 #define kRDigits {2, 0xfffffffe00000000, 0xffffffffffffffff, 0x1fffffffd} // 2^257 mod p256.p
664
665 #define kRInvDigits {0x180000000, 0xffffffff, 0xfffffffe80000001, 0x7fffffff00000001} // 1 / 2^257 mod p256.p
666
667 static const cryptonite_p256_int kR = { kRDigits };
668 static const cryptonite_p256_int kRInv = { kRInvDigits };
669
670 /* to_montgomery sets out = R*in. */
to_montgomery(felem out,const cryptonite_p256_int * in)671 static void to_montgomery(felem out, const cryptonite_p256_int* in) {
672 cryptonite_p256_int in_shifted;
673 int i;
674
675 cryptonite_p256_init(&in_shifted);
676 cryptonite_p256_modmul(&cryptonite_SECP256r1_p, in, 0, &kR, &in_shifted);
677
678 for (i = 0; i < NLIMBS; i++) {
679 if ((i & 1) == 0) {
680 out[i] = P256_DIGIT(&in_shifted, 0) & kBottom51Bits;
681 cryptonite_p256_shr(&in_shifted, 51, &in_shifted);
682 } else {
683 out[i] = P256_DIGIT(&in_shifted, 0) & kBottom52Bits;
684 cryptonite_p256_shr(&in_shifted, 52, &in_shifted);
685 }
686 }
687
688 cryptonite_p256_clear(&in_shifted);
689 }
690
691 /* from_montgomery sets out=in/R. */
from_montgomery(cryptonite_p256_int * out,const felem in)692 static void from_montgomery(cryptonite_p256_int* out, const felem in) {
693 cryptonite_p256_int result, tmp;
694 int i, top;
695
696 cryptonite_p256_init(&result);
697 cryptonite_p256_init(&tmp);
698
699 cryptonite_p256_add_d(&tmp, in[NLIMBS - 1], &result);
700 for (i = NLIMBS - 2; i >= 0; i--) {
701 if ((i & 1) == 0) {
702 top = cryptonite_p256_shl(&result, 51, &tmp);
703 } else {
704 top = cryptonite_p256_shl(&result, 52, &tmp);
705 }
706 top += cryptonite_p256_add_d(&tmp, in[i], &result);
707 }
708
709 cryptonite_p256_modmul(&cryptonite_SECP256r1_p, &kRInv, top, &result, out);
710
711 cryptonite_p256_clear(&result);
712 cryptonite_p256_clear(&tmp);
713 }
714