1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//-go:build !amd64 && !arm64
6// -build !amd64,!arm64
7
8package elliptic
9
10// This file contains a constant-time, 32-bit implementation of P256.
11
12import (
13	"math/big"
14)
15
16type p256Curve struct {
17	*CurveParams
18}
19
20var (
21	p256Params *CurveParams
22
23	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
24	// (2**257).
25	p256RInverse *big.Int
26)
27
28func initP256() {
29	// See FIPS 186-3, section D.2.3
30	p256Params = &CurveParams{Name: "P-256"}
31	p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
32	p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
33	p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
34	p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
35	p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
36	p256Params.BitSize = 256
37
38	p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
39
40	// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
41	initP256Arch()
42}
43
44func (curve p256Curve) Params() *CurveParams {
45	return curve.CurveParams
46}
47
48// p256GetScalar endian-swaps the big-endian scalar value from in and writes it
49// to out. If the scalar is equal or greater than the order of the group, it's
50// reduced modulo that order.
51func p256GetScalar(out *[32]byte, in []byte) {
52	n := new(big.Int).SetBytes(in)
53	var scalarBytes []byte
54
55	if n.Cmp(p256Params.N) >= 0 {
56		n.Mod(n, p256Params.N)
57		scalarBytes = n.Bytes()
58	} else {
59		scalarBytes = in
60	}
61
62	for i, v := range scalarBytes {
63		out[len(scalarBytes)-(1+i)] = v
64	}
65}
66
67func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
68	var scalarReversed [32]byte
69	p256GetScalar(&scalarReversed, scalar)
70
71	var x1, y1, z1 [p256Limbs]uint32
72	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
73	return p256ToAffine(&x1, &y1, &z1)
74}
75
76func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
77	var scalarReversed [32]byte
78	p256GetScalar(&scalarReversed, scalar)
79
80	var px, py, x1, y1, z1 [p256Limbs]uint32
81	p256FromBig(&px, bigX)
82	p256FromBig(&py, bigY)
83	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
84	return p256ToAffine(&x1, &y1, &z1)
85}
86
87// Field elements are represented as nine, unsigned 32-bit words.
88//
89// The value of a field element is:
90//   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
91//
92// That is, each limb is alternately 29 or 28-bits wide in little-endian
93// order.
94//
95// This means that a field element hits 2**257, rather than 2**256 as we would
96// like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
97// problems when multiplying as terms end up one bit short of a limb which
98// would require much bit-shifting to correct.
99//
100// Finally, the values stored in a field element are in Montgomery form. So the
101// value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
102// 2**257.
103
104const (
105	p256Limbs    = 9
106	bottom29Bits = 0x1fffffff
107)
108
109var (
110	// p256One is the number 1 as a field element.
111	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
112	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
113	// p256P is the prime modulus as a field element.
114	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
115	// p2562P is the twice prime modulus as a field element.
116	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
117)
118
119// p256Precomputed contains precomputed values to aid the calculation of scalar
120// multiples of the base point, G. It's actually two, equal length, tables
121// concatenated.
122//
123// The first table contains (x,y) field element pairs for 16 multiples of the
124// base point, G.
125//
126//   Index  |  Index (binary) | Value
127//       0  |           0000  | 0G (all zeros, omitted)
128//       1  |           0001  | G
129//       2  |           0010  | 2**64G
130//       3  |           0011  | 2**64G + G
131//       4  |           0100  | 2**128G
132//       5  |           0101  | 2**128G + G
133//       6  |           0110  | 2**128G + 2**64G
134//       7  |           0111  | 2**128G + 2**64G + G
135//       8  |           1000  | 2**192G
136//       9  |           1001  | 2**192G + G
137//      10  |           1010  | 2**192G + 2**64G
138//      11  |           1011  | 2**192G + 2**64G + G
139//      12  |           1100  | 2**192G + 2**128G
140//      13  |           1101  | 2**192G + 2**128G + G
141//      14  |           1110  | 2**192G + 2**128G + 2**64G
142//      15  |           1111  | 2**192G + 2**128G + 2**64G + G
143//
144// The second table follows the same style, but the terms are 2**32G,
145// 2**96G, 2**160G, 2**224G.
146//
147// This is ~2KB of data.
148var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
149	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
150	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
151	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
152	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
153	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
154	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
155	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
156	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
157	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
158	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
159	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
160	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
161	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
162	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
163	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
164	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
165	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
166	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
167	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
168	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
169	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
170	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
171	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
172	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
173	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
174	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
175	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
176	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
177	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
178	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
179	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
180	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
181	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
182	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
183	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
184	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
185	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
186	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
187	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
188	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
189	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
190	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
191	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
192	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
193	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
194	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
195	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
196	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
197	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
198	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
199	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
200	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
201	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
202	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
203	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
204	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
205	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
206	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
207	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
208	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
209}
210
211// Field element operations:
212
213// nonZeroToAllOnes returns:
214//   0xffffffff for 0 < x <= 2**31
215//   0 for x == 0 or x > 2**31.
216func nonZeroToAllOnes(x uint32) uint32 {
217	return ((x - 1) >> 31) - 1
218}
219
220// p256ReduceCarry adds a multiple of p in order to cancel |carry|,
221// which is a term at 2**257.
222//
223// On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
224// On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
225func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
226	carry_mask := nonZeroToAllOnes(carry)
227
228	inout[0] += carry << 1
229	inout[3] += 0x10000000 & carry_mask
230	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
231	// previous line therefore this doesn't underflow.
232	inout[3] -= carry << 11
233	inout[4] += (0x20000000 - 1) & carry_mask
234	inout[5] += (0x10000000 - 1) & carry_mask
235	inout[6] += (0x20000000 - 1) & carry_mask
236	inout[6] -= carry << 22
237	// This may underflow if carry is non-zero but, if so, we'll fix it in the
238	// next line.
239	inout[7] -= 1 & carry_mask
240	inout[7] += carry << 25
241}
242
243// p256Sum sets out = in+in2.
244//
245// On entry, in[i]+in2[i] must not overflow a 32-bit word.
246// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
247func p256Sum(out, in, in2 *[p256Limbs]uint32) {
248	carry := uint32(0)
249	for i := 0; ; i++ {
250		out[i] = in[i] + in2[i]
251		out[i] += carry
252		carry = out[i] >> 29
253		out[i] &= bottom29Bits
254
255		i++
256		if i == p256Limbs {
257			break
258		}
259
260		out[i] = in[i] + in2[i]
261		out[i] += carry
262		carry = out[i] >> 28
263		out[i] &= bottom28Bits
264	}
265
266	p256ReduceCarry(out, carry)
267}
268
269const (
270	two30m2    = 1<<30 - 1<<2
271	two30p13m2 = 1<<30 + 1<<13 - 1<<2
272	two31m2    = 1<<31 - 1<<2
273	two31p24m2 = 1<<31 + 1<<24 - 1<<2
274	two30m27m2 = 1<<30 - 1<<27 - 1<<2
275)
276
277// p256Zero31 is 0 mod p.
278var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
279
280// p256Diff sets out = in-in2.
281//
282// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
283//           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
284// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
285func p256Diff(out, in, in2 *[p256Limbs]uint32) {
286	var carry uint32
287
288	for i := 0; ; i++ {
289		out[i] = in[i] - in2[i]
290		out[i] += p256Zero31[i]
291		out[i] += carry
292		carry = out[i] >> 29
293		out[i] &= bottom29Bits
294
295		i++
296		if i == p256Limbs {
297			break
298		}
299
300		out[i] = in[i] - in2[i]
301		out[i] += p256Zero31[i]
302		out[i] += carry
303		carry = out[i] >> 28
304		out[i] &= bottom28Bits
305	}
306
307	p256ReduceCarry(out, carry)
308}
309
310// p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
311// the same 29,28,... bit positions as a field element.
312//
313// The values in field elements are in Montgomery form: x*R mod p where R =
314// 2**257. Since we just multiplied two Montgomery values together, the result
315// is x*y*R*R mod p. We wish to divide by R in order for the result also to be
316// in Montgomery form.
317//
318// On entry: tmp[i] < 2**64
319// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
320func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
321	// The following table may be helpful when reading this code:
322	//
323	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
324	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
325	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
326	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
327	var tmp2 [18]uint32
328	var carry, x, xMask uint32
329
330	// tmp contains 64-bit words with the same 29,28,29-bit positions as a
331	// field element. So the top of an element of tmp might overlap with
332	// another element two positions down. The following loop eliminates
333	// this overlap.
334	tmp2[0] = uint32(tmp[0]) & bottom29Bits
335
336	tmp2[1] = uint32(tmp[0]) >> 29
337	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
338	tmp2[1] += uint32(tmp[1]) & bottom28Bits
339	carry = tmp2[1] >> 28
340	tmp2[1] &= bottom28Bits
341
342	for i := 2; i < 17; i++ {
343		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
344		tmp2[i] += (uint32(tmp[i-1])) >> 28
345		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
346		tmp2[i] += uint32(tmp[i]) & bottom29Bits
347		tmp2[i] += carry
348		carry = tmp2[i] >> 29
349		tmp2[i] &= bottom29Bits
350
351		i++
352		if i == 17 {
353			break
354		}
355		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
356		tmp2[i] += uint32(tmp[i-1]) >> 29
357		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
358		tmp2[i] += uint32(tmp[i]) & bottom28Bits
359		tmp2[i] += carry
360		carry = tmp2[i] >> 28
361		tmp2[i] &= bottom28Bits
362	}
363
364	tmp2[17] = uint32(tmp[15]>>32) >> 25
365	tmp2[17] += uint32(tmp[16]) >> 29
366	tmp2[17] += uint32(tmp[16]>>32) << 3
367	tmp2[17] += carry
368
369	// Montgomery elimination of terms:
370	//
371	// Since R is 2**257, we can divide by R with a bitwise shift if we can
372	// ensure that the right-most 257 bits are all zero. We can make that true
373	// by adding multiplies of p without affecting the value.
374	//
375	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
376	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
377	// We can do that for 8 further limbs and then right shift to eliminate the
378	// extra factor of R.
379	for i := 0; ; i += 2 {
380		tmp2[i+1] += tmp2[i] >> 29
381		x = tmp2[i] & bottom29Bits
382		xMask = nonZeroToAllOnes(x)
383		tmp2[i] = 0
384
385		// The bounds calculations for this loop are tricky. Each iteration of
386		// the loop eliminates two words by adding values to words to their
387		// right.
388		//
389		// The following table contains the amounts added to each word (as an
390		// offset from the value of i at the top of the loop). The amounts are
391		// accounted for from the first and second half of the loop separately
392		// and are written as, for example, 28 to mean a value <2**28.
393		//
394		// Word:                   3   4   5   6   7   8   9   10
395		// Added in top half:     28  11      29  21  29  28
396		//                                        28  29
397		//                                            29
398		// Added in bottom half:      29  10      28  21  28   28
399		//                                            29
400		//
401		// The value that is currently offset 7 will be offset 5 for the next
402		// iteration and then offset 3 for the iteration after that. Therefore
403		// the total value added will be the values added at 7, 5 and 3.
404		//
405		// The following table accumulates these values. The sums at the bottom
406		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
407		//
408		// Word:                   3   4   5   6   7   8   9  10  11  12  13
409		//                        28  11  10  29  21  29  28  28  28  28  28
410		//                            29  28  11  28  29  28  29  28  29  28
411		//                                    29  28  21  21  29  21  29  21
412		//                                        10  29  28  21  28  21  28
413		//                                        28  29  28  29  28  29  28
414		//                                            11  10  29  10  29  10
415		//                                            29  28  11  28  11
416		//                                                    29      29
417		//                        --------------------------------------------
418		//                                                30+ 31+ 30+ 31+ 30+
419		//                                                28+ 29+ 28+ 29+ 21+
420		//                                                21+ 28+ 21+ 28+ 10
421		//                                                10  21+ 10  21+
422		//                                                    11      11
423		//
424		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
425		// tmp2[10/12] has an initial value of <2**29, then the maximum value
426		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
427		// as required.
428		tmp2[i+3] += (x << 10) & bottom28Bits
429		tmp2[i+4] += (x >> 18)
430
431		tmp2[i+6] += (x << 21) & bottom29Bits
432		tmp2[i+7] += x >> 8
433
434		// At position 200, which is the starting bit position for word 7, we
435		// have a factor of 0xf000000 = 2**28 - 2**24.
436		tmp2[i+7] += 0x10000000 & xMask
437		tmp2[i+8] += (x - 1) & xMask
438		tmp2[i+7] -= (x << 24) & bottom28Bits
439		tmp2[i+8] -= x >> 4
440
441		tmp2[i+8] += 0x20000000 & xMask
442		tmp2[i+8] -= x
443		tmp2[i+8] += (x << 28) & bottom29Bits
444		tmp2[i+9] += ((x >> 1) - 1) & xMask
445
446		if i+1 == p256Limbs {
447			break
448		}
449		tmp2[i+2] += tmp2[i+1] >> 28
450		x = tmp2[i+1] & bottom28Bits
451		xMask = nonZeroToAllOnes(x)
452		tmp2[i+1] = 0
453
454		tmp2[i+4] += (x << 11) & bottom29Bits
455		tmp2[i+5] += (x >> 18)
456
457		tmp2[i+7] += (x << 21) & bottom28Bits
458		tmp2[i+8] += x >> 7
459
460		// At position 199, which is the starting bit of the 8th word when
461		// dealing with a context starting on an odd word, we have a factor of
462		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
463		// word from i+1 is i+8.
464		tmp2[i+8] += 0x20000000 & xMask
465		tmp2[i+9] += (x - 1) & xMask
466		tmp2[i+8] -= (x << 25) & bottom29Bits
467		tmp2[i+9] -= x >> 4
468
469		tmp2[i+9] += 0x10000000 & xMask
470		tmp2[i+9] -= x
471		tmp2[i+10] += (x - 1) & xMask
472	}
473
474	// We merge the right shift with a carry chain. The words above 2**257 have
475	// widths of 28,29,... which we need to correct when copying them down.
476	carry = 0
477	for i := 0; i < 8; i++ {
478		// The maximum value of tmp2[i + 9] occurs on the first iteration and
479		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
480		// therefore safe.
481		out[i] = tmp2[i+9]
482		out[i] += carry
483		out[i] += (tmp2[i+10] << 28) & bottom29Bits
484		carry = out[i] >> 29
485		out[i] &= bottom29Bits
486
487		i++
488		out[i] = tmp2[i+9] >> 1
489		out[i] += carry
490		carry = out[i] >> 28
491		out[i] &= bottom28Bits
492	}
493
494	out[8] = tmp2[17]
495	out[8] += carry
496	carry = out[8] >> 29
497	out[8] &= bottom29Bits
498
499	p256ReduceCarry(out, carry)
500}
501
502// p256Square sets out=in*in.
503//
504// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
505// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
506func p256Square(out, in *[p256Limbs]uint32) {
507	var tmp [17]uint64
508
509	tmp[0] = uint64(in[0]) * uint64(in[0])
510	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
511	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
512		uint64(in[1])*(uint64(in[1])<<1)
513	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
514		uint64(in[1])*(uint64(in[2])<<1)
515	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
516		uint64(in[1])*(uint64(in[3])<<2) +
517		uint64(in[2])*uint64(in[2])
518	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
519		uint64(in[1])*(uint64(in[4])<<1) +
520		uint64(in[2])*(uint64(in[3])<<1)
521	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
522		uint64(in[1])*(uint64(in[5])<<2) +
523		uint64(in[2])*(uint64(in[4])<<1) +
524		uint64(in[3])*(uint64(in[3])<<1)
525	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
526		uint64(in[1])*(uint64(in[6])<<1) +
527		uint64(in[2])*(uint64(in[5])<<1) +
528		uint64(in[3])*(uint64(in[4])<<1)
529	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
530	// which is < 2**64 as required.
531	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
532		uint64(in[1])*(uint64(in[7])<<2) +
533		uint64(in[2])*(uint64(in[6])<<1) +
534		uint64(in[3])*(uint64(in[5])<<2) +
535		uint64(in[4])*uint64(in[4])
536	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
537		uint64(in[2])*(uint64(in[7])<<1) +
538		uint64(in[3])*(uint64(in[6])<<1) +
539		uint64(in[4])*(uint64(in[5])<<1)
540	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
541		uint64(in[3])*(uint64(in[7])<<2) +
542		uint64(in[4])*(uint64(in[6])<<1) +
543		uint64(in[5])*(uint64(in[5])<<1)
544	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
545		uint64(in[4])*(uint64(in[7])<<1) +
546		uint64(in[5])*(uint64(in[6])<<1)
547	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
548		uint64(in[5])*(uint64(in[7])<<2) +
549		uint64(in[6])*uint64(in[6])
550	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
551		uint64(in[6])*(uint64(in[7])<<1)
552	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
553		uint64(in[7])*(uint64(in[7])<<1)
554	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
555	tmp[16] = uint64(in[8]) * uint64(in[8])
556
557	p256ReduceDegree(out, tmp)
558}
559
560// p256Mul sets out=in*in2.
561//
562// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
563//           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
564// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
565func p256Mul(out, in, in2 *[p256Limbs]uint32) {
566	var tmp [17]uint64
567
568	tmp[0] = uint64(in[0]) * uint64(in2[0])
569	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
570		uint64(in[1])*(uint64(in2[0])<<0)
571	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
572		uint64(in[1])*(uint64(in2[1])<<1) +
573		uint64(in[2])*(uint64(in2[0])<<0)
574	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
575		uint64(in[1])*(uint64(in2[2])<<0) +
576		uint64(in[2])*(uint64(in2[1])<<0) +
577		uint64(in[3])*(uint64(in2[0])<<0)
578	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
579		uint64(in[1])*(uint64(in2[3])<<1) +
580		uint64(in[2])*(uint64(in2[2])<<0) +
581		uint64(in[3])*(uint64(in2[1])<<1) +
582		uint64(in[4])*(uint64(in2[0])<<0)
583	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
584		uint64(in[1])*(uint64(in2[4])<<0) +
585		uint64(in[2])*(uint64(in2[3])<<0) +
586		uint64(in[3])*(uint64(in2[2])<<0) +
587		uint64(in[4])*(uint64(in2[1])<<0) +
588		uint64(in[5])*(uint64(in2[0])<<0)
589	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
590		uint64(in[1])*(uint64(in2[5])<<1) +
591		uint64(in[2])*(uint64(in2[4])<<0) +
592		uint64(in[3])*(uint64(in2[3])<<1) +
593		uint64(in[4])*(uint64(in2[2])<<0) +
594		uint64(in[5])*(uint64(in2[1])<<1) +
595		uint64(in[6])*(uint64(in2[0])<<0)
596	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
597		uint64(in[1])*(uint64(in2[6])<<0) +
598		uint64(in[2])*(uint64(in2[5])<<0) +
599		uint64(in[3])*(uint64(in2[4])<<0) +
600		uint64(in[4])*(uint64(in2[3])<<0) +
601		uint64(in[5])*(uint64(in2[2])<<0) +
602		uint64(in[6])*(uint64(in2[1])<<0) +
603		uint64(in[7])*(uint64(in2[0])<<0)
604	// tmp[8] has the greatest value but doesn't overflow. See logic in
605	// p256Square.
606	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
607		uint64(in[1])*(uint64(in2[7])<<1) +
608		uint64(in[2])*(uint64(in2[6])<<0) +
609		uint64(in[3])*(uint64(in2[5])<<1) +
610		uint64(in[4])*(uint64(in2[4])<<0) +
611		uint64(in[5])*(uint64(in2[3])<<1) +
612		uint64(in[6])*(uint64(in2[2])<<0) +
613		uint64(in[7])*(uint64(in2[1])<<1) +
614		uint64(in[8])*(uint64(in2[0])<<0)
615	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
616		uint64(in[2])*(uint64(in2[7])<<0) +
617		uint64(in[3])*(uint64(in2[6])<<0) +
618		uint64(in[4])*(uint64(in2[5])<<0) +
619		uint64(in[5])*(uint64(in2[4])<<0) +
620		uint64(in[6])*(uint64(in2[3])<<0) +
621		uint64(in[7])*(uint64(in2[2])<<0) +
622		uint64(in[8])*(uint64(in2[1])<<0)
623	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
624		uint64(in[3])*(uint64(in2[7])<<1) +
625		uint64(in[4])*(uint64(in2[6])<<0) +
626		uint64(in[5])*(uint64(in2[5])<<1) +
627		uint64(in[6])*(uint64(in2[4])<<0) +
628		uint64(in[7])*(uint64(in2[3])<<1) +
629		uint64(in[8])*(uint64(in2[2])<<0)
630	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
631		uint64(in[4])*(uint64(in2[7])<<0) +
632		uint64(in[5])*(uint64(in2[6])<<0) +
633		uint64(in[6])*(uint64(in2[5])<<0) +
634		uint64(in[7])*(uint64(in2[4])<<0) +
635		uint64(in[8])*(uint64(in2[3])<<0)
636	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
637		uint64(in[5])*(uint64(in2[7])<<1) +
638		uint64(in[6])*(uint64(in2[6])<<0) +
639		uint64(in[7])*(uint64(in2[5])<<1) +
640		uint64(in[8])*(uint64(in2[4])<<0)
641	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
642		uint64(in[6])*(uint64(in2[7])<<0) +
643		uint64(in[7])*(uint64(in2[6])<<0) +
644		uint64(in[8])*(uint64(in2[5])<<0)
645	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
646		uint64(in[7])*(uint64(in2[7])<<1) +
647		uint64(in[8])*(uint64(in2[6])<<0)
648	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
649		uint64(in[8])*(uint64(in2[7])<<0)
650	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
651
652	p256ReduceDegree(out, tmp)
653}
654
655func p256Assign(out, in *[p256Limbs]uint32) {
656	*out = *in
657}
658
659// p256Invert calculates |out| = |in|^{-1}
660//
661// Based on Fermat's Little Theorem:
662//   a^p = a (mod p)
663//   a^{p-1} = 1 (mod p)
664//   a^{p-2} = a^{-1} (mod p)
665func p256Invert(out, in *[p256Limbs]uint32) {
666	var ftmp, ftmp2 [p256Limbs]uint32
667
668	// each e_I will hold |in|^{2^I - 1}
669	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
670
671	p256Square(&ftmp, in)     // 2^1
672	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
673	p256Assign(&e2, &ftmp)
674	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
675	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
676	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
677	p256Assign(&e4, &ftmp)
678	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
679	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
680	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
681	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
682	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
683	p256Assign(&e8, &ftmp)
684	for i := 0; i < 8; i++ {
685		p256Square(&ftmp, &ftmp)
686	} // 2^16 - 2^8
687	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
688	p256Assign(&e16, &ftmp)
689	for i := 0; i < 16; i++ {
690		p256Square(&ftmp, &ftmp)
691	} // 2^32 - 2^16
692	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
693	p256Assign(&e32, &ftmp)
694	for i := 0; i < 32; i++ {
695		p256Square(&ftmp, &ftmp)
696	} // 2^64 - 2^32
697	p256Assign(&e64, &ftmp)
698	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
699	for i := 0; i < 192; i++ {
700		p256Square(&ftmp, &ftmp)
701	} // 2^256 - 2^224 + 2^192
702
703	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
704	for i := 0; i < 16; i++ {
705		p256Square(&ftmp2, &ftmp2)
706	} // 2^80 - 2^16
707	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
708	for i := 0; i < 8; i++ {
709		p256Square(&ftmp2, &ftmp2)
710	} // 2^88 - 2^8
711	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
712	for i := 0; i < 4; i++ {
713		p256Square(&ftmp2, &ftmp2)
714	} // 2^92 - 2^4
715	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
716	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
717	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
718	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
719	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
720	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
721	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
722
723	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
724}
725
726// p256Scalar3 sets out=3*out.
727//
728// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
729// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
730func p256Scalar3(out *[p256Limbs]uint32) {
731	var carry uint32
732
733	for i := 0; ; i++ {
734		out[i] *= 3
735		out[i] += carry
736		carry = out[i] >> 29
737		out[i] &= bottom29Bits
738
739		i++
740		if i == p256Limbs {
741			break
742		}
743
744		out[i] *= 3
745		out[i] += carry
746		carry = out[i] >> 28
747		out[i] &= bottom28Bits
748	}
749
750	p256ReduceCarry(out, carry)
751}
752
753// p256Scalar4 sets out=4*out.
754//
755// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
756// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
757func p256Scalar4(out *[p256Limbs]uint32) {
758	var carry, nextCarry uint32
759
760	for i := 0; ; i++ {
761		nextCarry = out[i] >> 27
762		out[i] <<= 2
763		out[i] &= bottom29Bits
764		out[i] += carry
765		carry = nextCarry + (out[i] >> 29)
766		out[i] &= bottom29Bits
767
768		i++
769		if i == p256Limbs {
770			break
771		}
772		nextCarry = out[i] >> 26
773		out[i] <<= 2
774		out[i] &= bottom28Bits
775		out[i] += carry
776		carry = nextCarry + (out[i] >> 28)
777		out[i] &= bottom28Bits
778	}
779
780	p256ReduceCarry(out, carry)
781}
782
783// p256Scalar8 sets out=8*out.
784//
785// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
786// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
787func p256Scalar8(out *[p256Limbs]uint32) {
788	var carry, nextCarry uint32
789
790	for i := 0; ; i++ {
791		nextCarry = out[i] >> 26
792		out[i] <<= 3
793		out[i] &= bottom29Bits
794		out[i] += carry
795		carry = nextCarry + (out[i] >> 29)
796		out[i] &= bottom29Bits
797
798		i++
799		if i == p256Limbs {
800			break
801		}
802		nextCarry = out[i] >> 25
803		out[i] <<= 3
804		out[i] &= bottom28Bits
805		out[i] += carry
806		carry = nextCarry + (out[i] >> 28)
807		out[i] &= bottom28Bits
808	}
809
810	p256ReduceCarry(out, carry)
811}
812
813// Group operations:
814//
815// Elements of the elliptic curve group are represented in Jacobian
816// coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
817// Jacobian form.
818
819// p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
820//
821// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
822func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
823	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
824
825	p256Square(&delta, z)
826	p256Square(&gamma, y)
827	p256Mul(&beta, x, &gamma)
828
829	p256Sum(&tmp, x, &delta)
830	p256Diff(&tmp2, x, &delta)
831	p256Mul(&alpha, &tmp, &tmp2)
832	p256Scalar3(&alpha)
833
834	p256Sum(&tmp, y, z)
835	p256Square(&tmp, &tmp)
836	p256Diff(&tmp, &tmp, &gamma)
837	p256Diff(zOut, &tmp, &delta)
838
839	p256Scalar4(&beta)
840	p256Square(xOut, &alpha)
841	p256Diff(xOut, xOut, &beta)
842	p256Diff(xOut, xOut, &beta)
843
844	p256Diff(&tmp, &beta, xOut)
845	p256Mul(&tmp, &alpha, &tmp)
846	p256Square(&tmp2, &gamma)
847	p256Scalar8(&tmp2)
848	p256Diff(yOut, &tmp, &tmp2)
849}
850
851// p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
852// (i.e. the second point is affine.)
853//
854// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
855//
856// Note that this function does not handle P+P, infinity+P nor P+infinity
857// correctly.
858func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
859	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
860
861	p256Square(&z1z1, z1)
862	p256Sum(&tmp, z1, z1)
863
864	p256Mul(&u2, x2, &z1z1)
865	p256Mul(&z1z1z1, z1, &z1z1)
866	p256Mul(&s2, y2, &z1z1z1)
867	p256Diff(&h, &u2, x1)
868	p256Sum(&i, &h, &h)
869	p256Square(&i, &i)
870	p256Mul(&j, &h, &i)
871	p256Diff(&r, &s2, y1)
872	p256Sum(&r, &r, &r)
873	p256Mul(&v, x1, &i)
874
875	p256Mul(zOut, &tmp, &h)
876	p256Square(&rr, &r)
877	p256Diff(xOut, &rr, &j)
878	p256Diff(xOut, xOut, &v)
879	p256Diff(xOut, xOut, &v)
880
881	p256Diff(&tmp, &v, xOut)
882	p256Mul(yOut, &tmp, &r)
883	p256Mul(&tmp, y1, &j)
884	p256Diff(yOut, yOut, &tmp)
885	p256Diff(yOut, yOut, &tmp)
886}
887
888// p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
889//
890// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
891//
892// Note that this function does not handle P+P, infinity+P nor P+infinity
893// correctly.
894func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
895	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
896
897	p256Square(&z1z1, z1)
898	p256Square(&z2z2, z2)
899	p256Mul(&u1, x1, &z2z2)
900
901	p256Sum(&tmp, z1, z2)
902	p256Square(&tmp, &tmp)
903	p256Diff(&tmp, &tmp, &z1z1)
904	p256Diff(&tmp, &tmp, &z2z2)
905
906	p256Mul(&z2z2z2, z2, &z2z2)
907	p256Mul(&s1, y1, &z2z2z2)
908
909	p256Mul(&u2, x2, &z1z1)
910	p256Mul(&z1z1z1, z1, &z1z1)
911	p256Mul(&s2, y2, &z1z1z1)
912	p256Diff(&h, &u2, &u1)
913	p256Sum(&i, &h, &h)
914	p256Square(&i, &i)
915	p256Mul(&j, &h, &i)
916	p256Diff(&r, &s2, &s1)
917	p256Sum(&r, &r, &r)
918	p256Mul(&v, &u1, &i)
919
920	p256Mul(zOut, &tmp, &h)
921	p256Square(&rr, &r)
922	p256Diff(xOut, &rr, &j)
923	p256Diff(xOut, xOut, &v)
924	p256Diff(xOut, xOut, &v)
925
926	p256Diff(&tmp, &v, xOut)
927	p256Mul(yOut, &tmp, &r)
928	p256Mul(&tmp, &s1, &j)
929	p256Diff(yOut, yOut, &tmp)
930	p256Diff(yOut, yOut, &tmp)
931}
932
933// p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
934//
935// On entry: mask is either 0 or 0xffffffff.
936func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
937	for i := 0; i < p256Limbs; i++ {
938		tmp := mask & (in[i] ^ out[i])
939		out[i] ^= tmp
940	}
941}
942
943// p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
944// On entry: index < 16, table[0] must be zero.
945func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
946	for i := range xOut {
947		xOut[i] = 0
948	}
949	for i := range yOut {
950		yOut[i] = 0
951	}
952
953	for i := uint32(1); i < 16; i++ {
954		mask := i ^ index
955		mask |= mask >> 2
956		mask |= mask >> 1
957		mask &= 1
958		mask--
959		for j := range xOut {
960			xOut[j] |= table[0] & mask
961			table = table[1:]
962		}
963		for j := range yOut {
964			yOut[j] |= table[0] & mask
965			table = table[1:]
966		}
967	}
968}
969
970// p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
971// table.
972// On entry: index < 16, table[0] must be zero.
973func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
974	for i := range xOut {
975		xOut[i] = 0
976	}
977	for i := range yOut {
978		yOut[i] = 0
979	}
980	for i := range zOut {
981		zOut[i] = 0
982	}
983
984	// The implicit value at index 0 is all zero. We don't need to perform that
985	// iteration of the loop because we already set out_* to zero.
986	for i := uint32(1); i < 16; i++ {
987		mask := i ^ index
988		mask |= mask >> 2
989		mask |= mask >> 1
990		mask &= 1
991		mask--
992		for j := range xOut {
993			xOut[j] |= table[i][0][j] & mask
994		}
995		for j := range yOut {
996			yOut[j] |= table[i][1][j] & mask
997		}
998		for j := range zOut {
999			zOut[j] |= table[i][2][j] & mask
1000		}
1001	}
1002}
1003
1004// p256GetBit returns the bit'th bit of scalar.
1005func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
1006	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
1007}
1008
1009// p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
1010// little-endian number. Note that the value of scalar must be less than the
1011// order of the group.
1012func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
1013	nIsInfinityMask := ^uint32(0)
1014	var pIsNoninfiniteMask, mask, tableOffset uint32
1015	var px, py, tx, ty, tz [p256Limbs]uint32
1016
1017	for i := range xOut {
1018		xOut[i] = 0
1019	}
1020	for i := range yOut {
1021		yOut[i] = 0
1022	}
1023	for i := range zOut {
1024		zOut[i] = 0
1025	}
1026
1027	// The loop adds bits at positions 0, 64, 128 and 192, followed by
1028	// positions 32,96,160 and 224 and does this 32 times.
1029	for i := uint(0); i < 32; i++ {
1030		if i != 0 {
1031			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1032		}
1033		tableOffset = 0
1034		for j := uint(0); j <= 32; j += 32 {
1035			bit0 := p256GetBit(scalar, 31-i+j)
1036			bit1 := p256GetBit(scalar, 95-i+j)
1037			bit2 := p256GetBit(scalar, 159-i+j)
1038			bit3 := p256GetBit(scalar, 223-i+j)
1039			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
1040
1041			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
1042			tableOffset += 30 * p256Limbs
1043
1044			// Since scalar is less than the order of the group, we know that
1045			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
1046			// below.
1047			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
1048			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
1049			// (a.k.a.  the point at infinity). We handle that situation by
1050			// copying the point from the table.
1051			p256CopyConditional(xOut, &px, nIsInfinityMask)
1052			p256CopyConditional(yOut, &py, nIsInfinityMask)
1053			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
1054
1055			// Equally, the result is also wrong if the point from the table is
1056			// zero, which happens when the index is zero. We handle that by
1057			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
1058			pIsNoninfiniteMask = nonZeroToAllOnes(index)
1059			mask = pIsNoninfiniteMask & ^nIsInfinityMask
1060			p256CopyConditional(xOut, &tx, mask)
1061			p256CopyConditional(yOut, &ty, mask)
1062			p256CopyConditional(zOut, &tz, mask)
1063			// If p was not zero, then n is now non-zero.
1064			nIsInfinityMask &^= pIsNoninfiniteMask
1065		}
1066	}
1067}
1068
1069// p256PointToAffine converts a Jacobian point to an affine point. If the input
1070// is the point at infinity then it returns (0, 0) in constant time.
1071func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
1072	var zInv, zInvSq [p256Limbs]uint32
1073
1074	p256Invert(&zInv, z)
1075	p256Square(&zInvSq, &zInv)
1076	p256Mul(xOut, x, &zInvSq)
1077	p256Mul(&zInv, &zInv, &zInvSq)
1078	p256Mul(yOut, y, &zInv)
1079}
1080
1081// p256ToAffine returns a pair of *big.Int containing the affine representation
1082// of {x,y,z}.
1083func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
1084	var xx, yy [p256Limbs]uint32
1085	p256PointToAffine(&xx, &yy, x, y, z)
1086	return p256ToBig(&xx), p256ToBig(&yy)
1087}
1088
1089// p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
1090func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
1091	var px, py, pz, tx, ty, tz [p256Limbs]uint32
1092	var precomp [16][3][p256Limbs]uint32
1093	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
1094
1095	// We precompute 0,1,2,... times {x,y}.
1096	precomp[1][0] = *x
1097	precomp[1][1] = *y
1098	precomp[1][2] = p256One
1099
1100	for i := 2; i < 16; i += 2 {
1101		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
1102		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
1103	}
1104
1105	for i := range xOut {
1106		xOut[i] = 0
1107	}
1108	for i := range yOut {
1109		yOut[i] = 0
1110	}
1111	for i := range zOut {
1112		zOut[i] = 0
1113	}
1114	nIsInfinityMask = ^uint32(0)
1115
1116	// We add in a window of four bits each iteration and do this 64 times.
1117	for i := 0; i < 64; i++ {
1118		if i != 0 {
1119			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1120			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1121			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1122			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1123		}
1124
1125		index = uint32(scalar[31-i/2])
1126		if (i & 1) == 1 {
1127			index &= 15
1128		} else {
1129			index >>= 4
1130		}
1131
1132		// See the comments in scalarBaseMult about handling infinities.
1133		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
1134		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
1135		p256CopyConditional(xOut, &px, nIsInfinityMask)
1136		p256CopyConditional(yOut, &py, nIsInfinityMask)
1137		p256CopyConditional(zOut, &pz, nIsInfinityMask)
1138
1139		pIsNoninfiniteMask = nonZeroToAllOnes(index)
1140		mask = pIsNoninfiniteMask & ^nIsInfinityMask
1141		p256CopyConditional(xOut, &tx, mask)
1142		p256CopyConditional(yOut, &ty, mask)
1143		p256CopyConditional(zOut, &tz, mask)
1144		nIsInfinityMask &^= pIsNoninfiniteMask
1145	}
1146}
1147
1148// p256FromBig sets out = R*in.
1149func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
1150	tmp := new(big.Int).Lsh(in, 257)
1151	tmp.Mod(tmp, p256Params.P)
1152
1153	for i := 0; i < p256Limbs; i++ {
1154		if bits := tmp.Bits(); len(bits) > 0 {
1155			out[i] = uint32(bits[0]) & bottom29Bits
1156		} else {
1157			out[i] = 0
1158		}
1159		tmp.Rsh(tmp, 29)
1160
1161		i++
1162		if i == p256Limbs {
1163			break
1164		}
1165
1166		if bits := tmp.Bits(); len(bits) > 0 {
1167			out[i] = uint32(bits[0]) & bottom28Bits
1168		} else {
1169			out[i] = 0
1170		}
1171		tmp.Rsh(tmp, 28)
1172	}
1173}
1174
1175// p256ToBig returns a *big.Int containing the value of in.
1176func p256ToBig(in *[p256Limbs]uint32) *big.Int {
1177	result, tmp := new(big.Int), new(big.Int)
1178
1179	result.SetInt64(int64(in[p256Limbs-1]))
1180	for i := p256Limbs - 2; i >= 0; i-- {
1181		if (i & 1) == 0 {
1182			result.Lsh(result, 29)
1183		} else {
1184			result.Lsh(result, 28)
1185		}
1186		tmp.SetInt64(int64(in[i]))
1187		result.Add(result, tmp)
1188	}
1189
1190	result.Mul(result, p256RInverse)
1191	result.Mod(result, p256Params.P)
1192	return result
1193}
1194