1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !amd64,!arm64
6
7package elliptic
8
9// This file contains a constant-time, 32-bit implementation of P256.
10
11import (
12	"math/big"
13)
14
15type p256Curve struct {
16	*CurveParams
17}
18
19var (
20	p256Params *CurveParams
21
22	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
23	// (2**257).
24	p256RInverse *big.Int
25)
26
27func initP256() {
28	// See FIPS 186-3, section D.2.3
29	p256Params = &CurveParams{Name: "P-256"}
30	p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
31	p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
32	p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
33	p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
34	p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
35	p256Params.BitSize = 256
36
37	p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
38
39	// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
40	initP256Arch()
41}
42
43func (curve p256Curve) Params() *CurveParams {
44	return curve.CurveParams
45}
46
47// p256GetScalar endian-swaps the big-endian scalar value from in and writes it
48// to out. If the scalar is equal or greater than the order of the group, it's
49// reduced modulo that order.
50func p256GetScalar(out *[32]byte, in []byte) {
51	n := new(big.Int).SetBytes(in)
52	var scalarBytes []byte
53
54	if n.Cmp(p256Params.N) >= 0 {
55		n.Mod(n, p256Params.N)
56		scalarBytes = n.Bytes()
57	} else {
58		scalarBytes = in
59	}
60
61	for i, v := range scalarBytes {
62		out[len(scalarBytes)-(1+i)] = v
63	}
64}
65
66func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
67	var scalarReversed [32]byte
68	p256GetScalar(&scalarReversed, scalar)
69
70	var x1, y1, z1 [p256Limbs]uint32
71	p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
72	return p256ToAffine(&x1, &y1, &z1)
73}
74
75func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
76	var scalarReversed [32]byte
77	p256GetScalar(&scalarReversed, scalar)
78
79	var px, py, x1, y1, z1 [p256Limbs]uint32
80	p256FromBig(&px, bigX)
81	p256FromBig(&py, bigY)
82	p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
83	return p256ToAffine(&x1, &y1, &z1)
84}
85
86// Field elements are represented as nine, unsigned 32-bit words.
87//
88// The value of a field element is:
89//   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
90//
91// That is, each limb is alternately 29 or 28-bits wide in little-endian
92// order.
93//
94// This means that a field element hits 2**257, rather than 2**256 as we would
95// like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
96// problems when multiplying as terms end up one bit short of a limb which
97// would require much bit-shifting to correct.
98//
99// Finally, the values stored in a field element are in Montgomery form. So the
100// value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
101// 2**257.
102
103const (
104	p256Limbs    = 9
105	bottom29Bits = 0x1fffffff
106)
107
108var (
109	// p256One is the number 1 as a field element.
110	p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
111	p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
112	// p256P is the prime modulus as a field element.
113	p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
114	// p2562P is the twice prime modulus as a field element.
115	p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
116)
117
118// p256Precomputed contains precomputed values to aid the calculation of scalar
119// multiples of the base point, G. It's actually two, equal length, tables
120// concatenated.
121//
122// The first table contains (x,y) field element pairs for 16 multiples of the
123// base point, G.
124//
125//   Index  |  Index (binary) | Value
126//       0  |           0000  | 0G (all zeros, omitted)
127//       1  |           0001  | G
128//       2  |           0010  | 2**64G
129//       3  |           0011  | 2**64G + G
130//       4  |           0100  | 2**128G
131//       5  |           0101  | 2**128G + G
132//       6  |           0110  | 2**128G + 2**64G
133//       7  |           0111  | 2**128G + 2**64G + G
134//       8  |           1000  | 2**192G
135//       9  |           1001  | 2**192G + G
136//      10  |           1010  | 2**192G + 2**64G
137//      11  |           1011  | 2**192G + 2**64G + G
138//      12  |           1100  | 2**192G + 2**128G
139//      13  |           1101  | 2**192G + 2**128G + G
140//      14  |           1110  | 2**192G + 2**128G + 2**64G
141//      15  |           1111  | 2**192G + 2**128G + 2**64G + G
142//
143// The second table follows the same style, but the terms are 2**32G,
144// 2**96G, 2**160G, 2**224G.
145//
146// This is ~2KB of data.
147var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
148	0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
149	0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
150	0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
151	0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
152	0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
153	0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
154	0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
155	0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
156	0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
157	0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
158	0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
159	0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
160	0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
161	0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
162	0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
163	0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
164	0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
165	0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
166	0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
167	0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
168	0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
169	0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
170	0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
171	0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
172	0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
173	0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
174	0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
175	0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
176	0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
177	0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
178	0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
179	0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
180	0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
181	0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
182	0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
183	0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
184	0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
185	0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
186	0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
187	0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
188	0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
189	0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
190	0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
191	0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
192	0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
193	0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
194	0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
195	0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
196	0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
197	0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
198	0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
199	0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
200	0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
201	0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
202	0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
203	0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
204	0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
205	0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
206	0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
207	0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
208}
209
210// Field element operations:
211
212// nonZeroToAllOnes returns:
213//   0xffffffff for 0 < x <= 2**31
214//   0 for x == 0 or x > 2**31.
215func nonZeroToAllOnes(x uint32) uint32 {
216	return ((x - 1) >> 31) - 1
217}
218
219// p256ReduceCarry adds a multiple of p in order to cancel |carry|,
220// which is a term at 2**257.
221//
222// On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
223// On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
224func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
225	carry_mask := nonZeroToAllOnes(carry)
226
227	inout[0] += carry << 1
228	inout[3] += 0x10000000 & carry_mask
229	// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
230	// previous line therefore this doesn't underflow.
231	inout[3] -= carry << 11
232	inout[4] += (0x20000000 - 1) & carry_mask
233	inout[5] += (0x10000000 - 1) & carry_mask
234	inout[6] += (0x20000000 - 1) & carry_mask
235	inout[6] -= carry << 22
236	// This may underflow if carry is non-zero but, if so, we'll fix it in the
237	// next line.
238	inout[7] -= 1 & carry_mask
239	inout[7] += carry << 25
240}
241
242// p256Sum sets out = in+in2.
243//
244// On entry, in[i]+in2[i] must not overflow a 32-bit word.
245// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
246func p256Sum(out, in, in2 *[p256Limbs]uint32) {
247	carry := uint32(0)
248	for i := 0; ; i++ {
249		out[i] = in[i] + in2[i]
250		out[i] += carry
251		carry = out[i] >> 29
252		out[i] &= bottom29Bits
253
254		i++
255		if i == p256Limbs {
256			break
257		}
258
259		out[i] = in[i] + in2[i]
260		out[i] += carry
261		carry = out[i] >> 28
262		out[i] &= bottom28Bits
263	}
264
265	p256ReduceCarry(out, carry)
266}
267
268const (
269	two30m2    = 1<<30 - 1<<2
270	two30p13m2 = 1<<30 + 1<<13 - 1<<2
271	two31m2    = 1<<31 - 1<<2
272	two31p24m2 = 1<<31 + 1<<24 - 1<<2
273	two30m27m2 = 1<<30 - 1<<27 - 1<<2
274)
275
276// p256Zero31 is 0 mod p.
277var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
278
279// p256Diff sets out = in-in2.
280//
281// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
282//           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
283// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
284func p256Diff(out, in, in2 *[p256Limbs]uint32) {
285	var carry uint32
286
287	for i := 0; ; i++ {
288		out[i] = in[i] - in2[i]
289		out[i] += p256Zero31[i]
290		out[i] += carry
291		carry = out[i] >> 29
292		out[i] &= bottom29Bits
293
294		i++
295		if i == p256Limbs {
296			break
297		}
298
299		out[i] = in[i] - in2[i]
300		out[i] += p256Zero31[i]
301		out[i] += carry
302		carry = out[i] >> 28
303		out[i] &= bottom28Bits
304	}
305
306	p256ReduceCarry(out, carry)
307}
308
309// p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
310// the same 29,28,... bit positions as a field element.
311//
312// The values in field elements are in Montgomery form: x*R mod p where R =
313// 2**257. Since we just multiplied two Montgomery values together, the result
314// is x*y*R*R mod p. We wish to divide by R in order for the result also to be
315// in Montgomery form.
316//
317// On entry: tmp[i] < 2**64
318// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
319func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
320	// The following table may be helpful when reading this code:
321	//
322	// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
323	// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
324	// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
325	//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
326	var tmp2 [18]uint32
327	var carry, x, xMask uint32
328
329	// tmp contains 64-bit words with the same 29,28,29-bit positions as an
330	// field element. So the top of an element of tmp might overlap with
331	// another element two positions down. The following loop eliminates
332	// this overlap.
333	tmp2[0] = uint32(tmp[0]) & bottom29Bits
334
335	tmp2[1] = uint32(tmp[0]) >> 29
336	tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
337	tmp2[1] += uint32(tmp[1]) & bottom28Bits
338	carry = tmp2[1] >> 28
339	tmp2[1] &= bottom28Bits
340
341	for i := 2; i < 17; i++ {
342		tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
343		tmp2[i] += (uint32(tmp[i-1])) >> 28
344		tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
345		tmp2[i] += uint32(tmp[i]) & bottom29Bits
346		tmp2[i] += carry
347		carry = tmp2[i] >> 29
348		tmp2[i] &= bottom29Bits
349
350		i++
351		if i == 17 {
352			break
353		}
354		tmp2[i] = uint32(tmp[i-2]>>32) >> 25
355		tmp2[i] += uint32(tmp[i-1]) >> 29
356		tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
357		tmp2[i] += uint32(tmp[i]) & bottom28Bits
358		tmp2[i] += carry
359		carry = tmp2[i] >> 28
360		tmp2[i] &= bottom28Bits
361	}
362
363	tmp2[17] = uint32(tmp[15]>>32) >> 25
364	tmp2[17] += uint32(tmp[16]) >> 29
365	tmp2[17] += uint32(tmp[16]>>32) << 3
366	tmp2[17] += carry
367
368	// Montgomery elimination of terms:
369	//
370	// Since R is 2**257, we can divide by R with a bitwise shift if we can
371	// ensure that the right-most 257 bits are all zero. We can make that true
372	// by adding multiplies of p without affecting the value.
373	//
374	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
375	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
376	// We can do that for 8 further limbs and then right shift to eliminate the
377	// extra factor of R.
378	for i := 0; ; i += 2 {
379		tmp2[i+1] += tmp2[i] >> 29
380		x = tmp2[i] & bottom29Bits
381		xMask = nonZeroToAllOnes(x)
382		tmp2[i] = 0
383
384		// The bounds calculations for this loop are tricky. Each iteration of
385		// the loop eliminates two words by adding values to words to their
386		// right.
387		//
388		// The following table contains the amounts added to each word (as an
389		// offset from the value of i at the top of the loop). The amounts are
390		// accounted for from the first and second half of the loop separately
391		// and are written as, for example, 28 to mean a value <2**28.
392		//
393		// Word:                   3   4   5   6   7   8   9   10
394		// Added in top half:     28  11      29  21  29  28
395		//                                        28  29
396		//                                            29
397		// Added in bottom half:      29  10      28  21  28   28
398		//                                            29
399		//
400		// The value that is currently offset 7 will be offset 5 for the next
401		// iteration and then offset 3 for the iteration after that. Therefore
402		// the total value added will be the values added at 7, 5 and 3.
403		//
404		// The following table accumulates these values. The sums at the bottom
405		// are written as, for example, 29+28, to mean a value < 2**29+2**28.
406		//
407		// Word:                   3   4   5   6   7   8   9  10  11  12  13
408		//                        28  11  10  29  21  29  28  28  28  28  28
409		//                            29  28  11  28  29  28  29  28  29  28
410		//                                    29  28  21  21  29  21  29  21
411		//                                        10  29  28  21  28  21  28
412		//                                        28  29  28  29  28  29  28
413		//                                            11  10  29  10  29  10
414		//                                            29  28  11  28  11
415		//                                                    29      29
416		//                        --------------------------------------------
417		//                                                30+ 31+ 30+ 31+ 30+
418		//                                                28+ 29+ 28+ 29+ 21+
419		//                                                21+ 28+ 21+ 28+ 10
420		//                                                10  21+ 10  21+
421		//                                                    11      11
422		//
423		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
424		// tmp2[10/12] has an initial value of <2**29, then the maximum value
425		// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
426		// as required.
427		tmp2[i+3] += (x << 10) & bottom28Bits
428		tmp2[i+4] += (x >> 18)
429
430		tmp2[i+6] += (x << 21) & bottom29Bits
431		tmp2[i+7] += x >> 8
432
433		// At position 200, which is the starting bit position for word 7, we
434		// have a factor of 0xf000000 = 2**28 - 2**24.
435		tmp2[i+7] += 0x10000000 & xMask
436		tmp2[i+8] += (x - 1) & xMask
437		tmp2[i+7] -= (x << 24) & bottom28Bits
438		tmp2[i+8] -= x >> 4
439
440		tmp2[i+8] += 0x20000000 & xMask
441		tmp2[i+8] -= x
442		tmp2[i+8] += (x << 28) & bottom29Bits
443		tmp2[i+9] += ((x >> 1) - 1) & xMask
444
445		if i+1 == p256Limbs {
446			break
447		}
448		tmp2[i+2] += tmp2[i+1] >> 28
449		x = tmp2[i+1] & bottom28Bits
450		xMask = nonZeroToAllOnes(x)
451		tmp2[i+1] = 0
452
453		tmp2[i+4] += (x << 11) & bottom29Bits
454		tmp2[i+5] += (x >> 18)
455
456		tmp2[i+7] += (x << 21) & bottom28Bits
457		tmp2[i+8] += x >> 7
458
459		// At position 199, which is the starting bit of the 8th word when
460		// dealing with a context starting on an odd word, we have a factor of
461		// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
462		// word from i+1 is i+8.
463		tmp2[i+8] += 0x20000000 & xMask
464		tmp2[i+9] += (x - 1) & xMask
465		tmp2[i+8] -= (x << 25) & bottom29Bits
466		tmp2[i+9] -= x >> 4
467
468		tmp2[i+9] += 0x10000000 & xMask
469		tmp2[i+9] -= x
470		tmp2[i+10] += (x - 1) & xMask
471	}
472
473	// We merge the right shift with a carry chain. The words above 2**257 have
474	// widths of 28,29,... which we need to correct when copying them down.
475	carry = 0
476	for i := 0; i < 8; i++ {
477		// The maximum value of tmp2[i + 9] occurs on the first iteration and
478		// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
479		// therefore safe.
480		out[i] = tmp2[i+9]
481		out[i] += carry
482		out[i] += (tmp2[i+10] << 28) & bottom29Bits
483		carry = out[i] >> 29
484		out[i] &= bottom29Bits
485
486		i++
487		out[i] = tmp2[i+9] >> 1
488		out[i] += carry
489		carry = out[i] >> 28
490		out[i] &= bottom28Bits
491	}
492
493	out[8] = tmp2[17]
494	out[8] += carry
495	carry = out[8] >> 29
496	out[8] &= bottom29Bits
497
498	p256ReduceCarry(out, carry)
499}
500
501// p256Square sets out=in*in.
502//
503// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
504// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
505func p256Square(out, in *[p256Limbs]uint32) {
506	var tmp [17]uint64
507
508	tmp[0] = uint64(in[0]) * uint64(in[0])
509	tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
510	tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
511		uint64(in[1])*(uint64(in[1])<<1)
512	tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
513		uint64(in[1])*(uint64(in[2])<<1)
514	tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
515		uint64(in[1])*(uint64(in[3])<<2) +
516		uint64(in[2])*uint64(in[2])
517	tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
518		uint64(in[1])*(uint64(in[4])<<1) +
519		uint64(in[2])*(uint64(in[3])<<1)
520	tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
521		uint64(in[1])*(uint64(in[5])<<2) +
522		uint64(in[2])*(uint64(in[4])<<1) +
523		uint64(in[3])*(uint64(in[3])<<1)
524	tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
525		uint64(in[1])*(uint64(in[6])<<1) +
526		uint64(in[2])*(uint64(in[5])<<1) +
527		uint64(in[3])*(uint64(in[4])<<1)
528	// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
529	// which is < 2**64 as required.
530	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
531		uint64(in[1])*(uint64(in[7])<<2) +
532		uint64(in[2])*(uint64(in[6])<<1) +
533		uint64(in[3])*(uint64(in[5])<<2) +
534		uint64(in[4])*uint64(in[4])
535	tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
536		uint64(in[2])*(uint64(in[7])<<1) +
537		uint64(in[3])*(uint64(in[6])<<1) +
538		uint64(in[4])*(uint64(in[5])<<1)
539	tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
540		uint64(in[3])*(uint64(in[7])<<2) +
541		uint64(in[4])*(uint64(in[6])<<1) +
542		uint64(in[5])*(uint64(in[5])<<1)
543	tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
544		uint64(in[4])*(uint64(in[7])<<1) +
545		uint64(in[5])*(uint64(in[6])<<1)
546	tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
547		uint64(in[5])*(uint64(in[7])<<2) +
548		uint64(in[6])*uint64(in[6])
549	tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
550		uint64(in[6])*(uint64(in[7])<<1)
551	tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
552		uint64(in[7])*(uint64(in[7])<<1)
553	tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
554	tmp[16] = uint64(in[8]) * uint64(in[8])
555
556	p256ReduceDegree(out, tmp)
557}
558
559// p256Mul sets out=in*in2.
560//
561// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
562//           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
563// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
564func p256Mul(out, in, in2 *[p256Limbs]uint32) {
565	var tmp [17]uint64
566
567	tmp[0] = uint64(in[0]) * uint64(in2[0])
568	tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
569		uint64(in[1])*(uint64(in2[0])<<0)
570	tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
571		uint64(in[1])*(uint64(in2[1])<<1) +
572		uint64(in[2])*(uint64(in2[0])<<0)
573	tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
574		uint64(in[1])*(uint64(in2[2])<<0) +
575		uint64(in[2])*(uint64(in2[1])<<0) +
576		uint64(in[3])*(uint64(in2[0])<<0)
577	tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
578		uint64(in[1])*(uint64(in2[3])<<1) +
579		uint64(in[2])*(uint64(in2[2])<<0) +
580		uint64(in[3])*(uint64(in2[1])<<1) +
581		uint64(in[4])*(uint64(in2[0])<<0)
582	tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
583		uint64(in[1])*(uint64(in2[4])<<0) +
584		uint64(in[2])*(uint64(in2[3])<<0) +
585		uint64(in[3])*(uint64(in2[2])<<0) +
586		uint64(in[4])*(uint64(in2[1])<<0) +
587		uint64(in[5])*(uint64(in2[0])<<0)
588	tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
589		uint64(in[1])*(uint64(in2[5])<<1) +
590		uint64(in[2])*(uint64(in2[4])<<0) +
591		uint64(in[3])*(uint64(in2[3])<<1) +
592		uint64(in[4])*(uint64(in2[2])<<0) +
593		uint64(in[5])*(uint64(in2[1])<<1) +
594		uint64(in[6])*(uint64(in2[0])<<0)
595	tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
596		uint64(in[1])*(uint64(in2[6])<<0) +
597		uint64(in[2])*(uint64(in2[5])<<0) +
598		uint64(in[3])*(uint64(in2[4])<<0) +
599		uint64(in[4])*(uint64(in2[3])<<0) +
600		uint64(in[5])*(uint64(in2[2])<<0) +
601		uint64(in[6])*(uint64(in2[1])<<0) +
602		uint64(in[7])*(uint64(in2[0])<<0)
603	// tmp[8] has the greatest value but doesn't overflow. See logic in
604	// p256Square.
605	tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
606		uint64(in[1])*(uint64(in2[7])<<1) +
607		uint64(in[2])*(uint64(in2[6])<<0) +
608		uint64(in[3])*(uint64(in2[5])<<1) +
609		uint64(in[4])*(uint64(in2[4])<<0) +
610		uint64(in[5])*(uint64(in2[3])<<1) +
611		uint64(in[6])*(uint64(in2[2])<<0) +
612		uint64(in[7])*(uint64(in2[1])<<1) +
613		uint64(in[8])*(uint64(in2[0])<<0)
614	tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
615		uint64(in[2])*(uint64(in2[7])<<0) +
616		uint64(in[3])*(uint64(in2[6])<<0) +
617		uint64(in[4])*(uint64(in2[5])<<0) +
618		uint64(in[5])*(uint64(in2[4])<<0) +
619		uint64(in[6])*(uint64(in2[3])<<0) +
620		uint64(in[7])*(uint64(in2[2])<<0) +
621		uint64(in[8])*(uint64(in2[1])<<0)
622	tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
623		uint64(in[3])*(uint64(in2[7])<<1) +
624		uint64(in[4])*(uint64(in2[6])<<0) +
625		uint64(in[5])*(uint64(in2[5])<<1) +
626		uint64(in[6])*(uint64(in2[4])<<0) +
627		uint64(in[7])*(uint64(in2[3])<<1) +
628		uint64(in[8])*(uint64(in2[2])<<0)
629	tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
630		uint64(in[4])*(uint64(in2[7])<<0) +
631		uint64(in[5])*(uint64(in2[6])<<0) +
632		uint64(in[6])*(uint64(in2[5])<<0) +
633		uint64(in[7])*(uint64(in2[4])<<0) +
634		uint64(in[8])*(uint64(in2[3])<<0)
635	tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
636		uint64(in[5])*(uint64(in2[7])<<1) +
637		uint64(in[6])*(uint64(in2[6])<<0) +
638		uint64(in[7])*(uint64(in2[5])<<1) +
639		uint64(in[8])*(uint64(in2[4])<<0)
640	tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
641		uint64(in[6])*(uint64(in2[7])<<0) +
642		uint64(in[7])*(uint64(in2[6])<<0) +
643		uint64(in[8])*(uint64(in2[5])<<0)
644	tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
645		uint64(in[7])*(uint64(in2[7])<<1) +
646		uint64(in[8])*(uint64(in2[6])<<0)
647	tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
648		uint64(in[8])*(uint64(in2[7])<<0)
649	tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
650
651	p256ReduceDegree(out, tmp)
652}
653
654func p256Assign(out, in *[p256Limbs]uint32) {
655	*out = *in
656}
657
658// p256Invert calculates |out| = |in|^{-1}
659//
660// Based on Fermat's Little Theorem:
661//   a^p = a (mod p)
662//   a^{p-1} = 1 (mod p)
663//   a^{p-2} = a^{-1} (mod p)
664func p256Invert(out, in *[p256Limbs]uint32) {
665	var ftmp, ftmp2 [p256Limbs]uint32
666
667	// each e_I will hold |in|^{2^I - 1}
668	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
669
670	p256Square(&ftmp, in)     // 2^1
671	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
672	p256Assign(&e2, &ftmp)
673	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
674	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
675	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
676	p256Assign(&e4, &ftmp)
677	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
678	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
679	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
680	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
681	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
682	p256Assign(&e8, &ftmp)
683	for i := 0; i < 8; i++ {
684		p256Square(&ftmp, &ftmp)
685	} // 2^16 - 2^8
686	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
687	p256Assign(&e16, &ftmp)
688	for i := 0; i < 16; i++ {
689		p256Square(&ftmp, &ftmp)
690	} // 2^32 - 2^16
691	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
692	p256Assign(&e32, &ftmp)
693	for i := 0; i < 32; i++ {
694		p256Square(&ftmp, &ftmp)
695	} // 2^64 - 2^32
696	p256Assign(&e64, &ftmp)
697	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
698	for i := 0; i < 192; i++ {
699		p256Square(&ftmp, &ftmp)
700	} // 2^256 - 2^224 + 2^192
701
702	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
703	for i := 0; i < 16; i++ {
704		p256Square(&ftmp2, &ftmp2)
705	} // 2^80 - 2^16
706	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
707	for i := 0; i < 8; i++ {
708		p256Square(&ftmp2, &ftmp2)
709	} // 2^88 - 2^8
710	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
711	for i := 0; i < 4; i++ {
712		p256Square(&ftmp2, &ftmp2)
713	} // 2^92 - 2^4
714	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
715	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
716	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
717	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
718	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
719	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
720	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
721
722	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
723}
724
725// p256Scalar3 sets out=3*out.
726//
727// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
728// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
729func p256Scalar3(out *[p256Limbs]uint32) {
730	var carry uint32
731
732	for i := 0; ; i++ {
733		out[i] *= 3
734		out[i] += carry
735		carry = out[i] >> 29
736		out[i] &= bottom29Bits
737
738		i++
739		if i == p256Limbs {
740			break
741		}
742
743		out[i] *= 3
744		out[i] += carry
745		carry = out[i] >> 28
746		out[i] &= bottom28Bits
747	}
748
749	p256ReduceCarry(out, carry)
750}
751
752// p256Scalar4 sets out=4*out.
753//
754// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
755// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
756func p256Scalar4(out *[p256Limbs]uint32) {
757	var carry, nextCarry uint32
758
759	for i := 0; ; i++ {
760		nextCarry = out[i] >> 27
761		out[i] <<= 2
762		out[i] &= bottom29Bits
763		out[i] += carry
764		carry = nextCarry + (out[i] >> 29)
765		out[i] &= bottom29Bits
766
767		i++
768		if i == p256Limbs {
769			break
770		}
771		nextCarry = out[i] >> 26
772		out[i] <<= 2
773		out[i] &= bottom28Bits
774		out[i] += carry
775		carry = nextCarry + (out[i] >> 28)
776		out[i] &= bottom28Bits
777	}
778
779	p256ReduceCarry(out, carry)
780}
781
782// p256Scalar8 sets out=8*out.
783//
784// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
785// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
786func p256Scalar8(out *[p256Limbs]uint32) {
787	var carry, nextCarry uint32
788
789	for i := 0; ; i++ {
790		nextCarry = out[i] >> 26
791		out[i] <<= 3
792		out[i] &= bottom29Bits
793		out[i] += carry
794		carry = nextCarry + (out[i] >> 29)
795		out[i] &= bottom29Bits
796
797		i++
798		if i == p256Limbs {
799			break
800		}
801		nextCarry = out[i] >> 25
802		out[i] <<= 3
803		out[i] &= bottom28Bits
804		out[i] += carry
805		carry = nextCarry + (out[i] >> 28)
806		out[i] &= bottom28Bits
807	}
808
809	p256ReduceCarry(out, carry)
810}
811
812// Group operations:
813//
814// Elements of the elliptic curve group are represented in Jacobian
815// coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
816// Jacobian form.
817
818// p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
819//
820// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
821func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
822	var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
823
824	p256Square(&delta, z)
825	p256Square(&gamma, y)
826	p256Mul(&beta, x, &gamma)
827
828	p256Sum(&tmp, x, &delta)
829	p256Diff(&tmp2, x, &delta)
830	p256Mul(&alpha, &tmp, &tmp2)
831	p256Scalar3(&alpha)
832
833	p256Sum(&tmp, y, z)
834	p256Square(&tmp, &tmp)
835	p256Diff(&tmp, &tmp, &gamma)
836	p256Diff(zOut, &tmp, &delta)
837
838	p256Scalar4(&beta)
839	p256Square(xOut, &alpha)
840	p256Diff(xOut, xOut, &beta)
841	p256Diff(xOut, xOut, &beta)
842
843	p256Diff(&tmp, &beta, xOut)
844	p256Mul(&tmp, &alpha, &tmp)
845	p256Square(&tmp2, &gamma)
846	p256Scalar8(&tmp2)
847	p256Diff(yOut, &tmp, &tmp2)
848}
849
850// p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
851// (i.e. the second point is affine.)
852//
853// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
854//
855// Note that this function does not handle P+P, infinity+P nor P+infinity
856// correctly.
857func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
858	var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
859
860	p256Square(&z1z1, z1)
861	p256Sum(&tmp, z1, z1)
862
863	p256Mul(&u2, x2, &z1z1)
864	p256Mul(&z1z1z1, z1, &z1z1)
865	p256Mul(&s2, y2, &z1z1z1)
866	p256Diff(&h, &u2, x1)
867	p256Sum(&i, &h, &h)
868	p256Square(&i, &i)
869	p256Mul(&j, &h, &i)
870	p256Diff(&r, &s2, y1)
871	p256Sum(&r, &r, &r)
872	p256Mul(&v, x1, &i)
873
874	p256Mul(zOut, &tmp, &h)
875	p256Square(&rr, &r)
876	p256Diff(xOut, &rr, &j)
877	p256Diff(xOut, xOut, &v)
878	p256Diff(xOut, xOut, &v)
879
880	p256Diff(&tmp, &v, xOut)
881	p256Mul(yOut, &tmp, &r)
882	p256Mul(&tmp, y1, &j)
883	p256Diff(yOut, yOut, &tmp)
884	p256Diff(yOut, yOut, &tmp)
885}
886
887// p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
888//
889// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
890//
891// Note that this function does not handle P+P, infinity+P nor P+infinity
892// correctly.
893func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
894	var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
895
896	p256Square(&z1z1, z1)
897	p256Square(&z2z2, z2)
898	p256Mul(&u1, x1, &z2z2)
899
900	p256Sum(&tmp, z1, z2)
901	p256Square(&tmp, &tmp)
902	p256Diff(&tmp, &tmp, &z1z1)
903	p256Diff(&tmp, &tmp, &z2z2)
904
905	p256Mul(&z2z2z2, z2, &z2z2)
906	p256Mul(&s1, y1, &z2z2z2)
907
908	p256Mul(&u2, x2, &z1z1)
909	p256Mul(&z1z1z1, z1, &z1z1)
910	p256Mul(&s2, y2, &z1z1z1)
911	p256Diff(&h, &u2, &u1)
912	p256Sum(&i, &h, &h)
913	p256Square(&i, &i)
914	p256Mul(&j, &h, &i)
915	p256Diff(&r, &s2, &s1)
916	p256Sum(&r, &r, &r)
917	p256Mul(&v, &u1, &i)
918
919	p256Mul(zOut, &tmp, &h)
920	p256Square(&rr, &r)
921	p256Diff(xOut, &rr, &j)
922	p256Diff(xOut, xOut, &v)
923	p256Diff(xOut, xOut, &v)
924
925	p256Diff(&tmp, &v, xOut)
926	p256Mul(yOut, &tmp, &r)
927	p256Mul(&tmp, &s1, &j)
928	p256Diff(yOut, yOut, &tmp)
929	p256Diff(yOut, yOut, &tmp)
930}
931
932// p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
933//
934// On entry: mask is either 0 or 0xffffffff.
935func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
936	for i := 0; i < p256Limbs; i++ {
937		tmp := mask & (in[i] ^ out[i])
938		out[i] ^= tmp
939	}
940}
941
942// p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
943// On entry: index < 16, table[0] must be zero.
944func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
945	for i := range xOut {
946		xOut[i] = 0
947	}
948	for i := range yOut {
949		yOut[i] = 0
950	}
951
952	for i := uint32(1); i < 16; i++ {
953		mask := i ^ index
954		mask |= mask >> 2
955		mask |= mask >> 1
956		mask &= 1
957		mask--
958		for j := range xOut {
959			xOut[j] |= table[0] & mask
960			table = table[1:]
961		}
962		for j := range yOut {
963			yOut[j] |= table[0] & mask
964			table = table[1:]
965		}
966	}
967}
968
969// p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
970// table.
971// On entry: index < 16, table[0] must be zero.
972func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
973	for i := range xOut {
974		xOut[i] = 0
975	}
976	for i := range yOut {
977		yOut[i] = 0
978	}
979	for i := range zOut {
980		zOut[i] = 0
981	}
982
983	// The implicit value at index 0 is all zero. We don't need to perform that
984	// iteration of the loop because we already set out_* to zero.
985	for i := uint32(1); i < 16; i++ {
986		mask := i ^ index
987		mask |= mask >> 2
988		mask |= mask >> 1
989		mask &= 1
990		mask--
991		for j := range xOut {
992			xOut[j] |= table[i][0][j] & mask
993		}
994		for j := range yOut {
995			yOut[j] |= table[i][1][j] & mask
996		}
997		for j := range zOut {
998			zOut[j] |= table[i][2][j] & mask
999		}
1000	}
1001}
1002
1003// p256GetBit returns the bit'th bit of scalar.
1004func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
1005	return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
1006}
1007
1008// p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
1009// little-endian number. Note that the value of scalar must be less than the
1010// order of the group.
1011func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
1012	nIsInfinityMask := ^uint32(0)
1013	var pIsNoninfiniteMask, mask, tableOffset uint32
1014	var px, py, tx, ty, tz [p256Limbs]uint32
1015
1016	for i := range xOut {
1017		xOut[i] = 0
1018	}
1019	for i := range yOut {
1020		yOut[i] = 0
1021	}
1022	for i := range zOut {
1023		zOut[i] = 0
1024	}
1025
1026	// The loop adds bits at positions 0, 64, 128 and 192, followed by
1027	// positions 32,96,160 and 224 and does this 32 times.
1028	for i := uint(0); i < 32; i++ {
1029		if i != 0 {
1030			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1031		}
1032		tableOffset = 0
1033		for j := uint(0); j <= 32; j += 32 {
1034			bit0 := p256GetBit(scalar, 31-i+j)
1035			bit1 := p256GetBit(scalar, 95-i+j)
1036			bit2 := p256GetBit(scalar, 159-i+j)
1037			bit3 := p256GetBit(scalar, 223-i+j)
1038			index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
1039
1040			p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
1041			tableOffset += 30 * p256Limbs
1042
1043			// Since scalar is less than the order of the group, we know that
1044			// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
1045			// below.
1046			p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
1047			// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
1048			// (a.k.a.  the point at infinity). We handle that situation by
1049			// copying the point from the table.
1050			p256CopyConditional(xOut, &px, nIsInfinityMask)
1051			p256CopyConditional(yOut, &py, nIsInfinityMask)
1052			p256CopyConditional(zOut, &p256One, nIsInfinityMask)
1053
1054			// Equally, the result is also wrong if the point from the table is
1055			// zero, which happens when the index is zero. We handle that by
1056			// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
1057			pIsNoninfiniteMask = nonZeroToAllOnes(index)
1058			mask = pIsNoninfiniteMask & ^nIsInfinityMask
1059			p256CopyConditional(xOut, &tx, mask)
1060			p256CopyConditional(yOut, &ty, mask)
1061			p256CopyConditional(zOut, &tz, mask)
1062			// If p was not zero, then n is now non-zero.
1063			nIsInfinityMask &^= pIsNoninfiniteMask
1064		}
1065	}
1066}
1067
1068// p256PointToAffine converts a Jacobian point to an affine point. If the input
1069// is the point at infinity then it returns (0, 0) in constant time.
1070func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
1071	var zInv, zInvSq [p256Limbs]uint32
1072
1073	p256Invert(&zInv, z)
1074	p256Square(&zInvSq, &zInv)
1075	p256Mul(xOut, x, &zInvSq)
1076	p256Mul(&zInv, &zInv, &zInvSq)
1077	p256Mul(yOut, y, &zInv)
1078}
1079
1080// p256ToAffine returns a pair of *big.Int containing the affine representation
1081// of {x,y,z}.
1082func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
1083	var xx, yy [p256Limbs]uint32
1084	p256PointToAffine(&xx, &yy, x, y, z)
1085	return p256ToBig(&xx), p256ToBig(&yy)
1086}
1087
1088// p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
1089func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
1090	var px, py, pz, tx, ty, tz [p256Limbs]uint32
1091	var precomp [16][3][p256Limbs]uint32
1092	var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
1093
1094	// We precompute 0,1,2,... times {x,y}.
1095	precomp[1][0] = *x
1096	precomp[1][1] = *y
1097	precomp[1][2] = p256One
1098
1099	for i := 2; i < 16; i += 2 {
1100		p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
1101		p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
1102	}
1103
1104	for i := range xOut {
1105		xOut[i] = 0
1106	}
1107	for i := range yOut {
1108		yOut[i] = 0
1109	}
1110	for i := range zOut {
1111		zOut[i] = 0
1112	}
1113	nIsInfinityMask = ^uint32(0)
1114
1115	// We add in a window of four bits each iteration and do this 64 times.
1116	for i := 0; i < 64; i++ {
1117		if i != 0 {
1118			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1119			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1120			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1121			p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1122		}
1123
1124		index = uint32(scalar[31-i/2])
1125		if (i & 1) == 1 {
1126			index &= 15
1127		} else {
1128			index >>= 4
1129		}
1130
1131		// See the comments in scalarBaseMult about handling infinities.
1132		p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
1133		p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
1134		p256CopyConditional(xOut, &px, nIsInfinityMask)
1135		p256CopyConditional(yOut, &py, nIsInfinityMask)
1136		p256CopyConditional(zOut, &pz, nIsInfinityMask)
1137
1138		pIsNoninfiniteMask = nonZeroToAllOnes(index)
1139		mask = pIsNoninfiniteMask & ^nIsInfinityMask
1140		p256CopyConditional(xOut, &tx, mask)
1141		p256CopyConditional(yOut, &ty, mask)
1142		p256CopyConditional(zOut, &tz, mask)
1143		nIsInfinityMask &^= pIsNoninfiniteMask
1144	}
1145}
1146
1147// p256FromBig sets out = R*in.
1148func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
1149	tmp := new(big.Int).Lsh(in, 257)
1150	tmp.Mod(tmp, p256Params.P)
1151
1152	for i := 0; i < p256Limbs; i++ {
1153		if bits := tmp.Bits(); len(bits) > 0 {
1154			out[i] = uint32(bits[0]) & bottom29Bits
1155		} else {
1156			out[i] = 0
1157		}
1158		tmp.Rsh(tmp, 29)
1159
1160		i++
1161		if i == p256Limbs {
1162			break
1163		}
1164
1165		if bits := tmp.Bits(); len(bits) > 0 {
1166			out[i] = uint32(bits[0]) & bottom28Bits
1167		} else {
1168			out[i] = 0
1169		}
1170		tmp.Rsh(tmp, 28)
1171	}
1172}
1173
1174// p256ToBig returns a *big.Int containing the value of in.
1175func p256ToBig(in *[p256Limbs]uint32) *big.Int {
1176	result, tmp := new(big.Int), new(big.Int)
1177
1178	result.SetInt64(int64(in[p256Limbs-1]))
1179	for i := p256Limbs - 2; i >= 0; i-- {
1180		if (i & 1) == 0 {
1181			result.Lsh(result, 29)
1182		} else {
1183			result.Lsh(result, 28)
1184		}
1185		tmp.SetInt64(int64(in[i]))
1186		result.Add(result, tmp)
1187	}
1188
1189	result.Mul(result, p256RInverse)
1190	result.Mod(result, p256Params.P)
1191	return result
1192}
1193