1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// -build !amd64 6 7package elliptic 8 9// This file contains a constant-time, 32-bit implementation of P256. 10 11import ( 12 "math/big" 13) 14 15type p256Curve struct { 16 *CurveParams 17} 18 19var ( 20 p256Params *CurveParams 21 22 // RInverse contains 1/R mod p - the inverse of the Montgomery constant 23 // (2**257). 24 p256RInverse *big.Int 25) 26 27func initP256() { 28 // See FIPS 186-3, section D.2.3 29 p256Params = &CurveParams{Name: "P-256"} 30 p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10) 31 p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10) 32 p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16) 33 p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16) 34 p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16) 35 p256Params.BitSize = 256 36 37 p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16) 38 39 // Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation 40 initP256Arch() 41} 42 43func (curve p256Curve) Params() *CurveParams { 44 return curve.CurveParams 45} 46 47// p256GetScalar endian-swaps the big-endian scalar value from in and writes it 48// to out. If the scalar is equal or greater than the order of the group, it's 49// reduced modulo that order. 50func p256GetScalar(out *[32]byte, in []byte) { 51 n := new(big.Int).SetBytes(in) 52 var scalarBytes []byte 53 54 if n.Cmp(p256Params.N) >= 0 { 55 n.Mod(n, p256Params.N) 56 scalarBytes = n.Bytes() 57 } else { 58 scalarBytes = in 59 } 60 61 for i, v := range scalarBytes { 62 out[len(scalarBytes)-(1+i)] = v 63 } 64} 65 66func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) { 67 var scalarReversed [32]byte 68 p256GetScalar(&scalarReversed, scalar) 69 70 var x1, y1, z1 [p256Limbs]uint32 71 p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed) 72 return p256ToAffine(&x1, &y1, &z1) 73} 74 75func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) { 76 var scalarReversed [32]byte 77 p256GetScalar(&scalarReversed, scalar) 78 79 var px, py, x1, y1, z1 [p256Limbs]uint32 80 p256FromBig(&px, bigX) 81 p256FromBig(&py, bigY) 82 p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed) 83 return p256ToAffine(&x1, &y1, &z1) 84} 85 86// Field elements are represented as nine, unsigned 32-bit words. 87// 88// The value of an field element is: 89// x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228) 90// 91// That is, each limb is alternately 29 or 28-bits wide in little-endian 92// order. 93// 94// This means that a field element hits 2**257, rather than 2**256 as we would 95// like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes 96// problems when multiplying as terms end up one bit short of a limb which 97// would require much bit-shifting to correct. 98// 99// Finally, the values stored in a field element are in Montgomery form. So the 100// value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 101// 2**257. 102 103const ( 104 p256Limbs = 9 105 bottom29Bits = 0x1fffffff 106) 107 108var ( 109 // p256One is the number 1 as a field element. 110 p256One = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0} 111 p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0} 112 // p256P is the prime modulus as a field element. 113 p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff} 114 // p2562P is the twice prime modulus as a field element. 115 p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff} 116) 117 118// p256Precomputed contains precomputed values to aid the calculation of scalar 119// multiples of the base point, G. It's actually two, equal length, tables 120// concatenated. 121// 122// The first table contains (x,y) field element pairs for 16 multiples of the 123// base point, G. 124// 125// Index | Index (binary) | Value 126// 0 | 0000 | 0G (all zeros, omitted) 127// 1 | 0001 | G 128// 2 | 0010 | 2**64G 129// 3 | 0011 | 2**64G + G 130// 4 | 0100 | 2**128G 131// 5 | 0101 | 2**128G + G 132// 6 | 0110 | 2**128G + 2**64G 133// 7 | 0111 | 2**128G + 2**64G + G 134// 8 | 1000 | 2**192G 135// 9 | 1001 | 2**192G + G 136// 10 | 1010 | 2**192G + 2**64G 137// 11 | 1011 | 2**192G + 2**64G + G 138// 12 | 1100 | 2**192G + 2**128G 139// 13 | 1101 | 2**192G + 2**128G + G 140// 14 | 1110 | 2**192G + 2**128G + 2**64G 141// 15 | 1111 | 2**192G + 2**128G + 2**64G + G 142// 143// The second table follows the same style, but the terms are 2**32G, 144// 2**96G, 2**160G, 2**224G. 145// 146// This is ~2KB of data. 147var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{ 148 0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee, 149 0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3, 150 0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c, 151 0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22, 152 0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050, 153 0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b, 154 0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa, 155 0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2, 156 0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609, 157 0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581, 158 0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca, 159 0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33, 160 0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6, 161 0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd, 162 0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0, 163 0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881, 164 0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a, 165 0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26, 166 0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b, 167 0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023, 168 0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133, 169 0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa, 170 0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29, 171 0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc, 172 0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8, 173 0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59, 174 0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39, 175 0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689, 176 0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa, 177 0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3, 178 0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1, 179 0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f, 180 0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72, 181 0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d, 182 0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b, 183 0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a, 184 0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a, 185 0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f, 186 0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb, 187 0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc, 188 0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9, 189 0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce, 190 0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2, 191 0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca, 192 0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229, 193 0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57, 194 0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c, 195 0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa, 196 0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651, 197 0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec, 198 0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7, 199 0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c, 200 0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927, 201 0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298, 202 0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8, 203 0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2, 204 0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d, 205 0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4, 206 0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8, 207 0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78, 208} 209 210// Field element operations: 211 212// nonZeroToAllOnes returns: 213// 0xffffffff for 0 < x <= 2**31 214// 0 for x == 0 or x > 2**31. 215func nonZeroToAllOnes(x uint32) uint32 { 216 return ((x - 1) >> 31) - 1 217} 218 219// p256ReduceCarry adds a multiple of p in order to cancel |carry|, 220// which is a term at 2**257. 221// 222// On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28. 223// On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29. 224func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) { 225 carry_mask := nonZeroToAllOnes(carry) 226 227 inout[0] += carry << 1 228 inout[3] += 0x10000000 & carry_mask 229 // carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the 230 // previous line therefore this doesn't underflow. 231 inout[3] -= carry << 11 232 inout[4] += (0x20000000 - 1) & carry_mask 233 inout[5] += (0x10000000 - 1) & carry_mask 234 inout[6] += (0x20000000 - 1) & carry_mask 235 inout[6] -= carry << 22 236 // This may underflow if carry is non-zero but, if so, we'll fix it in the 237 // next line. 238 inout[7] -= 1 & carry_mask 239 inout[7] += carry << 25 240} 241 242// p256Sum sets out = in+in2. 243// 244// On entry, in[i]+in2[i] must not overflow a 32-bit word. 245// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 246func p256Sum(out, in, in2 *[p256Limbs]uint32) { 247 carry := uint32(0) 248 for i := 0; ; i++ { 249 out[i] = in[i] + in2[i] 250 out[i] += carry 251 carry = out[i] >> 29 252 out[i] &= bottom29Bits 253 254 i++ 255 if i == p256Limbs { 256 break 257 } 258 259 out[i] = in[i] + in2[i] 260 out[i] += carry 261 carry = out[i] >> 28 262 out[i] &= bottom28Bits 263 } 264 265 p256ReduceCarry(out, carry) 266} 267 268const ( 269 two30m2 = 1<<30 - 1<<2 270 two30p13m2 = 1<<30 + 1<<13 - 1<<2 271 two31m2 = 1<<31 - 1<<2 272 two31p24m2 = 1<<31 + 1<<24 - 1<<2 273 two30m27m2 = 1<<30 - 1<<27 - 1<<2 274) 275 276// p256Zero31 is 0 mod p. 277var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2} 278 279// p256Diff sets out = in-in2. 280// 281// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and 282// in2[0,2,...] < 2**30, in2[1,3,...] < 2**29. 283// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 284func p256Diff(out, in, in2 *[p256Limbs]uint32) { 285 var carry uint32 286 287 for i := 0; ; i++ { 288 out[i] = in[i] - in2[i] 289 out[i] += p256Zero31[i] 290 out[i] += carry 291 carry = out[i] >> 29 292 out[i] &= bottom29Bits 293 294 i++ 295 if i == p256Limbs { 296 break 297 } 298 299 out[i] = in[i] - in2[i] 300 out[i] += p256Zero31[i] 301 out[i] += carry 302 carry = out[i] >> 28 303 out[i] &= bottom28Bits 304 } 305 306 p256ReduceCarry(out, carry) 307} 308 309// p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with 310// the same 29,28,... bit positions as an field element. 311// 312// The values in field elements are in Montgomery form: x*R mod p where R = 313// 2**257. Since we just multiplied two Montgomery values together, the result 314// is x*y*R*R mod p. We wish to divide by R in order for the result also to be 315// in Montgomery form. 316// 317// On entry: tmp[i] < 2**64 318// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 319func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) { 320 // The following table may be helpful when reading this code: 321 // 322 // Limb number: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10... 323 // Width (bits): 29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29 324 // Start bit: 0 | 29| 57| 86|114|143|171|200|228|257|285 325 // (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285 326 var tmp2 [18]uint32 327 var carry, x, xMask uint32 328 329 // tmp contains 64-bit words with the same 29,28,29-bit positions as an 330 // field element. So the top of an element of tmp might overlap with 331 // another element two positions down. The following loop eliminates 332 // this overlap. 333 tmp2[0] = uint32(tmp[0]) & bottom29Bits 334 335 tmp2[1] = uint32(tmp[0]) >> 29 336 tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits 337 tmp2[1] += uint32(tmp[1]) & bottom28Bits 338 carry = tmp2[1] >> 28 339 tmp2[1] &= bottom28Bits 340 341 for i := 2; i < 17; i++ { 342 tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25 343 tmp2[i] += (uint32(tmp[i-1])) >> 28 344 tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits 345 tmp2[i] += uint32(tmp[i]) & bottom29Bits 346 tmp2[i] += carry 347 carry = tmp2[i] >> 29 348 tmp2[i] &= bottom29Bits 349 350 i++ 351 if i == 17 { 352 break 353 } 354 tmp2[i] = uint32(tmp[i-2]>>32) >> 25 355 tmp2[i] += uint32(tmp[i-1]) >> 29 356 tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits 357 tmp2[i] += uint32(tmp[i]) & bottom28Bits 358 tmp2[i] += carry 359 carry = tmp2[i] >> 28 360 tmp2[i] &= bottom28Bits 361 } 362 363 tmp2[17] = uint32(tmp[15]>>32) >> 25 364 tmp2[17] += uint32(tmp[16]) >> 29 365 tmp2[17] += uint32(tmp[16]>>32) << 3 366 tmp2[17] += carry 367 368 // Montgomery elimination of terms: 369 // 370 // Since R is 2**257, we can divide by R with a bitwise shift if we can 371 // ensure that the right-most 257 bits are all zero. We can make that true 372 // by adding multiplies of p without affecting the value. 373 // 374 // So we eliminate limbs from right to left. Since the bottom 29 bits of p 375 // are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0. 376 // We can do that for 8 further limbs and then right shift to eliminate the 377 // extra factor of R. 378 for i := 0; ; i += 2 { 379 tmp2[i+1] += tmp2[i] >> 29 380 x = tmp2[i] & bottom29Bits 381 xMask = nonZeroToAllOnes(x) 382 tmp2[i] = 0 383 384 // The bounds calculations for this loop are tricky. Each iteration of 385 // the loop eliminates two words by adding values to words to their 386 // right. 387 // 388 // The following table contains the amounts added to each word (as an 389 // offset from the value of i at the top of the loop). The amounts are 390 // accounted for from the first and second half of the loop separately 391 // and are written as, for example, 28 to mean a value <2**28. 392 // 393 // Word: 3 4 5 6 7 8 9 10 394 // Added in top half: 28 11 29 21 29 28 395 // 28 29 396 // 29 397 // Added in bottom half: 29 10 28 21 28 28 398 // 29 399 // 400 // The value that is currently offset 7 will be offset 5 for the next 401 // iteration and then offset 3 for the iteration after that. Therefore 402 // the total value added will be the values added at 7, 5 and 3. 403 // 404 // The following table accumulates these values. The sums at the bottom 405 // are written as, for example, 29+28, to mean a value < 2**29+2**28. 406 // 407 // Word: 3 4 5 6 7 8 9 10 11 12 13 408 // 28 11 10 29 21 29 28 28 28 28 28 409 // 29 28 11 28 29 28 29 28 29 28 410 // 29 28 21 21 29 21 29 21 411 // 10 29 28 21 28 21 28 412 // 28 29 28 29 28 29 28 413 // 11 10 29 10 29 10 414 // 29 28 11 28 11 415 // 29 29 416 // -------------------------------------------- 417 // 30+ 31+ 30+ 31+ 30+ 418 // 28+ 29+ 28+ 29+ 21+ 419 // 21+ 28+ 21+ 28+ 10 420 // 10 21+ 10 21+ 421 // 11 11 422 // 423 // So the greatest amount is added to tmp2[10] and tmp2[12]. If 424 // tmp2[10/12] has an initial value of <2**29, then the maximum value 425 // will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32, 426 // as required. 427 tmp2[i+3] += (x << 10) & bottom28Bits 428 tmp2[i+4] += (x >> 18) 429 430 tmp2[i+6] += (x << 21) & bottom29Bits 431 tmp2[i+7] += x >> 8 432 433 // At position 200, which is the starting bit position for word 7, we 434 // have a factor of 0xf000000 = 2**28 - 2**24. 435 tmp2[i+7] += 0x10000000 & xMask 436 tmp2[i+8] += (x - 1) & xMask 437 tmp2[i+7] -= (x << 24) & bottom28Bits 438 tmp2[i+8] -= x >> 4 439 440 tmp2[i+8] += 0x20000000 & xMask 441 tmp2[i+8] -= x 442 tmp2[i+8] += (x << 28) & bottom29Bits 443 tmp2[i+9] += ((x >> 1) - 1) & xMask 444 445 if i+1 == p256Limbs { 446 break 447 } 448 tmp2[i+2] += tmp2[i+1] >> 28 449 x = tmp2[i+1] & bottom28Bits 450 xMask = nonZeroToAllOnes(x) 451 tmp2[i+1] = 0 452 453 tmp2[i+4] += (x << 11) & bottom29Bits 454 tmp2[i+5] += (x >> 18) 455 456 tmp2[i+7] += (x << 21) & bottom28Bits 457 tmp2[i+8] += x >> 7 458 459 // At position 199, which is the starting bit of the 8th word when 460 // dealing with a context starting on an odd word, we have a factor of 461 // 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th 462 // word from i+1 is i+8. 463 tmp2[i+8] += 0x20000000 & xMask 464 tmp2[i+9] += (x - 1) & xMask 465 tmp2[i+8] -= (x << 25) & bottom29Bits 466 tmp2[i+9] -= x >> 4 467 468 tmp2[i+9] += 0x10000000 & xMask 469 tmp2[i+9] -= x 470 tmp2[i+10] += (x - 1) & xMask 471 } 472 473 // We merge the right shift with a carry chain. The words above 2**257 have 474 // widths of 28,29,... which we need to correct when copying them down. 475 carry = 0 476 for i := 0; i < 8; i++ { 477 // The maximum value of tmp2[i + 9] occurs on the first iteration and 478 // is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is 479 // therefore safe. 480 out[i] = tmp2[i+9] 481 out[i] += carry 482 out[i] += (tmp2[i+10] << 28) & bottom29Bits 483 carry = out[i] >> 29 484 out[i] &= bottom29Bits 485 486 i++ 487 out[i] = tmp2[i+9] >> 1 488 out[i] += carry 489 carry = out[i] >> 28 490 out[i] &= bottom28Bits 491 } 492 493 out[8] = tmp2[17] 494 out[8] += carry 495 carry = out[8] >> 29 496 out[8] &= bottom29Bits 497 498 p256ReduceCarry(out, carry) 499} 500 501// p256Square sets out=in*in. 502// 503// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29. 504// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 505func p256Square(out, in *[p256Limbs]uint32) { 506 var tmp [17]uint64 507 508 tmp[0] = uint64(in[0]) * uint64(in[0]) 509 tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1) 510 tmp[2] = uint64(in[0])*(uint64(in[2])<<1) + 511 uint64(in[1])*(uint64(in[1])<<1) 512 tmp[3] = uint64(in[0])*(uint64(in[3])<<1) + 513 uint64(in[1])*(uint64(in[2])<<1) 514 tmp[4] = uint64(in[0])*(uint64(in[4])<<1) + 515 uint64(in[1])*(uint64(in[3])<<2) + 516 uint64(in[2])*uint64(in[2]) 517 tmp[5] = uint64(in[0])*(uint64(in[5])<<1) + 518 uint64(in[1])*(uint64(in[4])<<1) + 519 uint64(in[2])*(uint64(in[3])<<1) 520 tmp[6] = uint64(in[0])*(uint64(in[6])<<1) + 521 uint64(in[1])*(uint64(in[5])<<2) + 522 uint64(in[2])*(uint64(in[4])<<1) + 523 uint64(in[3])*(uint64(in[3])<<1) 524 tmp[7] = uint64(in[0])*(uint64(in[7])<<1) + 525 uint64(in[1])*(uint64(in[6])<<1) + 526 uint64(in[2])*(uint64(in[5])<<1) + 527 uint64(in[3])*(uint64(in[4])<<1) 528 // tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60, 529 // which is < 2**64 as required. 530 tmp[8] = uint64(in[0])*(uint64(in[8])<<1) + 531 uint64(in[1])*(uint64(in[7])<<2) + 532 uint64(in[2])*(uint64(in[6])<<1) + 533 uint64(in[3])*(uint64(in[5])<<2) + 534 uint64(in[4])*uint64(in[4]) 535 tmp[9] = uint64(in[1])*(uint64(in[8])<<1) + 536 uint64(in[2])*(uint64(in[7])<<1) + 537 uint64(in[3])*(uint64(in[6])<<1) + 538 uint64(in[4])*(uint64(in[5])<<1) 539 tmp[10] = uint64(in[2])*(uint64(in[8])<<1) + 540 uint64(in[3])*(uint64(in[7])<<2) + 541 uint64(in[4])*(uint64(in[6])<<1) + 542 uint64(in[5])*(uint64(in[5])<<1) 543 tmp[11] = uint64(in[3])*(uint64(in[8])<<1) + 544 uint64(in[4])*(uint64(in[7])<<1) + 545 uint64(in[5])*(uint64(in[6])<<1) 546 tmp[12] = uint64(in[4])*(uint64(in[8])<<1) + 547 uint64(in[5])*(uint64(in[7])<<2) + 548 uint64(in[6])*uint64(in[6]) 549 tmp[13] = uint64(in[5])*(uint64(in[8])<<1) + 550 uint64(in[6])*(uint64(in[7])<<1) 551 tmp[14] = uint64(in[6])*(uint64(in[8])<<1) + 552 uint64(in[7])*(uint64(in[7])<<1) 553 tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1) 554 tmp[16] = uint64(in[8]) * uint64(in[8]) 555 556 p256ReduceDegree(out, tmp) 557} 558 559// p256Mul sets out=in*in2. 560// 561// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and 562// in2[0,2,...] < 2**30, in2[1,3,...] < 2**29. 563// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 564func p256Mul(out, in, in2 *[p256Limbs]uint32) { 565 var tmp [17]uint64 566 567 tmp[0] = uint64(in[0]) * uint64(in2[0]) 568 tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) + 569 uint64(in[1])*(uint64(in2[0])<<0) 570 tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) + 571 uint64(in[1])*(uint64(in2[1])<<1) + 572 uint64(in[2])*(uint64(in2[0])<<0) 573 tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) + 574 uint64(in[1])*(uint64(in2[2])<<0) + 575 uint64(in[2])*(uint64(in2[1])<<0) + 576 uint64(in[3])*(uint64(in2[0])<<0) 577 tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) + 578 uint64(in[1])*(uint64(in2[3])<<1) + 579 uint64(in[2])*(uint64(in2[2])<<0) + 580 uint64(in[3])*(uint64(in2[1])<<1) + 581 uint64(in[4])*(uint64(in2[0])<<0) 582 tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) + 583 uint64(in[1])*(uint64(in2[4])<<0) + 584 uint64(in[2])*(uint64(in2[3])<<0) + 585 uint64(in[3])*(uint64(in2[2])<<0) + 586 uint64(in[4])*(uint64(in2[1])<<0) + 587 uint64(in[5])*(uint64(in2[0])<<0) 588 tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) + 589 uint64(in[1])*(uint64(in2[5])<<1) + 590 uint64(in[2])*(uint64(in2[4])<<0) + 591 uint64(in[3])*(uint64(in2[3])<<1) + 592 uint64(in[4])*(uint64(in2[2])<<0) + 593 uint64(in[5])*(uint64(in2[1])<<1) + 594 uint64(in[6])*(uint64(in2[0])<<0) 595 tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) + 596 uint64(in[1])*(uint64(in2[6])<<0) + 597 uint64(in[2])*(uint64(in2[5])<<0) + 598 uint64(in[3])*(uint64(in2[4])<<0) + 599 uint64(in[4])*(uint64(in2[3])<<0) + 600 uint64(in[5])*(uint64(in2[2])<<0) + 601 uint64(in[6])*(uint64(in2[1])<<0) + 602 uint64(in[7])*(uint64(in2[0])<<0) 603 // tmp[8] has the greatest value but doesn't overflow. See logic in 604 // p256Square. 605 tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) + 606 uint64(in[1])*(uint64(in2[7])<<1) + 607 uint64(in[2])*(uint64(in2[6])<<0) + 608 uint64(in[3])*(uint64(in2[5])<<1) + 609 uint64(in[4])*(uint64(in2[4])<<0) + 610 uint64(in[5])*(uint64(in2[3])<<1) + 611 uint64(in[6])*(uint64(in2[2])<<0) + 612 uint64(in[7])*(uint64(in2[1])<<1) + 613 uint64(in[8])*(uint64(in2[0])<<0) 614 tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) + 615 uint64(in[2])*(uint64(in2[7])<<0) + 616 uint64(in[3])*(uint64(in2[6])<<0) + 617 uint64(in[4])*(uint64(in2[5])<<0) + 618 uint64(in[5])*(uint64(in2[4])<<0) + 619 uint64(in[6])*(uint64(in2[3])<<0) + 620 uint64(in[7])*(uint64(in2[2])<<0) + 621 uint64(in[8])*(uint64(in2[1])<<0) 622 tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) + 623 uint64(in[3])*(uint64(in2[7])<<1) + 624 uint64(in[4])*(uint64(in2[6])<<0) + 625 uint64(in[5])*(uint64(in2[5])<<1) + 626 uint64(in[6])*(uint64(in2[4])<<0) + 627 uint64(in[7])*(uint64(in2[3])<<1) + 628 uint64(in[8])*(uint64(in2[2])<<0) 629 tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) + 630 uint64(in[4])*(uint64(in2[7])<<0) + 631 uint64(in[5])*(uint64(in2[6])<<0) + 632 uint64(in[6])*(uint64(in2[5])<<0) + 633 uint64(in[7])*(uint64(in2[4])<<0) + 634 uint64(in[8])*(uint64(in2[3])<<0) 635 tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) + 636 uint64(in[5])*(uint64(in2[7])<<1) + 637 uint64(in[6])*(uint64(in2[6])<<0) + 638 uint64(in[7])*(uint64(in2[5])<<1) + 639 uint64(in[8])*(uint64(in2[4])<<0) 640 tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) + 641 uint64(in[6])*(uint64(in2[7])<<0) + 642 uint64(in[7])*(uint64(in2[6])<<0) + 643 uint64(in[8])*(uint64(in2[5])<<0) 644 tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) + 645 uint64(in[7])*(uint64(in2[7])<<1) + 646 uint64(in[8])*(uint64(in2[6])<<0) 647 tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) + 648 uint64(in[8])*(uint64(in2[7])<<0) 649 tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0) 650 651 p256ReduceDegree(out, tmp) 652} 653 654func p256Assign(out, in *[p256Limbs]uint32) { 655 *out = *in 656} 657 658// p256Invert calculates |out| = |in|^{-1} 659// 660// Based on Fermat's Little Theorem: 661// a^p = a (mod p) 662// a^{p-1} = 1 (mod p) 663// a^{p-2} = a^{-1} (mod p) 664func p256Invert(out, in *[p256Limbs]uint32) { 665 var ftmp, ftmp2 [p256Limbs]uint32 666 667 // each e_I will hold |in|^{2^I - 1} 668 var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32 669 670 p256Square(&ftmp, in) // 2^1 671 p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0 672 p256Assign(&e2, &ftmp) 673 p256Square(&ftmp, &ftmp) // 2^3 - 2^1 674 p256Square(&ftmp, &ftmp) // 2^4 - 2^2 675 p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0 676 p256Assign(&e4, &ftmp) 677 p256Square(&ftmp, &ftmp) // 2^5 - 2^1 678 p256Square(&ftmp, &ftmp) // 2^6 - 2^2 679 p256Square(&ftmp, &ftmp) // 2^7 - 2^3 680 p256Square(&ftmp, &ftmp) // 2^8 - 2^4 681 p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0 682 p256Assign(&e8, &ftmp) 683 for i := 0; i < 8; i++ { 684 p256Square(&ftmp, &ftmp) 685 } // 2^16 - 2^8 686 p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0 687 p256Assign(&e16, &ftmp) 688 for i := 0; i < 16; i++ { 689 p256Square(&ftmp, &ftmp) 690 } // 2^32 - 2^16 691 p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0 692 p256Assign(&e32, &ftmp) 693 for i := 0; i < 32; i++ { 694 p256Square(&ftmp, &ftmp) 695 } // 2^64 - 2^32 696 p256Assign(&e64, &ftmp) 697 p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0 698 for i := 0; i < 192; i++ { 699 p256Square(&ftmp, &ftmp) 700 } // 2^256 - 2^224 + 2^192 701 702 p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0 703 for i := 0; i < 16; i++ { 704 p256Square(&ftmp2, &ftmp2) 705 } // 2^80 - 2^16 706 p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0 707 for i := 0; i < 8; i++ { 708 p256Square(&ftmp2, &ftmp2) 709 } // 2^88 - 2^8 710 p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0 711 for i := 0; i < 4; i++ { 712 p256Square(&ftmp2, &ftmp2) 713 } // 2^92 - 2^4 714 p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0 715 p256Square(&ftmp2, &ftmp2) // 2^93 - 2^1 716 p256Square(&ftmp2, &ftmp2) // 2^94 - 2^2 717 p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0 718 p256Square(&ftmp2, &ftmp2) // 2^95 - 2^1 719 p256Square(&ftmp2, &ftmp2) // 2^96 - 2^2 720 p256Mul(&ftmp2, &ftmp2, in) // 2^96 - 3 721 722 p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3 723} 724 725// p256Scalar3 sets out=3*out. 726// 727// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 728// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 729func p256Scalar3(out *[p256Limbs]uint32) { 730 var carry uint32 731 732 for i := 0; ; i++ { 733 out[i] *= 3 734 out[i] += carry 735 carry = out[i] >> 29 736 out[i] &= bottom29Bits 737 738 i++ 739 if i == p256Limbs { 740 break 741 } 742 743 out[i] *= 3 744 out[i] += carry 745 carry = out[i] >> 28 746 out[i] &= bottom28Bits 747 } 748 749 p256ReduceCarry(out, carry) 750} 751 752// p256Scalar4 sets out=4*out. 753// 754// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 755// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 756func p256Scalar4(out *[p256Limbs]uint32) { 757 var carry, nextCarry uint32 758 759 for i := 0; ; i++ { 760 nextCarry = out[i] >> 27 761 out[i] <<= 2 762 out[i] &= bottom29Bits 763 out[i] += carry 764 carry = nextCarry + (out[i] >> 29) 765 out[i] &= bottom29Bits 766 767 i++ 768 if i == p256Limbs { 769 break 770 } 771 nextCarry = out[i] >> 26 772 out[i] <<= 2 773 out[i] &= bottom28Bits 774 out[i] += carry 775 carry = nextCarry + (out[i] >> 28) 776 out[i] &= bottom28Bits 777 } 778 779 p256ReduceCarry(out, carry) 780} 781 782// p256Scalar8 sets out=8*out. 783// 784// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 785// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. 786func p256Scalar8(out *[p256Limbs]uint32) { 787 var carry, nextCarry uint32 788 789 for i := 0; ; i++ { 790 nextCarry = out[i] >> 26 791 out[i] <<= 3 792 out[i] &= bottom29Bits 793 out[i] += carry 794 carry = nextCarry + (out[i] >> 29) 795 out[i] &= bottom29Bits 796 797 i++ 798 if i == p256Limbs { 799 break 800 } 801 nextCarry = out[i] >> 25 802 out[i] <<= 3 803 out[i] &= bottom28Bits 804 out[i] += carry 805 carry = nextCarry + (out[i] >> 28) 806 out[i] &= bottom28Bits 807 } 808 809 p256ReduceCarry(out, carry) 810} 811 812// Group operations: 813// 814// Elements of the elliptic curve group are represented in Jacobian 815// coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in 816// Jacobian form. 817 818// p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}. 819// 820// See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l 821func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) { 822 var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32 823 824 p256Square(&delta, z) 825 p256Square(&gamma, y) 826 p256Mul(&beta, x, &gamma) 827 828 p256Sum(&tmp, x, &delta) 829 p256Diff(&tmp2, x, &delta) 830 p256Mul(&alpha, &tmp, &tmp2) 831 p256Scalar3(&alpha) 832 833 p256Sum(&tmp, y, z) 834 p256Square(&tmp, &tmp) 835 p256Diff(&tmp, &tmp, &gamma) 836 p256Diff(zOut, &tmp, &delta) 837 838 p256Scalar4(&beta) 839 p256Square(xOut, &alpha) 840 p256Diff(xOut, xOut, &beta) 841 p256Diff(xOut, xOut, &beta) 842 843 p256Diff(&tmp, &beta, xOut) 844 p256Mul(&tmp, &alpha, &tmp) 845 p256Square(&tmp2, &gamma) 846 p256Scalar8(&tmp2) 847 p256Diff(yOut, &tmp, &tmp2) 848} 849 850// p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}. 851// (i.e. the second point is affine.) 852// 853// See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl 854// 855// Note that this function does not handle P+P, infinity+P nor P+infinity 856// correctly. 857func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) { 858 var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32 859 860 p256Square(&z1z1, z1) 861 p256Sum(&tmp, z1, z1) 862 863 p256Mul(&u2, x2, &z1z1) 864 p256Mul(&z1z1z1, z1, &z1z1) 865 p256Mul(&s2, y2, &z1z1z1) 866 p256Diff(&h, &u2, x1) 867 p256Sum(&i, &h, &h) 868 p256Square(&i, &i) 869 p256Mul(&j, &h, &i) 870 p256Diff(&r, &s2, y1) 871 p256Sum(&r, &r, &r) 872 p256Mul(&v, x1, &i) 873 874 p256Mul(zOut, &tmp, &h) 875 p256Square(&rr, &r) 876 p256Diff(xOut, &rr, &j) 877 p256Diff(xOut, xOut, &v) 878 p256Diff(xOut, xOut, &v) 879 880 p256Diff(&tmp, &v, xOut) 881 p256Mul(yOut, &tmp, &r) 882 p256Mul(&tmp, y1, &j) 883 p256Diff(yOut, yOut, &tmp) 884 p256Diff(yOut, yOut, &tmp) 885} 886 887// p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}. 888// 889// See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl 890// 891// Note that this function does not handle P+P, infinity+P nor P+infinity 892// correctly. 893func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) { 894 var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32 895 896 p256Square(&z1z1, z1) 897 p256Square(&z2z2, z2) 898 p256Mul(&u1, x1, &z2z2) 899 900 p256Sum(&tmp, z1, z2) 901 p256Square(&tmp, &tmp) 902 p256Diff(&tmp, &tmp, &z1z1) 903 p256Diff(&tmp, &tmp, &z2z2) 904 905 p256Mul(&z2z2z2, z2, &z2z2) 906 p256Mul(&s1, y1, &z2z2z2) 907 908 p256Mul(&u2, x2, &z1z1) 909 p256Mul(&z1z1z1, z1, &z1z1) 910 p256Mul(&s2, y2, &z1z1z1) 911 p256Diff(&h, &u2, &u1) 912 p256Sum(&i, &h, &h) 913 p256Square(&i, &i) 914 p256Mul(&j, &h, &i) 915 p256Diff(&r, &s2, &s1) 916 p256Sum(&r, &r, &r) 917 p256Mul(&v, &u1, &i) 918 919 p256Mul(zOut, &tmp, &h) 920 p256Square(&rr, &r) 921 p256Diff(xOut, &rr, &j) 922 p256Diff(xOut, xOut, &v) 923 p256Diff(xOut, xOut, &v) 924 925 p256Diff(&tmp, &v, xOut) 926 p256Mul(yOut, &tmp, &r) 927 p256Mul(&tmp, &s1, &j) 928 p256Diff(yOut, yOut, &tmp) 929 p256Diff(yOut, yOut, &tmp) 930} 931 932// p256CopyConditional sets out=in if mask = 0xffffffff in constant time. 933// 934// On entry: mask is either 0 or 0xffffffff. 935func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) { 936 for i := 0; i < p256Limbs; i++ { 937 tmp := mask & (in[i] ^ out[i]) 938 out[i] ^= tmp 939 } 940} 941 942// p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table. 943// On entry: index < 16, table[0] must be zero. 944func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) { 945 for i := range xOut { 946 xOut[i] = 0 947 } 948 for i := range yOut { 949 yOut[i] = 0 950 } 951 952 for i := uint32(1); i < 16; i++ { 953 mask := i ^ index 954 mask |= mask >> 2 955 mask |= mask >> 1 956 mask &= 1 957 mask-- 958 for j := range xOut { 959 xOut[j] |= table[0] & mask 960 table = table[1:] 961 } 962 for j := range yOut { 963 yOut[j] |= table[0] & mask 964 table = table[1:] 965 } 966 } 967} 968 969// p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of 970// table. 971// On entry: index < 16, table[0] must be zero. 972func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) { 973 for i := range xOut { 974 xOut[i] = 0 975 } 976 for i := range yOut { 977 yOut[i] = 0 978 } 979 for i := range zOut { 980 zOut[i] = 0 981 } 982 983 // The implicit value at index 0 is all zero. We don't need to perform that 984 // iteration of the loop because we already set out_* to zero. 985 for i := uint32(1); i < 16; i++ { 986 mask := i ^ index 987 mask |= mask >> 2 988 mask |= mask >> 1 989 mask &= 1 990 mask-- 991 for j := range xOut { 992 xOut[j] |= table[i][0][j] & mask 993 } 994 for j := range yOut { 995 yOut[j] |= table[i][1][j] & mask 996 } 997 for j := range zOut { 998 zOut[j] |= table[i][2][j] & mask 999 } 1000 } 1001} 1002 1003// p256GetBit returns the bit'th bit of scalar. 1004func p256GetBit(scalar *[32]uint8, bit uint) uint32 { 1005 return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1) 1006} 1007 1008// p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a 1009// little-endian number. Note that the value of scalar must be less than the 1010// order of the group. 1011func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) { 1012 nIsInfinityMask := ^uint32(0) 1013 var pIsNoninfiniteMask, mask, tableOffset uint32 1014 var px, py, tx, ty, tz [p256Limbs]uint32 1015 1016 for i := range xOut { 1017 xOut[i] = 0 1018 } 1019 for i := range yOut { 1020 yOut[i] = 0 1021 } 1022 for i := range zOut { 1023 zOut[i] = 0 1024 } 1025 1026 // The loop adds bits at positions 0, 64, 128 and 192, followed by 1027 // positions 32,96,160 and 224 and does this 32 times. 1028 for i := uint(0); i < 32; i++ { 1029 if i != 0 { 1030 p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut) 1031 } 1032 tableOffset = 0 1033 for j := uint(0); j <= 32; j += 32 { 1034 bit0 := p256GetBit(scalar, 31-i+j) 1035 bit1 := p256GetBit(scalar, 95-i+j) 1036 bit2 := p256GetBit(scalar, 159-i+j) 1037 bit3 := p256GetBit(scalar, 223-i+j) 1038 index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3) 1039 1040 p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index) 1041 tableOffset += 30 * p256Limbs 1042 1043 // Since scalar is less than the order of the group, we know that 1044 // {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle 1045 // below. 1046 p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py) 1047 // The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero 1048 // (a.k.a. the point at infinity). We handle that situation by 1049 // copying the point from the table. 1050 p256CopyConditional(xOut, &px, nIsInfinityMask) 1051 p256CopyConditional(yOut, &py, nIsInfinityMask) 1052 p256CopyConditional(zOut, &p256One, nIsInfinityMask) 1053 1054 // Equally, the result is also wrong if the point from the table is 1055 // zero, which happens when the index is zero. We handle that by 1056 // only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0. 1057 pIsNoninfiniteMask = nonZeroToAllOnes(index) 1058 mask = pIsNoninfiniteMask & ^nIsInfinityMask 1059 p256CopyConditional(xOut, &tx, mask) 1060 p256CopyConditional(yOut, &ty, mask) 1061 p256CopyConditional(zOut, &tz, mask) 1062 // If p was not zero, then n is now non-zero. 1063 nIsInfinityMask &^= pIsNoninfiniteMask 1064 } 1065 } 1066} 1067 1068// p256PointToAffine converts a Jacobian point to an affine point. If the input 1069// is the point at infinity then it returns (0, 0) in constant time. 1070func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) { 1071 var zInv, zInvSq [p256Limbs]uint32 1072 1073 p256Invert(&zInv, z) 1074 p256Square(&zInvSq, &zInv) 1075 p256Mul(xOut, x, &zInvSq) 1076 p256Mul(&zInv, &zInv, &zInvSq) 1077 p256Mul(yOut, y, &zInv) 1078} 1079 1080// p256ToAffine returns a pair of *big.Int containing the affine representation 1081// of {x,y,z}. 1082func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) { 1083 var xx, yy [p256Limbs]uint32 1084 p256PointToAffine(&xx, &yy, x, y, z) 1085 return p256ToBig(&xx), p256ToBig(&yy) 1086} 1087 1088// p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}. 1089func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) { 1090 var px, py, pz, tx, ty, tz [p256Limbs]uint32 1091 var precomp [16][3][p256Limbs]uint32 1092 var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32 1093 1094 // We precompute 0,1,2,... times {x,y}. 1095 precomp[1][0] = *x 1096 precomp[1][1] = *y 1097 precomp[1][2] = p256One 1098 1099 for i := 2; i < 16; i += 2 { 1100 p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2]) 1101 p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y) 1102 } 1103 1104 for i := range xOut { 1105 xOut[i] = 0 1106 } 1107 for i := range yOut { 1108 yOut[i] = 0 1109 } 1110 for i := range zOut { 1111 zOut[i] = 0 1112 } 1113 nIsInfinityMask = ^uint32(0) 1114 1115 // We add in a window of four bits each iteration and do this 64 times. 1116 for i := 0; i < 64; i++ { 1117 if i != 0 { 1118 p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut) 1119 p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut) 1120 p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut) 1121 p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut) 1122 } 1123 1124 index = uint32(scalar[31-i/2]) 1125 if (i & 1) == 1 { 1126 index &= 15 1127 } else { 1128 index >>= 4 1129 } 1130 1131 // See the comments in scalarBaseMult about handling infinities. 1132 p256SelectJacobianPoint(&px, &py, &pz, &precomp, index) 1133 p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz) 1134 p256CopyConditional(xOut, &px, nIsInfinityMask) 1135 p256CopyConditional(yOut, &py, nIsInfinityMask) 1136 p256CopyConditional(zOut, &pz, nIsInfinityMask) 1137 1138 pIsNoninfiniteMask = nonZeroToAllOnes(index) 1139 mask = pIsNoninfiniteMask & ^nIsInfinityMask 1140 p256CopyConditional(xOut, &tx, mask) 1141 p256CopyConditional(yOut, &ty, mask) 1142 p256CopyConditional(zOut, &tz, mask) 1143 nIsInfinityMask &^= pIsNoninfiniteMask 1144 } 1145} 1146 1147// p256FromBig sets out = R*in. 1148func p256FromBig(out *[p256Limbs]uint32, in *big.Int) { 1149 tmp := new(big.Int).Lsh(in, 257) 1150 tmp.Mod(tmp, p256Params.P) 1151 1152 for i := 0; i < p256Limbs; i++ { 1153 if bits := tmp.Bits(); len(bits) > 0 { 1154 out[i] = uint32(bits[0]) & bottom29Bits 1155 } else { 1156 out[i] = 0 1157 } 1158 tmp.Rsh(tmp, 29) 1159 1160 i++ 1161 if i == p256Limbs { 1162 break 1163 } 1164 1165 if bits := tmp.Bits(); len(bits) > 0 { 1166 out[i] = uint32(bits[0]) & bottom28Bits 1167 } else { 1168 out[i] = 0 1169 } 1170 tmp.Rsh(tmp, 28) 1171 } 1172} 1173 1174// p256ToBig returns a *big.Int containing the value of in. 1175func p256ToBig(in *[p256Limbs]uint32) *big.Int { 1176 result, tmp := new(big.Int), new(big.Int) 1177 1178 result.SetInt64(int64(in[p256Limbs-1])) 1179 for i := p256Limbs - 2; i >= 0; i-- { 1180 if (i & 1) == 0 { 1181 result.Lsh(result, 29) 1182 } else { 1183 result.Lsh(result, 28) 1184 } 1185 tmp.SetInt64(int64(in[i])) 1186 result.Add(result, tmp) 1187 } 1188 1189 result.Mul(result, p256RInverse) 1190 result.Mod(result, p256Params.P) 1191 return result 1192} 1193