1// Copyright 2020 ConsenSys Software Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Code generated by consensys/gnark-crypto DO NOT EDIT 16 17package bw6761 18 19import ( 20 "github.com/consensys/gnark-crypto/ecc" 21 "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" 22 "github.com/consensys/gnark-crypto/internal/parallel" 23 "math" 24 "runtime" 25 "sync" 26) 27 28// selector stores the index, mask and shifts needed to select bits from a scalar 29// it is used during the multiExp algorithm or the batch scalar multiplication 30type selector struct { 31 index uint64 // index in the multi-word scalar to select bits from 32 mask uint64 // mask (c-bit wide) 33 shift uint64 // shift needed to get our bits on low positions 34 35 multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) 36 maskHigh uint64 // same than mask, for index+1 37 shiftHigh uint64 // same than shift, for index+1 38} 39 40// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits 41// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract 42// 2^{c} to the current digit, making it negative. 43// negative digits can be processed in a later step as adding -G into the bucket instead of G 44// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul) 45func partitionScalars(scalars []fr.Element, c uint64) []fr.Element { 46 toReturn := make([]fr.Element, len(scalars)) 47 48 // number of c-bit radixes in a scalar 49 nbChunks := fr.Limbs * 64 / c 50 if (fr.Limbs*64)%c != 0 { 51 nbChunks++ 52 } 53 54 mask := uint64((1 << c) - 1) // low c bits are 1 55 msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window 56 max := int(1 << (c - 1)) // max value we want for our digits 57 cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words 58 59 // compute offset and word selector / shift to select the right bits of our windows 60 selectors := make([]selector, nbChunks) 61 for chunk := uint64(0); chunk < nbChunks; chunk++ { 62 jc := uint64(chunk * c) 63 d := selector{} 64 d.index = jc / 64 65 d.shift = jc - (d.index * 64) 66 d.mask = mask << d.shift 67 d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) 68 if d.multiWordSelect { 69 nbBitsHigh := d.shift - uint64(64-c) 70 d.maskHigh = (1 << nbBitsHigh) - 1 71 d.shiftHigh = (c - nbBitsHigh) 72 } 73 selectors[chunk] = d 74 } 75 76 parallel.Execute(len(scalars), func(start, end int) { 77 for i := start; i < end; i++ { 78 var carry int 79 80 // for each chunk in the scalar, compute the current digit, and an eventual carry 81 for chunk := uint64(0); chunk < nbChunks; chunk++ { 82 s := selectors[chunk] 83 84 // init with carry if any 85 digit := carry 86 carry = 0 87 88 // digit = value of the c-bit window 89 digit += int((scalars[i][s.index] & s.mask) >> s.shift) 90 91 if s.multiWordSelect { 92 // we are selecting bits over 2 words 93 digit += int(scalars[i][s.index+1]&s.maskHigh) << s.shiftHigh 94 } 95 96 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract 97 // 2^{c} to the current digit, making it negative. 98 if digit >= max { 99 digit -= (1 << c) 100 carry = 1 101 } 102 103 var bits uint64 104 if digit >= 0 { 105 bits = uint64(digit) 106 } else { 107 bits = uint64(-digit-1) | msbWindow 108 } 109 110 toReturn[i][s.index] |= (bits << s.shift) 111 if s.multiWordSelect { 112 toReturn[i][s.index+1] |= (bits >> s.shiftHigh) 113 } 114 115 } 116 } 117 }) 118 return toReturn 119} 120 121// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 122// optionally, takes as parameter a ecc.CPUSemaphore struct 123// enabling to set max number of cpus to use 124func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G1Affine { 125 var _p G1Jac 126 _p.MultiExp(points, scalars, opts...) 127 p.FromJacobian(&_p) 128 return p 129} 130 131// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 132// optionally, takes as parameter a ecc.CPUSemaphore struct 133// enabling to set max number of cpus to use 134func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G1Jac { 135 // note: 136 // each of the msmCX method is the same, except for the c constant it declares 137 // duplicating (through template generation) these methods allows to declare the buckets on the stack 138 // the choice of c needs to be improved: 139 // there is a theoritical value that gives optimal asymptotics 140 // but in practice, other factors come into play, including: 141 // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 142 // * number of CPUs 143 // * cache friendliness (which depends on the host, G1 or G2... ) 144 // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. 145 146 // for each msmCX 147 // step 1 148 // we compute, for each scalars over c-bit wide windows, nbChunk digits 149 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract 150 // 2^{c} to the current digit, making it negative. 151 // negative digits will be processed in the next step as adding -G into the bucket instead of G 152 // (computing -G is cheap, and this saves us half of the buckets) 153 // step 2 154 // buckets are declared on the stack 155 // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) 156 // we use jacobian extended formulas here as they are faster than mixed addition 157 // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel 158 // step 3 159 // reduce the buckets weigthed sums into our result (msmReduceChunk) 160 161 var opt *ecc.CPUSemaphore 162 if len(opts) > 0 { 163 opt = opts[0] 164 } else { 165 opt = ecc.NewCPUSemaphore(runtime.NumCPU()) 166 } 167 168 var C uint64 169 nbPoints := len(points) 170 171 // implemented msmC methods (the c we use must be in this slice) 172 implementedCs := []uint64{4, 5, 8, 16} 173 174 // approximate cost (in group operations) 175 // cost = bits/c * (nbPoints + 2^{c}) 176 // this needs to be verified empirically. 177 // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results 178 min := math.MaxFloat64 179 for _, c := range implementedCs { 180 cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) 181 cost := float64(cc) / float64(c) 182 if cost < min { 183 min = cost 184 C = c 185 } 186 } 187 188 // empirical, needs to be tuned. 189 // if C > 16 && nbPoints < 1 << 23 { 190 // C = 16 191 // } 192 193 // take all the cpus to ourselves 194 opt.Lock.Lock() 195 196 // partition the scalars 197 // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) 198 // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window 199 scalars = partitionScalars(scalars, C) 200 201 switch C { 202 203 case 4: 204 return p.msmC4(points, scalars, opt) 205 206 case 5: 207 return p.msmC5(points, scalars, opt) 208 209 case 8: 210 return p.msmC8(points, scalars, opt) 211 212 case 16: 213 return p.msmC16(points, scalars, opt) 214 215 default: 216 panic("unimplemented") 217 } 218} 219 220// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp 221func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac { 222 var _p g1JacExtended 223 totalj := <-chChunks[len(chChunks)-1] 224 _p.Set(&totalj) 225 for j := len(chChunks) - 2; j >= 0; j-- { 226 for l := 0; l < c; l++ { 227 _p.double(&_p) 228 } 229 totalj := <-chChunks[j] 230 _p.add(&totalj) 231 } 232 233 return p.unsafeFromJacExtended(&_p) 234} 235 236func msmProcessChunkG1Affine(chunk uint64, 237 chRes chan<- g1JacExtended, 238 buckets []g1JacExtended, 239 c uint64, 240 points []G1Affine, 241 scalars []fr.Element) { 242 243 mask := uint64((1 << c) - 1) // low c bits are 1 244 msbWindow := uint64(1 << (c - 1)) 245 246 for i := 0; i < len(buckets); i++ { 247 buckets[i].setInfinity() 248 } 249 250 jc := uint64(chunk * c) 251 s := selector{} 252 s.index = jc / 64 253 s.shift = jc - (s.index * 64) 254 s.mask = mask << s.shift 255 s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) 256 if s.multiWordSelect { 257 nbBitsHigh := s.shift - uint64(64-c) 258 s.maskHigh = (1 << nbBitsHigh) - 1 259 s.shiftHigh = (c - nbBitsHigh) 260 } 261 262 // for each scalars, get the digit corresponding to the chunk we're processing. 263 for i := 0; i < len(scalars); i++ { 264 bits := (scalars[i][s.index] & s.mask) >> s.shift 265 if s.multiWordSelect { 266 bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh 267 } 268 269 if bits == 0 { 270 continue 271 } 272 273 // if msbWindow bit is set, we need to substract 274 if bits&msbWindow == 0 { 275 // add 276 buckets[bits-1].addMixed(&points[i]) 277 } else { 278 // sub 279 buckets[bits & ^msbWindow].subMixed(&points[i]) 280 } 281 } 282 283 // reduce buckets into total 284 // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] 285 286 var runningSum, total g1JacExtended 287 runningSum.setInfinity() 288 total.setInfinity() 289 for k := len(buckets) - 1; k >= 0; k-- { 290 if !buckets[k].ZZ.IsZero() { 291 runningSum.add(&buckets[k]) 292 } 293 total.add(&runningSum) 294 } 295 296 chRes <- total 297 close(chRes) 298} 299 300func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac { 301 const c = 4 // scalars partitioned into c-bit radixes 302 const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar 303 304 // for each chunk, spawn a go routine that'll loop through all the scalars 305 var chChunks [nbChunks]chan g1JacExtended 306 307 // wait group to wait for all the go routines to start 308 var wg sync.WaitGroup 309 for chunk := nbChunks - 1; chunk >= 0; chunk-- { 310 chChunks[chunk] = make(chan g1JacExtended, 1) 311 <-opt.ChCPU // wait to have a cpu before scheduling 312 wg.Add(1) 313 go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) { 314 wg.Done() 315 var buckets [1 << (c - 1)]g1JacExtended 316 msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars) 317 opt.ChCPU <- struct{}{} // release token in the semaphore 318 }(uint64(chunk), chChunks[chunk], points, scalars) 319 } 320 321 // wait for all goRoutines to actually start 322 wg.Wait() 323 324 // all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore 325 opt.Lock.Unlock() 326 return msmReduceChunkG1Affine(p, c, chChunks[:]) 327} 328 329func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac { 330 const c = 5 // scalars partitioned into c-bit radixes 331 const nbChunks = (fr.Limbs * 64 / c) + 1 // number of c-bit radixes in a scalar 332 333 // for each chunk, spawn a go routine that'll loop through all the scalars 334 var chChunks [nbChunks]chan g1JacExtended 335 336 // wait group to wait for all the go routines to start 337 var wg sync.WaitGroup 338 // c doesn't divide 384, last window is smaller we can allocate less buckets 339 const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) 340 chChunks[nbChunks-1] = make(chan g1JacExtended, 1) 341 <-opt.ChCPU // wait to have a cpu before scheduling 342 wg.Add(1) 343 go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) { 344 wg.Done() 345 var buckets [1 << (lastC - 1)]g1JacExtended 346 msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars) 347 opt.ChCPU <- struct{}{} // release token in the semaphore 348 }(uint64(nbChunks-1), chChunks[nbChunks-1], points, scalars) 349 350 for chunk := nbChunks - 2; chunk >= 0; chunk-- { 351 chChunks[chunk] = make(chan g1JacExtended, 1) 352 <-opt.ChCPU // wait to have a cpu before scheduling 353 wg.Add(1) 354 go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) { 355 wg.Done() 356 var buckets [1 << (c - 1)]g1JacExtended 357 msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars) 358 opt.ChCPU <- struct{}{} // release token in the semaphore 359 }(uint64(chunk), chChunks[chunk], points, scalars) 360 } 361 362 // wait for all goRoutines to actually start 363 wg.Wait() 364 365 // all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore 366 opt.Lock.Unlock() 367 return msmReduceChunkG1Affine(p, c, chChunks[:]) 368} 369 370func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac { 371 const c = 8 // scalars partitioned into c-bit radixes 372 const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar 373 374 // for each chunk, spawn a go routine that'll loop through all the scalars 375 var chChunks [nbChunks]chan g1JacExtended 376 377 // wait group to wait for all the go routines to start 378 var wg sync.WaitGroup 379 for chunk := nbChunks - 1; chunk >= 0; chunk-- { 380 chChunks[chunk] = make(chan g1JacExtended, 1) 381 <-opt.ChCPU // wait to have a cpu before scheduling 382 wg.Add(1) 383 go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) { 384 wg.Done() 385 var buckets [1 << (c - 1)]g1JacExtended 386 msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars) 387 opt.ChCPU <- struct{}{} // release token in the semaphore 388 }(uint64(chunk), chChunks[chunk], points, scalars) 389 } 390 391 // wait for all goRoutines to actually start 392 wg.Wait() 393 394 // all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore 395 opt.Lock.Unlock() 396 return msmReduceChunkG1Affine(p, c, chChunks[:]) 397} 398 399func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac { 400 const c = 16 // scalars partitioned into c-bit radixes 401 const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar 402 403 // for each chunk, spawn a go routine that'll loop through all the scalars 404 var chChunks [nbChunks]chan g1JacExtended 405 406 // wait group to wait for all the go routines to start 407 var wg sync.WaitGroup 408 for chunk := nbChunks - 1; chunk >= 0; chunk-- { 409 chChunks[chunk] = make(chan g1JacExtended, 1) 410 <-opt.ChCPU // wait to have a cpu before scheduling 411 wg.Add(1) 412 go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) { 413 wg.Done() 414 var buckets [1 << (c - 1)]g1JacExtended 415 msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars) 416 opt.ChCPU <- struct{}{} // release token in the semaphore 417 }(uint64(chunk), chChunks[chunk], points, scalars) 418 } 419 420 // wait for all goRoutines to actually start 421 wg.Wait() 422 423 // all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore 424 opt.Lock.Unlock() 425 return msmReduceChunkG1Affine(p, c, chChunks[:]) 426} 427 428// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 429// optionally, takes as parameter a ecc.CPUSemaphore struct 430// enabling to set max number of cpus to use 431func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G2Affine { 432 var _p G2Jac 433 _p.MultiExp(points, scalars, opts...) 434 p.FromJacobian(&_p) 435 return p 436} 437 438// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf 439// optionally, takes as parameter a ecc.CPUSemaphore struct 440// enabling to set max number of cpus to use 441func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G2Jac { 442 // note: 443 // each of the msmCX method is the same, except for the c constant it declares 444 // duplicating (through template generation) these methods allows to declare the buckets on the stack 445 // the choice of c needs to be improved: 446 // there is a theoritical value that gives optimal asymptotics 447 // but in practice, other factors come into play, including: 448 // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 449 // * number of CPUs 450 // * cache friendliness (which depends on the host, G1 or G2... ) 451 // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. 452 453 // for each msmCX 454 // step 1 455 // we compute, for each scalars over c-bit wide windows, nbChunk digits 456 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract 457 // 2^{c} to the current digit, making it negative. 458 // negative digits will be processed in the next step as adding -G into the bucket instead of G 459 // (computing -G is cheap, and this saves us half of the buckets) 460 // step 2 461 // buckets are declared on the stack 462 // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) 463 // we use jacobian extended formulas here as they are faster than mixed addition 464 // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel 465 // step 3 466 // reduce the buckets weigthed sums into our result (msmReduceChunk) 467 468 var opt *ecc.CPUSemaphore 469 if len(opts) > 0 { 470 opt = opts[0] 471 } else { 472 opt = ecc.NewCPUSemaphore(runtime.NumCPU()) 473 } 474 475 var C uint64 476 nbPoints := len(points) 477 478 // implemented msmC methods (the c we use must be in this slice) 479 implementedCs := []uint64{4, 5, 8, 16} 480 481 // approximate cost (in group operations) 482 // cost = bits/c * (nbPoints + 2^{c}) 483 // this needs to be verified empirically. 484 // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results 485 min := math.MaxFloat64 486 for _, c := range implementedCs { 487 cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) 488 cost := float64(cc) / float64(c) 489 if cost < min { 490 min = cost 491 C = c 492 } 493 } 494 495 // empirical, needs to be tuned. 496 // if C > 16 && nbPoints < 1 << 23 { 497 // C = 16 498 // } 499 500 // take all the cpus to ourselves 501 opt.Lock.Lock() 502 503 // partition the scalars 504 // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) 505 // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window 506 scalars = partitionScalars(scalars, C) 507 508 switch C { 509 510 case 4: 511 return p.msmC4(points, scalars, opt) 512 513 case 5: 514 return p.msmC5(points, scalars, opt) 515 516 case 8: 517 return p.msmC8(points, scalars, opt) 518 519 case 16: 520 return p.msmC16(points, scalars, opt) 521 522 default: 523 panic("unimplemented") 524 } 525} 526 527// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp 528func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac { 529 var _p g2JacExtended 530 totalj := <-chChunks[len(chChunks)-1] 531 _p.Set(&totalj) 532 for j := len(chChunks) - 2; j >= 0; j-- { 533 for l := 0; l < c; l++ { 534 _p.double(&_p) 535 } 536 totalj := <-chChunks[j] 537 _p.add(&totalj) 538 } 539 540 return p.unsafeFromJacExtended(&_p) 541} 542 543func msmProcessChunkG2Affine(chunk uint64, 544 chRes chan<- g2JacExtended, 545 buckets []g2JacExtended, 546 c uint64, 547 points []G2Affine, 548 scalars []fr.Element) { 549 550 mask := uint64((1 << c) - 1) // low c bits are 1 551 msbWindow := uint64(1 << (c - 1)) 552 553 for i := 0; i < len(buckets); i++ { 554 buckets[i].setInfinity() 555 } 556 557 jc := uint64(chunk * c) 558 s := selector{} 559 s.index = jc / 64 560 s.shift = jc - (s.index * 64) 561 s.mask = mask << s.shift 562 s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) 563 if s.multiWordSelect { 564 nbBitsHigh := s.shift - uint64(64-c) 565 s.maskHigh = (1 << nbBitsHigh) - 1 566 s.shiftHigh = (c - nbBitsHigh) 567 } 568 569 // for each scalars, get the digit corresponding to the chunk we're processing. 570 for i := 0; i < len(scalars); i++ { 571 bits := (scalars[i][s.index] & s.mask) >> s.shift 572 if s.multiWordSelect { 573 bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh 574 } 575 576 if bits == 0 { 577 continue 578 } 579 580 // if msbWindow bit is set, we need to substract 581 if bits&msbWindow == 0 { 582 // add 583 buckets[bits-1].addMixed(&points[i]) 584 } else { 585 // sub 586 buckets[bits & ^msbWindow].subMixed(&points[i]) 587 } 588 } 589 590 // reduce buckets into total 591 // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] 592 593 var runningSum, total g2JacExtended 594 runningSum.setInfinity() 595 total.setInfinity() 596 for k := len(buckets) - 1; k >= 0; k-- { 597 if !buckets[k].ZZ.IsZero() { 598 runningSum.add(&buckets[k]) 599 } 600 total.add(&runningSum) 601 } 602 603 chRes <- total 604 close(chRes) 605} 606 607func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac { 608 const c = 4 // scalars partitioned into c-bit radixes 609 const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar 610 611 // for each chunk, spawn a go routine that'll loop through all the scalars 612 var chChunks [nbChunks]chan g2JacExtended 613 614 // wait group to wait for all the go routines to start 615 var wg sync.WaitGroup 616 for chunk := nbChunks - 1; chunk >= 0; chunk-- { 617 chChunks[chunk] = make(chan g2JacExtended, 1) 618 <-opt.ChCPU // wait to have a cpu before scheduling 619 wg.Add(1) 620 go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) { 621 wg.Done() 622 var buckets [1 << (c - 1)]g2JacExtended 623 msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars) 624 opt.ChCPU <- struct{}{} // release token in the semaphore 625 }(uint64(chunk), chChunks[chunk], points, scalars) 626 } 627 628 // wait for all goRoutines to actually start 629 wg.Wait() 630 631 // all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore 632 opt.Lock.Unlock() 633 return msmReduceChunkG2Affine(p, c, chChunks[:]) 634} 635 636func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac { 637 const c = 5 // scalars partitioned into c-bit radixes 638 const nbChunks = (fr.Limbs * 64 / c) + 1 // number of c-bit radixes in a scalar 639 640 // for each chunk, spawn a go routine that'll loop through all the scalars 641 var chChunks [nbChunks]chan g2JacExtended 642 643 // wait group to wait for all the go routines to start 644 var wg sync.WaitGroup 645 // c doesn't divide 384, last window is smaller we can allocate less buckets 646 const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) 647 chChunks[nbChunks-1] = make(chan g2JacExtended, 1) 648 <-opt.ChCPU // wait to have a cpu before scheduling 649 wg.Add(1) 650 go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) { 651 wg.Done() 652 var buckets [1 << (lastC - 1)]g2JacExtended 653 msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars) 654 opt.ChCPU <- struct{}{} // release token in the semaphore 655 }(uint64(nbChunks-1), chChunks[nbChunks-1], points, scalars) 656 657 for chunk := nbChunks - 2; chunk >= 0; chunk-- { 658 chChunks[chunk] = make(chan g2JacExtended, 1) 659 <-opt.ChCPU // wait to have a cpu before scheduling 660 wg.Add(1) 661 go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) { 662 wg.Done() 663 var buckets [1 << (c - 1)]g2JacExtended 664 msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars) 665 opt.ChCPU <- struct{}{} // release token in the semaphore 666 }(uint64(chunk), chChunks[chunk], points, scalars) 667 } 668 669 // wait for all goRoutines to actually start 670 wg.Wait() 671 672 // all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore 673 opt.Lock.Unlock() 674 return msmReduceChunkG2Affine(p, c, chChunks[:]) 675} 676 677func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac { 678 const c = 8 // scalars partitioned into c-bit radixes 679 const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar 680 681 // for each chunk, spawn a go routine that'll loop through all the scalars 682 var chChunks [nbChunks]chan g2JacExtended 683 684 // wait group to wait for all the go routines to start 685 var wg sync.WaitGroup 686 for chunk := nbChunks - 1; chunk >= 0; chunk-- { 687 chChunks[chunk] = make(chan g2JacExtended, 1) 688 <-opt.ChCPU // wait to have a cpu before scheduling 689 wg.Add(1) 690 go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) { 691 wg.Done() 692 var buckets [1 << (c - 1)]g2JacExtended 693 msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars) 694 opt.ChCPU <- struct{}{} // release token in the semaphore 695 }(uint64(chunk), chChunks[chunk], points, scalars) 696 } 697 698 // wait for all goRoutines to actually start 699 wg.Wait() 700 701 // all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore 702 opt.Lock.Unlock() 703 return msmReduceChunkG2Affine(p, c, chChunks[:]) 704} 705 706func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac { 707 const c = 16 // scalars partitioned into c-bit radixes 708 const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar 709 710 // for each chunk, spawn a go routine that'll loop through all the scalars 711 var chChunks [nbChunks]chan g2JacExtended 712 713 // wait group to wait for all the go routines to start 714 var wg sync.WaitGroup 715 for chunk := nbChunks - 1; chunk >= 0; chunk-- { 716 chChunks[chunk] = make(chan g2JacExtended, 1) 717 <-opt.ChCPU // wait to have a cpu before scheduling 718 wg.Add(1) 719 go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) { 720 wg.Done() 721 var buckets [1 << (c - 1)]g2JacExtended 722 msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars) 723 opt.ChCPU <- struct{}{} // release token in the semaphore 724 }(uint64(chunk), chChunks[chunk], points, scalars) 725 } 726 727 // wait for all goRoutines to actually start 728 wg.Wait() 729 730 // all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore 731 opt.Lock.Unlock() 732 return msmReduceChunkG2Affine(p, c, chChunks[:]) 733} 734