ecc/bw6-761/multiexp.go

// Copyright 2020 ConsenSys Software Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by consensys/gnark-crypto DO NOT EDIT

package bw6761

import (
	"github.com/consensys/gnark-crypto/ecc"
	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
	"github.com/consensys/gnark-crypto/internal/parallel"
	"math"
	"runtime"
	"sync"
)

// selector stores the index, mask and shifts needed to select bits from a scalar
// it is used during the multiExp algorithm or the batch scalar multiplication
type selector struct {
	index uint64 // index in the multi-word scalar to select bits from
	mask  uint64 // mask (c-bit wide)
	shift uint64 // shift needed to get our bits on low positions

	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
	maskHigh        uint64 // same than mask, for index+1
	shiftHigh       uint64 // same than shift, for index+1
}

// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
// 2^{c} to the current digit, making it negative.
// negative digits can be processed in a later step as adding -G into the bucket instead of G
// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
func partitionScalars(scalars []fr.Element, c uint64) []fr.Element {
	toReturn := make([]fr.Element, len(scalars))

	// number of c-bit radixes in a scalar
	nbChunks := fr.Limbs * 64 / c
	if (fr.Limbs*64)%c != 0 {
		nbChunks++
	}

	mask := uint64((1 << c) - 1)      // low c bits are 1
	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
	max := int(1 << (c - 1))          // max value we want for our digits
	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words

	// compute offset and word selector / shift to select the right bits of our windows
	selectors := make([]selector, nbChunks)
	for chunk := uint64(0); chunk < nbChunks; chunk++ {
		jc := uint64(chunk * c)
		d := selector{}
		d.index = jc / 64
		d.shift = jc - (d.index * 64)
		d.mask = mask << d.shift
		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
		if d.multiWordSelect {
			nbBitsHigh := d.shift - uint64(64-c)
			d.maskHigh = (1 << nbBitsHigh) - 1
			d.shiftHigh = (c - nbBitsHigh)
		}
		selectors[chunk] = d
	}

	parallel.Execute(len(scalars), func(start, end int) {
		for i := start; i < end; i++ {
			var carry int

			// for each chunk in the scalar, compute the current digit, and an eventual carry
			for chunk := uint64(0); chunk < nbChunks; chunk++ {
				s := selectors[chunk]

				// init with carry if any
				digit := carry
				carry = 0

				// digit = value of the c-bit window
				digit += int((scalars[i][s.index] & s.mask) >> s.shift)

				if s.multiWordSelect {
					// we are selecting bits over 2 words
					digit += int(scalars[i][s.index+1]&s.maskHigh) << s.shiftHigh
				}

				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
				// 2^{c} to the current digit, making it negative.
				if digit >= max {
					digit -= (1 << c)
					carry = 1
				}

				var bits uint64
				if digit >= 0 {
					bits = uint64(digit)
				} else {
					bits = uint64(-digit-1) | msbWindow
				}

				toReturn[i][s.index] |= (bits << s.shift)
				if s.multiWordSelect {
					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
				}

			}
		}
	})
	return toReturn
}

// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
// optionally, takes as parameter a ecc.CPUSemaphore struct
// enabling to set max number of cpus to use
func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G1Affine {
	var _p G1Jac
	_p.MultiExp(points, scalars, opts...)
	p.FromJacobian(&_p)
	return p
}

// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
// optionally, takes as parameter a ecc.CPUSemaphore struct
// enabling to set max number of cpus to use
func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G1Jac {
	// note:
	// each of the msmCX method is the same, except for the c constant it declares
	// duplicating (through template generation) these methods allows to declare the buckets on the stack
	// the choice of c needs to be improved:
	// there is a theoritical value that gives optimal asymptotics
	// but in practice, other factors come into play, including:
	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
	// * number of CPUs
	// * cache friendliness (which depends on the host, G1 or G2... )
	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.

	// for each msmCX
	// step 1
	// we compute, for each scalars over c-bit wide windows, nbChunk digits
	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
	// 2^{c} to the current digit, making it negative.
	// negative digits will be processed in the next step as adding -G into the bucket instead of G
	// (computing -G is cheap, and this saves us half of the buckets)
	// step 2
	// buckets are declared on the stack
	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
	// we use jacobian extended formulas here as they are faster than mixed addition
	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
	// step 3
	// reduce the buckets weigthed sums into our result (msmReduceChunk)

	var opt *ecc.CPUSemaphore
	if len(opts) > 0 {
		opt = opts[0]
	} else {
		opt = ecc.NewCPUSemaphore(runtime.NumCPU())
	}

	var C uint64
	nbPoints := len(points)

	// implemented msmC methods (the c we use must be in this slice)
	implementedCs := []uint64{4, 5, 8, 16}

	// approximate cost (in group operations)
	// cost = bits/c * (nbPoints + 2^{c})
	// this needs to be verified empirically.
	// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
	min := math.MaxFloat64
	for _, c := range implementedCs {
		cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
		cost := float64(cc) / float64(c)
		if cost < min {
			min = cost
			C = c
		}
	}

	// empirical, needs to be tuned.
	// if C > 16 && nbPoints < 1 << 23 {
	// 	C = 16
	// }

	// take all the cpus to ourselves
	opt.Lock.Lock()

	// partition the scalars
	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
	scalars = partitionScalars(scalars, C)

	switch C {

	case 4:
		return p.msmC4(points, scalars, opt)

	case 5:
		return p.msmC5(points, scalars, opt)

	case 8:
		return p.msmC8(points, scalars, opt)

	case 16:
		return p.msmC16(points, scalars, opt)

	default:
		panic("unimplemented")
	}
}

// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
	var _p g1JacExtended
	totalj := <-chChunks[len(chChunks)-1]
	_p.Set(&totalj)
	for j := len(chChunks) - 2; j >= 0; j-- {
		for l := 0; l < c; l++ {
			_p.double(&_p)
		}
		totalj := <-chChunks[j]
		_p.add(&totalj)
	}

	return p.unsafeFromJacExtended(&_p)
}

func msmProcessChunkG1Affine(chunk uint64,
	chRes chan<- g1JacExtended,
	buckets []g1JacExtended,
	c uint64,
	points []G1Affine,
	scalars []fr.Element) {

	mask := uint64((1 << c) - 1) // low c bits are 1
	msbWindow := uint64(1 << (c - 1))

	for i := 0; i < len(buckets); i++ {
		buckets[i].setInfinity()
	}

	jc := uint64(chunk * c)
	s := selector{}
	s.index = jc / 64
	s.shift = jc - (s.index * 64)
	s.mask = mask << s.shift
	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
	if s.multiWordSelect {
		nbBitsHigh := s.shift - uint64(64-c)
		s.maskHigh = (1 << nbBitsHigh) - 1
		s.shiftHigh = (c - nbBitsHigh)
	}

	// for each scalars, get the digit corresponding to the chunk we're processing.
	for i := 0; i < len(scalars); i++ {
		bits := (scalars[i][s.index] & s.mask) >> s.shift
		if s.multiWordSelect {
			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
		}

		if bits == 0 {
			continue
		}

		// if msbWindow bit is set, we need to substract
		if bits&msbWindow == 0 {
			// add
			buckets[bits-1].addMixed(&points[i])
		} else {
			// sub
			buckets[bits & ^msbWindow].subMixed(&points[i])
		}
	}

	// reduce buckets into total
	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]

	var runningSum, total g1JacExtended
	runningSum.setInfinity()
	total.setInfinity()
	for k := len(buckets) - 1; k >= 0; k-- {
		if !buckets[k].ZZ.IsZero() {
			runningSum.add(&buckets[k])
		}
		total.add(&runningSum)
	}

	chRes <- total
	close(chRes)
}

func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac {
	const c = 4                          // scalars partitioned into c-bit radixes
	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar

	// for each chunk, spawn a go routine that'll loop through all the scalars
	var chChunks [nbChunks]chan g1JacExtended

	// wait group to wait for all the go routines to start
	var wg sync.WaitGroup
	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
		chChunks[chunk] = make(chan g1JacExtended, 1)
		<-opt.ChCPU // wait to have a cpu before scheduling
		wg.Add(1)
		go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) {
			wg.Done()
			var buckets [1 << (c - 1)]g1JacExtended
			msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars)
			opt.ChCPU <- struct{}{} // release token in the semaphore
		}(uint64(chunk), chChunks[chunk], points, scalars)
	}

	// wait for all goRoutines to actually start
	wg.Wait()

	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
	opt.Lock.Unlock()
	return msmReduceChunkG1Affine(p, c, chChunks[:])
}

func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac {
	const c = 5                              // scalars partitioned into c-bit radixes
	const nbChunks = (fr.Limbs * 64 / c) + 1 // number of c-bit radixes in a scalar

	// for each chunk, spawn a go routine that'll loop through all the scalars
	var chChunks [nbChunks]chan g1JacExtended

	// wait group to wait for all the go routines to start
	var wg sync.WaitGroup
	// c doesn't divide 384, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	chChunks[nbChunks-1] = make(chan g1JacExtended, 1)
	<-opt.ChCPU // wait to have a cpu before scheduling
	wg.Add(1)
	go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) {
		wg.Done()
		var buckets [1 << (lastC - 1)]g1JacExtended
		msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars)
		opt.ChCPU <- struct{}{} // release token in the semaphore
	}(uint64(nbChunks-1), chChunks[nbChunks-1], points, scalars)

	for chunk := nbChunks - 2; chunk >= 0; chunk-- {
		chChunks[chunk] = make(chan g1JacExtended, 1)
		<-opt.ChCPU // wait to have a cpu before scheduling
		wg.Add(1)
		go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) {
			wg.Done()
			var buckets [1 << (c - 1)]g1JacExtended
			msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars)
			opt.ChCPU <- struct{}{} // release token in the semaphore
		}(uint64(chunk), chChunks[chunk], points, scalars)
	}

	// wait for all goRoutines to actually start
	wg.Wait()

	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
	opt.Lock.Unlock()
	return msmReduceChunkG1Affine(p, c, chChunks[:])
}

func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac {
	const c = 8                          // scalars partitioned into c-bit radixes
	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar

	// for each chunk, spawn a go routine that'll loop through all the scalars
	var chChunks [nbChunks]chan g1JacExtended

	// wait group to wait for all the go routines to start
	var wg sync.WaitGroup
	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
		chChunks[chunk] = make(chan g1JacExtended, 1)
		<-opt.ChCPU // wait to have a cpu before scheduling
		wg.Add(1)
		go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) {
			wg.Done()
			var buckets [1 << (c - 1)]g1JacExtended
			msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars)
			opt.ChCPU <- struct{}{} // release token in the semaphore
		}(uint64(chunk), chChunks[chunk], points, scalars)
	}

	// wait for all goRoutines to actually start
	wg.Wait()

	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
	opt.Lock.Unlock()
	return msmReduceChunkG1Affine(p, c, chChunks[:])
}

func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac {
	const c = 16                         // scalars partitioned into c-bit radixes
	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar

	// for each chunk, spawn a go routine that'll loop through all the scalars
	var chChunks [nbChunks]chan g1JacExtended

	// wait group to wait for all the go routines to start
	var wg sync.WaitGroup
	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
		chChunks[chunk] = make(chan g1JacExtended, 1)
		<-opt.ChCPU // wait to have a cpu before scheduling
		wg.Add(1)
		go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) {
			wg.Done()
			var buckets [1 << (c - 1)]g1JacExtended
			msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars)
			opt.ChCPU <- struct{}{} // release token in the semaphore
		}(uint64(chunk), chChunks[chunk], points, scalars)
	}

	// wait for all goRoutines to actually start
	wg.Wait()

	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
	opt.Lock.Unlock()
	return msmReduceChunkG1Affine(p, c, chChunks[:])
}

// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
// optionally, takes as parameter a ecc.CPUSemaphore struct
// enabling to set max number of cpus to use
func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G2Affine {
	var _p G2Jac
	_p.MultiExp(points, scalars, opts...)
	p.FromJacobian(&_p)
	return p
}

// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
// optionally, takes as parameter a ecc.CPUSemaphore struct
// enabling to set max number of cpus to use
func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G2Jac {
	// note:
	// each of the msmCX method is the same, except for the c constant it declares
	// duplicating (through template generation) these methods allows to declare the buckets on the stack
	// the choice of c needs to be improved:
	// there is a theoritical value that gives optimal asymptotics
	// but in practice, other factors come into play, including:
	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
	// * number of CPUs
	// * cache friendliness (which depends on the host, G1 or G2... )
	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.

	// for each msmCX
	// step 1
	// we compute, for each scalars over c-bit wide windows, nbChunk digits
	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
	// 2^{c} to the current digit, making it negative.
	// negative digits will be processed in the next step as adding -G into the bucket instead of G
	// (computing -G is cheap, and this saves us half of the buckets)
	// step 2
	// buckets are declared on the stack
	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
	// we use jacobian extended formulas here as they are faster than mixed addition
	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
	// step 3
	// reduce the buckets weigthed sums into our result (msmReduceChunk)

	var opt *ecc.CPUSemaphore
	if len(opts) > 0 {
		opt = opts[0]
	} else {
		opt = ecc.NewCPUSemaphore(runtime.NumCPU())
	}

	var C uint64
	nbPoints := len(points)

	// implemented msmC methods (the c we use must be in this slice)
	implementedCs := []uint64{4, 5, 8, 16}

	// approximate cost (in group operations)
	// cost = bits/c * (nbPoints + 2^{c})
	// this needs to be verified empirically.
	// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
	min := math.MaxFloat64
	for _, c := range implementedCs {
		cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
		cost := float64(cc) / float64(c)
		if cost < min {
			min = cost
			C = c
		}
	}

	// empirical, needs to be tuned.
	// if C > 16 && nbPoints < 1 << 23 {
	// 	C = 16
	// }

	// take all the cpus to ourselves
	opt.Lock.Lock()

	// partition the scalars
	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
	scalars = partitionScalars(scalars, C)

	switch C {

	case 4:
		return p.msmC4(points, scalars, opt)

	case 5:
		return p.msmC5(points, scalars, opt)

	case 8:
		return p.msmC8(points, scalars, opt)

	case 16:
		return p.msmC16(points, scalars, opt)

	default:
		panic("unimplemented")
	}
}

// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
	var _p g2JacExtended
	totalj := <-chChunks[len(chChunks)-1]
	_p.Set(&totalj)
	for j := len(chChunks) - 2; j >= 0; j-- {
		for l := 0; l < c; l++ {
			_p.double(&_p)
		}
		totalj := <-chChunks[j]
		_p.add(&totalj)
	}

	return p.unsafeFromJacExtended(&_p)
}

func msmProcessChunkG2Affine(chunk uint64,
	chRes chan<- g2JacExtended,
	buckets []g2JacExtended,
	c uint64,
	points []G2Affine,
	scalars []fr.Element) {

	mask := uint64((1 << c) - 1) // low c bits are 1
	msbWindow := uint64(1 << (c - 1))

	for i := 0; i < len(buckets); i++ {
		buckets[i].setInfinity()
	}

	jc := uint64(chunk * c)
	s := selector{}
	s.index = jc / 64
	s.shift = jc - (s.index * 64)
	s.mask = mask << s.shift
	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
	if s.multiWordSelect {
		nbBitsHigh := s.shift - uint64(64-c)
		s.maskHigh = (1 << nbBitsHigh) - 1
		s.shiftHigh = (c - nbBitsHigh)
	}

	// for each scalars, get the digit corresponding to the chunk we're processing.
	for i := 0; i < len(scalars); i++ {
		bits := (scalars[i][s.index] & s.mask) >> s.shift
		if s.multiWordSelect {
			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
		}

		if bits == 0 {
			continue
		}

		// if msbWindow bit is set, we need to substract
		if bits&msbWindow == 0 {
			// add
			buckets[bits-1].addMixed(&points[i])
		} else {
			// sub
			buckets[bits & ^msbWindow].subMixed(&points[i])
		}
	}

	// reduce buckets into total
	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]

	var runningSum, total g2JacExtended
	runningSum.setInfinity()
	total.setInfinity()
	for k := len(buckets) - 1; k >= 0; k-- {
		if !buckets[k].ZZ.IsZero() {
			runningSum.add(&buckets[k])
		}
		total.add(&runningSum)
	}

	chRes <- total
	close(chRes)
}

func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac {
	const c = 4                          // scalars partitioned into c-bit radixes
	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar

	// for each chunk, spawn a go routine that'll loop through all the scalars
	var chChunks [nbChunks]chan g2JacExtended

	// wait group to wait for all the go routines to start
	var wg sync.WaitGroup
	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
		chChunks[chunk] = make(chan g2JacExtended, 1)
		<-opt.ChCPU // wait to have a cpu before scheduling
		wg.Add(1)
		go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) {
			wg.Done()
			var buckets [1 << (c - 1)]g2JacExtended
			msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars)
			opt.ChCPU <- struct{}{} // release token in the semaphore
		}(uint64(chunk), chChunks[chunk], points, scalars)
	}

	// wait for all goRoutines to actually start
	wg.Wait()

	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
	opt.Lock.Unlock()
	return msmReduceChunkG2Affine(p, c, chChunks[:])
}

func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac {
	const c = 5                              // scalars partitioned into c-bit radixes
	const nbChunks = (fr.Limbs * 64 / c) + 1 // number of c-bit radixes in a scalar

	// for each chunk, spawn a go routine that'll loop through all the scalars
	var chChunks [nbChunks]chan g2JacExtended

	// wait group to wait for all the go routines to start
	var wg sync.WaitGroup
	// c doesn't divide 384, last window is smaller we can allocate less buckets
	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
	chChunks[nbChunks-1] = make(chan g2JacExtended, 1)
	<-opt.ChCPU // wait to have a cpu before scheduling
	wg.Add(1)
	go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) {
		wg.Done()
		var buckets [1 << (lastC - 1)]g2JacExtended
		msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars)
		opt.ChCPU <- struct{}{} // release token in the semaphore
	}(uint64(nbChunks-1), chChunks[nbChunks-1], points, scalars)

	for chunk := nbChunks - 2; chunk >= 0; chunk-- {
		chChunks[chunk] = make(chan g2JacExtended, 1)
		<-opt.ChCPU // wait to have a cpu before scheduling
		wg.Add(1)
		go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) {
			wg.Done()
			var buckets [1 << (c - 1)]g2JacExtended
			msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars)
			opt.ChCPU <- struct{}{} // release token in the semaphore
		}(uint64(chunk), chChunks[chunk], points, scalars)
	}

	// wait for all goRoutines to actually start
	wg.Wait()

	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
	opt.Lock.Unlock()
	return msmReduceChunkG2Affine(p, c, chChunks[:])
}

func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac {
	const c = 8                          // scalars partitioned into c-bit radixes
	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar

	// for each chunk, spawn a go routine that'll loop through all the scalars
	var chChunks [nbChunks]chan g2JacExtended

	// wait group to wait for all the go routines to start
	var wg sync.WaitGroup
	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
		chChunks[chunk] = make(chan g2JacExtended, 1)
		<-opt.ChCPU // wait to have a cpu before scheduling
		wg.Add(1)
		go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) {
			wg.Done()
			var buckets [1 << (c - 1)]g2JacExtended
			msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars)
			opt.ChCPU <- struct{}{} // release token in the semaphore
		}(uint64(chunk), chChunks[chunk], points, scalars)
	}

	// wait for all goRoutines to actually start
	wg.Wait()

	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
	opt.Lock.Unlock()
	return msmReduceChunkG2Affine(p, c, chChunks[:])
}

func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac {
	const c = 16                         // scalars partitioned into c-bit radixes
	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar

	// for each chunk, spawn a go routine that'll loop through all the scalars
	var chChunks [nbChunks]chan g2JacExtended

	// wait group to wait for all the go routines to start
	var wg sync.WaitGroup
	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
		chChunks[chunk] = make(chan g2JacExtended, 1)
		<-opt.ChCPU // wait to have a cpu before scheduling
		wg.Add(1)
		go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) {
			wg.Done()
			var buckets [1 << (c - 1)]g2JacExtended
			msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars)
			opt.ChCPU <- struct{}{} // release token in the semaphore
		}(uint64(chunk), chChunks[chunk], points, scalars)
	}

	// wait for all goRoutines to actually start
	wg.Wait()

	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
	opt.Lock.Unlock()
	return msmReduceChunkG2Affine(p, c, chChunks[:])
}