1// Copyright 2020 ConsenSys Software Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Code generated by consensys/gnark-crypto DO NOT EDIT
16
17package bw6761
18
19import (
20	"github.com/consensys/gnark-crypto/ecc"
21	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
22	"github.com/consensys/gnark-crypto/internal/parallel"
23	"math"
24	"runtime"
25	"sync"
26)
27
28// selector stores the index, mask and shifts needed to select bits from a scalar
29// it is used during the multiExp algorithm or the batch scalar multiplication
30type selector struct {
31	index uint64 // index in the multi-word scalar to select bits from
32	mask  uint64 // mask (c-bit wide)
33	shift uint64 // shift needed to get our bits on low positions
34
35	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
36	maskHigh        uint64 // same than mask, for index+1
37	shiftHigh       uint64 // same than shift, for index+1
38}
39
40// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
41// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
42// 2^{c} to the current digit, making it negative.
43// negative digits can be processed in a later step as adding -G into the bucket instead of G
44// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
45func partitionScalars(scalars []fr.Element, c uint64) []fr.Element {
46	toReturn := make([]fr.Element, len(scalars))
47
48	// number of c-bit radixes in a scalar
49	nbChunks := fr.Limbs * 64 / c
50	if (fr.Limbs*64)%c != 0 {
51		nbChunks++
52	}
53
54	mask := uint64((1 << c) - 1)      // low c bits are 1
55	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
56	max := int(1 << (c - 1))          // max value we want for our digits
57	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
58
59	// compute offset and word selector / shift to select the right bits of our windows
60	selectors := make([]selector, nbChunks)
61	for chunk := uint64(0); chunk < nbChunks; chunk++ {
62		jc := uint64(chunk * c)
63		d := selector{}
64		d.index = jc / 64
65		d.shift = jc - (d.index * 64)
66		d.mask = mask << d.shift
67		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
68		if d.multiWordSelect {
69			nbBitsHigh := d.shift - uint64(64-c)
70			d.maskHigh = (1 << nbBitsHigh) - 1
71			d.shiftHigh = (c - nbBitsHigh)
72		}
73		selectors[chunk] = d
74	}
75
76	parallel.Execute(len(scalars), func(start, end int) {
77		for i := start; i < end; i++ {
78			var carry int
79
80			// for each chunk in the scalar, compute the current digit, and an eventual carry
81			for chunk := uint64(0); chunk < nbChunks; chunk++ {
82				s := selectors[chunk]
83
84				// init with carry if any
85				digit := carry
86				carry = 0
87
88				// digit = value of the c-bit window
89				digit += int((scalars[i][s.index] & s.mask) >> s.shift)
90
91				if s.multiWordSelect {
92					// we are selecting bits over 2 words
93					digit += int(scalars[i][s.index+1]&s.maskHigh) << s.shiftHigh
94				}
95
96				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
97				// 2^{c} to the current digit, making it negative.
98				if digit >= max {
99					digit -= (1 << c)
100					carry = 1
101				}
102
103				var bits uint64
104				if digit >= 0 {
105					bits = uint64(digit)
106				} else {
107					bits = uint64(-digit-1) | msbWindow
108				}
109
110				toReturn[i][s.index] |= (bits << s.shift)
111				if s.multiWordSelect {
112					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
113				}
114
115			}
116		}
117	})
118	return toReturn
119}
120
121// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
122// optionally, takes as parameter a ecc.CPUSemaphore struct
123// enabling to set max number of cpus to use
124func (p *G1Affine) MultiExp(points []G1Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G1Affine {
125	var _p G1Jac
126	_p.MultiExp(points, scalars, opts...)
127	p.FromJacobian(&_p)
128	return p
129}
130
131// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
132// optionally, takes as parameter a ecc.CPUSemaphore struct
133// enabling to set max number of cpus to use
134func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G1Jac {
135	// note:
136	// each of the msmCX method is the same, except for the c constant it declares
137	// duplicating (through template generation) these methods allows to declare the buckets on the stack
138	// the choice of c needs to be improved:
139	// there is a theoritical value that gives optimal asymptotics
140	// but in practice, other factors come into play, including:
141	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
142	// * number of CPUs
143	// * cache friendliness (which depends on the host, G1 or G2... )
144	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
145
146	// for each msmCX
147	// step 1
148	// we compute, for each scalars over c-bit wide windows, nbChunk digits
149	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
150	// 2^{c} to the current digit, making it negative.
151	// negative digits will be processed in the next step as adding -G into the bucket instead of G
152	// (computing -G is cheap, and this saves us half of the buckets)
153	// step 2
154	// buckets are declared on the stack
155	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
156	// we use jacobian extended formulas here as they are faster than mixed addition
157	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
158	// step 3
159	// reduce the buckets weigthed sums into our result (msmReduceChunk)
160
161	var opt *ecc.CPUSemaphore
162	if len(opts) > 0 {
163		opt = opts[0]
164	} else {
165		opt = ecc.NewCPUSemaphore(runtime.NumCPU())
166	}
167
168	var C uint64
169	nbPoints := len(points)
170
171	// implemented msmC methods (the c we use must be in this slice)
172	implementedCs := []uint64{4, 5, 8, 16}
173
174	// approximate cost (in group operations)
175	// cost = bits/c * (nbPoints + 2^{c})
176	// this needs to be verified empirically.
177	// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
178	min := math.MaxFloat64
179	for _, c := range implementedCs {
180		cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
181		cost := float64(cc) / float64(c)
182		if cost < min {
183			min = cost
184			C = c
185		}
186	}
187
188	// empirical, needs to be tuned.
189	// if C > 16 && nbPoints < 1 << 23 {
190	// 	C = 16
191	// }
192
193	// take all the cpus to ourselves
194	opt.Lock.Lock()
195
196	// partition the scalars
197	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
198	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
199	scalars = partitionScalars(scalars, C)
200
201	switch C {
202
203	case 4:
204		return p.msmC4(points, scalars, opt)
205
206	case 5:
207		return p.msmC5(points, scalars, opt)
208
209	case 8:
210		return p.msmC8(points, scalars, opt)
211
212	case 16:
213		return p.msmC16(points, scalars, opt)
214
215	default:
216		panic("unimplemented")
217	}
218}
219
220// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
221func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
222	var _p g1JacExtended
223	totalj := <-chChunks[len(chChunks)-1]
224	_p.Set(&totalj)
225	for j := len(chChunks) - 2; j >= 0; j-- {
226		for l := 0; l < c; l++ {
227			_p.double(&_p)
228		}
229		totalj := <-chChunks[j]
230		_p.add(&totalj)
231	}
232
233	return p.unsafeFromJacExtended(&_p)
234}
235
236func msmProcessChunkG1Affine(chunk uint64,
237	chRes chan<- g1JacExtended,
238	buckets []g1JacExtended,
239	c uint64,
240	points []G1Affine,
241	scalars []fr.Element) {
242
243	mask := uint64((1 << c) - 1) // low c bits are 1
244	msbWindow := uint64(1 << (c - 1))
245
246	for i := 0; i < len(buckets); i++ {
247		buckets[i].setInfinity()
248	}
249
250	jc := uint64(chunk * c)
251	s := selector{}
252	s.index = jc / 64
253	s.shift = jc - (s.index * 64)
254	s.mask = mask << s.shift
255	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
256	if s.multiWordSelect {
257		nbBitsHigh := s.shift - uint64(64-c)
258		s.maskHigh = (1 << nbBitsHigh) - 1
259		s.shiftHigh = (c - nbBitsHigh)
260	}
261
262	// for each scalars, get the digit corresponding to the chunk we're processing.
263	for i := 0; i < len(scalars); i++ {
264		bits := (scalars[i][s.index] & s.mask) >> s.shift
265		if s.multiWordSelect {
266			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
267		}
268
269		if bits == 0 {
270			continue
271		}
272
273		// if msbWindow bit is set, we need to substract
274		if bits&msbWindow == 0 {
275			// add
276			buckets[bits-1].addMixed(&points[i])
277		} else {
278			// sub
279			buckets[bits & ^msbWindow].subMixed(&points[i])
280		}
281	}
282
283	// reduce buckets into total
284	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
285
286	var runningSum, total g1JacExtended
287	runningSum.setInfinity()
288	total.setInfinity()
289	for k := len(buckets) - 1; k >= 0; k-- {
290		if !buckets[k].ZZ.IsZero() {
291			runningSum.add(&buckets[k])
292		}
293		total.add(&runningSum)
294	}
295
296	chRes <- total
297	close(chRes)
298}
299
300func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac {
301	const c = 4                          // scalars partitioned into c-bit radixes
302	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
303
304	// for each chunk, spawn a go routine that'll loop through all the scalars
305	var chChunks [nbChunks]chan g1JacExtended
306
307	// wait group to wait for all the go routines to start
308	var wg sync.WaitGroup
309	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
310		chChunks[chunk] = make(chan g1JacExtended, 1)
311		<-opt.ChCPU // wait to have a cpu before scheduling
312		wg.Add(1)
313		go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) {
314			wg.Done()
315			var buckets [1 << (c - 1)]g1JacExtended
316			msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars)
317			opt.ChCPU <- struct{}{} // release token in the semaphore
318		}(uint64(chunk), chChunks[chunk], points, scalars)
319	}
320
321	// wait for all goRoutines to actually start
322	wg.Wait()
323
324	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
325	opt.Lock.Unlock()
326	return msmReduceChunkG1Affine(p, c, chChunks[:])
327}
328
329func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac {
330	const c = 5                              // scalars partitioned into c-bit radixes
331	const nbChunks = (fr.Limbs * 64 / c) + 1 // number of c-bit radixes in a scalar
332
333	// for each chunk, spawn a go routine that'll loop through all the scalars
334	var chChunks [nbChunks]chan g1JacExtended
335
336	// wait group to wait for all the go routines to start
337	var wg sync.WaitGroup
338	// c doesn't divide 384, last window is smaller we can allocate less buckets
339	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
340	chChunks[nbChunks-1] = make(chan g1JacExtended, 1)
341	<-opt.ChCPU // wait to have a cpu before scheduling
342	wg.Add(1)
343	go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) {
344		wg.Done()
345		var buckets [1 << (lastC - 1)]g1JacExtended
346		msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars)
347		opt.ChCPU <- struct{}{} // release token in the semaphore
348	}(uint64(nbChunks-1), chChunks[nbChunks-1], points, scalars)
349
350	for chunk := nbChunks - 2; chunk >= 0; chunk-- {
351		chChunks[chunk] = make(chan g1JacExtended, 1)
352		<-opt.ChCPU // wait to have a cpu before scheduling
353		wg.Add(1)
354		go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) {
355			wg.Done()
356			var buckets [1 << (c - 1)]g1JacExtended
357			msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars)
358			opt.ChCPU <- struct{}{} // release token in the semaphore
359		}(uint64(chunk), chChunks[chunk], points, scalars)
360	}
361
362	// wait for all goRoutines to actually start
363	wg.Wait()
364
365	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
366	opt.Lock.Unlock()
367	return msmReduceChunkG1Affine(p, c, chChunks[:])
368}
369
370func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac {
371	const c = 8                          // scalars partitioned into c-bit radixes
372	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
373
374	// for each chunk, spawn a go routine that'll loop through all the scalars
375	var chChunks [nbChunks]chan g1JacExtended
376
377	// wait group to wait for all the go routines to start
378	var wg sync.WaitGroup
379	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
380		chChunks[chunk] = make(chan g1JacExtended, 1)
381		<-opt.ChCPU // wait to have a cpu before scheduling
382		wg.Add(1)
383		go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) {
384			wg.Done()
385			var buckets [1 << (c - 1)]g1JacExtended
386			msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars)
387			opt.ChCPU <- struct{}{} // release token in the semaphore
388		}(uint64(chunk), chChunks[chunk], points, scalars)
389	}
390
391	// wait for all goRoutines to actually start
392	wg.Wait()
393
394	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
395	opt.Lock.Unlock()
396	return msmReduceChunkG1Affine(p, c, chChunks[:])
397}
398
399func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G1Jac {
400	const c = 16                         // scalars partitioned into c-bit radixes
401	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
402
403	// for each chunk, spawn a go routine that'll loop through all the scalars
404	var chChunks [nbChunks]chan g1JacExtended
405
406	// wait group to wait for all the go routines to start
407	var wg sync.WaitGroup
408	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
409		chChunks[chunk] = make(chan g1JacExtended, 1)
410		<-opt.ChCPU // wait to have a cpu before scheduling
411		wg.Add(1)
412		go func(j uint64, chRes chan g1JacExtended, points []G1Affine, scalars []fr.Element) {
413			wg.Done()
414			var buckets [1 << (c - 1)]g1JacExtended
415			msmProcessChunkG1Affine(j, chRes, buckets[:], c, points, scalars)
416			opt.ChCPU <- struct{}{} // release token in the semaphore
417		}(uint64(chunk), chChunks[chunk], points, scalars)
418	}
419
420	// wait for all goRoutines to actually start
421	wg.Wait()
422
423	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
424	opt.Lock.Unlock()
425	return msmReduceChunkG1Affine(p, c, chChunks[:])
426}
427
428// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
429// optionally, takes as parameter a ecc.CPUSemaphore struct
430// enabling to set max number of cpus to use
431func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G2Affine {
432	var _p G2Jac
433	_p.MultiExp(points, scalars, opts...)
434	p.FromJacobian(&_p)
435	return p
436}
437
438// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
439// optionally, takes as parameter a ecc.CPUSemaphore struct
440// enabling to set max number of cpus to use
441func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, opts ...*ecc.CPUSemaphore) *G2Jac {
442	// note:
443	// each of the msmCX method is the same, except for the c constant it declares
444	// duplicating (through template generation) these methods allows to declare the buckets on the stack
445	// the choice of c needs to be improved:
446	// there is a theoritical value that gives optimal asymptotics
447	// but in practice, other factors come into play, including:
448	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
449	// * number of CPUs
450	// * cache friendliness (which depends on the host, G1 or G2... )
451	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
452
453	// for each msmCX
454	// step 1
455	// we compute, for each scalars over c-bit wide windows, nbChunk digits
456	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
457	// 2^{c} to the current digit, making it negative.
458	// negative digits will be processed in the next step as adding -G into the bucket instead of G
459	// (computing -G is cheap, and this saves us half of the buckets)
460	// step 2
461	// buckets are declared on the stack
462	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
463	// we use jacobian extended formulas here as they are faster than mixed addition
464	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
465	// step 3
466	// reduce the buckets weigthed sums into our result (msmReduceChunk)
467
468	var opt *ecc.CPUSemaphore
469	if len(opts) > 0 {
470		opt = opts[0]
471	} else {
472		opt = ecc.NewCPUSemaphore(runtime.NumCPU())
473	}
474
475	var C uint64
476	nbPoints := len(points)
477
478	// implemented msmC methods (the c we use must be in this slice)
479	implementedCs := []uint64{4, 5, 8, 16}
480
481	// approximate cost (in group operations)
482	// cost = bits/c * (nbPoints + 2^{c})
483	// this needs to be verified empirically.
484	// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
485	min := math.MaxFloat64
486	for _, c := range implementedCs {
487		cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
488		cost := float64(cc) / float64(c)
489		if cost < min {
490			min = cost
491			C = c
492		}
493	}
494
495	// empirical, needs to be tuned.
496	// if C > 16 && nbPoints < 1 << 23 {
497	// 	C = 16
498	// }
499
500	// take all the cpus to ourselves
501	opt.Lock.Lock()
502
503	// partition the scalars
504	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
505	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
506	scalars = partitionScalars(scalars, C)
507
508	switch C {
509
510	case 4:
511		return p.msmC4(points, scalars, opt)
512
513	case 5:
514		return p.msmC5(points, scalars, opt)
515
516	case 8:
517		return p.msmC8(points, scalars, opt)
518
519	case 16:
520		return p.msmC16(points, scalars, opt)
521
522	default:
523		panic("unimplemented")
524	}
525}
526
527// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
528func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
529	var _p g2JacExtended
530	totalj := <-chChunks[len(chChunks)-1]
531	_p.Set(&totalj)
532	for j := len(chChunks) - 2; j >= 0; j-- {
533		for l := 0; l < c; l++ {
534			_p.double(&_p)
535		}
536		totalj := <-chChunks[j]
537		_p.add(&totalj)
538	}
539
540	return p.unsafeFromJacExtended(&_p)
541}
542
543func msmProcessChunkG2Affine(chunk uint64,
544	chRes chan<- g2JacExtended,
545	buckets []g2JacExtended,
546	c uint64,
547	points []G2Affine,
548	scalars []fr.Element) {
549
550	mask := uint64((1 << c) - 1) // low c bits are 1
551	msbWindow := uint64(1 << (c - 1))
552
553	for i := 0; i < len(buckets); i++ {
554		buckets[i].setInfinity()
555	}
556
557	jc := uint64(chunk * c)
558	s := selector{}
559	s.index = jc / 64
560	s.shift = jc - (s.index * 64)
561	s.mask = mask << s.shift
562	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
563	if s.multiWordSelect {
564		nbBitsHigh := s.shift - uint64(64-c)
565		s.maskHigh = (1 << nbBitsHigh) - 1
566		s.shiftHigh = (c - nbBitsHigh)
567	}
568
569	// for each scalars, get the digit corresponding to the chunk we're processing.
570	for i := 0; i < len(scalars); i++ {
571		bits := (scalars[i][s.index] & s.mask) >> s.shift
572		if s.multiWordSelect {
573			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
574		}
575
576		if bits == 0 {
577			continue
578		}
579
580		// if msbWindow bit is set, we need to substract
581		if bits&msbWindow == 0 {
582			// add
583			buckets[bits-1].addMixed(&points[i])
584		} else {
585			// sub
586			buckets[bits & ^msbWindow].subMixed(&points[i])
587		}
588	}
589
590	// reduce buckets into total
591	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
592
593	var runningSum, total g2JacExtended
594	runningSum.setInfinity()
595	total.setInfinity()
596	for k := len(buckets) - 1; k >= 0; k-- {
597		if !buckets[k].ZZ.IsZero() {
598			runningSum.add(&buckets[k])
599		}
600		total.add(&runningSum)
601	}
602
603	chRes <- total
604	close(chRes)
605}
606
607func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac {
608	const c = 4                          // scalars partitioned into c-bit radixes
609	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
610
611	// for each chunk, spawn a go routine that'll loop through all the scalars
612	var chChunks [nbChunks]chan g2JacExtended
613
614	// wait group to wait for all the go routines to start
615	var wg sync.WaitGroup
616	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
617		chChunks[chunk] = make(chan g2JacExtended, 1)
618		<-opt.ChCPU // wait to have a cpu before scheduling
619		wg.Add(1)
620		go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) {
621			wg.Done()
622			var buckets [1 << (c - 1)]g2JacExtended
623			msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars)
624			opt.ChCPU <- struct{}{} // release token in the semaphore
625		}(uint64(chunk), chChunks[chunk], points, scalars)
626	}
627
628	// wait for all goRoutines to actually start
629	wg.Wait()
630
631	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
632	opt.Lock.Unlock()
633	return msmReduceChunkG2Affine(p, c, chChunks[:])
634}
635
636func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac {
637	const c = 5                              // scalars partitioned into c-bit radixes
638	const nbChunks = (fr.Limbs * 64 / c) + 1 // number of c-bit radixes in a scalar
639
640	// for each chunk, spawn a go routine that'll loop through all the scalars
641	var chChunks [nbChunks]chan g2JacExtended
642
643	// wait group to wait for all the go routines to start
644	var wg sync.WaitGroup
645	// c doesn't divide 384, last window is smaller we can allocate less buckets
646	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
647	chChunks[nbChunks-1] = make(chan g2JacExtended, 1)
648	<-opt.ChCPU // wait to have a cpu before scheduling
649	wg.Add(1)
650	go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) {
651		wg.Done()
652		var buckets [1 << (lastC - 1)]g2JacExtended
653		msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars)
654		opt.ChCPU <- struct{}{} // release token in the semaphore
655	}(uint64(nbChunks-1), chChunks[nbChunks-1], points, scalars)
656
657	for chunk := nbChunks - 2; chunk >= 0; chunk-- {
658		chChunks[chunk] = make(chan g2JacExtended, 1)
659		<-opt.ChCPU // wait to have a cpu before scheduling
660		wg.Add(1)
661		go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) {
662			wg.Done()
663			var buckets [1 << (c - 1)]g2JacExtended
664			msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars)
665			opt.ChCPU <- struct{}{} // release token in the semaphore
666		}(uint64(chunk), chChunks[chunk], points, scalars)
667	}
668
669	// wait for all goRoutines to actually start
670	wg.Wait()
671
672	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
673	opt.Lock.Unlock()
674	return msmReduceChunkG2Affine(p, c, chChunks[:])
675}
676
677func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac {
678	const c = 8                          // scalars partitioned into c-bit radixes
679	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
680
681	// for each chunk, spawn a go routine that'll loop through all the scalars
682	var chChunks [nbChunks]chan g2JacExtended
683
684	// wait group to wait for all the go routines to start
685	var wg sync.WaitGroup
686	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
687		chChunks[chunk] = make(chan g2JacExtended, 1)
688		<-opt.ChCPU // wait to have a cpu before scheduling
689		wg.Add(1)
690		go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) {
691			wg.Done()
692			var buckets [1 << (c - 1)]g2JacExtended
693			msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars)
694			opt.ChCPU <- struct{}{} // release token in the semaphore
695		}(uint64(chunk), chChunks[chunk], points, scalars)
696	}
697
698	// wait for all goRoutines to actually start
699	wg.Wait()
700
701	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
702	opt.Lock.Unlock()
703	return msmReduceChunkG2Affine(p, c, chChunks[:])
704}
705
706func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, opt *ecc.CPUSemaphore) *G2Jac {
707	const c = 16                         // scalars partitioned into c-bit radixes
708	const nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
709
710	// for each chunk, spawn a go routine that'll loop through all the scalars
711	var chChunks [nbChunks]chan g2JacExtended
712
713	// wait group to wait for all the go routines to start
714	var wg sync.WaitGroup
715	for chunk := nbChunks - 1; chunk >= 0; chunk-- {
716		chChunks[chunk] = make(chan g2JacExtended, 1)
717		<-opt.ChCPU // wait to have a cpu before scheduling
718		wg.Add(1)
719		go func(j uint64, chRes chan g2JacExtended, points []G2Affine, scalars []fr.Element) {
720			wg.Done()
721			var buckets [1 << (c - 1)]g2JacExtended
722			msmProcessChunkG2Affine(j, chRes, buckets[:], c, points, scalars)
723			opt.ChCPU <- struct{}{} // release token in the semaphore
724		}(uint64(chunk), chChunks[chunk], points, scalars)
725	}
726
727	// wait for all goRoutines to actually start
728	wg.Wait()
729
730	// all my tasks are scheduled, I can let other func use avaiable tokens in the semaphore
731	opt.Lock.Unlock()
732	return msmReduceChunkG2Affine(p, c, chChunks[:])
733}
734