1// Copyright 2020 ConsenSys Software Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package amd64
16
17import (
18	"github.com/consensys/bavard/amd64"
19)
20
21func (fq2 *Fq2Amd64) generateMulByNonResidueE2BLS381() {
22	// // MulByNonResidue multiplies a E2 by (1,1)
23	// func (z *E2) MulByNonResidue(x *E2) *E2 {
24	// 	var a fp.Element
25	// 	a.Sub(&x.A0, &x.A1)
26	// 	z.A1.Add(&x.A0, &x.A1)
27	// 	z.A0.Set(&a)
28	// 	return z
29	// }
30	registers := fq2.FnHeader("mulNonResE2", 0, 16)
31
32	a := registers.PopN(fq2.NbWords)
33	b := registers.PopN(fq2.NbWords)
34	x := registers.Pop()
35	tr := registers.Pop() // zero or r
36	fq2.XORQ(tr, tr)      // set to zero
37
38	fq2.MOVQ("x+8(FP)", x)
39	fq2.Mov(x, a) // a = a0
40
41	// a = x.A0 - x.A1
42	fq2.Sub(x, a, fq2.NbWords)
43	fq2.modReduceAfterSubScratch(tr, a, b)
44	// b = x.A0 + x.A1
45	fq2.Mov(x, b, fq2.NbWords) // b = a1
46	fq2.Add(x, b)
47
48	fq2.MOVQ("res+0(FP)", tr)
49	fq2.Mov(a, tr)
50	fq2.ReduceElement(b, a)
51	fq2.Mov(b, tr, 0, fq2.NbWords)
52
53	fq2.RET()
54}
55
56func (fq2 *Fq2Amd64) generateSquareE2BLS381(forceCheck bool) {
57	// // Square sets z to the E2-product of x,x returns z
58	// func (z *E2) Square(x *E2) *E2 {
59	// 	// algo 22 https://eprint.iacr.org/2010/354.pdf
60	// 	var a, b fp.Element
61	// 	a.Add(&x.A0, &x.A1)
62	// 	b.Sub(&x.A0, &x.A1)
63	// 	a.Mul(&a, &b)
64	// 	b.Mul(&x.A0, &x.A1).Double(&b)
65	// 	z.A0.Set(&a)
66	// 	z.A1.Set(&b)
67	// 	return z
68	// }
69	const argSize = 16
70	minStackSize := 0
71	if forceCheck {
72		minStackSize = argSize
73	}
74	stackSize := fq2.StackSize(fq2.NbWords*3, 2, minStackSize)
75	registers := fq2.FnHeader("squareAdxE2", stackSize, argSize, amd64.DX, amd64.AX)
76	defer fq2.AssertCleanStack(stackSize, minStackSize)
77	fq2.WriteLn("NO_LOCAL_POINTERS")
78
79	fq2.WriteLn(`
80	// z.A0 = (x.A0 + x.A1) * (x.A0 - x.A1)
81	// z.A1 = 2 * x.A0 * x.A1
82	`)
83
84	noAdx := fq2.NewLabel()
85	if forceCheck {
86		// check ADX instruction support
87		fq2.CMPB("·supportAdx(SB)", 1)
88		fq2.JNE(noAdx)
89	}
90
91	// used in the mul operation
92	op1 := registers.PopN(fq2.NbWords)
93	res := registers.PopN(fq2.NbWords)
94
95	xat := func(i int) string {
96		return string(op1[i])
97	}
98
99	ax := amd64.AX
100	dx := amd64.DX
101
102	// b = a0 * a1 * 2
103
104	fq2.Comment("2 * x.A0 * x.A1")
105	fq2.MOVQ("x+8(FP)", ax)
106
107	fq2.LabelRegisters("2 * x.A1", op1...)
108	fq2.Mov(ax, op1, fq2.NbWords)
109	fq2.Add(op1, op1) // op1, no reduce
110
111	fq2.MulADX(&registers, xat, func(i int) string {
112		fq2.MOVQ("x+8(FP)", dx)
113		return dx.At(i)
114	}, res)
115	fq2.ReduceElement(res, op1)
116
117	fq2.MOVQ("x+8(FP)", ax)
118
119	fq2.LabelRegisters("x.A1", op1...)
120	fq2.Mov(ax, op1, fq2.NbWords)
121
122	fq2.MOVQ("res+0(FP)", dx)
123	fq2.Mov(res, dx, 0, fq2.NbWords)
124	fq2.Mov(op1, res)
125
126	// op1 and res both contains x.A1 at this point
127	// res+0(FP) (z.A1) must not be referenced.
128
129	// a = a0 + a1
130	fq2.Comment("Add(&x.A0, &x.A1)")
131	fq2.Add(ax, op1)
132	//--> must save on stack
133	a0a1 := fq2.PopN(&registers, true)
134	fq2.Mov(op1, a0a1)
135
136	zero := amd64.BP
137	fq2.XORQ(zero, zero)
138
139	// b = a0 - a1
140	fq2.Comment("Sub(&x.A0, &x.A1)")
141	fq2.Mov(ax, op1)
142	fq2.Sub(res, op1)
143	fq2.modReduceAfterSubScratch(zero, op1, res) // using res as scratch registers
144
145	// a = a * b
146	fq2.MulADX(&registers, xat, func(i int) string { return string(a0a1[i]) }, res)
147	fq2.ReduceElement(res, op1)
148
149	fq2.MOVQ("res+0(FP)", ax)
150	fq2.Mov(res, ax)
151
152	// result.a0 = a
153	fq2.RET()
154
155	// No adx
156	if forceCheck {
157		fq2.LABEL(noAdx)
158		fq2.MOVQ("res+0(FP)", amd64.AX)
159		fq2.MOVQ(amd64.AX, "(SP)")
160		fq2.MOVQ("x+8(FP)", amd64.AX)
161		fq2.MOVQ(amd64.AX, "8(SP)")
162		fq2.WriteLn("CALL ·squareGenericE2(SB)")
163		fq2.RET()
164	}
165
166	fq2.Push(&registers, a0a1...)
167}
168
169func (fq2 *Fq2Amd64) generateMulE2BLS381(forceCheck bool) {
170	// var a, b, c fp.Element
171	// a.Add(&x.A0, &x.A1)
172	// b.Add(&y.A0, &y.A1)
173	// a.Mul(&a, &b)
174	// b.Mul(&x.A0, &y.A0)
175	// c.Mul(&x.A1, &y.A1)
176	// z.A1.Sub(&a, &b).Sub(&z.A1, &c)
177	// z.A0.Sub(&b, &c)
178
179	// we need a bit of stack space to store the results of the xA0yA0 and xA1yA1 multiplications
180	const argSize = 24
181	minStackSize := 0
182	if forceCheck {
183		minStackSize = argSize
184	}
185	stackSize := fq2.StackSize(fq2.NbWords*4, 2, minStackSize)
186	registers := fq2.FnHeader("mulAdxE2", stackSize, argSize, amd64.DX, amd64.AX)
187	defer fq2.AssertCleanStack(stackSize, minStackSize)
188
189	fq2.WriteLn("NO_LOCAL_POINTERS")
190
191	fq2.WriteLn(`
192	// var a, b, c fp.Element
193	// a.Add(&x.A0, &x.A1)
194	// b.Add(&y.A0, &y.A1)
195	// a.Mul(&a, &b)
196	// b.Mul(&x.A0, &y.A0)
197	// c.Mul(&x.A1, &y.A1)
198	// z.A1.Sub(&a, &b).Sub(&z.A1, &c)
199	// z.A0.Sub(&b, &c)
200	`)
201
202	lblNoAdx := fq2.NewLabel()
203	// check ADX instruction support
204	if forceCheck {
205		fq2.CMPB("·supportAdx(SB)", 1)
206		fq2.JNE(lblNoAdx)
207	}
208
209	// used in the mul operation
210	op1 := registers.PopN(fq2.NbWords)
211	res := registers.PopN(fq2.NbWords)
212
213	xat := func(i int) string {
214		return string(op1[i])
215	}
216
217	ax := amd64.AX
218	dx := amd64.DX
219
220	aStack := fq2.PopN(&registers, true)
221	cStack := fq2.PopN(&registers, true)
222
223	fq2.MOVQ("x+8(FP)", ax)
224
225	// c = x.A1 * y.A1
226	fq2.Mov(ax, op1, fq2.NbWords)
227	fq2.MulADX(&registers, xat, func(i int) string {
228		fq2.MOVQ("y+16(FP)", dx)
229		return dx.At(i + fq2.NbWords)
230	}, res)
231	fq2.ReduceElement(res, op1)
232	// res = x.A1 * y.A1
233	// pushing on stack for later use.
234	fq2.Mov(res, cStack)
235
236	fq2.MOVQ("x+8(FP)", ax)
237	fq2.MOVQ("y+16(FP)", dx)
238
239	// a = x.a0 + x.a1
240	fq2.Mov(ax, op1, fq2.NbWords)
241	fq2.Add(ax, op1)
242	fq2.Mov(op1, aStack)
243
244	// b = y.a0 + y.a1
245	fq2.Mov(dx, op1)
246	fq2.Add(dx, op1, fq2.NbWords)
247	// --> note, we don't reduce, as this is used as input to the mul which accept input of size D-1/2 -1
248
249	// a = 	a * b = (x.a0 + x.a1) *  (y.a0 + y.a1)
250	fq2.MulADX(&registers, xat, func(i int) string {
251		return string(aStack[i])
252	}, res)
253	fq2.ReduceElement(res, op1)
254
255	// moving result to the stack.
256	fq2.Mov(res, aStack)
257
258	// b = x.A0 * y.AO
259	fq2.MOVQ("x+8(FP)", ax)
260
261	fq2.Mov(ax, op1)
262
263	fq2.MulADX(&registers, xat, func(i int) string {
264		fq2.MOVQ("y+16(FP)", dx)
265		return dx.At(i)
266	}, res)
267	fq2.ReduceElement(res, op1)
268
269	zero := dx
270	fq2.XORQ(zero, zero)
271
272	// a = a - b -c
273	fq2.Mov(aStack, op1)
274	fq2.Sub(res, op1) // a -= b
275	fq2.Mov(res, aStack)
276	fq2.modReduceAfterSubScratch(zero, op1, res)
277
278	fq2.Sub(cStack, op1) // a -= c
279	fq2.modReduceAfterSubScratch(zero, op1, res)
280
281	fq2.MOVQ("z+0(FP)", ax)
282	fq2.Mov(op1, ax, 0, fq2.NbWords)
283
284	// b = b - c
285	fq2.Mov(aStack, res)
286	fq2.Sub(cStack, res) // b -= c
287	fq2.modReduceAfterSubScratch(zero, res, op1)
288
289	fq2.Mov(res, ax)
290
291	fq2.RET()
292
293	// No adx
294	if forceCheck {
295		fq2.LABEL(lblNoAdx)
296		fq2.MOVQ("z+0(FP)", amd64.AX)
297		fq2.MOVQ(amd64.AX, "(SP)")
298		fq2.MOVQ("x+8(FP)", amd64.AX)
299		fq2.MOVQ(amd64.AX, "8(SP)")
300		fq2.MOVQ("y+16(FP)", amd64.AX)
301		fq2.MOVQ(amd64.AX, "16(SP)")
302		fq2.WriteLn("CALL ·mulGenericE2(SB)")
303		fq2.RET()
304
305	}
306
307	fq2.Push(&registers, aStack...)
308	fq2.Push(&registers, cStack...)
309
310}
311