1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go
6// +build !math_big_pure_go
7
8#include "textflag.h"
9
10// This file provides fast assembly versions for the elementary
11// arithmetic operations on vectors implemented in arith.go.
12
13// func mulWW(x, y Word) (z1, z0 Word)
14TEXT ·mulWW(SB),NOSPLIT,$0
15	MOVQ x+0(FP), AX
16	MULQ y+8(FP)
17	MOVQ DX, z1+16(FP)
18	MOVQ AX, z0+24(FP)
19	RET
20
21
22
23// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
24// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
25// This is faster than using rotate instructions.
26
27// func addVV(z, x, y []Word) (c Word)
28TEXT ·addVV(SB),NOSPLIT,$0
29	MOVQ z_len+8(FP), DI
30	MOVQ x+24(FP), R8
31	MOVQ y+48(FP), R9
32	MOVQ z+0(FP), R10
33
34	MOVQ $0, CX		// c = 0
35	MOVQ $0, SI		// i = 0
36
37	// s/JL/JMP/ below to disable the unrolled loop
38	SUBQ $4, DI		// n -= 4
39	JL V1			// if n < 0 goto V1
40
41U1:	// n >= 0
42	// regular loop body unrolled 4x
43	ADDQ CX, CX		// restore CF
44	MOVQ 0(R8)(SI*8), R11
45	MOVQ 8(R8)(SI*8), R12
46	MOVQ 16(R8)(SI*8), R13
47	MOVQ 24(R8)(SI*8), R14
48	ADCQ 0(R9)(SI*8), R11
49	ADCQ 8(R9)(SI*8), R12
50	ADCQ 16(R9)(SI*8), R13
51	ADCQ 24(R9)(SI*8), R14
52	MOVQ R11, 0(R10)(SI*8)
53	MOVQ R12, 8(R10)(SI*8)
54	MOVQ R13, 16(R10)(SI*8)
55	MOVQ R14, 24(R10)(SI*8)
56	SBBQ CX, CX		// save CF
57
58	ADDQ $4, SI		// i += 4
59	SUBQ $4, DI		// n -= 4
60	JGE U1			// if n >= 0 goto U1
61
62V1:	ADDQ $4, DI		// n += 4
63	JLE E1			// if n <= 0 goto E1
64
65L1:	// n > 0
66	ADDQ CX, CX		// restore CF
67	MOVQ 0(R8)(SI*8), R11
68	ADCQ 0(R9)(SI*8), R11
69	MOVQ R11, 0(R10)(SI*8)
70	SBBQ CX, CX		// save CF
71
72	ADDQ $1, SI		// i++
73	SUBQ $1, DI		// n--
74	JG L1			// if n > 0 goto L1
75
76E1:	NEGQ CX
77	MOVQ CX, c+72(FP)	// return c
78	RET
79
80
81// func subVV(z, x, y []Word) (c Word)
82// (same as addVV except for SBBQ instead of ADCQ and label names)
83TEXT ·subVV(SB),NOSPLIT,$0
84	MOVQ z_len+8(FP), DI
85	MOVQ x+24(FP), R8
86	MOVQ y+48(FP), R9
87	MOVQ z+0(FP), R10
88
89	MOVQ $0, CX		// c = 0
90	MOVQ $0, SI		// i = 0
91
92	// s/JL/JMP/ below to disable the unrolled loop
93	SUBQ $4, DI		// n -= 4
94	JL V2			// if n < 0 goto V2
95
96U2:	// n >= 0
97	// regular loop body unrolled 4x
98	ADDQ CX, CX		// restore CF
99	MOVQ 0(R8)(SI*8), R11
100	MOVQ 8(R8)(SI*8), R12
101	MOVQ 16(R8)(SI*8), R13
102	MOVQ 24(R8)(SI*8), R14
103	SBBQ 0(R9)(SI*8), R11
104	SBBQ 8(R9)(SI*8), R12
105	SBBQ 16(R9)(SI*8), R13
106	SBBQ 24(R9)(SI*8), R14
107	MOVQ R11, 0(R10)(SI*8)
108	MOVQ R12, 8(R10)(SI*8)
109	MOVQ R13, 16(R10)(SI*8)
110	MOVQ R14, 24(R10)(SI*8)
111	SBBQ CX, CX		// save CF
112
113	ADDQ $4, SI		// i += 4
114	SUBQ $4, DI		// n -= 4
115	JGE U2			// if n >= 0 goto U2
116
117V2:	ADDQ $4, DI		// n += 4
118	JLE E2			// if n <= 0 goto E2
119
120L2:	// n > 0
121	ADDQ CX, CX		// restore CF
122	MOVQ 0(R8)(SI*8), R11
123	SBBQ 0(R9)(SI*8), R11
124	MOVQ R11, 0(R10)(SI*8)
125	SBBQ CX, CX		// save CF
126
127	ADDQ $1, SI		// i++
128	SUBQ $1, DI		// n--
129	JG L2			// if n > 0 goto L2
130
131E2:	NEGQ CX
132	MOVQ CX, c+72(FP)	// return c
133	RET
134
135
136// func addVW(z, x []Word, y Word) (c Word)
137TEXT ·addVW(SB),NOSPLIT,$0
138	MOVQ z_len+8(FP), DI
139	CMPQ DI, $32
140	JG large
141	MOVQ x+24(FP), R8
142	MOVQ y+48(FP), CX	// c = y
143	MOVQ z+0(FP), R10
144
145	MOVQ $0, SI		// i = 0
146
147	// s/JL/JMP/ below to disable the unrolled loop
148	SUBQ $4, DI		// n -= 4
149	JL V3			// if n < 4 goto V3
150
151U3:	// n >= 0
152	// regular loop body unrolled 4x
153	MOVQ 0(R8)(SI*8), R11
154	MOVQ 8(R8)(SI*8), R12
155	MOVQ 16(R8)(SI*8), R13
156	MOVQ 24(R8)(SI*8), R14
157	ADDQ CX, R11
158	ADCQ $0, R12
159	ADCQ $0, R13
160	ADCQ $0, R14
161	SBBQ CX, CX		// save CF
162	NEGQ CX
163	MOVQ R11, 0(R10)(SI*8)
164	MOVQ R12, 8(R10)(SI*8)
165	MOVQ R13, 16(R10)(SI*8)
166	MOVQ R14, 24(R10)(SI*8)
167
168	ADDQ $4, SI		// i += 4
169	SUBQ $4, DI		// n -= 4
170	JGE U3			// if n >= 0 goto U3
171
172V3:	ADDQ $4, DI		// n += 4
173	JLE E3			// if n <= 0 goto E3
174
175L3:	// n > 0
176	ADDQ 0(R8)(SI*8), CX
177	MOVQ CX, 0(R10)(SI*8)
178	SBBQ CX, CX		// save CF
179	NEGQ CX
180
181	ADDQ $1, SI		// i++
182	SUBQ $1, DI		// n--
183	JG L3			// if n > 0 goto L3
184
185E3:	MOVQ CX, c+56(FP)	// return c
186	RET
187large:
188	JMP ·addVWlarge(SB)
189
190
191// func subVW(z, x []Word, y Word) (c Word)
192// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
193TEXT ·subVW(SB),NOSPLIT,$0
194	MOVQ z_len+8(FP), DI
195	CMPQ DI, $32
196	JG large
197	MOVQ x+24(FP), R8
198	MOVQ y+48(FP), CX	// c = y
199	MOVQ z+0(FP), R10
200
201	MOVQ $0, SI		// i = 0
202
203	// s/JL/JMP/ below to disable the unrolled loop
204	SUBQ $4, DI		// n -= 4
205	JL V4			// if n < 4 goto V4
206
207U4:	// n >= 0
208	// regular loop body unrolled 4x
209	MOVQ 0(R8)(SI*8), R11
210	MOVQ 8(R8)(SI*8), R12
211	MOVQ 16(R8)(SI*8), R13
212	MOVQ 24(R8)(SI*8), R14
213	SUBQ CX, R11
214	SBBQ $0, R12
215	SBBQ $0, R13
216	SBBQ $0, R14
217	SBBQ CX, CX		// save CF
218	NEGQ CX
219	MOVQ R11, 0(R10)(SI*8)
220	MOVQ R12, 8(R10)(SI*8)
221	MOVQ R13, 16(R10)(SI*8)
222	MOVQ R14, 24(R10)(SI*8)
223
224	ADDQ $4, SI		// i += 4
225	SUBQ $4, DI		// n -= 4
226	JGE U4			// if n >= 0 goto U4
227
228V4:	ADDQ $4, DI		// n += 4
229	JLE E4			// if n <= 0 goto E4
230
231L4:	// n > 0
232	MOVQ 0(R8)(SI*8), R11
233	SUBQ CX, R11
234	MOVQ R11, 0(R10)(SI*8)
235	SBBQ CX, CX		// save CF
236	NEGQ CX
237
238	ADDQ $1, SI		// i++
239	SUBQ $1, DI		// n--
240	JG L4			// if n > 0 goto L4
241
242E4:	MOVQ CX, c+56(FP)	// return c
243	RET
244large:
245	JMP ·subVWlarge(SB)
246
247
248// func shlVU(z, x []Word, s uint) (c Word)
249TEXT ·shlVU(SB),NOSPLIT,$0
250	MOVQ z_len+8(FP), BX	// i = z
251	SUBQ $1, BX		// i--
252	JL X8b			// i < 0	(n <= 0)
253
254	// n > 0
255	MOVQ z+0(FP), R10
256	MOVQ x+24(FP), R8
257	MOVQ s+48(FP), CX
258	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
259	MOVQ $0, DX
260	SHLQ CX, AX, DX		// w1>>ŝ
261	MOVQ DX, c+56(FP)
262
263	CMPQ BX, $0
264	JLE X8a			// i <= 0
265
266	// i > 0
267L8:	MOVQ AX, DX		// w = w1
268	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
269	SHLQ CX, AX, DX		// w<<s | w1>>ŝ
270	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
271	SUBQ $1, BX		// i--
272	JG L8			// i > 0
273
274	// i <= 0
275X8a:	SHLQ CX, AX		// w1<<s
276	MOVQ AX, (R10)		// z[0] = w1<<s
277	RET
278
279X8b:	MOVQ $0, c+56(FP)
280	RET
281
282
283// func shrVU(z, x []Word, s uint) (c Word)
284TEXT ·shrVU(SB),NOSPLIT,$0
285	MOVQ z_len+8(FP), R11
286	SUBQ $1, R11		// n--
287	JL X9b			// n < 0	(n <= 0)
288
289	// n > 0
290	MOVQ z+0(FP), R10
291	MOVQ x+24(FP), R8
292	MOVQ s+48(FP), CX
293	MOVQ (R8), AX		// w1 = x[0]
294	MOVQ $0, DX
295	SHRQ CX, AX, DX		// w1<<ŝ
296	MOVQ DX, c+56(FP)
297
298	MOVQ $0, BX		// i = 0
299	JMP E9
300
301	// i < n-1
302L9:	MOVQ AX, DX		// w = w1
303	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
304	SHRQ CX, AX, DX		// w>>s | w1<<ŝ
305	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
306	ADDQ $1, BX		// i++
307
308E9:	CMPQ BX, R11
309	JL L9			// i < n-1
310
311	// i >= n-1
312X9a:	SHRQ CX, AX		// w1>>s
313	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
314	RET
315
316X9b:	MOVQ $0, c+56(FP)
317	RET
318
319
320// func mulAddVWW(z, x []Word, y, r Word) (c Word)
321TEXT ·mulAddVWW(SB),NOSPLIT,$0
322	MOVQ z+0(FP), R10
323	MOVQ x+24(FP), R8
324	MOVQ y+48(FP), R9
325	MOVQ r+56(FP), CX	// c = r
326	MOVQ z_len+8(FP), R11
327	MOVQ $0, BX		// i = 0
328
329	CMPQ R11, $4
330	JL E5
331
332U5:	// i+4 <= n
333	// regular loop body unrolled 4x
334	MOVQ (0*8)(R8)(BX*8), AX
335	MULQ R9
336	ADDQ CX, AX
337	ADCQ $0, DX
338	MOVQ AX, (0*8)(R10)(BX*8)
339	MOVQ DX, CX
340	MOVQ (1*8)(R8)(BX*8), AX
341	MULQ R9
342	ADDQ CX, AX
343	ADCQ $0, DX
344	MOVQ AX, (1*8)(R10)(BX*8)
345	MOVQ DX, CX
346	MOVQ (2*8)(R8)(BX*8), AX
347	MULQ R9
348	ADDQ CX, AX
349	ADCQ $0, DX
350	MOVQ AX, (2*8)(R10)(BX*8)
351	MOVQ DX, CX
352	MOVQ (3*8)(R8)(BX*8), AX
353	MULQ R9
354	ADDQ CX, AX
355	ADCQ $0, DX
356	MOVQ AX, (3*8)(R10)(BX*8)
357	MOVQ DX, CX
358	ADDQ $4, BX		// i += 4
359
360	LEAQ 4(BX), DX
361	CMPQ DX, R11
362	JLE U5
363	JMP E5
364
365L5:	MOVQ (R8)(BX*8), AX
366	MULQ R9
367	ADDQ CX, AX
368	ADCQ $0, DX
369	MOVQ AX, (R10)(BX*8)
370	MOVQ DX, CX
371	ADDQ $1, BX		// i++
372
373E5:	CMPQ BX, R11		// i < n
374	JL L5
375
376	MOVQ CX, c+64(FP)
377	RET
378
379
380// func addMulVVW(z, x []Word, y Word) (c Word)
381TEXT ·addMulVVW(SB),NOSPLIT,$0
382	CMPB ·support_adx(SB), $1
383	JEQ adx
384	MOVQ z+0(FP), R10
385	MOVQ x+24(FP), R8
386	MOVQ y+48(FP), R9
387	MOVQ z_len+8(FP), R11
388	MOVQ $0, BX		// i = 0
389	MOVQ $0, CX		// c = 0
390	MOVQ R11, R12
391	ANDQ $-2, R12
392	CMPQ R11, $2
393	JAE A6
394	JMP E6
395
396A6:
397	MOVQ (R8)(BX*8), AX
398	MULQ R9
399	ADDQ (R10)(BX*8), AX
400	ADCQ $0, DX
401	ADDQ CX, AX
402	ADCQ $0, DX
403	MOVQ DX, CX
404	MOVQ AX, (R10)(BX*8)
405
406	MOVQ (8)(R8)(BX*8), AX
407	MULQ R9
408	ADDQ (8)(R10)(BX*8), AX
409	ADCQ $0, DX
410	ADDQ CX, AX
411	ADCQ $0, DX
412	MOVQ DX, CX
413	MOVQ AX, (8)(R10)(BX*8)
414
415	ADDQ $2, BX
416	CMPQ BX, R12
417	JL A6
418	JMP E6
419
420L6:	MOVQ (R8)(BX*8), AX
421	MULQ R9
422	ADDQ CX, AX
423	ADCQ $0, DX
424	ADDQ AX, (R10)(BX*8)
425	ADCQ $0, DX
426	MOVQ DX, CX
427	ADDQ $1, BX		// i++
428
429E6:	CMPQ BX, R11		// i < n
430	JL L6
431
432	MOVQ CX, c+56(FP)
433	RET
434
435adx:
436	MOVQ z_len+8(FP), R11
437	MOVQ z+0(FP), R10
438	MOVQ x+24(FP), R8
439	MOVQ y+48(FP), DX
440	MOVQ $0, BX   // i = 0
441	MOVQ $0, CX   // carry
442	CMPQ R11, $8
443	JAE  adx_loop_header
444	CMPQ BX, R11
445	JL adx_short
446	MOVQ CX, c+56(FP)
447	RET
448
449adx_loop_header:
450	MOVQ  R11, R13
451	ANDQ  $-8, R13
452adx_loop:
453	XORQ  R9, R9  // unset flags
454	MULXQ (R8), SI, DI
455	ADCXQ CX,SI
456	ADOXQ (R10), SI
457	MOVQ  SI,(R10)
458
459	MULXQ 8(R8), AX, CX
460	ADCXQ DI, AX
461	ADOXQ 8(R10), AX
462	MOVQ  AX, 8(R10)
463
464	MULXQ 16(R8), SI, DI
465	ADCXQ CX, SI
466	ADOXQ 16(R10), SI
467	MOVQ  SI, 16(R10)
468
469	MULXQ 24(R8), AX, CX
470	ADCXQ DI, AX
471	ADOXQ 24(R10), AX
472	MOVQ  AX, 24(R10)
473
474	MULXQ 32(R8), SI, DI
475	ADCXQ CX, SI
476	ADOXQ 32(R10), SI
477	MOVQ  SI, 32(R10)
478
479	MULXQ 40(R8), AX, CX
480	ADCXQ DI, AX
481	ADOXQ 40(R10), AX
482	MOVQ  AX, 40(R10)
483
484	MULXQ 48(R8), SI, DI
485	ADCXQ CX, SI
486	ADOXQ 48(R10), SI
487	MOVQ  SI, 48(R10)
488
489	MULXQ 56(R8), AX, CX
490	ADCXQ DI, AX
491	ADOXQ 56(R10), AX
492	MOVQ  AX, 56(R10)
493
494	ADCXQ R9, CX
495	ADOXQ R9, CX
496
497	ADDQ $64, R8
498	ADDQ $64, R10
499	ADDQ $8, BX
500
501	CMPQ BX, R13
502	JL adx_loop
503	MOVQ z+0(FP), R10
504	MOVQ x+24(FP), R8
505	CMPQ BX, R11
506	JL adx_short
507	MOVQ CX, c+56(FP)
508	RET
509
510adx_short:
511	MULXQ (R8)(BX*8), SI, DI
512	ADDQ CX, SI
513	ADCQ $0, DI
514	ADDQ SI, (R10)(BX*8)
515	ADCQ $0, DI
516	MOVQ DI, CX
517	ADDQ $1, BX		// i++
518
519	CMPQ BX, R11
520	JL adx_short
521
522	MOVQ CX, c+56(FP)
523	RET
524
525
526
527