1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go && (ppc64 || ppc64le)
6// +build !math_big_pure_go
7// +build ppc64 ppc64le
8
9#include "textflag.h"
10
11// This file provides fast assembly versions for the elementary
12// arithmetic operations on vectors implemented in arith.go.
13
14// func mulWW(x, y Word) (z1, z0 Word)
15TEXT ·mulWW(SB), NOSPLIT, $0
16	MOVD   x+0(FP), R4
17	MOVD   y+8(FP), R5
18	MULHDU R4, R5, R6
19	MULLD  R4, R5, R7
20	MOVD   R6, z1+16(FP)
21	MOVD   R7, z0+24(FP)
22	RET
23
24// func addVV(z, y, y []Word) (c Word)
25// z[i] = x[i] + y[i] for all i, carrying
26TEXT ·addVV(SB), NOSPLIT, $0
27	MOVD  z_len+8(FP), R7   // R7 = z_len
28	MOVD  x+24(FP), R8      // R8 = x[]
29	MOVD  y+48(FP), R9      // R9 = y[]
30	MOVD  z+0(FP), R10      // R10 = z[]
31
32	// If z_len = 0, we are done
33	CMP   R0, R7
34	MOVD  R0, R4
35	BEQ   done
36
37	// Process the first iteration out of the loop so we can
38	// use MOVDU and avoid 3 index registers updates.
39	MOVD  0(R8), R11      // R11 = x[i]
40	MOVD  0(R9), R12      // R12 = y[i]
41	ADD   $-1, R7         // R7 = z_len - 1
42	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
43	CMP   R0, R7
44	MOVD  R15, 0(R10)     // z[i]
45	BEQ   final          // If z_len was 1, we are done
46
47	SRD   $2, R7, R5      // R5 = z_len/4
48	CMP   R0, R5
49	MOVD  R5, CTR         // Set up loop counter
50	BEQ   tail            // If R5 = 0, we can't use the loop
51
52	// Process 4 elements per iteration. Unrolling this loop
53	// means a performance trade-off: we will lose performance
54	// for small values of z_len (0.90x in the worst case), but
55	// gain significant performance as z_len increases (up to
56	// 1.45x).
57loop:
58	MOVD  8(R8), R11      // R11 = x[i]
59	MOVD  16(R8), R12     // R12 = x[i+1]
60	MOVD  24(R8), R14     // R14 = x[i+2]
61	MOVDU 32(R8), R15     // R15 = x[i+3]
62	MOVD  8(R9), R16      // R16 = y[i]
63	MOVD  16(R9), R17     // R17 = y[i+1]
64	MOVD  24(R9), R18     // R18 = y[i+2]
65	MOVDU 32(R9), R19     // R19 = y[i+3]
66	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
67	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
68	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
69	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
70	MOVD  R20, 8(R10)     // z[i]
71	MOVD  R21, 16(R10)    // z[i+1]
72	MOVD  R22, 24(R10)    // z[i+2]
73	MOVDU R23, 32(R10)    // z[i+3]
74	ADD   $-4, R7         // R7 = z_len - 4
75	BC  16, 0, loop       // bdnz
76
77	// We may have more elements to read
78	CMP   R0, R7
79	BEQ   final
80
81	// Process the remaining elements, one at a time
82tail:
83	MOVDU 8(R8), R11      // R11 = x[i]
84	MOVDU 8(R9), R16      // R16 = y[i]
85	ADD   $-1, R7         // R7 = z_len - 1
86	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
87	CMP   R0, R7
88	MOVDU R20, 8(R10)     // z[i]
89	BEQ   final           // If R7 = 0, we are done
90
91	MOVDU 8(R8), R11
92	MOVDU 8(R9), R16
93	ADD   $-1, R7
94	ADDE  R11, R16, R20
95	CMP   R0, R7
96	MOVDU R20, 8(R10)
97	BEQ   final
98
99	MOVD  8(R8), R11
100	MOVD  8(R9), R16
101	ADDE  R11, R16, R20
102	MOVD  R20, 8(R10)
103
104final:
105	ADDZE R4              // Capture CA
106
107done:
108	MOVD  R4, c+72(FP)
109	RET
110
111// func subVV(z, x, y []Word) (c Word)
112// z[i] = x[i] - y[i] for all i, carrying
113TEXT ·subVV(SB), NOSPLIT, $0
114	MOVD  z_len+8(FP), R7 // R7 = z_len
115	MOVD  x+24(FP), R8    // R8 = x[]
116	MOVD  y+48(FP), R9    // R9 = y[]
117	MOVD  z+0(FP), R10    // R10 = z[]
118
119	// If z_len = 0, we are done
120	CMP   R0, R7
121	MOVD  R0, R4
122	BEQ   done
123
124	// Process the first iteration out of the loop so we can
125	// use MOVDU and avoid 3 index registers updates.
126	MOVD  0(R8), R11      // R11 = x[i]
127	MOVD  0(R9), R12      // R12 = y[i]
128	ADD   $-1, R7         // R7 = z_len - 1
129	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
130	CMP   R0, R7
131	MOVD  R15, 0(R10)     // z[i]
132	BEQ   final           // If z_len was 1, we are done
133
134	SRD   $2, R7, R5      // R5 = z_len/4
135	CMP   R0, R5
136	MOVD  R5, CTR         // Set up loop counter
137	BEQ   tail            // If R5 = 0, we can't use the loop
138
139	// Process 4 elements per iteration. Unrolling this loop
140	// means a performance trade-off: we will lose performance
141	// for small values of z_len (0.92x in the worst case), but
142	// gain significant performance as z_len increases (up to
143	// 1.45x).
144loop:
145	MOVD  8(R8), R11      // R11 = x[i]
146	MOVD  16(R8), R12     // R12 = x[i+1]
147	MOVD  24(R8), R14     // R14 = x[i+2]
148	MOVDU 32(R8), R15     // R15 = x[i+3]
149	MOVD  8(R9), R16      // R16 = y[i]
150	MOVD  16(R9), R17     // R17 = y[i+1]
151	MOVD  24(R9), R18     // R18 = y[i+2]
152	MOVDU 32(R9), R19     // R19 = y[i+3]
153	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
154	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
155	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
156	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
157	MOVD  R20, 8(R10)     // z[i]
158	MOVD  R21, 16(R10)    // z[i+1]
159	MOVD  R22, 24(R10)    // z[i+2]
160	MOVDU R23, 32(R10)    // z[i+3]
161	ADD   $-4, R7         // R7 = z_len - 4
162	BC  16, 0, loop       // bdnz
163
164	// We may have more elements to read
165	CMP   R0, R7
166	BEQ   final
167
168	// Process the remaining elements, one at a time
169tail:
170	MOVDU 8(R8), R11      // R11 = x[i]
171	MOVDU 8(R9), R16      // R16 = y[i]
172	ADD   $-1, R7         // R7 = z_len - 1
173	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
174	CMP   R0, R7
175	MOVDU R20, 8(R10)     // z[i]
176	BEQ   final           // If R7 = 0, we are done
177
178	MOVDU 8(R8), R11
179	MOVDU 8(R9), R16
180	ADD   $-1, R7
181	SUBE  R16, R11, R20
182	CMP   R0, R7
183	MOVDU R20, 8(R10)
184	BEQ   final
185
186	MOVD  8(R8), R11
187	MOVD  8(R9), R16
188	SUBE  R16, R11, R20
189	MOVD  R20, 8(R10)
190
191final:
192	ADDZE R4
193	XOR   $1, R4
194
195done:
196	MOVD  R4, c+72(FP)
197	RET
198
199// func addVW(z, x []Word, y Word) (c Word)
200TEXT ·addVW(SB), NOSPLIT, $0
201	MOVD z+0(FP), R10	// R10 = z[]
202	MOVD x+24(FP), R8	// R8 = x[]
203	MOVD y+48(FP), R4	// R4 = y = c
204	MOVD z_len+8(FP), R11	// R11 = z_len
205
206	CMP   R0, R11		// If z_len is zero, return
207	BEQ   done
208
209	// We will process the first iteration out of the loop so we capture
210	// the value of c. In the subsequent iterations, we will rely on the
211	// value of CA set here.
212	MOVD  0(R8), R20	// R20 = x[i]
213	ADD   $-1, R11		// R11 = z_len - 1
214	ADDC  R20, R4, R6	// R6 = x[i] + c
215	CMP   R0, R11		// If z_len was 1, we are done
216	MOVD  R6, 0(R10)	// z[i]
217	BEQ   final
218
219	// We will read 4 elements per iteration
220	SRD   $2, R11, R9	// R9 = z_len/4
221	DCBT  (R8)
222	CMP   R0, R9
223	MOVD  R9, CTR		// Set up the loop counter
224	BEQ   tail		// If R9 = 0, we can't use the loop
225
226loop:
227	MOVD  8(R8), R20	// R20 = x[i]
228	MOVD  16(R8), R21	// R21 = x[i+1]
229	MOVD  24(R8), R22	// R22 = x[i+2]
230	MOVDU 32(R8), R23	// R23 = x[i+3]
231	ADDZE R20, R24		// R24 = x[i] + CA
232	ADDZE R21, R25		// R25 = x[i+1] + CA
233	ADDZE R22, R26		// R26 = x[i+2] + CA
234	ADDZE R23, R27		// R27 = x[i+3] + CA
235	MOVD  R24, 8(R10)	// z[i]
236	MOVD  R25, 16(R10)	// z[i+1]
237	MOVD  R26, 24(R10)	// z[i+2]
238	MOVDU R27, 32(R10)	// z[i+3]
239	ADD   $-4, R11		// R11 = z_len - 4
240	BC    16, 0, loop	// bdnz
241
242	// We may have some elements to read
243	CMP R0, R11
244	BEQ final
245
246tail:
247	MOVDU 8(R8), R20
248	ADDZE R20, R24
249	ADD $-1, R11
250	MOVDU R24, 8(R10)
251	CMP R0, R11
252	BEQ final
253
254	MOVDU 8(R8), R20
255	ADDZE R20, R24
256	ADD $-1, R11
257	MOVDU R24, 8(R10)
258	CMP R0, R11
259	BEQ final
260
261	MOVD 8(R8), R20
262	ADDZE R20, R24
263	MOVD R24, 8(R10)
264
265final:
266	ADDZE R0, R4		// c = CA
267done:
268	MOVD  R4, c+56(FP)
269	RET
270
271// func subVW(z, x []Word, y Word) (c Word)
272TEXT ·subVW(SB), NOSPLIT, $0
273	MOVD  z+0(FP), R10	// R10 = z[]
274	MOVD  x+24(FP), R8	// R8 = x[]
275	MOVD  y+48(FP), R4	// R4 = y = c
276	MOVD  z_len+8(FP), R11	// R11 = z_len
277
278	CMP   R0, R11		// If z_len is zero, return
279	BEQ   done
280
281	// We will process the first iteration out of the loop so we capture
282	// the value of c. In the subsequent iterations, we will rely on the
283	// value of CA set here.
284	MOVD  0(R8), R20	// R20 = x[i]
285	ADD   $-1, R11		// R11 = z_len - 1
286	SUBC  R4, R20, R6	// R6 = x[i] - c
287	CMP   R0, R11		// If z_len was 1, we are done
288	MOVD  R6, 0(R10)	// z[i]
289	BEQ   final
290
291	// We will read 4 elements per iteration
292	SRD   $2, R11, R9	// R9 = z_len/4
293	DCBT  (R8)
294	CMP   R0, R9
295	MOVD  R9, CTR		// Set up the loop counter
296	BEQ   tail		// If R9 = 0, we can't use the loop
297
298	// The loop here is almost the same as the one used in s390x, but
299	// we don't need to capture CA every iteration because we've already
300	// done that above.
301loop:
302	MOVD  8(R8), R20
303	MOVD  16(R8), R21
304	MOVD  24(R8), R22
305	MOVDU 32(R8), R23
306	SUBE  R0, R20
307	SUBE  R0, R21
308	SUBE  R0, R22
309	SUBE  R0, R23
310	MOVD  R20, 8(R10)
311	MOVD  R21, 16(R10)
312	MOVD  R22, 24(R10)
313	MOVDU R23, 32(R10)
314	ADD   $-4, R11
315	BC    16, 0, loop	// bdnz
316
317	// We may have some elements to read
318	CMP   R0, R11
319	BEQ   final
320
321tail:
322	MOVDU 8(R8), R20
323	SUBE  R0, R20
324	ADD   $-1, R11
325	MOVDU R20, 8(R10)
326	CMP   R0, R11
327	BEQ   final
328
329	MOVDU 8(R8), R20
330	SUBE  R0, R20
331	ADD   $-1, R11
332	MOVDU R20, 8(R10)
333	CMP   R0, R11
334	BEQ   final
335
336	MOVD  8(R8), R20
337	SUBE  R0, R20
338	MOVD  R20, 8(R10)
339
340final:
341	// Capture CA
342	SUBE  R4, R4
343	NEG   R4, R4
344
345done:
346	MOVD  R4, c+56(FP)
347	RET
348
349TEXT ·shlVU(SB), NOSPLIT, $0
350	BR ·shlVU_g(SB)
351
352TEXT ·shrVU(SB), NOSPLIT, $0
353	BR ·shrVU_g(SB)
354
355// func mulAddVWW(z, x []Word, y, r Word) (c Word)
356TEXT ·mulAddVWW(SB), NOSPLIT, $0
357	MOVD    z+0(FP), R10      // R10 = z[]
358	MOVD    x+24(FP), R8      // R8 = x[]
359	MOVD    y+48(FP), R9      // R9 = y
360	MOVD    r+56(FP), R4      // R4 = r = c
361	MOVD    z_len+8(FP), R11  // R11 = z_len
362
363	CMP     R0, R11
364	BEQ     done
365
366	MOVD    0(R8), R20
367	ADD     $-1, R11
368	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
369	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
370	ADDC    R4, R6            // R6 = z0 + r
371	ADDZE   R7                // R7 = z1 + CA
372	CMP     R0, R11
373	MOVD    R7, R4            // R4 = c
374	MOVD    R6, 0(R10)        // z[i]
375	BEQ     done
376
377	// We will read 4 elements per iteration
378	SRD     $2, R11, R14      // R14 = z_len/4
379	DCBT    (R8)
380	CMP     R0, R14
381	MOVD    R14, CTR          // Set up the loop counter
382	BEQ     tail              // If R9 = 0, we can't use the loop
383
384loop:
385	MOVD    8(R8), R20        // R20 = x[i]
386	MOVD    16(R8), R21       // R21 = x[i+1]
387	MOVD    24(R8), R22       // R22 = x[i+2]
388	MOVDU   32(R8), R23       // R23 = x[i+3]
389	MULLD   R9, R20, R24      // R24 = z0[i]
390	MULHDU  R9, R20, R20      // R20 = z1[i]
391	ADDC    R4, R24           // R24 = z0[i] + c
392	ADDZE   R20               // R7 = z1[i] + CA
393	MULLD   R9, R21, R25
394	MULHDU  R9, R21, R21
395	ADDC    R20, R25
396	ADDZE   R21
397	MULLD   R9, R22, R26
398	MULHDU  R9, R22, R22
399	MULLD   R9, R23, R27
400	MULHDU  R9, R23, R23
401	ADDC    R21, R26
402	ADDZE   R22
403	MOVD    R24, 8(R10)       // z[i]
404	MOVD    R25, 16(R10)      // z[i+1]
405	ADDC    R22, R27
406	ADDZE   R23,R4		  // update carry
407	MOVD    R26, 24(R10)      // z[i+2]
408	MOVDU   R27, 32(R10)      // z[i+3]
409	ADD     $-4, R11          // R11 = z_len - 4
410	BC      16, 0, loop       // bdnz
411
412	// We may have some elements to read
413	CMP   R0, R11
414	BEQ   done
415
416	// Process the remaining elements, one at a time
417tail:
418	MOVDU   8(R8), R20        // R20 = x[i]
419	MULLD   R9, R20, R24      // R24 = z0[i]
420	MULHDU  R9, R20, R25      // R25 = z1[i]
421	ADD     $-1, R11          // R11 = z_len - 1
422	ADDC    R4, R24
423	ADDZE   R25
424	MOVDU   R24, 8(R10)       // z[i]
425	CMP     R0, R11
426	MOVD    R25, R4           // R4 = c
427	BEQ     done              // If R11 = 0, we are done
428
429	MOVDU   8(R8), R20
430	MULLD   R9, R20, R24
431	MULHDU  R9, R20, R25
432	ADD     $-1, R11
433	ADDC    R4, R24
434	ADDZE   R25
435	MOVDU   R24, 8(R10)
436	CMP     R0, R11
437	MOVD    R25, R4
438	BEQ     done
439
440	MOVD    8(R8), R20
441	MULLD   R9, R20, R24
442	MULHDU  R9, R20, R25
443	ADD     $-1, R11
444	ADDC    R4, R24
445	ADDZE   R25
446	MOVD    R24, 8(R10)
447	MOVD    R25, R4
448
449done:
450	MOVD    R4, c+64(FP)
451	RET
452
453// func addMulVVW(z, x []Word, y Word) (c Word)
454TEXT ·addMulVVW(SB), NOSPLIT, $0
455	MOVD z+0(FP), R10	// R10 = z[]
456	MOVD x+24(FP), R8	// R8 = x[]
457	MOVD y+48(FP), R9	// R9 = y
458	MOVD z_len+8(FP), R22	// R22 = z_len
459
460	MOVD R0, R3		// R3 will be the index register
461	CMP  R0, R22
462	MOVD R0, R4		// R4 = c = 0
463	MOVD R22, CTR		// Initialize loop counter
464	BEQ  done
465
466loop:
467	MOVD  (R8)(R3), R20	// Load x[i]
468	MOVD  (R10)(R3), R21	// Load z[i]
469	MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y)
470	MULHDU R9, R20, R7	// R7 = High-order(x[i]*y)
471	ADDC   R21, R6		// R6 = z0
472	ADDZE  R7		// R7 = z1
473	ADDC   R4, R6		// R6 = z0 + c + 0
474	ADDZE  R7, R4           // c += z1
475	MOVD   R6, (R10)(R3)	// Store z[i]
476	ADD    $8, R3
477	BC  16, 0, loop		// bdnz
478
479done:
480	MOVD R4, c+56(FP)
481	RET
482
483
484