1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go
6// +build !math_big_pure_go
7
8#include "textflag.h"
9
10// This file provides fast assembly versions for the elementary
11// arithmetic operations on vectors implemented in arith.go.
12
13TEXT ·mulWW(SB), NOSPLIT, $0
14	MOVD   x+0(FP), R3
15	MOVD   y+8(FP), R4
16	MULHDU R3, R4
17	MOVD   R10, z1+16(FP)
18	MOVD   R11, z0+24(FP)
19	RET
20
21
22// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
23// func addVV(z, x, y []Word) (c Word)
24
25TEXT ·addVV(SB), NOSPLIT, $0
26	MOVD addvectorfacility+0x00(SB), R1
27	BR   (R1)
28
29TEXT ·addVV_check(SB), NOSPLIT, $0
30	MOVB   ·hasVX(SB), R1
31	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
32	MOVD   $addvectorfacility+0x00(SB), R1
33	MOVDaddVV_novec(SB), R2
34	MOVD   R2, 0(R1)
35
36	// MOVD	$·addVV_novec(SB), 0(R1)
37	BR ·addVV_novec(SB)
38
39vectorimpl:
40	MOVD $addvectorfacility+0x00(SB), R1
41	MOVDaddVV_vec(SB), R2
42	MOVD R2, 0(R1)
43
44	// MOVD	$·addVV_vec(SB), 0(R1)
45	BR ·addVV_vec(SB)
46
47GLOBL addvectorfacility+0x00(SB), NOPTR, $8
48DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
49
50TEXT ·addVV_vec(SB), NOSPLIT, $0
51	MOVD z_len+8(FP), R3
52	MOVD x+24(FP), R8
53	MOVD y+48(FP), R9
54	MOVD z+0(FP), R2
55
56	MOVD $0, R4  // c = 0
57	MOVD $0, R0  // make sure it's zero
58	MOVD $0, R10 // i = 0
59
60	// s/JL/JMP/ below to disable the unrolled loop
61	SUB $4, R3
62	BLT v1
63	SUB $12, R3 // n -= 16
64	BLT A1      // if n < 0 goto A1
65
66	MOVD R8, R5
67	MOVD R9, R6
68	MOVD R2, R7
69
70	// n >= 0
71	// regular loop body unrolled 16x
72	VZERO V0 // c = 0
73
74UU1:
75	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
76	ADD  $64, R5
77	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
78	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
79
80	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
81	ADD  $64, R6
82	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
83	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
84
85	VACCCQ V1, V9, V0, V25
86	VACQ   V1, V9, V0, V17
87	VACCCQ V2, V10, V25, V26
88	VACQ   V2, V10, V25, V18
89
90	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
91	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
92	ADD $32, R5
93	ADD $32, R6
94
95	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
96	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
97	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
98	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
99
100	VACCCQ V3, V11, V26, V27
101	VACQ   V3, V11, V26, V19
102	VACCCQ V4, V12, V27, V28
103	VACQ   V4, V12, V27, V20
104
105	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
106	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
107	ADD $32, R5
108	ADD $32, R6
109
110	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
111	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
112	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
113	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
114
115	VACCCQ V5, V13, V28, V29
116	VACQ   V5, V13, V28, V21
117	VACCCQ V6, V14, V29, V30
118	VACQ   V6, V14, V29, V22
119
120	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
121	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
122	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
123	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
124
125	VACCCQ V7, V15, V30, V31
126	VACQ   V7, V15, V30, V23
127	VACCCQ V8, V16, V31, V0  // V0 has carry-over
128	VACQ   V8, V16, V31, V24
129
130	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
131	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
132	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
133	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
134	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
135	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
136	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
137	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
138	VSTM  V17, V24, 0(R7)     // 128-bytes into z
139	ADD   $128, R7
140	ADD   $128, R10           // i += 16
141	SUB   $16, R3             // n -= 16
142	BGE   UU1                 // if n >= 0 goto U1
143	VLGVG $1, V0, R4          // put cf into R4
144	NEG   R4, R4              // save cf
145
146A1:
147	ADD $12, R3 // n += 16
148
149	// s/JL/JMP/ below to disable the unrolled loop
150	BLT v1 // if n < 0 goto v1
151
152U1:  // n >= 0
153	// regular loop body unrolled 4x
154	MOVD 0(R8)(R10*1), R5
155	MOVD 8(R8)(R10*1), R6
156	MOVD 16(R8)(R10*1), R7
157	MOVD 24(R8)(R10*1), R1
158	ADDC R4, R4             // restore CF
159	MOVD 0(R9)(R10*1), R11
160	ADDE R11, R5
161	MOVD 8(R9)(R10*1), R11
162	ADDE R11, R6
163	MOVD 16(R9)(R10*1), R11
164	ADDE R11, R7
165	MOVD 24(R9)(R10*1), R11
166	ADDE R11, R1
167	MOVD R0, R4
168	ADDE R4, R4             // save CF
169	NEG  R4, R4
170	MOVD R5, 0(R2)(R10*1)
171	MOVD R6, 8(R2)(R10*1)
172	MOVD R7, 16(R2)(R10*1)
173	MOVD R1, 24(R2)(R10*1)
174
175	ADD $32, R10 // i += 4
176	SUB $4, R3   // n -= 4
177	BGE U1       // if n >= 0 goto U1
178
179v1:
180	ADD $4, R3 // n += 4
181	BLE E1     // if n <= 0 goto E1
182
183L1:  // n > 0
184	ADDC R4, R4            // restore CF
185	MOVD 0(R8)(R10*1), R5
186	MOVD 0(R9)(R10*1), R11
187	ADDE R11, R5
188	MOVD R5, 0(R2)(R10*1)
189	MOVD R0, R4
190	ADDE R4, R4            // save CF
191	NEG  R4, R4
192
193	ADD $8, R10 // i++
194	SUB $1, R3  // n--
195	BGT L1      // if n > 0 goto L1
196
197E1:
198	NEG  R4, R4
199	MOVD R4, c+72(FP) // return c
200	RET
201
202TEXT ·addVV_novec(SB), NOSPLIT, $0
203novec:
204	MOVD z_len+8(FP), R3
205	MOVD x+24(FP), R8
206	MOVD y+48(FP), R9
207	MOVD z+0(FP), R2
208
209	MOVD $0, R4  // c = 0
210	MOVD $0, R0  // make sure it's zero
211	MOVD $0, R10 // i = 0
212
213	// s/JL/JMP/ below to disable the unrolled loop
214	SUB $4, R3 // n -= 4
215	BLT v1n    // if n < 0 goto v1n
216
217U1n:  // n >= 0
218	// regular loop body unrolled 4x
219	MOVD 0(R8)(R10*1), R5
220	MOVD 8(R8)(R10*1), R6
221	MOVD 16(R8)(R10*1), R7
222	MOVD 24(R8)(R10*1), R1
223	ADDC R4, R4             // restore CF
224	MOVD 0(R9)(R10*1), R11
225	ADDE R11, R5
226	MOVD 8(R9)(R10*1), R11
227	ADDE R11, R6
228	MOVD 16(R9)(R10*1), R11
229	ADDE R11, R7
230	MOVD 24(R9)(R10*1), R11
231	ADDE R11, R1
232	MOVD R0, R4
233	ADDE R4, R4             // save CF
234	NEG  R4, R4
235	MOVD R5, 0(R2)(R10*1)
236	MOVD R6, 8(R2)(R10*1)
237	MOVD R7, 16(R2)(R10*1)
238	MOVD R1, 24(R2)(R10*1)
239
240	ADD $32, R10 // i += 4
241	SUB $4, R3   // n -= 4
242	BGE U1n      // if n >= 0 goto U1n
243
244v1n:
245	ADD $4, R3 // n += 4
246	BLE E1n    // if n <= 0 goto E1n
247
248L1n:  // n > 0
249	ADDC R4, R4            // restore CF
250	MOVD 0(R8)(R10*1), R5
251	MOVD 0(R9)(R10*1), R11
252	ADDE R11, R5
253	MOVD R5, 0(R2)(R10*1)
254	MOVD R0, R4
255	ADDE R4, R4            // save CF
256	NEG  R4, R4
257
258	ADD $8, R10 // i++
259	SUB $1, R3  // n--
260	BGT L1n     // if n > 0 goto L1n
261
262E1n:
263	NEG  R4, R4
264	MOVD R4, c+72(FP) // return c
265	RET
266
267TEXT ·subVV(SB), NOSPLIT, $0
268	MOVD subvectorfacility+0x00(SB), R1
269	BR   (R1)
270
271TEXT ·subVV_check(SB), NOSPLIT, $0
272	MOVB   ·hasVX(SB), R1
273	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
274	MOVD   $subvectorfacility+0x00(SB), R1
275	MOVDsubVV_novec(SB), R2
276	MOVD   R2, 0(R1)
277
278	// MOVD	$·subVV_novec(SB), 0(R1)
279	BR ·subVV_novec(SB)
280
281vectorimpl:
282	MOVD $subvectorfacility+0x00(SB), R1
283	MOVDsubVV_vec(SB), R2
284	MOVD R2, 0(R1)
285
286	// MOVD	$·subVV_vec(SB), 0(R1)
287	BR ·subVV_vec(SB)
288
289GLOBL subvectorfacility+0x00(SB), NOPTR, $8
290DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
291
292// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
293// func subVV(z, x, y []Word) (c Word)
294// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
295TEXT ·subVV_vec(SB), NOSPLIT, $0
296	MOVD z_len+8(FP), R3
297	MOVD x+24(FP), R8
298	MOVD y+48(FP), R9
299	MOVD z+0(FP), R2
300	MOVD $0, R4          // c = 0
301	MOVD $0, R0          // make sure it's zero
302	MOVD $0, R10         // i = 0
303
304	// s/JL/JMP/ below to disable the unrolled loop
305	SUB $4, R3  // n -= 4
306	BLT v1      // if n < 0 goto v1
307	SUB $12, R3 // n -= 16
308	BLT A1      // if n < 0 goto A1
309
310	MOVD R8, R5
311	MOVD R9, R6
312	MOVD R2, R7
313
314	// n >= 0
315	// regular loop body unrolled 16x
316	VZERO V0         // cf = 0
317	MOVD  $1, R4     // for 390 subtraction cf starts as 1 (no borrow)
318	VLVGG $1, R4, V0 // put carry into V0
319
320UU1:
321	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
322	ADD  $64, R5
323	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
324	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
325
326	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
327	ADD  $64, R6
328	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
329	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
330
331	VSBCBIQ V1, V9, V0, V25
332	VSBIQ   V1, V9, V0, V17
333	VSBCBIQ V2, V10, V25, V26
334	VSBIQ   V2, V10, V25, V18
335
336	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
337	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
338	ADD $32, R5
339	ADD $32, R6
340
341	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
342	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
343	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
344	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
345
346	VSBCBIQ V3, V11, V26, V27
347	VSBIQ   V3, V11, V26, V19
348	VSBCBIQ V4, V12, V27, V28
349	VSBIQ   V4, V12, V27, V20
350
351	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
352	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
353	ADD $32, R5
354	ADD $32, R6
355
356	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
357	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
358	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
359	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
360
361	VSBCBIQ V5, V13, V28, V29
362	VSBIQ   V5, V13, V28, V21
363	VSBCBIQ V6, V14, V29, V30
364	VSBIQ   V6, V14, V29, V22
365
366	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
367	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
368	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
369	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
370
371	VSBCBIQ V7, V15, V30, V31
372	VSBIQ   V7, V15, V30, V23
373	VSBCBIQ V8, V16, V31, V0  // V0 has carry-over
374	VSBIQ   V8, V16, V31, V24
375
376	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
377	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
378	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
379	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
380	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
381	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
382	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
383	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
384	VSTM  V17, V24, 0(R7)     // 128-bytes into z
385	ADD   $128, R7
386	ADD   $128, R10           // i += 16
387	SUB   $16, R3             // n -= 16
388	BGE   UU1                 // if n >= 0 goto U1
389	VLGVG $1, V0, R4          // put cf into R4
390	SUB   $1, R4              // save cf
391
392A1:
393	ADD $12, R3 // n += 16
394	BLT v1      // if n < 0 goto v1
395
396U1:  // n >= 0
397	// regular loop body unrolled 4x
398	MOVD 0(R8)(R10*1), R5
399	MOVD 8(R8)(R10*1), R6
400	MOVD 16(R8)(R10*1), R7
401	MOVD 24(R8)(R10*1), R1
402	MOVD R0, R11
403	SUBC R4, R11            // restore CF
404	MOVD 0(R9)(R10*1), R11
405	SUBE R11, R5
406	MOVD 8(R9)(R10*1), R11
407	SUBE R11, R6
408	MOVD 16(R9)(R10*1), R11
409	SUBE R11, R7
410	MOVD 24(R9)(R10*1), R11
411	SUBE R11, R1
412	MOVD R0, R4
413	SUBE R4, R4             // save CF
414	MOVD R5, 0(R2)(R10*1)
415	MOVD R6, 8(R2)(R10*1)
416	MOVD R7, 16(R2)(R10*1)
417	MOVD R1, 24(R2)(R10*1)
418
419	ADD $32, R10 // i += 4
420	SUB $4, R3   // n -= 4
421	BGE U1       // if n >= 0 goto U1n
422
423v1:
424	ADD $4, R3 // n += 4
425	BLE E1     // if n <= 0 goto E1
426
427L1:  // n > 0
428	MOVD R0, R11
429	SUBC R4, R11           // restore CF
430	MOVD 0(R8)(R10*1), R5
431	MOVD 0(R9)(R10*1), R11
432	SUBE R11, R5
433	MOVD R5, 0(R2)(R10*1)
434	MOVD R0, R4
435	SUBE R4, R4            // save CF
436
437	ADD $8, R10 // i++
438	SUB $1, R3  // n--
439	BGT L1      // if n > 0 goto L1n
440
441E1:
442	NEG  R4, R4
443	MOVD R4, c+72(FP) // return c
444	RET
445
446// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
447// func subVV(z, x, y []Word) (c Word)
448// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
449TEXT ·subVV_novec(SB), NOSPLIT, $0
450	MOVD z_len+8(FP), R3
451	MOVD x+24(FP), R8
452	MOVD y+48(FP), R9
453	MOVD z+0(FP), R2
454
455	MOVD $0, R4  // c = 0
456	MOVD $0, R0  // make sure it's zero
457	MOVD $0, R10 // i = 0
458
459	// s/JL/JMP/ below to disable the unrolled loop
460	SUB $4, R3 // n -= 4
461	BLT v1     // if n < 0 goto v1
462
463U1:  // n >= 0
464	// regular loop body unrolled 4x
465	MOVD 0(R8)(R10*1), R5
466	MOVD 8(R8)(R10*1), R6
467	MOVD 16(R8)(R10*1), R7
468	MOVD 24(R8)(R10*1), R1
469	MOVD R0, R11
470	SUBC R4, R11            // restore CF
471	MOVD 0(R9)(R10*1), R11
472	SUBE R11, R5
473	MOVD 8(R9)(R10*1), R11
474	SUBE R11, R6
475	MOVD 16(R9)(R10*1), R11
476	SUBE R11, R7
477	MOVD 24(R9)(R10*1), R11
478	SUBE R11, R1
479	MOVD R0, R4
480	SUBE R4, R4             // save CF
481	MOVD R5, 0(R2)(R10*1)
482	MOVD R6, 8(R2)(R10*1)
483	MOVD R7, 16(R2)(R10*1)
484	MOVD R1, 24(R2)(R10*1)
485
486	ADD $32, R10 // i += 4
487	SUB $4, R3   // n -= 4
488	BGE U1       // if n >= 0 goto U1
489
490v1:
491	ADD $4, R3 // n += 4
492	BLE E1     // if n <= 0 goto E1
493
494L1:  // n > 0
495	MOVD R0, R11
496	SUBC R4, R11           // restore CF
497	MOVD 0(R8)(R10*1), R5
498	MOVD 0(R9)(R10*1), R11
499	SUBE R11, R5
500	MOVD R5, 0(R2)(R10*1)
501	MOVD R0, R4
502	SUBE R4, R4            // save CF
503
504	ADD $8, R10 // i++
505	SUB $1, R3  // n--
506	BGT L1      // if n > 0 goto L1
507
508E1:
509	NEG  R4, R4
510	MOVD R4, c+72(FP) // return c
511	RET
512
513TEXT ·addVW(SB), NOSPLIT, $0
514	MOVD z_len+8(FP), R5 // length of z
515	MOVD x+24(FP), R6
516	MOVD y+48(FP), R7    // c = y
517	MOVD z+0(FP), R8
518
519	CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
520
521	// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
522	ADDC   0(R6), R7
523	MOVD   R7, 0(R8)
524	CMPBEQ R5, $1, returnResult // len(z) == 1
525	MOVD   $0, R9
526	ADDE   8(R6), R9
527	MOVD   R9, 8(R8)
528	CMPBEQ R5, $2, returnResult // len(z) == 2
529
530	// Update the counters
531	MOVD $16, R12    // i = 2
532	MOVD $-2(R5), R5 // n = n - 2
533
534loopOverEachWord:
535	BRC  $12, copySetup // carry = 0, copy the rest
536	MOVD $1, R9
537
538	// Originally we used the carry flag generated in the previous iteration
539	// (i.e: ADDE could be used here to do the addition).  However, since we
540	// already know carry is 1 (otherwise we will go to copy section), we can use
541	// ADDC here so the current iteration does not depend on the carry flag
542	// generated in the previous iteration. This could be useful when branch prediction happens.
543	ADDC 0(R6)(R12*1), R9
544	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
545
546	MOVD  $8(R12), R12         // i++
547	BRCTG R5, loopOverEachWord // n--
548
549// Return the current carry value
550returnResult:
551	MOVD $0, R0
552	ADDE R0, R0
553	MOVD R0, c+56(FP)
554	RET
555
556// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
557// With the assumption that x and z will not overlap with each other or x and z will
558// point to same memory region, we can use a faster version of copy using only MVC here.
559// In the following implementation, we have three copy loops, each copying a word, 4 words, and
560// 32 words at a time.  Via benchmarking, this implementation is faster than calling runtime·memmove.
561copySetup:
562	ADD R12, R6
563	ADD R12, R8
564
565	CMPBGE R5, $4, mediumLoop
566
567smallLoop:  // does a loop unrolling to copy word when n < 4
568	CMPBEQ R5, $0, returnZero
569	MVC    $8, 0(R6), 0(R8)
570	CMPBEQ R5, $1, returnZero
571	MVC    $8, 8(R6), 8(R8)
572	CMPBEQ R5, $2, returnZero
573	MVC    $8, 16(R6), 16(R8)
574
575returnZero:
576	MOVD $0, c+56(FP) // return 0 as carry
577	RET
578
579mediumLoop:
580	CMPBLT R5, $4, smallLoop
581	CMPBLT R5, $32, mediumLoopBody
582
583largeLoop:  // Copying 256 bytes at a time.
584	MVC    $256, 0(R6), 0(R8)
585	MOVD   $256(R6), R6
586	MOVD   $256(R8), R8
587	MOVD   $-32(R5), R5
588	CMPBGE R5, $32, largeLoop
589	BR     mediumLoop
590
591mediumLoopBody:  // Copying 32 bytes at a time
592	MVC    $32, 0(R6), 0(R8)
593	MOVD   $32(R6), R6
594	MOVD   $32(R8), R8
595	MOVD   $-4(R5), R5
596	CMPBGE R5, $4, mediumLoopBody
597	BR     smallLoop
598
599returnC:
600	MOVD R7, c+56(FP)
601	RET
602
603TEXT ·subVW(SB), NOSPLIT, $0
604	MOVD z_len+8(FP), R5
605	MOVD x+24(FP), R6
606	MOVD y+48(FP), R7    // The borrow bit passed in
607	MOVD z+0(FP), R8
608	MOVD $0, R0          // R0 is a temporary variable used during computation. Ensure it has zero in it.
609
610	CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
611
612	// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
613	MOVD   0(R6), R9
614	SUBC   R7, R9
615	MOVD   R9, 0(R8)
616	CMPBEQ R5, $1, returnResult
617	MOVD   8(R6), R9
618	SUBE   R0, R9
619	MOVD   R9, 8(R8)
620	CMPBEQ R5, $2, returnResult
621
622	// Update the counters
623	MOVD $16, R12    // i = 2
624	MOVD $-2(R5), R5 // n = n - 2
625
626loopOverEachWord:
627	BRC  $3, copySetup    // no borrow, copy the rest
628	MOVD 0(R6)(R12*1), R9
629
630	// Originally we used the borrow flag generated in the previous iteration
631	// (i.e: SUBE could be used here to do the subtraction). However, since we
632	// already know borrow is 1 (otherwise we will go to copy section), we can
633	// use SUBC here so the current iteration does not depend on the borrow flag
634	// generated in the previous iteration. This could be useful when branch prediction happens.
635	SUBC $1, R9
636	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
637
638	MOVD  $8(R12), R12         // i++
639	BRCTG R5, loopOverEachWord // n--
640
641// return the current borrow value
642returnResult:
643	SUBE R0, R0
644	NEG  R0, R0
645	MOVD R0, c+56(FP)
646	RET
647
648// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
649// With the assumption that x and z will not overlap with each other or x and z will
650// point to same memory region, we can use a faster version of copy using only MVC here.
651// In the following implementation, we have three copy loops, each copying a word, 4 words, and
652// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
653copySetup:
654	ADD R12, R6
655	ADD R12, R8
656
657	CMPBGE R5, $4, mediumLoop
658
659smallLoop:  // does a loop unrolling to copy word when n < 4
660	CMPBEQ R5, $0, returnZero
661	MVC    $8, 0(R6), 0(R8)
662	CMPBEQ R5, $1, returnZero
663	MVC    $8, 8(R6), 8(R8)
664	CMPBEQ R5, $2, returnZero
665	MVC    $8, 16(R6), 16(R8)
666
667returnZero:
668	MOVD $0, c+56(FP) // return 0 as borrow
669	RET
670
671mediumLoop:
672	CMPBLT R5, $4, smallLoop
673	CMPBLT R5, $32, mediumLoopBody
674
675largeLoop:  // Copying 256 bytes at a time
676	MVC    $256, 0(R6), 0(R8)
677	MOVD   $256(R6), R6
678	MOVD   $256(R8), R8
679	MOVD   $-32(R5), R5
680	CMPBGE R5, $32, largeLoop
681	BR     mediumLoop
682
683mediumLoopBody:  // Copying 32 bytes at a time
684	MVC    $32, 0(R6), 0(R8)
685	MOVD   $32(R6), R6
686	MOVD   $32(R8), R8
687	MOVD   $-4(R5), R5
688	CMPBGE R5, $4, mediumLoopBody
689	BR     smallLoop
690
691returnC:
692	MOVD R7, c+56(FP)
693	RET
694
695// func shlVU(z, x []Word, s uint) (c Word)
696TEXT ·shlVU(SB), NOSPLIT, $0
697	BR ·shlVU_g(SB)
698
699// func shrVU(z, x []Word, s uint) (c Word)
700TEXT ·shrVU(SB), NOSPLIT, $0
701	BR ·shrVU_g(SB)
702
703// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i
704// func mulAddVWW(z, x []Word, y, r Word) (c Word)
705TEXT ·mulAddVWW(SB), NOSPLIT, $0
706	MOVD z+0(FP), R2
707	MOVD x+24(FP), R8
708	MOVD y+48(FP), R9
709	MOVD r+56(FP), R4    // c = r
710	MOVD z_len+8(FP), R5
711	MOVD $0, R1          // i = 0
712	MOVD $0, R7          // i*8 = 0
713	MOVD $0, R0          // make sure it's zero
714	BR   E5
715
716L5:
717	MOVD   (R8)(R1*1), R6
718	MULHDU R9, R6
719	ADDC   R4, R11         // add to low order bits
720	ADDE   R0, R6
721	MOVD   R11, (R2)(R1*1)
722	MOVD   R6, R4
723	ADD    $8, R1          // i*8 + 8
724	ADD    $1, R7          // i++
725
726E5:
727	CMPBLT R7, R5, L5 // i < n
728
729	MOVD R4, c+64(FP)
730	RET
731
732// func addMulVVW(z, x []Word, y Word) (c Word)
733// CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i
734TEXT ·addMulVVW(SB), NOSPLIT, $0
735	MOVD z+0(FP), R2
736	MOVD x+24(FP), R8
737	MOVD y+48(FP), R9
738	MOVD z_len+8(FP), R5
739
740	MOVD $0, R1 // i*8 = 0
741	MOVD $0, R7 // i = 0
742	MOVD $0, R0 // make sure it's zero
743	MOVD $0, R4 // c = 0
744
745	MOVD   R5, R12
746	AND    $-2, R12
747	CMPBGE R5, $2, A6
748	BR     E6
749
750A6:
751	MOVD   (R8)(R1*1), R6
752	MULHDU R9, R6
753	MOVD   (R2)(R1*1), R10
754	ADDC   R10, R11        // add to low order bits
755	ADDE   R0, R6
756	ADDC   R4, R11
757	ADDE   R0, R6
758	MOVD   R6, R4
759	MOVD   R11, (R2)(R1*1)
760
761	MOVD   (8)(R8)(R1*1), R6
762	MULHDU R9, R6
763	MOVD   (8)(R2)(R1*1), R10
764	ADDC   R10, R11           // add to low order bits
765	ADDE   R0, R6
766	ADDC   R4, R11
767	ADDE   R0, R6
768	MOVD   R6, R4
769	MOVD   R11, (8)(R2)(R1*1)
770
771	ADD $16, R1 // i*8 + 8
772	ADD $2, R7  // i++
773
774	CMPBLT R7, R12, A6
775	BR     E6
776
777L6:
778	MOVD   (R8)(R1*1), R6
779	MULHDU R9, R6
780	MOVD   (R2)(R1*1), R10
781	ADDC   R10, R11        // add to low order bits
782	ADDE   R0, R6
783	ADDC   R4, R11
784	ADDE   R0, R6
785	MOVD   R6, R4
786	MOVD   R11, (R2)(R1*1)
787
788	ADD $8, R1 // i*8 + 8
789	ADD $1, R7 // i++
790
791E6:
792	CMPBLT R7, R5, L6 // i < n
793
794	MOVD R4, c+56(FP)
795	RET
796
797