1dnl  X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 ?
37C AMD K10	 ?
38C AMD bull	 ?
39C AMD pile	 ?
40C AMD steam	 ?
41C AMD bobcat	 ?
42C AMD jaguar	 ?
43C Intel P4	 ?
44C Intel core	 ?
45C Intel NHM	 ?
46C Intel SBR	 3.24
47C Intel IBR	 3.04
48C Intel HWL	 ?
49C Intel BWL	 ?
50C Intel atom	 ?
51C VIA nano	 ?
52
53C The inner loops of this code are the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbjörn Granlund.
55
56C TODO
57C  * Micro-optimise, none performed thus far.
58C  * Consider inlining mpn_add_n.
59C  * Single basecases out before the pushes.
60
61C When playing with pointers, set this to $2 to fall back to conservative
62C indexing in wind-down code.
63define(`I',`$1')
64
65define(`rp',          `%rdi')   C rcx
66define(`up',          `%rsi')   C rdx
67define(`mp_param',    `%rdx')   C r8
68define(`n',           `%rcx')   C r9
69define(`u0inv',       `%r8')    C stack
70
71define(`i',           `%r14')
72define(`j',           `%r15')
73define(`mp',          `%r12')
74define(`q0',          `%r13')
75
76C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
77
78ABI_SUPPORT(DOS64)
79ABI_SUPPORT(STD64)
80
81define(`ALIGNx', `ALIGN(16)')
82
83ASM_START()
84	TEXT
85	ALIGN(32)
86PROLOGUE(mpn_redc_1)
87	FUNC_ENTRY(4)
88IFDOS(`	mov	56(%rsp), %r8	')
89	push	%rbx
90	push	%rbp
91	push	%r12
92	push	%r13
93	push	%r14
94	push	%r15
95
96	mov	(up), q0
97	mov	n, j			C outer loop induction var
98	lea	8(mp_param,n,8), mp
99	lea	8(up,n,8), up
100	neg	n
101	imul	u0inv, q0		C first iteration q0
102
103	test	$1, R8(n)
104	jz	L(bx0)
105
106L(bx1):	test	$2, R8(n)
107	jz	L(b3)
108
109L(b1):	cmp	$-1, R32(n)
110	jz	L(n1)
111
112L(otp1):lea	1(n), i
113	mov	-8(mp,n,8), %rax
114	mul	q0
115	mov	-8(up,n,8), %r10
116	mov	%rdx, %r11
117	add	%rax, %r10
118	mov	(mp,n,8), %rax
119	adc	$0, %r11
120	mul	q0
121	mov	%rdx, %r9
122	mov	(up,n,8), %rbx
123	add	%rax, %rbx
124	adc	$0, %r9
125	mov	(mp,i,8), %rax
126	mul	q0
127	mov	(up,i,8), %r10
128	add	%r11, %rbx
129	mov	%rbx, -8(up,i,8)	C next low remainder limb
130	adc	$0, %r9
131	imul	u0inv, %rbx		C next q limb
132	jmp	L(e1)
133
134	ALIGNx
135L(tp1):	mul	q0
136	mov	-16(up,i,8), %r10
137	add	%r11, %rbp
138	mov	%rdx, %r11
139	adc	$0, %r9
140	mov	%rbp, -24(up,i,8)
141	add	%rax, %r10
142	mov	-8(mp,i,8), %rax
143	adc	$0, %r11
144	mul	q0
145	add	%r9, %r10
146	mov	%rdx, %r9
147	mov	-8(up,i,8), %rbp
148	adc	$0, %r11
149	mov	%r10, -16(up,i,8)
150	add	%rax, %rbp
151	adc	$0, %r9
152	mov	(mp,i,8), %rax
153	mul	q0
154	mov	(up,i,8), %r10
155	add	%r11, %rbp
156	mov	%rbp, -8(up,i,8)
157	adc	$0, %r9
158L(e1):	mov	%rdx, %r11
159	add	%rax, %r10
160	mov	8(mp,i,8), %rax
161	adc	$0, %r11
162	mul	q0
163	mov	8(up,i,8), %rbp
164	add	%r9, %r10
165	mov	%rdx, %r9
166	mov	%r10, (up,i,8)
167	adc	$0, %r11
168	add	%rax, %rbp
169	adc	$0, %r9
170	mov	16(mp,i,8), %rax
171	add	$4, i
172	jnc	L(tp1)
173
174L(ed1):	mul	q0
175	mov	I(-16(up),-16(up,i,8)), %r10
176	add	%r11, %rbp
177	adc	$0, %r9
178	mov	%rbp, I(-24(up),-24(up,i,8))
179	add	%rax, %r10
180	adc	$0, %rdx
181	add	%r9, %r10
182	adc	$0, %rdx
183	mov	%r10, I(-16(up),-16(up,i,8))
184	mov	%rdx, -8(up,n,8)	C up[0]
185	mov	%rbx, q0		C previously computed q limb -> q0
186	lea	8(up), up		C up++
187	dec	j
188	jnz	L(otp1)
189	jmp	L(cj)
190
191L(b3):	cmp	$-3, R32(n)
192	jz	L(n3)
193
194L(otp3):lea	3(n), i
195	mov	-8(mp,n,8), %rax
196	mul	q0
197	mov	-8(up,n,8), %r10
198	mov	%rdx, %r11
199	add	%rax, %r10
200	mov	(mp,n,8), %rax
201	adc	$0, %r11
202	mul	q0
203	mov	(up,n,8), %rbx
204	mov	%rdx, %r9
205	add	%rax, %rbx
206	adc	$0, %r9
207	mov	8(mp,n,8), %rax
208	mul	q0
209	mov	8(up,n,8), %r10
210	add	%r11, %rbx
211	mov	%rdx, %r11
212	adc	$0, %r9
213	mov	%rbx, (up,n,8)
214	imul	u0inv, %rbx		C next q limb
215	jmp	L(e3)
216
217	ALIGNx
218L(tp3):	mul	q0
219	mov	-16(up,i,8), %r10
220	add	%r11, %rbp
221	mov	%rdx, %r11
222	adc	$0, %r9
223	mov	%rbp, -24(up,i,8)
224L(e3):	add	%rax, %r10
225	mov	-8(mp,i,8), %rax
226	adc	$0, %r11
227	mul	q0
228	add	%r9, %r10
229	mov	%rdx, %r9
230	mov	-8(up,i,8), %rbp
231	adc	$0, %r11
232	mov	%r10, -16(up,i,8)
233	add	%rax, %rbp
234	adc	$0, %r9
235	mov	(mp,i,8), %rax
236	mul	q0
237	mov	(up,i,8), %r10
238	add	%r11, %rbp
239	mov	%rbp, -8(up,i,8)
240	adc	$0, %r9
241	mov	%rdx, %r11
242	add	%rax, %r10
243	mov	8(mp,i,8), %rax
244	adc	$0, %r11
245	mul	q0
246	mov	8(up,i,8), %rbp
247	add	%r9, %r10
248	mov	%rdx, %r9
249	mov	%r10, (up,i,8)
250	adc	$0, %r11
251	add	%rax, %rbp
252	adc	$0, %r9
253	mov	16(mp,i,8), %rax
254	add	$4, i
255	jnc	L(tp3)
256
257L(ed3):	mul	q0
258	mov	I(-16(up),-16(up,i,8)), %r10
259	add	%r11, %rbp
260	adc	$0, %r9
261	mov	%rbp, I(-24(up),-24(up,i,8))
262	add	%rax, %r10
263	adc	$0, %rdx
264	add	%r9, %r10
265	adc	$0, %rdx
266	mov	%r10, I(-16(up),-16(up,i,8))
267	mov	%rdx, -8(up,n,8)	C up[0]
268	mov	%rbx, q0		C previously computed q limb -> q0
269	lea	8(up), up		C up++
270	dec	j
271	jnz	L(otp3)
272C	jmp	L(cj)
273
274L(cj):
275IFSTD(`	lea	-8(up,n,8), up		C param 2: up
276	lea	(up,n,8), %rdx		C param 3: up - n
277	neg	R32(n)		')	C param 4: n
278
279IFDOS(`	lea	-8(up,n,8), %rdx	C param 2: up
280	lea	(%rdx,n,8), %r8		C param 3: up - n
281	neg	R32(n)
282	mov	n, %r9			C param 4: n
283	mov	rp, %rcx	')	C param 1: rp
284
285	CALL(	mpn_add_n)
286
287L(ret):	pop	%r15
288	pop	%r14
289	pop	%r13
290	pop	%r12
291	pop	%rbp
292	pop	%rbx
293	FUNC_EXIT()
294	ret
295
296L(bx0):	test	$2, R8(n)
297	jnz	L(b2)
298
299L(b0):
300L(otp0):lea	(n), i
301	mov	-8(mp,n,8), %rax
302	mul	q0
303	mov	%rdx, %r9
304	mov	-8(up,n,8), %rbp
305	add	%rax, %rbp
306	adc	$0, %r9
307	mov	(mp,n,8), %rax
308	mul	q0
309	mov	(up,n,8), %rbx
310	mov	%rdx, %r11
311	add	%rax, %rbx
312	mov	8(mp,n,8), %rax
313	adc	$0, %r11
314	mul	q0
315	mov	8(up,n,8), %rbp
316	add	%r9, %rbx
317	mov	%rdx, %r9
318	mov	%rbx, (up,n,8)
319	adc	$0, %r11
320	imul	u0inv, %rbx		C next q limb
321	jmp	L(e0)
322
323	ALIGNx
324L(tp0):	mul	q0
325	mov	-16(up,i,8), %r10
326	add	%r11, %rbp
327	mov	%rdx, %r11
328	adc	$0, %r9
329	mov	%rbp, -24(up,i,8)
330	add	%rax, %r10
331	mov	-8(mp,i,8), %rax
332	adc	$0, %r11
333	mul	q0
334	add	%r9, %r10
335	mov	%rdx, %r9
336	mov	-8(up,i,8), %rbp
337	adc	$0, %r11
338	mov	%r10, -16(up,i,8)
339	add	%rax, %rbp
340	adc	$0, %r9
341	mov	(mp,i,8), %rax
342	mul	q0
343	mov	(up,i,8), %r10
344	add	%r11, %rbp
345	mov	%rbp, -8(up,i,8)
346	adc	$0, %r9
347	mov	%rdx, %r11
348	add	%rax, %r10
349	mov	8(mp,i,8), %rax
350	adc	$0, %r11
351	mul	q0
352	mov	8(up,i,8), %rbp
353	add	%r9, %r10
354	mov	%rdx, %r9
355	mov	%r10, (up,i,8)
356	adc	$0, %r11
357L(e0):	add	%rax, %rbp
358	adc	$0, %r9
359	mov	16(mp,i,8), %rax
360	add	$4, i
361	jnc	L(tp0)
362
363L(ed0):	mul	q0
364	mov	I(-16(up),-16(up,i,8)), %r10
365	add	%r11, %rbp
366	adc	$0, %r9
367	mov	%rbp, I(-24(up),-24(up,i,8))
368	add	%rax, %r10
369	adc	$0, %rdx
370	add	%r9, %r10
371	adc	$0, %rdx
372	mov	%r10, I(-16(up),-16(up,i,8))
373	mov	%rdx, -8(up,n,8)	C up[0]
374	mov	%rbx, q0		C previously computed q limb -> q0
375	lea	8(up), up		C up++
376	dec	j
377	jnz	L(otp0)
378	jmp	L(cj)
379
380L(b2):	cmp	$-2, R32(n)
381	jz	L(n2)
382
383L(otp2):lea	2(n), i
384	mov	-8(mp,n,8), %rax
385	mul	q0
386	mov	-8(up,n,8), %rbp
387	mov	%rdx, %r9
388	add	%rax, %rbp
389	adc	$0, %r9
390	mov	(mp,n,8), %rax
391	mul	q0
392	mov	(up,n,8), %rbx
393	mov	%rdx, %r11
394	add	%rax, %rbx
395	mov	8(mp,n,8), %rax
396	adc	$0, %r11
397	mul	q0
398	add	%r9, %rbx
399	mov	%rdx, %r9
400	mov	8(up,n,8), %rbp
401	adc	$0, %r11
402	mov	%rbx, (up,n,8)
403	imul	u0inv, %rbx		C next q limb
404	jmp	L(e2)
405
406	ALIGNx
407L(tp2):	mul	q0
408	mov	-16(up,i,8), %r10
409	add	%r11, %rbp
410	mov	%rdx, %r11
411	adc	$0, %r9
412	mov	%rbp, -24(up,i,8)
413	add	%rax, %r10
414	mov	-8(mp,i,8), %rax
415	adc	$0, %r11
416	mul	q0
417	add	%r9, %r10
418	mov	%rdx, %r9
419	mov	-8(up,i,8), %rbp
420	adc	$0, %r11
421	mov	%r10, -16(up,i,8)
422L(e2):	add	%rax, %rbp
423	adc	$0, %r9
424	mov	(mp,i,8), %rax
425	mul	q0
426	mov	(up,i,8), %r10
427	add	%r11, %rbp
428	mov	%rbp, -8(up,i,8)
429	adc	$0, %r9
430	mov	%rdx, %r11
431	add	%rax, %r10
432	mov	8(mp,i,8), %rax
433	adc	$0, %r11
434	mul	q0
435	mov	8(up,i,8), %rbp
436	add	%r9, %r10
437	mov	%rdx, %r9
438	mov	%r10, (up,i,8)
439	adc	$0, %r11
440	add	%rax, %rbp
441	adc	$0, %r9
442	mov	16(mp,i,8), %rax
443	add	$4, i
444	jnc	L(tp2)
445
446L(ed2):	mul	q0
447	mov	I(-16(up),-16(up,i,8)), %r10
448	add	%r11, %rbp
449	adc	$0, %r9
450	mov	%rbp, I(-24(up),-24(up,i,8))
451	add	%rax, %r10
452	adc	$0, %rdx
453	add	%r9, %r10
454	adc	$0, %rdx
455	mov	%r10, I(-16(up),-16(up,i,8))
456	mov	%rdx, -8(up,n,8)	C up[0]
457	mov	%rbx, q0		C previously computed q limb -> q0
458	lea	8(up), up		C up++
459	dec	j
460	jnz	L(otp2)
461	jmp	L(cj)
462
463L(n1):	mov	(mp_param), %rax
464	mul	q0
465	add	-16(up), %rax
466	adc	-8(up), %rdx
467	mov	%rdx, (rp)
468	mov	$0, R32(%rax)
469	adc	R32(%rax), R32(%rax)
470	jmp	L(ret)
471
472L(n2):	mov	(mp_param), %rax
473	mov	-24(up), %rbp
474	mul	q0
475	add	%rax, %rbp
476	mov	%rdx, %r9
477	adc	$0, %r9
478	mov	-16(mp), %rax
479	mov	-16(up), %r10
480	mul	q0
481	add	%rax, %r10
482	mov	%rdx, %r11
483	adc	$0, %r11
484	add	%r9, %r10
485	adc	$0, %r11
486	mov	%r10, q0
487	imul	u0inv, q0		C next q0
488	mov	-24(mp), %rax
489	mul	q0
490	add	%rax, %r10
491	mov	%rdx, %r9
492	adc	$0, %r9
493	mov	-16(mp), %rax
494	mov	-8(up), %r14
495	mul	q0
496	add	%rax, %r14
497	adc	$0, %rdx
498	add	%r9, %r14
499	adc	$0, %rdx
500	xor	R32(%rax), R32(%rax)
501	add	%r11, %r14
502	adc	(up), %rdx
503	mov	%r14, (rp)
504	mov	%rdx, 8(rp)
505	adc	R32(%rax), R32(%rax)
506	jmp	L(ret)
507
508	ALIGNx
509L(n3):	mov	-32(mp), %rax
510	mov	-32(up), %r10
511	mul	q0
512	add	%rax, %r10
513	mov	-24(mp), %rax
514	mov	%rdx, %r11
515	adc	$0, %r11
516	mov	-24(up), %rbp
517	mul	q0
518	add	%rax, %rbp
519	mov	%rdx, %r9
520	adc	$0, %r9
521	mov	-16(mp), %rax
522	add	%r11, %rbp
523	mov	-16(up), %r10
524	adc	$0, %r9
525	mul	q0
526	mov	%rbp, q0
527	imul	u0inv, q0		C next q0
528	add	%rax, %r10
529	mov	%rdx, %r11
530	adc	$0, %r11
531	mov	%rbp, -24(up)
532	add	%r9, %r10
533	adc	$0, %r11
534	mov	%r10, -16(up)
535	mov	%r11, -32(up)		C up[0]
536	lea	8(up), up		C up++
537	dec	j
538	jnz	L(n3)
539	jmp	L(cj)
540EPILOGUE()
541ASM_END()
542