1dnl  X86-64 mpn_redc_1 optimised for Intel Atom.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 ?
37C AMD K10	 ?
38C AMD bull	 ?
39C AMD pile	 ?
40C AMD steam	 ?
41C AMD bobcat	 5.0
42C AMD jaguar	 ?
43C Intel P4	 ?
44C Intel core	 ?
45C Intel NHM	 ?
46C Intel SBR	 ?
47C Intel IBR	 ?
48C Intel HWL	 ?
49C Intel BWL	 ?
50C Intel atom	 ?
51C VIA nano	 ?
52
53C TODO
54C  * Micro-optimise, none performed thus far.
55C  * Consider inlining mpn_add_n.
56C  * Single basecases out before the pushes.
57C  * Make lead-in code for the inner loops be more similar.
58
59C When playing with pointers, set this to $2 to fall back to conservative
60C indexing in wind-down code.
61define(`I',`$1')
62
63define(`rp',          `%rdi')   C rcx
64define(`up',          `%rsi')   C rdx
65define(`mp_param',    `%rdx')   C r8
66define(`n',           `%rcx')   C r9
67define(`u0inv',       `%r8')    C stack
68
69define(`i',           `%r14')
70define(`j',           `%r15')
71define(`mp',          `%r12')
72define(`q0',          `%r13')
73define(`w0',          `%rbp')
74define(`w1',          `%r9')
75define(`w2',          `%r10')
76define(`w3',          `%r11')
77
78C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
79
80ABI_SUPPORT(DOS64)
81ABI_SUPPORT(STD64)
82
83define(`ALIGNx', `ALIGN(16)')
84
85ASM_START()
86	TEXT
87	ALIGN(32)
88PROLOGUE(mpn_redc_1)
89	FUNC_ENTRY(4)
90IFDOS(`	mov	56(%rsp), %r8	')
91	push	%rbx
92	push	%rbp
93	push	%r12
94	push	%r13
95	push	%r14
96	push	%r15
97
98	mov	(up), q0
99	mov	n, j			C outer loop induction var
100	lea	(mp_param,n,8), mp
101	lea	(up,n,8), up
102	neg	n
103	imul	u0inv, q0		C first iteration q0
104
105	test	$1, R8(n)
106	jz	L(bx0)
107
108L(bx1):	test	$2, R8(n)
109	jz	L(b3)
110
111L(b1):	cmp	$-1, R32(n)
112	jz	L(n1)
113
114L(otp1):lea	1(n), i
115	mov	(mp,n,8), %rax
116	mul	q0
117	mov	%rax, %rbp
118	mov	8(mp,n,8), %rax
119	mov	%rdx, %r9
120	mul	q0
121	mov	%rax, %rbx
122	mov	16(mp,n,8), %rax
123	mov	%rdx, %r10
124	mul	q0
125	add	(up,n,8), %rbp
126	mov	%rax, %rbp
127	adc	%r9, %rbx
128	mov	24(mp,n,8), %rax
129	adc	$0, %r10
130	mov	%rdx, %r9
131	mul	q0
132	add	8(up,n,8), %rbx
133	mov	%rbx, 8(up,n,8)
134	mov	%rax, %r11
135	adc	%r10, %rbp
136	mov	32(mp,n,8), %rax
137	adc	$0, %r9
138	imul	u0inv, %rbx		C next q limb
139	jmp	L(e1)
140
141	ALIGNx
142L(tp1):	mul	q0
143	add	%rbp, -24(up,i,8)
144	mov	%rax, %rbp
145	mov	(mp,i,8), %rax
146	adc	%r9, %r11
147	mov	%rdx, %r9
148	adc	$0, %r10
149	mul	q0
150	add	%r11, -16(up,i,8)
151	mov	%rax, %r11
152	mov	8(mp,i,8), %rax
153	adc	%r10, %rbp
154	mov	%rdx, %r10
155	adc	$0, %r9
156	mul	q0
157	add	%rbp, -8(up,i,8)
158	mov	%rax, %rbp
159	adc	%r9, %r11
160	mov	16(mp,i,8), %rax
161	adc	$0, %r10
162	mov	%rdx, %r9
163	mul	q0
164	add	%r11, (up,i,8)
165	mov	%rax, %r11
166	adc	%r10, %rbp
167	mov	24(mp,i,8), %rax
168	adc	$0, %r9
169L(e1):	add	$4, i
170	mov	%rdx, %r10
171	js	L(tp1)
172
173L(ed1):	mul	q0
174	add	%rbp, I(-24(up),-24(up,i,8))
175	adc	%r9, %r11
176	adc	$0, %r10
177	add	%r11, I(-16(up),-16(up,i,8))
178	adc	%r10, %rax
179	adc	$0, %rdx
180	add	%rax, I(-8(up),-8(up,i,8))
181	adc	$0, %rdx
182	mov	%rdx, (up,n,8)		C up[0]
183	mov	%rbx, q0		C previously computed q limb -> q0
184	lea	8(up), up		C up++
185	dec	j
186	jnz	L(otp1)
187	jmp	L(cj)
188
189L(b3):	cmp	$-3, R32(n)
190	jz	L(n3)
191
192L(otp3):lea	3(n), i
193	mov	(mp,n,8), %rax
194	mul	q0
195	mov	%rax, %rbp
196	mov	8(mp,n,8), %rax
197	mov	%rdx, %r9
198	mul	q0
199	mov	%rax, %rbx
200	mov	16(mp,n,8), %rax
201	mov	%rdx, %r10
202	mul	q0
203	add	(up,n,8), %rbp
204	mov	%rax, %rbp
205	mov	24(mp,n,8), %rax
206	adc	%r9, %rbx
207	mov	%rdx, %r9
208	adc	$0, %r10
209	mul	q0
210	add	8(up,n,8), %rbx
211	mov	%rbx, 8(up,n,8)
212	mov	%rax, %r11
213	mov	32(mp,n,8), %rax
214	adc	%r10, %rbp
215	mov	%rdx, %r10
216	adc	$0, %r9
217	imul	u0inv, %rbx		C next q limb
218	jmp	L(e3)
219
220	ALIGNx
221L(tp3):	mul	q0
222	add	%rbp, -24(up,i,8)
223	mov	%rax, %rbp
224	mov	(mp,i,8), %rax
225	adc	%r9, %r11
226	mov	%rdx, %r9
227	adc	$0, %r10
228	mul	q0
229	add	%r11, -16(up,i,8)
230	mov	%rax, %r11
231	mov	8(mp,i,8), %rax
232	adc	%r10, %rbp
233	mov	%rdx, %r10
234	adc	$0, %r9
235L(e3):	mul	q0
236	add	%rbp, -8(up,i,8)
237	mov	%rax, %rbp
238	adc	%r9, %r11
239	mov	16(mp,i,8), %rax
240	adc	$0, %r10
241	mov	%rdx, %r9
242	mul	q0
243	add	%r11, (up,i,8)
244	mov	%rax, %r11
245	adc	%r10, %rbp
246	mov	24(mp,i,8), %rax
247	adc	$0, %r9
248	add	$4, i
249	mov	%rdx, %r10
250	js	L(tp3)
251
252L(ed3):	mul	q0
253	add	%rbp, I(-24(up),-24(up,i,8))
254	adc	%r9, %r11
255	adc	$0, %r10
256	add	%r11, I(-16(up),-16(up,i,8))
257	adc	%r10, %rax
258	adc	$0, %rdx
259	add	%rax, I(-8(up),-8(up,i,8))
260	adc	$0, %rdx
261	mov	%rdx, (up,n,8)		C up[0]
262	mov	%rbx, q0		C previously computed q limb -> q0
263	lea	8(up), up		C up++
264	dec	j
265	jnz	L(otp3)
266C	jmp	L(cj)
267
268L(cj):
269IFSTD(`	lea	(up,n,8), up		C param 2: up
270	lea	(up,n,8), %rdx		C param 3: up - n
271	neg	R32(n)		')	C param 4: n
272
273IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
274	lea	(%rdx,n,8), %r8		C param 3: up - n
275	neg	R32(n)
276	mov	n, %r9			C param 4: n
277	mov	rp, %rcx	')	C param 1: rp
278
279	CALL(	mpn_add_n)
280
281L(ret):	pop	%r15
282	pop	%r14
283	pop	%r13
284	pop	%r12
285	pop	%rbp
286	pop	%rbx
287	FUNC_EXIT()
288	ret
289
290L(bx0):	test	$2, R8(n)
291	jnz	L(b2)
292
293L(b0):	cmp	$-4, R32(n)
294	jz	L(n4)
295
296L(otp0):lea	4(n), i
297	mov	(mp,n,8), %rax
298	mul	q0
299	mov	%rax, %r11
300	mov	8(mp,n,8), %rax
301	mov	%rdx, %r10
302	mul	q0
303	mov	%rax, %rbx
304	mov	16(mp,n,8), %rax
305	mov	%rdx, %r9
306	mul	q0
307	add	(up,n,8), %r11
308	mov	%rax, %r11
309	adc	%r10, %rbx
310	mov	24(mp,n,8), %rax
311	adc	$0, %r9
312	mov	%rdx, %r10
313	mul	q0
314	add	8(up,n,8), %rbx
315	mov	%rbx, 8(up,n,8)
316	mov	%rax, %rbp
317	mov	32(mp,n,8), %rax
318	adc	%r9, %r11
319	mov	%rdx, %r9
320	adc	$0, %r10
321	imul	u0inv, %rbx		C next q limb
322	jmp	L(e0)
323
324	ALIGNx
325L(tp0):	mul	q0
326	add	%rbp, -24(up,i,8)
327	mov	%rax, %rbp
328	mov	(mp,i,8), %rax
329	adc	%r9, %r11
330	mov	%rdx, %r9
331	adc	$0, %r10
332L(e0):	mul	q0
333	add	%r11, -16(up,i,8)
334	mov	%rax, %r11
335	mov	8(mp,i,8), %rax
336	adc	%r10, %rbp
337	mov	%rdx, %r10
338	adc	$0, %r9
339	mul	q0
340	add	%rbp, -8(up,i,8)
341	mov	%rax, %rbp
342	adc	%r9, %r11
343	mov	16(mp,i,8), %rax
344	adc	$0, %r10
345	mov	%rdx, %r9
346	mul	q0
347	add	%r11, (up,i,8)
348	mov	%rax, %r11
349	adc	%r10, %rbp
350	mov	24(mp,i,8), %rax
351	adc	$0, %r9
352	add	$4, i
353	mov	%rdx, %r10
354	js	L(tp0)
355
356L(ed0):	mul	q0
357	add	%rbp, I(-24(up),-24(up,i,8))
358	adc	%r9, %r11
359	adc	$0, %r10
360	add	%r11, I(-16(up),-16(up,i,8))
361	adc	%r10, %rax
362	adc	$0, %rdx
363	add	%rax, I(-8(up),-8(up,i,8))
364	adc	$0, %rdx
365	mov	%rdx, (up,n,8)		C up[0]
366	mov	%rbx, q0		C previously computed q limb -> q0
367	lea	8(up), up		C up++
368	dec	j
369	jnz	L(otp0)
370	jmp	L(cj)
371
372L(b2):	cmp	$-2, R32(n)
373	jz	L(n2)
374
375L(otp2):lea	2(n), i
376	mov	(mp,n,8), %rax
377	mul	q0
378	mov	%rax, %r11
379	mov	8(mp,n,8), %rax
380	mov	%rdx, %r10
381	mul	q0
382	mov	%rax, %rbx
383	mov	16(mp,n,8), %rax
384	mov	%rdx, %r9
385	mul	q0
386	add	(up,n,8), %r11
387	mov	%rax, %r11
388	adc	%r10, %rbx
389	mov	24(mp,n,8), %rax
390	adc	$0, %r9
391	mov	%rdx, %r10
392	mul	q0
393	add	8(up,n,8), %rbx
394	mov	%rbx, 8(up,n,8)
395	mov	%rax, %rbp
396	mov	32(mp,n,8), %rax
397	adc	%r9, %r11
398	mov	%rdx, %r9
399	adc	$0, %r10
400	imul	u0inv, %rbx		C next q limb
401	jmp	L(e2)
402
403	ALIGNx
404L(tp2):	mul	q0
405	add	%rbp, -24(up,i,8)
406	mov	%rax, %rbp
407	mov	(mp,i,8), %rax
408	adc	%r9, %r11
409	mov	%rdx, %r9
410	adc	$0, %r10
411	mul	q0
412	add	%r11, -16(up,i,8)
413	mov	%rax, %r11
414	mov	8(mp,i,8), %rax
415	adc	%r10, %rbp
416	mov	%rdx, %r10
417	adc	$0, %r9
418	mul	q0
419	add	%rbp, -8(up,i,8)
420	mov	%rax, %rbp
421	adc	%r9, %r11
422	mov	16(mp,i,8), %rax
423	adc	$0, %r10
424	mov	%rdx, %r9
425L(e2):	mul	q0
426	add	%r11, (up,i,8)
427	mov	%rax, %r11
428	adc	%r10, %rbp
429	mov	24(mp,i,8), %rax
430	adc	$0, %r9
431	add	$4, i
432	mov	%rdx, %r10
433	js	L(tp2)
434
435L(ed2):	mul	q0
436	add	%rbp, I(-24(up),-24(up,i,8))
437	adc	%r9, %r11
438	adc	$0, %r10
439	add	%r11, I(-16(up),-16(up,i,8))
440	adc	%r10, %rax
441	adc	$0, %rdx
442	add	%rax, I(-8(up),-8(up,i,8))
443	adc	$0, %rdx
444	mov	%rdx, (up,n,8)		C up[0]
445	mov	%rbx, q0		C previously computed q limb -> q0
446	lea	8(up), up		C up++
447	dec	j
448	jnz	L(otp2)
449	jmp	L(cj)
450
451L(n1):	mov	(mp_param), %rax
452	mul	q0
453	add	-8(up), %rax
454	adc	(up), %rdx
455	mov	%rdx, (rp)
456	mov	$0, R32(%rax)
457	adc	R32(%rax), R32(%rax)
458	jmp	L(ret)
459
460L(n2):	mov	(mp_param), %rax
461	mov	-16(up), %rbp
462	mul	q0
463	add	%rax, %rbp
464	mov	%rdx, %r9
465	adc	$0, %r9
466	mov	-8(mp), %rax
467	mov	-8(up), %r10
468	mul	q0
469	add	%rax, %r10
470	mov	%rdx, %r11
471	adc	$0, %r11
472	add	%r9, %r10
473	adc	$0, %r11
474	mov	%r10, q0
475	imul	u0inv, q0		C next q0
476	mov	-16(mp), %rax
477	mul	q0
478	add	%rax, %r10
479	mov	%rdx, %r9
480	adc	$0, %r9
481	mov	-8(mp), %rax
482	mov	(up), %r14
483	mul	q0
484	add	%rax, %r14
485	adc	$0, %rdx
486	add	%r9, %r14
487	adc	$0, %rdx
488	xor	R32(%rax), R32(%rax)
489	add	%r11, %r14
490	adc	8(up), %rdx
491	mov	%r14, (rp)
492	mov	%rdx, 8(rp)
493	adc	R32(%rax), R32(%rax)
494	jmp	L(ret)
495
496	ALIGNx
497L(n3):	mov	-24(mp), %rax
498	mov	-24(up), %r10
499	mul	q0
500	add	%rax, %r10
501	mov	-16(mp), %rax
502	mov	%rdx, %r11
503	adc	$0, %r11
504	mov	-16(up), %rbp
505	mul	q0
506	add	%rax, %rbp
507	mov	%rdx, %r9
508	adc	$0, %r9
509	mov	-8(mp), %rax
510	add	%r11, %rbp
511	mov	-8(up), %r10
512	adc	$0, %r9
513	mul	q0
514	mov	%rbp, q0
515	imul	u0inv, q0		C next q0
516	add	%rax, %r10
517	mov	%rdx, %r11
518	adc	$0, %r11
519	mov	%rbp, -16(up)
520	add	%r9, %r10
521	adc	$0, %r11
522	mov	%r10, -8(up)
523	mov	%r11, -24(up)		C up[0]
524	lea	8(up), up		C up++
525	dec	j
526	jnz	L(n3)
527
528	mov	-48(up), %rdx
529	mov	-40(up), %rbx
530	xor	R32(%rax), R32(%rax)
531	add	%rbp, %rdx
532	adc	%r10, %rbx
533	adc	-8(up), %r11
534	mov	%rdx, (rp)
535	mov	%rbx, 8(rp)
536	mov	%r11, 16(rp)
537	adc	R32(%rax), R32(%rax)
538	jmp	L(ret)
539
540L(n4):	mov	-32(mp), %rax
541	mul	q0
542	mov	%rax, %r11
543	mov	-24(mp), %rax
544	mov	%rdx, %r10
545	mul	q0
546	mov	%rax, %rbx
547	mov	-16(mp), %rax
548	mov	%rdx, %r9
549	mul	q0
550	add	-32(up), %r11
551	mov	%rax, %r11
552	adc	%r10, %rbx
553	mov	-8(mp), %rax
554	adc	$0, %r9
555	mov	%rdx, %r10
556	mul	q0
557	add	-24(up), %rbx
558	mov	%rbx, -24(up)
559	adc	%r9, %r11
560	adc	$0, %r10
561	imul	u0inv, %rbx		C next q limb
562	add	%r11, -16(up)
563	adc	%r10, %rax
564	adc	$0, %rdx
565	add	%rax, -8(up)
566	adc	$0, %rdx
567	mov	%rdx, -32(up)		C up[0]
568	mov	%rbx, q0		C previously computed q limb -> q0
569	dec	j
570	lea	8(up), up		C up++
571	jnz	L(n4)
572	jmp	L(cj)
573EPILOGUE()
574ASM_END()
575