1dnl  AMD64 mpn_add_n, mpn_sub_n
2
3dnl  Copyright 2003-2005, 2007, 2008, 2010-2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9	 1.5
35C AMD K10	 1.5
36C AMD bd1	 1.8
37C AMD bobcat	 2.5
38C Intel P4
39C Intel core2	 4.9
40C Intel NHM	 5.5
41C Intel SBR	 1.61
42C Intel IBR	 1.61
43C Intel atom	 4
44C VIA nano	 3.25
45
46C The loop of this code is the result of running a code generation and
47C optimization tool suite written by David Harvey and Torbjorn Granlund.
48
49C INPUT PARAMETERS
50define(`rp',	`%rdi')	C rcx
51define(`up',	`%rsi')	C rdx
52define(`vp',	`%rdx')	C r8
53define(`n',	`%rcx')	C r9
54define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
55
56ifdef(`OPERATION_add_n', `
57	define(ADCSBB,	      adc)
58	define(func,	      mpn_add_n)
59	define(func_nc,	      mpn_add_nc)')
60ifdef(`OPERATION_sub_n', `
61	define(ADCSBB,	      sbb)
62	define(func,	      mpn_sub_n)
63	define(func_nc,	      mpn_sub_nc)')
64
65MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
66
67ABI_SUPPORT(DOS64)
68ABI_SUPPORT(STD64)
69
70ASM_START()
71	TEXT
72	ALIGN(16)
73PROLOGUE(func_nc)
74	FUNC_ENTRY(4)
75IFDOS(`	mov	56(%rsp), %r8	')
76	mov	R32(n), R32(%rax)
77	shr	$2, n
78	and	$3, R32(%rax)
79	bt	$0, %r8			C cy flag <- carry parameter
80	jrcxz	L(lt4)
81
82	mov	(up), %r8
83	mov	8(up), %r9
84	dec	n
85	jmp	L(mid)
86
87EPILOGUE()
88	ALIGN(16)
89PROLOGUE(func)
90	FUNC_ENTRY(4)
91	mov	R32(n), R32(%rax)
92	shr	$2, n
93	and	$3, R32(%rax)
94	jrcxz	L(lt4)
95
96	mov	(up), %r8
97	mov	8(up), %r9
98	dec	n
99	jmp	L(mid)
100
101L(lt4):	dec	R32(%rax)
102	mov	(up), %r8
103	jnz	L(2)
104	ADCSBB	(vp), %r8
105	mov	%r8, (rp)
106	adc	R32(%rax), R32(%rax)
107	FUNC_EXIT()
108	ret
109
110L(2):	dec	R32(%rax)
111	mov	8(up), %r9
112	jnz	L(3)
113	ADCSBB	(vp), %r8
114	ADCSBB	8(vp), %r9
115	mov	%r8, (rp)
116	mov	%r9, 8(rp)
117	adc	R32(%rax), R32(%rax)
118	FUNC_EXIT()
119	ret
120
121L(3):	mov	16(up), %r10
122	ADCSBB	(vp), %r8
123	ADCSBB	8(vp), %r9
124	ADCSBB	16(vp), %r10
125	mov	%r8, (rp)
126	mov	%r9, 8(rp)
127	mov	%r10, 16(rp)
128	setc	R8(%rax)
129	FUNC_EXIT()
130	ret
131
132	ALIGN(16)
133L(top):	ADCSBB	(vp), %r8
134	ADCSBB	8(vp), %r9
135	ADCSBB	16(vp), %r10
136	ADCSBB	24(vp), %r11
137	mov	%r8, (rp)
138	lea	32(up), up
139	mov	%r9, 8(rp)
140	mov	%r10, 16(rp)
141	dec	n
142	mov	%r11, 24(rp)
143	lea	32(vp), vp
144	mov	(up), %r8
145	mov	8(up), %r9
146	lea	32(rp), rp
147L(mid):	mov	16(up), %r10
148	mov	24(up), %r11
149	jnz	L(top)
150
151L(end):	lea	32(up), up
152	ADCSBB	(vp), %r8
153	ADCSBB	8(vp), %r9
154	ADCSBB	16(vp), %r10
155	ADCSBB	24(vp), %r11
156	lea	32(vp), vp
157	mov	%r8, (rp)
158	mov	%r9, 8(rp)
159	mov	%r10, 16(rp)
160	mov	%r11, 24(rp)
161	lea	32(rp), rp
162
163	inc	R32(%rax)
164	dec	R32(%rax)
165	jnz	L(lt4)
166	adc	R32(%rax), R32(%rax)
167	FUNC_EXIT()
168	ret
169EPILOGUE()
170