1*f81b1c5bSmrgdnl  AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
2*f81b1c5bSmrg
3*f81b1c5bSmrgdnl  Copyright 2011-2013 Free Software Foundation, Inc.
4*f81b1c5bSmrg
5*f81b1c5bSmrgdnl  This file is part of the GNU MP Library.
6*f81b1c5bSmrgdnl
7*f81b1c5bSmrgdnl  The GNU MP Library is free software; you can redistribute it and/or modify
8*f81b1c5bSmrgdnl  it under the terms of either:
9*f81b1c5bSmrgdnl
10*f81b1c5bSmrgdnl    * the GNU Lesser General Public License as published by the Free
11*f81b1c5bSmrgdnl      Software Foundation; either version 3 of the License, or (at your
12*f81b1c5bSmrgdnl      option) any later version.
13*f81b1c5bSmrgdnl
14*f81b1c5bSmrgdnl  or
15*f81b1c5bSmrgdnl
16*f81b1c5bSmrgdnl    * the GNU General Public License as published by the Free Software
17*f81b1c5bSmrgdnl      Foundation; either version 2 of the License, or (at your option) any
18*f81b1c5bSmrgdnl      later version.
19*f81b1c5bSmrgdnl
20*f81b1c5bSmrgdnl  or both in parallel, as here.
21*f81b1c5bSmrgdnl
22*f81b1c5bSmrgdnl  The GNU MP Library is distributed in the hope that it will be useful, but
23*f81b1c5bSmrgdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24*f81b1c5bSmrgdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25*f81b1c5bSmrgdnl  for more details.
26*f81b1c5bSmrgdnl
27*f81b1c5bSmrgdnl  You should have received copies of the GNU General Public License and the
28*f81b1c5bSmrgdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29*f81b1c5bSmrgdnl  see https://www.gnu.org/licenses/.
30*f81b1c5bSmrg
31*f81b1c5bSmrginclude(`../config.m4')
32*f81b1c5bSmrg
33*f81b1c5bSmrgC	     cycles/limb
34*f81b1c5bSmrgC AMD K8,K9	 2
35*f81b1c5bSmrgC AMD K10	 2
36*f81b1c5bSmrgC AMD bd1	 2.32
37*f81b1c5bSmrgC AMD bobcat	 3
38*f81b1c5bSmrgC Intel P4	13
39*f81b1c5bSmrgC Intel core2	 2.9
40*f81b1c5bSmrgC Intel NHM	 2.8
41*f81b1c5bSmrgC Intel SBR	 2.4
42*f81b1c5bSmrgC Intel atom	 5.33
43*f81b1c5bSmrgC VIA nano	 3
44*f81b1c5bSmrg
45*f81b1c5bSmrgC NOTES
46*f81b1c5bSmrgC  * It might seem natural to use the cmov insn here, but since this function
47*f81b1c5bSmrgC    is supposed to have the exact same execution pattern for cnd true and
48*f81b1c5bSmrgC    false, and since cmov's documentation is not clear about whether it
49*f81b1c5bSmrgC    actually reads both source operands and writes the register for a false
50*f81b1c5bSmrgC    condition, we cannot use it.
51*f81b1c5bSmrgC  * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory
52*f81b1c5bSmrgC    to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use
53*f81b1c5bSmrgC    ADCSBB-to-memory, again saving 1 insn/limb.
54*f81b1c5bSmrgC  * This runs optimally at decoder bandwidth on K10.  It has not been tuned
55*f81b1c5bSmrgC    for any other processor.
56*f81b1c5bSmrg
57*f81b1c5bSmrgC INPUT PARAMETERS
58*f81b1c5bSmrgdefine(`cnd',	`%rdi')	dnl rcx
59*f81b1c5bSmrgdefine(`rp',	`%rsi')	dnl rdx
60*f81b1c5bSmrgdefine(`up',	`%rdx')	dnl r8
61*f81b1c5bSmrgdefine(`vp',	`%rcx')	dnl r9
62*f81b1c5bSmrgdefine(`n',	`%r8')	dnl rsp+40
63*f81b1c5bSmrg
64*f81b1c5bSmrgifdef(`OPERATION_cnd_add_n', `
65*f81b1c5bSmrg	define(ADDSUB,	      add)
66*f81b1c5bSmrg	define(ADCSBB,	      adc)
67*f81b1c5bSmrg	define(func,	      mpn_cnd_add_n)')
68*f81b1c5bSmrgifdef(`OPERATION_cnd_sub_n', `
69*f81b1c5bSmrg	define(ADDSUB,	      sub)
70*f81b1c5bSmrg	define(ADCSBB,	      sbb)
71*f81b1c5bSmrg	define(func,	      mpn_cnd_sub_n)')
72*f81b1c5bSmrg
73*f81b1c5bSmrgMULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
74*f81b1c5bSmrg
75*f81b1c5bSmrgABI_SUPPORT(DOS64)
76*f81b1c5bSmrgABI_SUPPORT(STD64)
77*f81b1c5bSmrg
78*f81b1c5bSmrgASM_START()
79*f81b1c5bSmrg	TEXT
80*f81b1c5bSmrg	ALIGN(16)
81*f81b1c5bSmrgPROLOGUE(func)
82*f81b1c5bSmrg	FUNC_ENTRY(4)
83*f81b1c5bSmrgIFDOS(`	mov	56(%rsp), R32(%r8)')
84*f81b1c5bSmrg	push	%rbx
85*f81b1c5bSmrg	push	%rbp
86*f81b1c5bSmrg	push	%r12
87*f81b1c5bSmrg	push	%r13
88*f81b1c5bSmrg	push	%r14
89*f81b1c5bSmrg
90*f81b1c5bSmrg	neg	cnd
91*f81b1c5bSmrg	sbb	cnd, cnd		C make cnd mask
92*f81b1c5bSmrg
93*f81b1c5bSmrg	lea	(vp,n,8), vp
94*f81b1c5bSmrg	lea	(up,n,8), up
95*f81b1c5bSmrg	lea	(rp,n,8), rp
96*f81b1c5bSmrg
97*f81b1c5bSmrg	mov	R32(n), R32(%rax)
98*f81b1c5bSmrg	neg	n
99*f81b1c5bSmrg	and	$3, R32(%rax)
100*f81b1c5bSmrg	jz	L(top)			C carry-save reg rax = 0 in this arc
101*f81b1c5bSmrg	cmp	$2, R32(%rax)
102*f81b1c5bSmrg	jc	L(b1)
103*f81b1c5bSmrg	jz	L(b2)
104*f81b1c5bSmrg
105*f81b1c5bSmrgL(b3):	mov	(vp,n,8), %r12
106*f81b1c5bSmrg	mov	8(vp,n,8), %r13
107*f81b1c5bSmrg	mov	16(vp,n,8), %r14
108*f81b1c5bSmrg	and	cnd, %r12
109*f81b1c5bSmrg	mov	(up,n,8), %r10
110*f81b1c5bSmrg	and	cnd, %r13
111*f81b1c5bSmrg	mov	8(up,n,8), %rbx
112*f81b1c5bSmrg	and	cnd, %r14
113*f81b1c5bSmrg	mov	16(up,n,8), %rbp
114*f81b1c5bSmrg	ADDSUB	%r12, %r10
115*f81b1c5bSmrg	mov	%r10, (rp,n,8)
116*f81b1c5bSmrg	ADCSBB	%r13, %rbx
117*f81b1c5bSmrg	mov	%rbx, 8(rp,n,8)
118*f81b1c5bSmrg	ADCSBB	%r14, %rbp
119*f81b1c5bSmrg	mov	%rbp, 16(rp,n,8)
120*f81b1c5bSmrg	sbb	R32(%rax), R32(%rax)	C save carry
121*f81b1c5bSmrg	add	$3, n
122*f81b1c5bSmrg	js	L(top)
123*f81b1c5bSmrg	jmp	L(end)
124*f81b1c5bSmrg
125*f81b1c5bSmrgL(b2):	mov	(vp,n,8), %r12
126*f81b1c5bSmrg	mov	8(vp,n,8), %r13
127*f81b1c5bSmrg	mov	(up,n,8), %r10
128*f81b1c5bSmrg	and	cnd, %r12
129*f81b1c5bSmrg	mov	8(up,n,8), %rbx
130*f81b1c5bSmrg	and	cnd, %r13
131*f81b1c5bSmrg	ADDSUB	%r12, %r10
132*f81b1c5bSmrg	mov	%r10, (rp,n,8)
133*f81b1c5bSmrg	ADCSBB	%r13, %rbx
134*f81b1c5bSmrg	mov	%rbx, 8(rp,n,8)
135*f81b1c5bSmrg	sbb	R32(%rax), R32(%rax)	C save carry
136*f81b1c5bSmrg	add	$2, n
137*f81b1c5bSmrg	js	L(top)
138*f81b1c5bSmrg	jmp	L(end)
139*f81b1c5bSmrg
140*f81b1c5bSmrgL(b1):	mov	(vp,n,8), %r12
141*f81b1c5bSmrg	mov	(up,n,8), %r10
142*f81b1c5bSmrg	and	cnd, %r12
143*f81b1c5bSmrg	ADDSUB	%r12, %r10
144*f81b1c5bSmrg	mov	%r10, (rp,n,8)
145*f81b1c5bSmrg	sbb	R32(%rax), R32(%rax)	C save carry
146*f81b1c5bSmrg	add	$1, n
147*f81b1c5bSmrg	jns	L(end)
148*f81b1c5bSmrg
149*f81b1c5bSmrg	ALIGN(16)
150*f81b1c5bSmrgL(top):	mov	(vp,n,8), %r12
151*f81b1c5bSmrg	mov	8(vp,n,8), %r13
152*f81b1c5bSmrg	mov	16(vp,n,8), %r14
153*f81b1c5bSmrg	mov	24(vp,n,8), %r11
154*f81b1c5bSmrg	and	cnd, %r12
155*f81b1c5bSmrg	mov	(up,n,8), %r10
156*f81b1c5bSmrg	and	cnd, %r13
157*f81b1c5bSmrg	mov	8(up,n,8), %rbx
158*f81b1c5bSmrg	and	cnd, %r14
159*f81b1c5bSmrg	mov	16(up,n,8), %rbp
160*f81b1c5bSmrg	and	cnd, %r11
161*f81b1c5bSmrg	mov	24(up,n,8), %r9
162*f81b1c5bSmrg	add	R32(%rax), R32(%rax)	C restore carry
163*f81b1c5bSmrg	ADCSBB	%r12, %r10
164*f81b1c5bSmrg	mov	%r10, (rp,n,8)
165*f81b1c5bSmrg	ADCSBB	%r13, %rbx
166*f81b1c5bSmrg	mov	%rbx, 8(rp,n,8)
167*f81b1c5bSmrg	ADCSBB	%r14, %rbp
168*f81b1c5bSmrg	mov	%rbp, 16(rp,n,8)
169*f81b1c5bSmrg	ADCSBB	%r11, %r9
170*f81b1c5bSmrg	mov	%r9, 24(rp,n,8)
171*f81b1c5bSmrg	sbb	R32(%rax), R32(%rax)	C save carry
172*f81b1c5bSmrg	add	$4, n
173*f81b1c5bSmrg	js	L(top)
174*f81b1c5bSmrg
175*f81b1c5bSmrgL(end):	neg	R32(%rax)
176*f81b1c5bSmrg	pop	%r14
177*f81b1c5bSmrg	pop	%r13
178*f81b1c5bSmrg	pop	%r12
179*f81b1c5bSmrg	pop	%rbp
180*f81b1c5bSmrg	pop	%rbx
181*f81b1c5bSmrg	FUNC_EXIT()
182*f81b1c5bSmrg	ret
183*f81b1c5bSmrgEPILOGUE()
184