1*671ea119Smrgdnl  AMD64 mpn_add_n, mpn_sub_n optimised for bobcat.
2*671ea119Smrg
3*671ea119Smrgdnl  Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
4*671ea119Smrg
5*671ea119Smrgdnl  This file is part of the GNU MP Library.
6*671ea119Smrgdnl
7*671ea119Smrgdnl  The GNU MP Library is free software; you can redistribute it and/or modify
8*671ea119Smrgdnl  it under the terms of either:
9*671ea119Smrgdnl
10*671ea119Smrgdnl    * the GNU Lesser General Public License as published by the Free
11*671ea119Smrgdnl      Software Foundation; either version 3 of the License, or (at your
12*671ea119Smrgdnl      option) any later version.
13*671ea119Smrgdnl
14*671ea119Smrgdnl  or
15*671ea119Smrgdnl
16*671ea119Smrgdnl    * the GNU General Public License as published by the Free Software
17*671ea119Smrgdnl      Foundation; either version 2 of the License, or (at your option) any
18*671ea119Smrgdnl      later version.
19*671ea119Smrgdnl
20*671ea119Smrgdnl  or both in parallel, as here.
21*671ea119Smrgdnl
22*671ea119Smrgdnl  The GNU MP Library is distributed in the hope that it will be useful, but
23*671ea119Smrgdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24*671ea119Smrgdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25*671ea119Smrgdnl  for more details.
26*671ea119Smrgdnl
27*671ea119Smrgdnl  You should have received copies of the GNU General Public License and the
28*671ea119Smrgdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29*671ea119Smrgdnl  see https://www.gnu.org/licenses/.
30*671ea119Smrg
31*671ea119Smrginclude(`../config.m4')
32*671ea119Smrg
33*671ea119SmrgC	    cycles/limb
34*671ea119SmrgC AMD K8,K9	 1.77
35*671ea119SmrgC AMD K10	 1.76\1.82
36*671ea119SmrgC AMD bd1	 1.67\2.12
37*671ea119SmrgC AMD bd2	 1.62\1.82
38*671ea119SmrgC AMD bd3
39*671ea119SmrgC AMD bd4	 1.55\2.2
40*671ea119SmrgC AMD zen
41*671ea119SmrgC AMD bt1	 2.54
42*671ea119SmrgC AMD bt2	 2
43*671ea119SmrgC Intel P4	11
44*671ea119SmrgC Intel PNR	 4.76
45*671ea119SmrgC Intel NHM	 5.27
46*671ea119SmrgC Intel SBR	 2
47*671ea119SmrgC Intel IBR	 1.94
48*671ea119SmrgC Intel HWL	 1.63
49*671ea119SmrgC Intel BWL	 1.51
50*671ea119SmrgC Intel SKL	 1.51
51*671ea119SmrgC Intel atom	 3.56
52*671ea119SmrgC Intel SLM	 4
53*671ea119SmrgC VIA nano
54*671ea119Smrg
55*671ea119SmrgC The loop of this code is the result of running a code generation and
56*671ea119SmrgC optimization tool suite written by David Harvey and Torbjorn Granlund.
57*671ea119Smrg
58*671ea119SmrgC INPUT PARAMETERS
59*671ea119Smrgdefine(`rp',	`%rdi')	C rcx
60*671ea119Smrgdefine(`up',	`%rsi')	C rdx
61*671ea119Smrgdefine(`vp',	`%rdx')	C r8
62*671ea119Smrgdefine(`n',	`%rcx')	C r9
63*671ea119Smrgdefine(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
64*671ea119Smrg
65*671ea119Smrgifdef(`OPERATION_add_n', `
66*671ea119Smrg	define(ADCSBB,	      adc)
67*671ea119Smrg	define(func,	      mpn_add_n)
68*671ea119Smrg	define(func_nc,	      mpn_add_nc)')
69*671ea119Smrgifdef(`OPERATION_sub_n', `
70*671ea119Smrg	define(ADCSBB,	      sbb)
71*671ea119Smrg	define(func,	      mpn_sub_n)
72*671ea119Smrg	define(func_nc,	      mpn_sub_nc)')
73*671ea119Smrg
74*671ea119SmrgMULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
75*671ea119Smrg
76*671ea119SmrgABI_SUPPORT(DOS64)
77*671ea119SmrgABI_SUPPORT(STD64)
78*671ea119Smrg
79*671ea119SmrgASM_START()
80*671ea119Smrg	TEXT
81*671ea119Smrg	ALIGN(16)
82*671ea119SmrgPROLOGUE(func)
83*671ea119Smrg	FUNC_ENTRY(4)
84*671ea119Smrg	xor	%r8, %r8
85*671ea119SmrgL(ent):	test	$1, R8(n)
86*671ea119Smrg	jnz	L(bx1)
87*671ea119Smrg
88*671ea119SmrgL(bx0):	test	$2, R8(n)
89*671ea119Smrg	jnz	L(b10)
90*671ea119Smrg
91*671ea119SmrgL(b00):	shr	$2, n
92*671ea119Smrg	neg	%r8
93*671ea119Smrg	mov	$3, R32(%rax)
94*671ea119Smrg	mov	(up), %r10
95*671ea119Smrg	mov	8(up), %r11
96*671ea119Smrg	jmp	L(lo0)
97*671ea119Smrg
98*671ea119SmrgL(b10):	shr	$2, n
99*671ea119Smrg	neg	%r8
100*671ea119Smrg	mov	$1, R32(%rax)
101*671ea119Smrg	mov	(up), %r8
102*671ea119Smrg	mov	8(up), %r9
103*671ea119Smrg	jrcxz	L(cj2)
104*671ea119Smrg	jmp	L(top)
105*671ea119Smrg
106*671ea119SmrgL(bx1):	test	$2, R8(n)
107*671ea119Smrg	jnz	L(b11)
108*671ea119Smrg
109*671ea119SmrgL(b01):	shr	$2, n
110*671ea119Smrg	neg	%r8
111*671ea119Smrg	mov	$0, R32(%rax)
112*671ea119Smrg	mov	(up), %r9
113*671ea119Smrg	jrcxz	L(cj1)
114*671ea119Smrg	mov	8(up), %r10
115*671ea119Smrg	jmp	L(lo1)
116*671ea119Smrg
117*671ea119Smrg	ALIGN(8)
118*671ea119SmrgL(b11):	inc	n
119*671ea119Smrg	shr	$2, n
120*671ea119Smrg	neg	%r8
121*671ea119Smrg	mov	$2, R32(%rax)
122*671ea119Smrg	mov	(up), %r11
123*671ea119Smrg	jmp	L(lo3)
124*671ea119Smrg
125*671ea119Smrg	ALIGN(4)
126*671ea119SmrgL(top):	mov	8(up,%rax,8), %r10
127*671ea119Smrg	ADCSBB	-8(vp,%rax,8), %r8
128*671ea119Smrg	mov	%r8, -8(rp,%rax,8)
129*671ea119SmrgL(lo1):	mov	16(up,%rax,8), %r11
130*671ea119Smrg	ADCSBB	(vp,%rax,8), %r9
131*671ea119Smrg	lea	4(%rax), %rax
132*671ea119Smrg	mov	%r9, -32(rp,%rax,8)
133*671ea119SmrgL(lo0):	ADCSBB	-24(vp,%rax,8), %r10
134*671ea119Smrg	mov	%r10, -24(rp,%rax,8)
135*671ea119SmrgL(lo3):	ADCSBB	-16(vp,%rax,8), %r11
136*671ea119Smrg	dec	n
137*671ea119Smrg	mov	-8(up,%rax,8), %r8
138*671ea119Smrg	mov	%r11, -16(rp,%rax,8)
139*671ea119SmrgL(lo2):	mov	(up,%rax,8), %r9
140*671ea119Smrg	jnz	L(top)
141*671ea119Smrg
142*671ea119SmrgL(cj2):	ADCSBB	-8(vp,%rax,8), %r8
143*671ea119Smrg	mov	%r8, -8(rp,%rax,8)
144*671ea119SmrgL(cj1):	ADCSBB	(vp,%rax,8), %r9
145*671ea119Smrg	mov	%r9, (rp,%rax,8)
146*671ea119Smrg
147*671ea119Smrg	mov	$0, R32(%rax)
148*671ea119Smrg	adc	$0, R32(%rax)
149*671ea119Smrg
150*671ea119Smrg	FUNC_EXIT()
151*671ea119Smrg	ret
152*671ea119SmrgEPILOGUE()
153*671ea119Smrg
154*671ea119Smrg	ALIGN(16)
155*671ea119SmrgPROLOGUE(func_nc)
156*671ea119Smrg	FUNC_ENTRY(4)
157*671ea119SmrgIFDOS(`	mov	56(%rsp), %r8	')
158*671ea119Smrg	jmp	L(ent)
159*671ea119SmrgEPILOGUE()
160