1d25e02daSmrgdnl  AMD64 mpn_addlshC_n, mpn_sublshC_n -- rp[] = up[] +- (vp[] << C), where
2d25e02daSmrgdnl  C is 1, 2, 3.  Optimized for Pentium 4.
3d25e02daSmrg
4d25e02daSmrgdnl  Contributed to the GNU project by Torbjorn Granlund.
5d25e02daSmrg
6*f81b1c5bSmrgdnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
7d25e02daSmrg
8d25e02daSmrgdnl  This file is part of the GNU MP Library.
9*f81b1c5bSmrgdnl
10d25e02daSmrgdnl  The GNU MP Library is free software; you can redistribute it and/or modify
11*f81b1c5bSmrgdnl  it under the terms of either:
12*f81b1c5bSmrgdnl
13*f81b1c5bSmrgdnl    * the GNU Lesser General Public License as published by the Free
14*f81b1c5bSmrgdnl      Software Foundation; either version 3 of the License, or (at your
15*f81b1c5bSmrgdnl      option) any later version.
16*f81b1c5bSmrgdnl
17*f81b1c5bSmrgdnl  or
18*f81b1c5bSmrgdnl
19*f81b1c5bSmrgdnl    * the GNU General Public License as published by the Free Software
20*f81b1c5bSmrgdnl      Foundation; either version 2 of the License, or (at your option) any
21*f81b1c5bSmrgdnl      later version.
22*f81b1c5bSmrgdnl
23*f81b1c5bSmrgdnl  or both in parallel, as here.
24*f81b1c5bSmrgdnl
25d25e02daSmrgdnl  The GNU MP Library is distributed in the hope that it will be useful, but
26d25e02daSmrgdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27*f81b1c5bSmrgdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28*f81b1c5bSmrgdnl  for more details.
29*f81b1c5bSmrgdnl
30*f81b1c5bSmrgdnl  You should have received copies of the GNU General Public License and the
31*f81b1c5bSmrgdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32*f81b1c5bSmrgdnl  see https://www.gnu.org/licenses/.
33d25e02daSmrg
34d25e02daSmrgC	     cycles/limb
35d25e02daSmrgC AMD K8,K9	 3.8
36d25e02daSmrgC AMD K10	 3.8
37d25e02daSmrgC Intel P4	 5.8
38d25e02daSmrgC Intel core2	 4.75
39d25e02daSmrgC Intel corei	 4.75
40d25e02daSmrgC Intel atom	 ?
41d25e02daSmrgC VIA nano	 4.75
42d25e02daSmrg
43d25e02daSmrg
44d25e02daSmrgC INPUT PARAMETERS
45d25e02daSmrgdefine(`rp',`%rdi')
46d25e02daSmrgdefine(`up',`%rsi')
47d25e02daSmrgdefine(`vp',`%rdx')
48d25e02daSmrgdefine(`n', `%rcx')
49d25e02daSmrg
50d25e02daSmrgdefine(M, eval(m4_lshift(1,LSH)))
51d25e02daSmrg
52d25e02daSmrgABI_SUPPORT(DOS64)
53d25e02daSmrgABI_SUPPORT(STD64)
54d25e02daSmrg
55d25e02daSmrgASM_START()
56d25e02daSmrg	TEXT
57d25e02daSmrg	ALIGN(16)
58d25e02daSmrgPROLOGUE(func)
59d25e02daSmrg	FUNC_ENTRY(4)
60d25e02daSmrg	push	%rbx
61d25e02daSmrg	push	%r12
62d25e02daSmrg	push	%rbp
63d25e02daSmrg
64d25e02daSmrg	mov	(vp), %r9
65d25e02daSmrg	shl	$LSH, %r9
66d25e02daSmrg	mov	4(vp), R32(%rbp)
67d25e02daSmrg
68d25e02daSmrg	xor	R32(%rbx), R32(%rbx)
69d25e02daSmrg
70d25e02daSmrg	mov	R32(n), R32(%rax)
71d25e02daSmrg	and	$3, R32(%rax)
72d25e02daSmrg	jne	L(n00)		C n = 0, 4, 8, ...
73d25e02daSmrg
74d25e02daSmrg	mov	(up), %r8
75d25e02daSmrg	mov	8(up), %r10
76d25e02daSmrg	shr	$RSH, R32(%rbp)
77d25e02daSmrg	ADDSUB	%r9, %r8
78d25e02daSmrg	mov	8(vp), %r9
79d25e02daSmrg	lea	(%rbp,%r9,M), %r9
80d25e02daSmrg	setc	R8(%rax)
81d25e02daSmrg	mov	12(vp), R32(%rbp)
82d25e02daSmrg	lea	-16(rp), rp
83d25e02daSmrg	jmp	L(L00)
84d25e02daSmrg
85d25e02daSmrgL(n00):	cmp	$2, R32(%rax)
86d25e02daSmrg	jnc	L(n01)		C n = 1, 5, 9, ...
87d25e02daSmrg	mov	(up), %r11
88d25e02daSmrg	lea	-8(rp), rp
89d25e02daSmrg	shr	$RSH, R32(%rbp)
90d25e02daSmrg	ADDSUB	%r9, %r11
91d25e02daSmrg	setc	R8(%rbx)
92d25e02daSmrg	dec	n
93d25e02daSmrg	jz	L(1)		C jump for n = 1
94d25e02daSmrg	mov	8(up), %r8
95d25e02daSmrg	mov	8(vp), %r9
96d25e02daSmrg	lea	(%rbp,%r9,M), %r9
97d25e02daSmrg	mov	12(vp), R32(%rbp)
98d25e02daSmrg	lea	8(up), up
99d25e02daSmrg	lea	8(vp), vp
100d25e02daSmrg	jmp	L(L01)
101d25e02daSmrg
102d25e02daSmrgL(n01):	jne	L(n10)		C n = 2, 6, 10, ...
103d25e02daSmrg	mov	(up), %r12
104d25e02daSmrg	mov	8(up), %r11
105d25e02daSmrg	shr	$RSH, R32(%rbp)
106d25e02daSmrg	ADDSUB	%r9, %r12
107d25e02daSmrg	mov	8(vp), %r9
108d25e02daSmrg	lea	(%rbp,%r9,M), %r9
109d25e02daSmrg	setc	R8(%rax)
110d25e02daSmrg	mov	12(vp), R32(%rbp)
111d25e02daSmrg	lea	16(up), up
112d25e02daSmrg	lea	16(vp), vp
113d25e02daSmrg	jmp	L(L10)
114d25e02daSmrg
115d25e02daSmrgL(n10):	mov	(up), %r10
116d25e02daSmrg	mov	8(up), %r12
117d25e02daSmrg	shr	$RSH, R32(%rbp)
118d25e02daSmrg	ADDSUB	%r9, %r10
119d25e02daSmrg	mov	8(vp), %r9
120d25e02daSmrg	lea	(%rbp,%r9,M), %r9
121d25e02daSmrg	setc	R8(%rbx)
122d25e02daSmrg	mov	12(vp), R32(%rbp)
123d25e02daSmrg	lea	-24(rp), rp
124d25e02daSmrg	lea	-8(up), up
125d25e02daSmrg	lea	-8(vp), vp
126d25e02daSmrg	jmp	L(L11)
127d25e02daSmrg
128d25e02daSmrgL(c0):	mov	$1, R8(%rbx)
129d25e02daSmrg	jmp	L(rc0)
130d25e02daSmrgL(c1):	mov	$1, R8(%rax)
131d25e02daSmrg	jmp	L(rc1)
132d25e02daSmrgL(c2):	mov	$1, R8(%rbx)
133d25e02daSmrg	jmp	L(rc2)
134d25e02daSmrg
135d25e02daSmrg	ALIGN(16)
136d25e02daSmrgL(top):	mov	(up), %r8	C not on critical path
137d25e02daSmrg	shr	$RSH, R32(%rbp)
138d25e02daSmrg	ADDSUB	%r9, %r11	C not on critical path
139d25e02daSmrg	mov	(vp), %r9
140d25e02daSmrg	lea	(%rbp,%r9,M), %r9
141d25e02daSmrg	setc	R8(%rbx)	C save carry out
142d25e02daSmrg	mov	4(vp), R32(%rbp)
143d25e02daSmrg	mov	%r12, (rp)
144d25e02daSmrg	ADDSUB	%rax, %r11	C apply previous carry out
145d25e02daSmrg	jc	L(c0)		C jump if ripple
146d25e02daSmrgL(rc0):
147d25e02daSmrgL(L01):	mov	8(up), %r10
148d25e02daSmrg	shr	$RSH, R32(%rbp)
149d25e02daSmrg	ADDSUB	%r9, %r8
150d25e02daSmrg	mov	8(vp), %r9
151d25e02daSmrg	lea	(%rbp,%r9,M), %r9
152d25e02daSmrg	setc	R8(%rax)
153d25e02daSmrg	mov	12(vp), R32(%rbp)
154d25e02daSmrg	mov	%r11, 8(rp)
155d25e02daSmrg	ADDSUB	%rbx, %r8
156d25e02daSmrg	jc	L(c1)
157d25e02daSmrgL(rc1):
158d25e02daSmrgL(L00):	mov	16(up), %r12
159d25e02daSmrg	shr	$RSH, R32(%rbp)
160d25e02daSmrg	ADDSUB	%r9, %r10
161d25e02daSmrg	mov	16(vp), %r9
162d25e02daSmrg	lea	(%rbp,%r9,M), %r9
163d25e02daSmrg	setc	R8(%rbx)
164d25e02daSmrg	mov	20(vp), R32(%rbp)
165d25e02daSmrg	mov	%r8, 16(rp)
166d25e02daSmrg	ADDSUB	%rax, %r10
167d25e02daSmrg	jc	L(c2)
168d25e02daSmrgL(rc2):
169d25e02daSmrgL(L11):	mov	24(up), %r11
170d25e02daSmrg	shr	$RSH, R32(%rbp)
171d25e02daSmrg	ADDSUB	%r9, %r12
172d25e02daSmrg	mov	24(vp), %r9
173d25e02daSmrg	lea	(%rbp,%r9,M), %r9
174d25e02daSmrg	lea	32(up), up
175d25e02daSmrg	lea	32(vp), vp
176d25e02daSmrg	setc	R8(%rax)
177d25e02daSmrg	mov	-4(vp), R32(%rbp)
178d25e02daSmrg	mov	%r10, 24(rp)
179d25e02daSmrg	ADDSUB	%rbx, %r12
180d25e02daSmrg	jc	L(c3)
181d25e02daSmrgL(rc3):	lea	32(rp), rp
182d25e02daSmrgL(L10):	sub	$4, n
183d25e02daSmrg	ja	L(top)
184d25e02daSmrg
185d25e02daSmrgL(end):
186d25e02daSmrg	shr	$RSH, R32(%rbp)
187d25e02daSmrg	ADDSUB	%r9, %r11
188d25e02daSmrg	setc	R8(%rbx)
189d25e02daSmrg	mov	%r12, (rp)
190d25e02daSmrg	ADDSUB	%rax, %r11
191d25e02daSmrg	jnc	L(1)
192d25e02daSmrg	mov	$1, R8(%rbx)
193d25e02daSmrgL(1):	mov	%r11, 8(rp)
194d25e02daSmrg	lea	(%rbx,%rbp), R32(%rax)
195d25e02daSmrg	pop	%rbp
196d25e02daSmrg	pop	%r12
197d25e02daSmrg	pop	%rbx
198d25e02daSmrg	FUNC_EXIT()
199d25e02daSmrg	ret
200d25e02daSmrgL(c3):	mov	$1, R8(%rax)
201d25e02daSmrg	jmp	L(rc3)
202d25e02daSmrgEPILOGUE()
203d25e02daSmrgASM_END()
204