1dnl  AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
2
3dnl  Copyright 2011 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns.  The
36C innerloop is 2*3-way unrolled, which is best we can do with the available
37C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
38C cannot feed carry between operations there.
39
40C			    cycles/limb
41C P5
42C P6 model 0-8,10-12
43C P6 model 9  (Banias)
44C P6 model 13 (Dothan)
45C P4 model 0  (Willamette)
46C P4 model 1  (?)
47C P4 model 2  (Northwood)
48C P4 model 3  (Prescott)
49C P4 model 4  (Nocona)
50C Intel Atom			 6.75
51C AMD K6
52C AMD K7
53C AMD K8
54
55C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
56C processors.  It uses 2*4-way unrolling, for good reasons.
57C
58C Breaking carry recurrency might be a good idea.  We would then need separate
59C registers for the shift carry and add/subtract carry, which in turn would
60C force us to 2*2-way unrolling.
61
62defframe(PARAM_SIZE,	12)
63defframe(PARAM_SRC,	 8)
64defframe(PARAM_DST,	 4)
65
66dnl  re-use parameter space
67define(VAR_COUNT,`PARAM_SIZE')
68define(SAVE_EBX,`PARAM_SRC')
69define(SAVE_EBP,`PARAM_DST')
70
71ASM_START()
72	TEXT
73	ALIGN(8)
74PROLOGUE(mpn_sublsh1_n_ip1)
75deflit(`FRAME',0)
76
77define(`rp',  `%edi')
78define(`up',  `%esi')
79
80	mov	PARAM_SIZE, %eax	C size
81	push	up			FRAME_pushl()
82	push	rp			FRAME_pushl()
83	xor	%edx, %edx
84	mov	PARAM_SRC, up
85	mov	PARAM_DST, rp
86	mov	%ebx, SAVE_EBX
87	mov	%eax, %ebx
88	shr	$3, %eax
89
90	not	%eax			C count = -(size\8)-i
91	and	$7, %ebx		C size % 8
92	jz	L(exact)
93
94L(oop):
95ifdef(`CPU_P6',`
96	shr	%edx ')			C restore 2nd saved carry bit
97	mov	(up), %ecx
98	adc	%ecx, %ecx
99	rcr	%edx			C restore 1st saved carry bit
100	lea	4(up), up
101	sbb	%ecx, (rp)
102	lea	4(rp), rp
103	adc	%edx, %edx		C save a carry bit in edx
104ifdef(`CPU_P6',`
105	adc	%edx, %edx ')		C save another carry bit in edx
106	dec	%ebx
107	jnz	L(oop)
108L(exact):
109	inc	%eax
110	jz	L(end)
111	mov	%eax, VAR_COUNT
112	mov	%ebp, SAVE_EBP
113
114	ALIGN(16)
115L(top):
116ifdef(`CPU_P6',`
117	shr	%edx ')			C restore 2nd saved carry bit
118	mov	(up), %eax
119	adc	%eax, %eax
120	mov	4(up), %ebx
121	adc	%ebx, %ebx
122	mov	8(up), %ecx
123	adc	%ecx, %ecx
124	mov	12(up), %ebp
125	adc	%ebp, %ebp
126
127	rcr	%edx			C restore 1st saved carry bit
128
129	sbb	%eax, (rp)
130	sbb	%ebx, 4(rp)
131	sbb	%ecx, 8(rp)
132	sbb	%ebp, 12(rp)
133
134	mov	16(up), %eax
135	adc	%eax, %eax
136	mov	20(up), %ebx
137	adc	%ebx, %ebx
138	mov	24(up), %ecx
139	adc	%ecx, %ecx
140	mov	28(up), %ebp
141	adc	%ebp, %ebp
142
143	lea	32(up), up
144	adc	%edx, %edx		C save a carry bit in edx
145
146	sbb	%eax, 16(rp)
147	sbb	%ebx, 20(rp)
148	sbb	%ecx, 24(rp)
149	sbb	%ebp, 28(rp)
150
151ifdef(`CPU_P6',`
152	adc	%edx, %edx ')		C save another carry bit in edx
153	incl	VAR_COUNT
154	lea	32(rp), rp
155	jne	L(top)
156
157	mov	SAVE_EBP, %ebp
158L(end):
159	mov	SAVE_EBX, %ebx
160
161ifdef(`CPU_P6',`
162	xor	%eax, %eax
163	shr	$1, %edx
164	adc	%edx, %eax
165',`
166	adc	$0, %edx
167	mov	%edx, %eax
168')
169	pop	rp			FRAME_popl()
170	pop	up			FRAME_popl()
171	ret
172EPILOGUE()
173ASM_END()
174