1dnl  x86 mpn_gcd_1 optimised for AMD K7.
2
3dnl  Contributed to the GNU project by by Kevin Ryde.  Rehacked by Torbjorn
4dnl  Granlund.
5
6dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36
37C	     cycles/bit (approx)
38C AMD K7	 5.31
39C AMD K8,K9	 5.33
40C AMD K10	 5.30
41C AMD bd1	 ?
42C AMD bobcat	 7.02
43C Intel P4-2	10.1
44C Intel P4-3/4	10.0
45C Intel P6/13	 5.88
46C Intel core2	 6.26
47C Intel NHM	 6.83
48C Intel SBR	 8.50
49C Intel atom	 8.90
50C VIA nano	 ?
51C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
52
53C TODO
54C  * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
55C  * Stream things better through registers, avoiding some copying.
56
57C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
58
59deflit(MAXSHIFT, 6)
60deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
61
62DEF_OBJECT(ctz_table,64)
63	.byte	MAXSHIFT
64forloop(i,1,MASK,
65`	.byte	m4_count_trailing_zeros(i)
66')
67END_OBJECT(ctz_table)
68
69C Threshold of when to call bmod when U is one limb.  Should be about
70C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
71define(`DIV_THRES_LOG2', 7)
72
73
74define(`up',    `%edi')
75define(`n',     `%esi')
76define(`v0',    `%edx')
77
78
79ASM_START()
80	TEXT
81	ALIGN(16)
82PROLOGUE(mpn_gcd_1)
83	push	%edi
84	push	%esi
85
86	mov	12(%esp), up
87	mov	16(%esp), n
88	mov	20(%esp), v0
89
90	mov	(up), %eax		C U low limb
91	or	v0, %eax		C x | y
92	mov	$-1, %ecx
93
94L(twos):
95	inc	%ecx
96	shr	%eax
97	jnc	L(twos)
98
99	shr	%cl, v0
100	mov	%ecx, %eax		C common twos
101
102L(divide_strip_y):
103	shr	v0
104	jnc	L(divide_strip_y)
105	adc	v0, v0
106
107	push	%eax
108	push	v0
109
110	cmp	$1, n
111	jnz	L(reduce_nby1)
112
113C Both U and V are single limbs, reduce with bmod if u0 >> v0.
114	mov	(up), %ecx
115	mov	%ecx, %eax
116	shr	$DIV_THRES_LOG2, %ecx
117	cmp	%ecx, v0
118	ja	L(reduced)
119
120	mov	v0, %esi
121	xor	%edx, %edx
122	div	%esi
123	mov	%edx, %eax
124	jmp	L(reduced)
125
126L(reduce_nby1):
127ifdef(`PIC_WITH_EBX',`
128	push	%ebx
129	call	L(movl_eip_to_ebx)
130	add	$_GLOBAL_OFFSET_TABLE_, %ebx
131')
132	push	v0			C param 3
133	push	n			C param 2
134	push	up			C param 1
135	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
136	jl	L(bmod)
137	CALL(	mpn_mod_1)
138	jmp	L(called)
139L(bmod):
140	CALL(	mpn_modexact_1_odd)
141
142L(called):
143	add	$12, %esp		C deallocate params
144ifdef(`PIC_WITH_EBX',`
145	pop	%ebx
146')
147L(reduced):
148	pop	%edx
149
150	LEA(	ctz_table, %esi)
151	test	%eax, %eax
152	mov	%eax, %ecx
153	jnz	L(mid)
154	jmp	L(end)
155
156	ALIGN(16)			C               K8    BC    P4    NHM   SBR
157L(top):	cmovc(	%ecx, %eax)		C if x-y < 0	0
158	cmovc(	%edi, %edx)		C use x,y-x	0
159L(mid):	and	$MASK, %ecx		C		0
160	movzbl	(%esi,%ecx), %ecx	C		1
161	jz	L(shift_alot)		C		1
162	shr	%cl, %eax		C		3
163	mov	%eax, %edi		C		4
164	mov	%edx, %ecx		C		3
165	sub	%eax, %ecx		C		4
166	sub	%edx, %eax		C		4
167	jnz	L(top)			C		5
168
169L(end):	pop	%ecx
170	mov	%edx, %eax
171	shl	%cl, %eax
172	pop	%esi
173	pop	%edi
174	ret
175
176L(shift_alot):
177	shr	$MAXSHIFT, %eax
178	mov	%eax, %ecx
179	jmp	L(mid)
180
181ifdef(`PIC_WITH_EBX',`
182L(movl_eip_to_ebx):
183	mov	(%esp), %ebx
184	ret
185')
186EPILOGUE()
187