1dnl  Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P4: 19.0 cycles/limb
35
36
37C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
38C                               mp_limb_t divisor);
39C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
40C                                mp_limb_t divisor, mp_limb_t carry);
41C
42
43defframe(PARAM_CARRY,  16)
44defframe(PARAM_DIVISOR,12)
45defframe(PARAM_SIZE,   8)
46defframe(PARAM_SRC,    4)
47
48	TEXT
49
50	ALIGN(16)
51PROLOGUE(mpn_modexact_1c_odd)
52deflit(`FRAME',0)
53
54	movd	PARAM_CARRY, %mm1
55	jmp	L(start_1c)
56
57EPILOGUE()
58
59
60	ALIGN(16)
61PROLOGUE(mpn_modexact_1_odd)
62deflit(`FRAME',0)
63
64	pxor	%mm1, %mm1		C carry limb
65L(start_1c):
66	movl	PARAM_DIVISOR, %eax
67
68	movd	PARAM_DIVISOR, %mm7
69
70	shrl	%eax
71
72	andl	$127, %eax		C d/2, 7 bits
73
74ifdef(`PIC',`
75	LEA(	binvert_limb_table, %edx)
76	movzbl	(%eax,%edx), %eax		C inv 8 bits
77',`
78	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
79')
80
81	C
82
83	movd	%eax, %mm6		C inv
84
85	movd	%eax, %mm0		C inv
86
87	pmuludq	%mm6, %mm6		C inv*inv
88
89	C
90
91	pmuludq	%mm7, %mm6		C inv*inv*d
92	paddd	%mm0, %mm0		C 2*inv
93
94	C
95
96	psubd	%mm6, %mm0		C inv = 2*inv - inv*inv*d
97	pxor	%mm6, %mm6
98
99	paddd	%mm0, %mm6
100	pmuludq	%mm0, %mm0		C inv*inv
101
102	C
103
104	pmuludq	%mm7, %mm0		C inv*inv*d
105	paddd	%mm6, %mm6		C 2*inv
106
107
108	movl	PARAM_SRC, %eax
109	movl	PARAM_SIZE, %ecx
110
111	C
112
113	psubd	%mm0, %mm6		C inv = 2*inv - inv*inv*d
114
115	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
116	pushl	%eax	FRAME_pushl()
117	movd	%mm6, %eax
118	imul	PARAM_DIVISOR, %eax
119	cmpl	$1, %eax
120	popl	%eax	FRAME_popl()')
121
122	pxor	%mm0, %mm0		C carry bit
123
124
125C The dependent chain here is as follows.
126C
127C					latency
128C	psubq	 s = (src-cbit) - climb	   2
129C	pmuludq	 q = s*inverse		   8
130C	pmuludq	 prod = q*divisor	   8
131C	psrlq	 climb = high(prod)	   2
132C					  --
133C					  20
134C
135C Yet the loop measures 19.0 c/l, so obviously there's something gained
136C there over a straight reading of the chip documentation.
137
138L(top):
139	C eax	src, incrementing
140	C ebx
141	C ecx	counter, limbs
142	C edx
143	C
144	C mm0	carry bit
145	C mm1	carry limb
146	C mm6	inverse
147	C mm7	divisor
148
149	movd	(%eax), %mm2
150	addl	$4, %eax
151
152	psubq	%mm0, %mm2		C src - cbit
153
154	psubq	%mm1, %mm2		C src - cbit - climb
155	movq	%mm2, %mm0
156	psrlq	$63, %mm0		C new cbit
157
158	pmuludq	%mm6, %mm2		C s*inverse
159
160	movq	%mm7, %mm1
161	pmuludq	%mm2, %mm1		C q*divisor
162	psrlq	$32, %mm1		C new climb
163
164	subl	$1, %ecx
165	jnz	L(top)
166
167
168L(done):
169	paddq	%mm1, %mm0
170	movd	%mm0, %eax
171	emms
172	ret
173
174EPILOGUE()
175ASM_END()
176