1dnl  X86_64 mpn_divrem_hensel_qr_1_2
2
3dnl  Copyright 2009 Jason Moxham
4
5dnl  This file is part of the MPIR Library.
6
7dnl  The MPIR Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The MPIR Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
19dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20dnl  Boston, MA 02110-1301, USA.
21
22include(`../config.m4')
23
24C	(rdi,rdx)=(rsi,rdx) / rcx      rdx>=1
25C	rax=hensel remainder from div
26
27C	This is divrem_hensel_1 with shifting on the output of the quotient
28C	On k8/k10 the shifting comes for free so no need to have different
29C	fn for that. And on K8/K10 this runs at 10c/l which is optimal
30C	This function "replaces" divexact_1 and modexact_1_odd
31C	This is same as the shifting version but with  no shifting
32
33ASM_START()
34PROLOGUE(mpn_divrem_hensel_qr_1_2)
35mov $1,%r9
36sub %rdx,%r9
37lea -8(%rdi,%rdx,8),%rdi
38lea -8(%rsi,%rdx,8),%rsi
39
40push %r12
41push %r13
42push %r14
43
44mov %rcx,%rdx
45C // rdx is 3 bit inverse
46
47mov %rdx,%rax
48imul %ecx,%edx
49mov $2,%r11
50sub %rdx,%r11
51imul %eax,%r11d
52C //r11 has 4 bits
53
54mov %r11,%rax
55imul %ecx,%r11d
56mov $2,%rdx
57sub %r11,%rdx
58imul %eax,%edx
59C //rdx has 8 bits
60
61mov %rdx,%rax
62imul %ecx,%edx
63mov $2,%r11
64sub %rdx,%r11
65imul %eax,%r11d
66C //r11 has 16 bits
67
68mov %r11,%rax
69imul %ecx,%r11d
70mov $2,%rdx
71sub %r11,%rdx
72imul %eax,%edx
73C // rdx has 32 bits
74
75mov %rdx,%rax
76imul %rcx,%rdx
77mov $2,%r11
78sub %rdx,%r11
79imul %rax,%r11
80C //r11 has 64 bits
81
82mov %r11,%rax
83mov %r11,%r12
84mul %rcx
85neg %rdx
86imul %rdx,%r12
87C // r12,r11 has 128 bits
88
89mov %r11,%r13
90mov %r12,%r14
91
92mov (%rsi,%r9,8),%r11
93mov 8(%rsi,%r9,8),%r12
94mov $0,%r10
95add $2,%r9
96jc skiplp
97ALIGN(16)
98lp:
99	mov %r12,%r8
100	mov %r13,%rax
101	mul %r11
102	mov %rax,-16(%rdi,%r9,8)
103	imul %r14,%r11
104	imul %r13,%r12
105	add %r11,%rdx
106	add %r12,%rdx
107		mov (%rsi,%r9,8),%r11
108		mov 8(%rsi,%r9,8),%r12
109	mov %rdx,-8(%rdi,%r9,8)
110	mov %rcx,%rax
111	mul %rdx
112		add %r10,%r10
113		sbb $0,%r11
114		sbb $0,%r12
115		sbb %r10,%r10
116	cmp %rax,%r8
117		sbb %rdx,%r11
118		sbb $0,%r12
119		sbb $0,%r10
120	add $2,%r9
121	jnc lp
122skiplp:
123mov %r12,%r8
124mov %r13,%rax
125mul %r11
126mov %rax,-16(%rdi,%r9,8)
127imul %r14,%r11
128imul %r13,%r12
129add %r11,%rdx
130add %r12,%rdx
131cmp $0,%r9
132jne case0
133case1:
134		mov (%rsi,%r9,8),%r11
135	mov %rdx,-8(%rdi,%r9,8)
136	mov %rcx,%rax
137	mul %rdx
138		add %r10,%r10
139		sbb $0,%r11
140		sbb %r10,%r10
141	cmp %rax,%r8
142		sbb %rdx,%r11
143		sbb $0,%r10
144	mov %r11,%rax
145	imul %r13,%rax
146	mov %rax,(%rdi,%r9,8)
147	mul %rcx
148	add %r10,%r10
149	mov $0,%rax
150	adc %rdx,%rax
151	pop %r14
152	pop %r13
153	pop %r12
154	ret
155case0:
156	mov %rdx,-8(%rdi,%r9,8)
157	mov %rcx,%rax
158	mul %rdx
159	cmp %rax,%r8
160	mov $0,%rax
161	adc %rdx,%rax
162	sub %r10,%rax
163	pop %r14
164	pop %r13
165	pop %r12
166	ret
167EPILOGUE()
168