1dnl  AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C cycles/limb	mul_2		addmul_2
36C AMD K8,K9
37C AMD K10
38C AMD bull
39C AMD pile
40C AMD steam
41C AMD bobcat
42C AMD jaguar
43C Intel P4
44C Intel core	 4.0		4.18-4.25
45C Intel NHM	 3.75		4.06-4.2
46C Intel SBR
47C Intel IBR
48C Intel HWL
49C Intel BWL
50C Intel atom
51C VIA nano
52
53C The inner loops of this code are the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbjörn Granlund.
55
56C TODO
57C   * Implement proper cor2, replacing current cor0.
58C   * Offset n by 2 in order to avoid the outer loop cmp.  (And sqr_basecase?)
59C   * Micro-optimise.
60
61C When playing with pointers, set this to $2 to fall back to conservative
62C indexing in wind-down code.
63define(`I',`$1')
64
65define(`rp',       `%rdi')
66define(`up',       `%rsi')
67define(`vp_param', `%rdx')
68define(`n_param',  `%rcx')
69define(`n_param8',  `%cl')
70
71define(`v0',       `%r10')
72define(`v1',       `%r11')
73define(`w0',       `%rbx')
74define(`w032',       `%ebx')
75define(`w1',       `%rcx')
76define(`w132',       `%ecx')
77define(`w2',       `%rbp')
78define(`w232',       `%ebp')
79define(`w3',       `%r12')
80define(`w332',       `%r12d')
81define(`n',        `%r9')
82define(`n32',        `%r9d')
83define(`n8',        `%r9b')
84define(`i',        `%r13')
85define(`vp',       `%r8')
86
87define(`X0',       `%r14')
88define(`X1',       `%r15')
89
90C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
91
92define(`ALIGNx', `ALIGN(16)')
93
94define(`N', 85)
95ifdef(`N',,`define(`N',0)')
96define(`MOV', `ifelse(eval(N & $3),0,`mov	$1, $2',`lea	($1), $2')')
97
98ASM_START()
99	TEXT
100	ALIGN(32)
101PROLOGUE(mpn_mullow_n_basecase)
102
103	mov	(up), %rax
104	mov	vp_param, vp
105
106	cmp	$4, n_param
107	jb	lsmall
108
109	mov	(vp_param), v0
110	push	%rbx
111	lea	(rp,n_param,8), rp	C point rp at R[un]
112	push	%rbp
113	lea	(up,n_param,8), up	C point up right after U's end
114	push	%r12
115	mov	$0, n32		C FIXME
116	sub	n_param, n
117	push	%r13
118	mul	v0
119	mov	8(vp), v1
120
121	test	$1, n_param8
122	jnz	lm2x1
123
124lm2x0:test	$2, n_param8
125	jnz	lm2b2
126
127lm2b0:lea	(n), i
128	mov	%rax, (rp,n,8)
129	mov	%rdx, w1
130	mov	(up,n,8), %rax
131	xor	w232, w232
132	jmp	lm2e0
133
134lm2b2:lea	-2(n), i
135	mov	%rax, w2
136	mov	(up,n,8), %rax
137	mov	%rdx, w3
138	xor	w032, w032
139	jmp	lm2e2
140
141lm2x1:test	$2, n_param8
142	jnz	lm2b3
143
144lm2b1:lea	1(n), i
145	mov	%rax, (rp,n,8)
146	mov	(up,n,8), %rax
147	mov	%rdx, w0
148	xor	w132, w132
149	jmp	lm2e1
150
151lm2b3:lea	-1(n), i
152	xor	w332, w332
153	mov	%rax, w1
154	mov	%rdx, w2
155	mov	(up,n,8), %rax
156	jmp	lm2e3
157
158	ALIGNx
159lm2tp:mul	v0
160	add	%rax, w3
161	mov	-8(up,i,8), %rax
162	mov	w3, -8(rp,i,8)
163	adc	%rdx, w0
164	adc	$0, w132
165lm2e1:mul	v1
166	add	%rax, w0
167	adc	%rdx, w1
168	mov	$0, w232
169	mov	(up,i,8), %rax
170	mul	v0
171	add	%rax, w0
172	mov	w0, (rp,i,8)
173	adc	%rdx, w1
174	mov	(up,i,8), %rax
175	adc	$0, w232
176lm2e0:mul	v1
177	add	%rax, w1
178	adc	%rdx, w2
179	mov	8(up,i,8), %rax
180	mul	v0
181	mov	$0, w332
182	add	%rax, w1
183	adc	%rdx, w2
184	adc	$0, w332
185	mov	8(up,i,8), %rax
186lm2e3:mul	v1
187	add	%rax, w2
188	mov	w1, 8(rp,i,8)
189	adc	%rdx, w3
190	mov	$0, w032
191	mov	16(up,i,8), %rax
192	mul	v0
193	add	%rax, w2
194	mov	16(up,i,8), %rax
195	adc	%rdx, w3
196	adc	$0, w032
197lm2e2:mul	v1
198	mov	$0, w132		C FIXME: dead in last iteration
199	add	%rax, w3
200	mov	24(up,i,8), %rax
201	mov	w2, 16(rp,i,8)
202	adc	%rdx, w0		C FIXME: dead in last iteration
203	add	$4, i
204	js	lm2tp
205
206lm2ed:imul	v0, %rax
207	add	w3, %rax
208	mov	%rax, I(-8(rp),-8(rp,i,8))
209
210	add	$2, n
211	lea	16(vp), vp
212	lea	-16(up), up
213	cmp	$-2, n
214	jge	lcor1
215
216	push	%r14
217	push	%r15
218
219louter:
220	mov	(vp), v0
221	mov	8(vp), v1
222	mov	(up,n,8), %rax
223	mul	v0
224	test	$1, n8
225	jnz	la1x1
226
227la1x0:mov	%rax, X1
228	MOV(	%rdx, X0, 8)
229	mov	(up,n,8), %rax
230	mul	v1
231	test	$2, n8
232	jnz	la110
233
234la100:lea	(n), i
235	mov	(rp,n,8), w3
236	mov	%rax, w0
237	MOV(	%rdx, w1, 16)
238	jmp	llo0
239
240la110:lea	2(n), i
241	mov	(rp,n,8), w1
242	mov	%rax, w2
243	mov	8(up,n,8), %rax
244	MOV(	%rdx, w3, 1)
245	jmp	llo2
246
247la1x1:mov	%rax, X0
248	MOV(	%rdx, X1, 2)
249	mov	(up,n,8), %rax
250	mul	v1
251	test	$2, n8
252	jz	la111
253
254la101:lea	1(n), i
255	MOV(	%rdx, w0, 4)
256	mov	(rp,n,8), w2
257	mov	%rax, w3
258	jmp	llo1
259
260la111:lea	-1(n), i
261	MOV(	%rdx, w2, 64)
262	mov	%rax, w1
263	mov	(rp,n,8), w0
264	mov	8(up,n,8), %rax
265	jmp	llo3
266
267	ALIGNx
268ltop:	mul	v1
269	add	w0, w1
270	adc	%rax, w2
271	mov	-8(up,i,8), %rax
272	MOV(	%rdx, w3, 1)
273	adc	$0, w3
274llo2:	mul	v0
275	add	w1, X1
276	mov	X1, -16(rp,i,8)
277	adc	%rax, X0
278	MOV(	%rdx, X1, 2)
279	adc	$0, X1
280	mov	-8(up,i,8), %rax
281	mul	v1
282	MOV(	%rdx, w0, 4)
283	mov	-8(rp,i,8), w1
284	add	w1, w2
285	adc	%rax, w3
286	adc	$0, w0
287llo1:	mov	(up,i,8), %rax
288	mul	v0
289	add	w2, X0
290	adc	%rax, X1
291	mov	X0, -8(rp,i,8)
292	MOV(	%rdx, X0, 8)
293	adc	$0, X0
294	mov	(up,i,8), %rax
295	mov	(rp,i,8), w2
296	mul	v1
297	add	w2, w3
298	adc	%rax, w0
299	MOV(	%rdx, w1, 16)
300	adc	$0, w1
301llo0:	mov	8(up,i,8), %rax
302	mul	v0
303	add	w3, X1
304	mov	X1, (rp,i,8)
305	adc	%rax, X0
306	MOV(	%rdx, X1, 32)
307	mov	8(rp,i,8), w3
308	adc	$0, X1
309	mov	8(up,i,8), %rax
310	mul	v1
311	add	w3, w0
312	MOV(	%rdx, w2, 64)
313	adc	%rax, w1
314	mov	16(up,i,8), %rax
315	adc	$0, w2
316llo3:	mul	v0
317	add	w0, X0
318	mov	X0, 8(rp,i,8)
319	MOV(	%rdx, X0, 128)
320	adc	%rax, X1
321	mov	16(up,i,8), %rax
322	mov	16(rp,i,8), w0
323	adc	$0, X0
324	add	$4, i
325	jnc	ltop
326
327lend:	imul	v1, %rax
328	add	w0, w1
329	adc	%rax, w2
330	mov	I(-8(up),-8(up,i,8)), %rax
331	imul	v0, %rax
332	add	w1, X1
333	mov	X1, I(-16(rp),-16(rp,i,8))
334	adc	X0, %rax
335	mov	I(-8(rp),-8(rp,i,8)), w1
336	add	w1, w2
337	add	w2, %rax
338	mov	%rax, I(-8(rp),-8(rp,i,8))
339
340	add	$2, n
341	lea	16(vp), vp
342	lea	-16(up), up
343	cmp	$-2, n
344	jl	louter
345
346	pop	%r15
347	pop	%r14
348
349	jnz	lcor0
350
351lcor1:mov	(vp), v0
352	mov	8(vp), v1
353	mov	-16(up), %rax
354	mul	v0			C u0 x v2
355	add	-16(rp), %rax		C FIXME: rp[0] still available in reg?
356	adc	-8(rp), %rdx		C FIXME: rp[1] still available in reg?
357	mov	-8(up), %rbx
358	imul	v0, %rbx
359	mov	-16(up), %rcx
360	imul	v1, %rcx
361	mov	%rax, -16(rp)
362	add	%rbx, %rcx
363	add	%rdx, %rcx
364	mov	%rcx, -8(rp)
365	pop	%r13
366	pop	%r12
367	pop	%rbp
368	pop	%rbx
369	ret
370
371lcor0:mov	(vp), %r11
372	imul	-8(up), %r11
373	add	%rax, %r11
374	mov	%r11, -8(rp)
375	pop	%r13
376	pop	%r12
377	pop	%rbp
378	pop	%rbx
379	ret
380
381	ALIGN(16)
382lsmall:
383	cmp	$2, n_param
384	jae	lgt1
385ln1:	imul	(vp_param), %rax
386	mov	%rax, (rp)
387	ret
388lgt1:	ja	lgt2
389ln2:	mov	(vp_param), %r9
390	mul	%r9
391	mov	%rax, (rp)
392	mov	8(up), %rax
393	imul	%r9, %rax
394	add	%rax, %rdx
395	mov	8(vp), %r9
396	mov	(up), %rcx
397	imul	%r9, %rcx
398	add	%rcx, %rdx
399	mov	%rdx, 8(rp)
400	ret
401lgt2:
402ln3:	mov	(vp_param), %r9
403	mul	%r9		C u0 x v0
404	mov	%rax, (rp)
405	mov	%rdx, %r10
406	mov	8(up), %rax
407	mul	%r9		C u1 x v0
408	imul	16(up), %r9	C u2 x v0
409	add	%rax, %r10
410	adc	%rdx, %r9
411	mov	8(vp), %r11
412	mov	(up), %rax
413	mul	%r11		C u0 x v1
414	add	%rax, %r10
415	adc	%rdx, %r9
416	imul	8(up), %r11	C u1 x v1
417	add	%r11, %r9
418	mov	%r10, 8(rp)
419	mov	16(vp), %r10
420	mov	(up), %rax
421	imul	%rax, %r10	C u0 x v2
422	add	%r10, %r9
423	mov	%r9, 16(rp)
424	ret
425EPILOGUE()
426