1dnl  AMD64 mpn_mulmid_basecase
2
3dnl  Based on mul_basecase.asm from GMP 4.3.1, modifications are copyright
4dnl  (C) 2009, David Harvey. The original mul_basecase.asm was released under
5dnl  LGPLv3+, license terms reproduced below. These modifications are hereby
6dnl  released under the same terms.
7
8dnl  ========= Original license terms:
9
10dnl  Contributed to the GNU project by Torbjorn Granlund and David Harvey.
11
12dnl  Copyright 2008 Free Software Foundation, Inc.
13
14dnl  This file is part of the GNU MP Library.
15
16dnl  The GNU MP Library is free software; you can redistribute it and/or modify
17dnl  it under the terms of the GNU Lesser General Public License as published
18dnl  by the Free Software Foundation; either version 3 of the License, or (at
19dnl  your option) any later version.
20
21dnl  The GNU MP Library is distributed in the hope that it will be useful, but
22dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
23dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
24dnl  License for more details.
25
26dnl  You should have received a copy of the GNU Lesser General Public License
27dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
28
29dnl  ========= end license terms
30
31
32include(`../config.m4')
33
34C	     cycles/limb
35C K8,K9:	 2.375  (2.5 when un - vn is "small")
36C K10:		 ?
37C P4:		 ?
38C P6-15:	 ?
39
40C INPUT PARAMETERS
41define(`rp',      `%rdi')
42define(`up',      `%rsi')
43define(`un_param',`%rdx')
44define(`vp_param',`%rcx')
45define(`vn',      `%r8')
46define(`vn32',    `%r8d')
47
48define(`v0', `%r12')
49define(`v1', `%r9')
50
51define(`w0', `%rbx')
52define(`w1', `%rcx')
53define(`w2', `%rbp')
54define(`w3', `%r10')
55define(`w032', `%ebx')
56define(`w132', `%ecx')
57define(`w232', `%ebp')
58define(`w332', `%r10d')
59
60define(`n',  `%r11')
61define(`outer_addr', `%r14')
62define(`un',  `%r13')
63define(`un32',`%r13d')
64define(`vp',  `%r15')
65
66define(`vp_inner', `%r10')
67
68
69ASM_START()
70	TEXT
71	ALIGN(16)
72PROLOGUE(mpn_mulmid_basecase)
73	push	%rbx
74	push	%rbp
75	push	%r12
76	push	%r13
77	push	%r14
78	push	%r15
79
80	mov	vp_param, vp
81
82	C use un for row length (= un_param - vn + 1)
83	lea	1(un_param), un
84	sub	vn, un
85
86	lea	(rp,un,8), rp
87
88	cmp	$4, un		C FIXME: needs tuning
89	jc	L(diagonal)
90
91	lea	(up,un_param,8), up
92
93	test	$1, vn
94	jz	L(mul_2)
95
96C ===========================================================
97C     mul_1 for vp[0] if vn is odd
98
99L(mul_1):
100	mov	un32, w032
101
102	neg	un
103	mov	(up,un,8), %rax
104	mov	(vp), v0
105	mul	v0
106
107	and	$-4, un		C round down to multiple of 4
108	mov	un, n
109
110	and	$3, w032
111	jz	L(mul_1_prologue_0)
112	cmp	$2, w032
113	jc	L(mul_1_prologue_1)
114	jz	L(mul_1_prologue_2)
115
116L(mul_1_prologue_3):
117	mov	%rax, w3
118	mov	%rdx, w0
119	lea	L(addmul_prologue_3)(%rip), outer_addr
120	jmp	L(mul_1_entry_3)
121
122	ALIGN(16)
123L(mul_1_prologue_0):
124	mov	%rax, w2
125	mov	%rdx, w3		C note already w0 == 0
126	lea	L(addmul_prologue_0)(%rip), outer_addr
127	jmp	L(mul_1_entry_0)
128
129	ALIGN(16)
130L(mul_1_prologue_1):
131	add	$4, n
132	mov	%rax, w1
133	mov	%rdx, w2
134	mov	$0, w332
135	mov	(up,n,8), %rax
136	lea	L(addmul_prologue_1)(%rip), outer_addr
137	jmp	L(mul_1_entry_1)
138
139	ALIGN(16)
140L(mul_1_prologue_2):
141	mov	%rax, w0
142	mov	%rdx, w1
143	mov	24(up,n,8), %rax
144	mov	$0, w232
145	mov	$0, w332
146	lea	L(addmul_prologue_2)(%rip), outer_addr
147	jmp	L(mul_1_entry_2)
148
149
150	C this loop is 10 c/loop = 2.5 c/l on K8
151
152	ALIGN(16)
153L(mul_1_top):
154	mov	w0, -16(rp,n,8)
155	add	%rax, w1
156	mov	(up,n,8), %rax
157	adc	%rdx, w2
158L(mul_1_entry_1):
159	mov	$0, w032
160	mul	v0
161	mov	w1, -8(rp,n,8)
162	add	%rax, w2
163	adc	%rdx, w3
164L(mul_1_entry_0):
165	mov	8(up,n,8), %rax
166	mul	v0
167	mov	w2, (rp,n,8)
168	add	%rax, w3
169	adc	%rdx, w0
170L(mul_1_entry_3):
171	mov	16(up,n,8), %rax
172	mul	v0
173	mov	w3, 8(rp,n,8)
174	mov	$0, w232		C zero
175	mov	w2, w3			C zero
176	add	%rax, w0
177	mov	24(up,n,8), %rax
178	mov	w2, w1			C zero
179	adc	%rdx, w1
180L(mul_1_entry_2):
181	mul	v0
182	add	$4, n
183	js	L(mul_1_top)
184
185	mov	w0, -16(rp)
186	add	%rax, w1
187	mov	w1, -8(rp)
188	mov	w2, 8(rp)		C zero last limb of output
189	adc	%rdx, w2
190	mov	w2, (rp)
191
192	dec	vn
193	jz	L(ret)
194
195	lea	-8(up), up
196	lea	8(vp), vp
197
198	mov	un, n
199	mov	(vp), v0
200	mov	8(vp), v1
201
202	jmp	*outer_addr
203
204C ===========================================================
205C     mul_2 for vp[0], vp[1] if vn is even
206
207	ALIGN(16)
208L(mul_2):
209	mov	un32, w032
210
211	neg	un
212	mov	-8(up,un,8), %rax
213	mov	(vp), v0
214	mov	8(vp), v1
215	mul	v1
216
217	and	$-4, un		C round down to multiple of 4
218	mov	un, n
219
220	and	$3, w032
221	jz	L(mul_2_prologue_0)
222	cmp	$2, w032
223	jc	L(mul_2_prologue_1)
224	jz	L(mul_2_prologue_2)
225
226L(mul_2_prologue_3):
227	mov	%rax, w1
228	mov	%rdx, w2
229	lea	L(addmul_prologue_3)(%rip), outer_addr
230	jmp	L(mul_2_entry_3)
231
232	ALIGN(16)
233L(mul_2_prologue_0):
234	mov	%rax, w0
235	mov	%rdx, w1
236	lea	L(addmul_prologue_0)(%rip), outer_addr
237	jmp	L(mul_2_entry_0)
238
239	ALIGN(16)
240L(mul_2_prologue_1):
241	mov	%rax, w3
242	mov	%rdx, w0
243	mov	$0, w132
244	lea	L(addmul_prologue_1)(%rip), outer_addr
245	jmp	L(mul_2_entry_1)
246
247	ALIGN(16)
248L(mul_2_prologue_2):
249	mov	%rax, w2
250	mov	%rdx, w3
251	mov	$0, w032
252	mov	16(up,n,8), %rax
253	lea	L(addmul_prologue_2)(%rip), outer_addr
254	jmp	L(mul_2_entry_2)
255
256
257	C this loop is 18 c/loop = 2.25 c/l on K8
258
259	ALIGN(16)
260L(mul_2_top):
261	mov     -8(up,n,8), %rax
262	mul     v1
263	add     %rax, w0
264	adc     %rdx, w1
265L(mul_2_entry_0):
266	mov     $0, w232
267	mov     (up,n,8), %rax
268	mul     v0
269	add     %rax, w0
270	mov     (up,n,8), %rax
271	adc     %rdx, w1
272	adc     $0, w232
273	mul     v1
274	add     %rax, w1
275	mov     w0, (rp,n,8)
276	adc     %rdx, w2
277L(mul_2_entry_3):
278	mov     8(up,n,8), %rax
279	mul     v0
280	mov     $0, w332
281	add     %rax, w1
282	adc     %rdx, w2
283	mov     $0, w032
284	adc     $0, w332
285	mov     8(up,n,8), %rax
286	mov     w1, 8(rp,n,8)
287	mul     v1
288	add     %rax, w2
289	mov     16(up,n,8), %rax
290	adc     %rdx, w3
291L(mul_2_entry_2):
292	mov     $0, w132
293	mul     v0
294	add     %rax, w2
295	mov     16(up,n,8), %rax
296	adc     %rdx, w3
297	adc     $0, w032
298	mul     v1
299	add     %rax, w3
300	mov     w2, 16(rp,n,8)
301	adc     %rdx, w0
302L(mul_2_entry_1):
303	mov     24(up,n,8), %rax
304	mul     v0
305	add     %rax, w3
306	adc     %rdx, w0
307	adc     $0, w132
308	add     $4, n
309	mov     w3, -8(rp,n,8)
310	jnz     L(mul_2_top)
311
312	mov	w0, (rp)
313	mov	w1, 8(rp)
314
315	sub	$2, vn
316	jz	L(ret)
317
318	lea	16(vp), vp
319	lea	-16(up), up
320
321	mov	un, n
322	mov	(vp), v0
323	mov	8(vp), v1
324
325	jmp	*outer_addr
326
327C ===========================================================
328C     addmul_2 for remaining vp's
329
330	ALIGN(16)
331L(addmul_prologue_0):
332	mov	-8(up,n,8), %rax
333	mul	v1
334	mov	%rax, w1
335	mov	%rdx, w2
336	mov	$0, w332
337	jmp	L(addmul_entry_0)
338
339	ALIGN(16)
340L(addmul_prologue_1):
341	mov	16(up,n,8), %rax
342	mul	v1
343	mov	%rax, w0
344	mov	%rdx, w1
345	mov	$0, w232
346	mov	24(up,n,8), %rax
347	jmp	L(addmul_entry_1)
348
349	ALIGN(16)
350L(addmul_prologue_2):
351	mov	8(up,n,8), %rax
352	mul	v1
353	mov	%rax, w3
354	mov	%rdx, w0
355	mov	$0, w132
356	jmp	L(addmul_entry_2)
357
358	ALIGN(16)
359L(addmul_prologue_3):
360	mov	(up,n,8), %rax
361	mul	v1
362	mov	%rax, w2
363	mov	%rdx, w3
364	mov	$0, w032
365	mov	$0, w132
366	jmp	L(addmul_entry_3)
367
368	C this loop is 19 c/loop = 2.375 c/l on K8
369
370	ALIGN(16)
371L(addmul_top):
372	mov	$0, w332
373	add	%rax, w0
374	mov	-8(up,n,8), %rax
375	adc	%rdx, w1
376	adc	$0, w232
377	mul	v1
378	add	w0, -8(rp,n,8)
379	adc	%rax, w1
380	adc	%rdx, w2
381L(addmul_entry_0):
382	mov	(up,n,8), %rax
383	mul	v0
384	add	%rax, w1
385	mov	(up,n,8), %rax
386	adc	%rdx, w2
387	adc	$0, w332
388	mul	v1
389	add	w1, (rp,n,8)
390	mov	$0, w132
391	adc	%rax, w2
392	mov	$0, w032
393	adc	%rdx, w3
394L(addmul_entry_3):
395	mov	8(up,n,8), %rax
396	mul	v0
397	add	%rax, w2
398	mov	8(up,n,8), %rax
399	adc	%rdx, w3
400	adc	$0, w032
401	mul	v1
402	add	w2, 8(rp,n,8)
403	adc	%rax, w3
404	adc	%rdx, w0
405L(addmul_entry_2):
406	mov	16(up,n,8), %rax
407	mul	v0
408	add	%rax, w3
409	mov	16(up,n,8), %rax
410	adc	%rdx, w0
411	adc	$0, w132
412	mul	v1
413	add	w3, 16(rp,n,8)
414	nop			C don't ask...
415	adc	%rax, w0
416	mov	$0, w232
417	mov	24(up,n,8), %rax
418	adc	%rdx, w1
419L(addmul_entry_1):
420	mul	v0
421	add	$4, n
422	jnz	L(addmul_top)
423
424	add	%rax, w0
425	adc	%rdx, w1
426	adc	$0, w232
427
428	add	w0, -8(rp)
429	adc	w1, (rp)
430	adc	w2, 8(rp)
431
432	sub	$2, vn
433	jz	L(ret)
434
435	lea	16(vp), vp
436	lea	-16(up), up
437
438	mov	un, n
439	mov	(vp), v0
440	mov	8(vp), v1
441
442	jmp	*outer_addr
443
444C ===========================================================
445C     accumulate along diagonals if un - vn is small
446
447	ALIGN(16)
448L(diagonal):
449	xor	w032, w032
450	xor	w132, w132
451	xor	w232, w232
452
453	neg	un
454
455	mov	vn32, %eax
456	and	$3, %eax
457	jz	L(diag_prologue_0)
458	cmp	$2, %eax
459	jc	L(diag_prologue_1)
460	jz	L(diag_prologue_2)
461
462L(diag_prologue_3):
463	lea	-8(vp), vp
464	mov	vp, vp_inner
465	add	$1, vn
466	mov	vn, n
467	lea	L(diag_entry_3)(%rip), outer_addr
468	jmp	L(diag_entry_3)
469
470L(diag_prologue_0):
471	mov	vp, vp_inner
472	mov	vn, n
473	lea	0(%rip), outer_addr
474	mov     -8(up,n,8), %rax
475	jmp	L(diag_entry_0)
476
477L(diag_prologue_1):
478	lea	8(vp), vp
479	mov	vp, vp_inner
480	add	$3, vn
481	mov	vn, n
482	lea	0(%rip), outer_addr
483	mov     -8(vp_inner), %rax
484	jmp	L(diag_entry_1)
485
486L(diag_prologue_2):
487	lea	-16(vp), vp
488	mov	vp, vp_inner
489	add	$2, vn
490	mov	vn, n
491	lea	0(%rip), outer_addr
492	mov	16(vp_inner), %rax
493	jmp	L(diag_entry_2)
494
495
496	C this loop is 10 c/loop = 2.5 c/l on K8
497
498	ALIGN(16)
499L(diag_top):
500	add     %rax, w0
501	adc     %rdx, w1
502	mov     -8(up,n,8), %rax
503	adc     $0, w2
504L(diag_entry_0):
505	mulq    (vp_inner)
506	add     %rax, w0
507	adc     %rdx, w1
508	adc     $0, w2
509L(diag_entry_3):
510	mov     -16(up,n,8), %rax
511	mulq    8(vp_inner)
512	add     %rax, w0
513	mov     16(vp_inner), %rax
514	adc     %rdx, w1
515	adc     $0, w2
516L(diag_entry_2):
517	mulq    -24(up,n,8)
518	add     %rax, w0
519	mov     24(vp_inner), %rax
520	adc     %rdx, w1
521	lea     32(vp_inner), vp_inner
522	adc     $0, w2
523L(diag_entry_1):
524	mulq    -32(up,n,8)
525	sub     $4, n
526	jnz	L(diag_top)
527
528	add	%rax, w0
529	adc	%rdx, w1
530	adc	$0, w2
531
532	mov	w0, (rp,un,8)
533
534	inc	un
535	jz	L(diag_end)
536
537	mov	vn, n
538	mov	vp, vp_inner
539
540	lea	8(up), up
541	mov	w1, w0
542	mov	w2, w1
543	xor	w232, w232
544
545	jmp	*outer_addr
546
547L(diag_end):
548	mov	w1, (rp)
549	mov	w2, 8(rp)
550
551L(ret):	pop	%r15
552	pop	%r14
553	pop	%r13
554	pop	%r12
555	pop	%rbp
556	pop	%rbx
557	ret
558
559EPILOGUE()
560