1dnl  AMD64 mpn_sqr_basecase optimised for AMD bobcat.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9	 4.5
35C AMD K10	 4.5
36C AMD bd1	 4.75
37C AMD bobcat	 5
38C Intel P4	17.7
39C Intel core2	 5.5
40C Intel NHM	 5.43
41C Intel SBR	 3.92
42C Intel atom	23
43C VIA nano	 5.63
44
45C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the
46C multiply insn bandwidth, without any apparent loop branch exit pipeline
47C replays experienced on K8.  The structure is unusual: it falls into mul_1 in
48C the same way for all n, then it splits into 4 different wind-down blocks and
49C 4 separate addmul_1 loops.
50C
51C We have not tried using the same addmul_1 loops with a switch into feed-in
52C code, as we do in other basecase implementations.  Doing that could save
53C substantial code volume, but would also probably add some overhead.
54
55C TODO
56C  * Tune un < 4 code.
57C  * Perhaps implement a larger final corner (it is now 2 x 1).
58C  * Lots of space could be saved by replacing the "switch" code by gradual
59C    jumps out from mul_1 winddown code, perhaps with no added overhead.
60C  * Are the ALIGN(16) really necessary?  They add about 25 bytes of padding.
61
62ABI_SUPPORT(DOS64)
63ABI_SUPPORT(STD64)
64
65C Standard parameters
66define(`rp',              `%rdi')
67define(`up',              `%rsi')
68define(`un_param',        `%rdx')
69C Standard allocations
70define(`un',              `%rbx')
71define(`w0',              `%r8')
72define(`w1',              `%r9')
73define(`w2',              `%r10')
74define(`w3',              `%r11')
75define(`n',               `%rbp')
76define(`v0',              `%rcx')
77
78C Temp macro for allowing control over indexing.
79C Define to return $1 for more conservative ptr handling.
80define(`X',`$2')
81dnl define(`X',`$1')
82
83
84ASM_START()
85	TEXT
86	ALIGN(64)
87PROLOGUE(mpn_sqr_basecase)
88	FUNC_ENTRY(3)
89
90	mov	(up), %rax
91
92	cmp	$2, R32(un_param)
93	jae	L(ge2)
94
95	mul	%rax
96	mov	%rax, (rp)
97	mov	%rdx, 8(rp)
98	FUNC_EXIT()
99	ret
100
101L(ge2):	mov	(up), v0
102	jnz	L(g2)
103
104	mul	%rax
105	mov	%rax, (rp)
106	mov	8(up), %rax
107	mov	%rdx, w0
108	mul	v0
109	add	%rax, w0
110	mov	%rdx, w1
111	adc	$0, w1
112	mov	8(up), v0
113	mov	(up), %rax
114	mul	v0
115	add	%rax, w0
116	mov	w0, 8(rp)
117	mov	%rdx, w0		C CAUTION: r8 realloc
118	adc	$0, w0
119	mov	8(up), %rax
120	mul	v0
121	add	w1, w0
122	adc	$0, %rdx
123	add	w0, %rax
124	adc	$0, %rdx
125	mov	%rax, 16(rp)
126	mov	%rdx, 24(rp)
127	FUNC_EXIT()
128	ret
129
130L(g2):	cmp	$3, R32(un_param)
131	ja	L(g3)
132	mul	%rax
133	mov	%rax, (rp)
134	mov	%rdx, 8(rp)
135	mov	8(up), %rax
136	mul	%rax
137	mov	%rax, 16(rp)
138	mov	%rdx, 24(rp)
139	mov	16(up), %rax
140	mul	%rax
141	mov	%rax, 32(rp)
142	mov	%rdx, 40(rp)
143
144	mov	(up), v0
145	mov	8(up), %rax
146	mul	v0
147	mov	%rax, w0
148	mov	%rdx, w1
149	mov	16(up), %rax
150	mul	v0
151	xor	R32(w2), R32(w2)
152	add	%rax, w1
153	adc	%rdx, w2
154
155	mov	8(up), v0
156	mov	16(up), %rax
157	mul	v0
158	xor	R32(w3), R32(w3)
159	add	%rax, w2
160	adc	%rdx, w3
161	add	w0, w0
162	adc	w1, w1
163	adc	w2, w2
164	adc	w3, w3
165	mov	$0, R32(v0)
166	adc	v0, v0
167	add	w0, 8(rp)
168	adc	w1, 16(rp)
169	adc	w2, 24(rp)
170	adc	w3, 32(rp)
171	adc	v0, 40(rp)
172	FUNC_EXIT()
173	ret
174
175L(g3):	push	%rbx
176	push	%rbp
177
178	mov	8(up), %rax
179	lea	-24(rp,un_param,8), rp
180	lea	-24(up,un_param,8), up
181	neg	un_param
182	push	un_param		C for sqr_diag_addlsh1
183	lea	(un_param), un
184	lea	3(un_param), n
185
186	mul	v0
187	mov	%rax, w2
188	mov	%rdx, w3
189	jmp	L(L3)
190
191	ALIGN(16)
192L(top):	mov	w0, -16(rp,n,8)
193	add	w1, w2
194	adc	$0, w3
195	mov	(up,n,8), %rax
196	mul	v0
197	mov	%rax, w0
198	mov	%rdx, w1
199	mov	w2, -8(rp,n,8)
200	add	w3, w0
201	adc	$0, w1
202	mov	8(up,n,8), %rax
203	mul	v0
204	mov	%rax, w2
205	mov	%rdx, w3
206	mov	w0, (rp,n,8)
207	add	w1, w2
208	adc	$0, w3
209L(L3):	mov	16(up,n,8), %rax
210	mul	v0
211	mov	%rax, w0
212	mov	%rdx, w1
213	mov	w2, 8(rp,n,8)
214	add	w3, w0
215	adc	$0, w1
216	mov	24(up,n,8), %rax
217	mul	v0
218	mov	%rax, w2
219	mov	%rdx, w3
220	add	$4, n
221	js	L(top)
222
223	mov	w0, -16(rp,n,8)
224	add	w1, w2
225	adc	$0, w3
226
227	test	n, n
228	jz	L(r2)
229	cmp	$2, R32(n)
230	ja	L(r3)
231	jz	L(r0)
232
233
234L(r1):	mov	X((up,n,8),8(up)), %rax
235	mul	v0
236	mov	%rax, w0
237	mov	%rdx, w1
238	mov	w2, X(-8(rp,n,8),(rp))
239	add	w3, w0
240	adc	$0, w1
241	mov	X(8(up,n,8),16(up)), %rax
242	mul	v0
243	mov	%rax, w2
244	mov	%rdx, w3
245	mov	w0, X((rp,n,8),8(rp))
246	add	w1, w2
247	adc	$0, w3
248	mov	w2, X(8(rp,n,8),16(rp))
249	mov	w3, X(16(rp,n,8),24(rp))
250	add	$5, un
251	jmp	L(to0)
252
253L(r2):	mov	X((up,n,8),(up)), %rax
254	mul	v0
255	mov	%rax, w0
256	mov	%rdx, w1
257	mov	w2, X(-8(rp,n,8),-8(rp))
258	add	w3, w0
259	adc	$0, w1
260	mov	X(8(up,n,8),8(up)), %rax
261	mul	v0
262	mov	%rax, w2
263	mov	%rdx, w3
264	mov	w0, X((rp,n,8),(rp))
265	add	w1, w2
266	adc	$0, w3
267	mov	X(16(up,n,8),16(up)), %rax
268	mul	v0
269	mov	%rax, w0
270	mov	%rdx, w1
271	mov	w2, X(8(rp,n,8),8(rp))
272	add	w3, w0
273	adc	$0, w1
274	mov	w0, X(16(rp,n,8),16(rp))
275	adc	$0, w3
276	mov	w1, X(24(rp,n,8),24(rp))
277	add	$6, un
278	jmp	L(to1)
279
280L(r3):	mov	w2, X(-8(rp,n,8),16(rp))
281	mov	w3, X((rp,n,8),24(rp))
282	add	$3, un
283	jmp	L(to2)
284
285L(r0):	mov	X((up,n,8),16(up)), %rax
286	mul	v0
287	mov	%rax, w0
288	mov	%rdx, w1
289	mov	w2, X(-8(rp,n,8),8(rp))
290	add	w3, w0
291	adc	$0, w1
292	mov	w0, X((rp,n,8),16(rp))
293	mov	w1, X(8(rp,n,8),24(rp))
294	add	$4, un
295C	jmp	L(to3)
296C fall through into main loop
297
298
299L(outer):
300	mov	un, n
301	mov	(up,un,8), v0
302	mov	8(up,un,8), %rax
303	lea	8(rp), rp
304	mul	v0
305	mov	%rax, w2
306	mov	%rdx, w3
307	jmp	L(al3)
308
309	ALIGN(16)
310L(ta3):	add	w0, -16(rp,n,8)
311	adc	w1, w2
312	adc	$0, w3
313	mov	(up,n,8), %rax
314	mul	v0
315	mov	%rax, w0
316	mov	%rdx, w1
317	add	w2, -8(rp,n,8)
318	adc	w3, w0
319	adc	$0, w1
320	mov	8(up,n,8), %rax
321	mul	v0
322	mov	%rax, w2
323	mov	%rdx, w3
324	add	w0, (rp,n,8)
325	adc	w1, w2
326	adc	$0, w3
327L(al3):	mov	16(up,n,8), %rax
328	mul	v0
329	mov	%rax, w0
330	mov	%rdx, w1
331	add	w2, 8(rp,n,8)
332	adc	w3, w0
333	adc	$0, w1
334	mov	24(up,n,8), %rax
335	mul	v0
336	mov	%rax, w2
337	mov	%rdx, w3
338	add	$4, n
339	js	L(ta3)
340
341	add	w0, X(-16(rp,n,8),8(rp))
342	adc	w1, w2
343	adc	$0, w3
344	add	w2, X(-8(rp,n,8),16(rp))
345	adc	$0, w3
346	mov	w3, X((rp,n,8),24(rp))
347
348
349L(to2):	mov	un, n
350	cmp	$-4, R32(un)
351	jnc	L(end)
352	add	$4, un
353	mov	8(up,n,8), v0
354	mov	16(up,n,8), %rax
355	lea	8(rp), rp
356	mul	v0
357	mov	%rax, w0
358	mov	%rdx, w1
359	jmp	L(al2)
360
361	ALIGN(16)
362L(ta2):	add	w0, -16(rp,n,8)
363	adc	w1, w2
364	adc	$0, w3
365	mov	(up,n,8), %rax
366	mul	v0
367	mov	%rax, w0
368	mov	%rdx, w1
369	add	w2, -8(rp,n,8)
370	adc	w3, w0
371	adc	$0, w1
372	mov	8(up,n,8), %rax
373	mul	v0
374	mov	%rax, w2
375	mov	%rdx, w3
376	add	w0, (rp,n,8)
377	adc	w1, w2
378	adc	$0, w3
379	mov	16(up,n,8), %rax
380	mul	v0
381	mov	%rax, w0
382	mov	%rdx, w1
383	add	w2, 8(rp,n,8)
384	adc	w3, w0
385	adc	$0, w1
386L(al2):	mov	24(up,n,8), %rax
387	mul	v0
388	mov	%rax, w2
389	mov	%rdx, w3
390	add	$4, n
391	js	L(ta2)
392
393	add	w0, X(-16(rp,n,8),8(rp))
394	adc	w1, w2
395	adc	$0, w3
396	add	w2, X(-8(rp,n,8),16(rp))
397	adc	$0, w3
398	mov	w3, X((rp,n,8),24(rp))
399
400
401L(to1):	mov	un, n
402	mov	-16(up,un,8), v0
403	mov	-8(up,un,8), %rax
404	lea	8(rp), rp
405	mul	v0
406	mov	%rax, w2
407	mov	%rdx, w3
408	jmp	L(al1)
409
410	ALIGN(16)
411L(ta1):	add	w0, -16(rp,n,8)
412	adc	w1, w2
413	adc	$0, w3
414L(al1):	mov	(up,n,8), %rax
415	mul	v0
416	mov	%rax, w0
417	mov	%rdx, w1
418	add	w2, -8(rp,n,8)
419	adc	w3, w0
420	adc	$0, w1
421	mov	8(up,n,8), %rax
422	mul	v0
423	mov	%rax, w2
424	mov	%rdx, w3
425	add	w0, (rp,n,8)
426	adc	w1, w2
427	adc	$0, w3
428	mov	16(up,n,8), %rax
429	mul	v0
430	mov	%rax, w0
431	mov	%rdx, w1
432	add	w2, 8(rp,n,8)
433	adc	w3, w0
434	adc	$0, w1
435	mov	24(up,n,8), %rax
436	mul	v0
437	mov	%rax, w2
438	mov	%rdx, w3
439	add	$4, n
440	js	L(ta1)
441
442	add	w0, X(-16(rp,n,8),8(rp))
443	adc	w1, w2
444	adc	$0, w3
445	add	w2, X(-8(rp,n,8),16(rp))
446	adc	$0, w3
447	mov	w3, X((rp,n,8),24(rp))
448
449
450L(to0):	mov	un, n
451	mov	-8(up,un,8), v0
452	mov	(up,un,8), %rax
453	lea	8(rp), rp
454	mul	v0
455	mov	%rax, w0
456	mov	%rdx, w1
457	jmp	L(al0)
458
459	ALIGN(16)
460L(ta0):	add	w0, -16(rp,n,8)
461	adc	w1, w2
462	adc	$0, w3
463	mov	(up,n,8), %rax
464	mul	v0
465	mov	%rax, w0
466	mov	%rdx, w1
467	add	w2, -8(rp,n,8)
468	adc	w3, w0
469	adc	$0, w1
470L(al0):	mov	8(up,n,8), %rax
471	mul	v0
472	mov	%rax, w2
473	mov	%rdx, w3
474	add	w0, (rp,n,8)
475	adc	w1, w2
476	adc	$0, w3
477	mov	16(up,n,8), %rax
478	mul	v0
479	mov	%rax, w0
480	mov	%rdx, w1
481	add	w2, 8(rp,n,8)
482	adc	w3, w0
483	adc	$0, w1
484	mov	24(up,n,8), %rax
485	mul	v0
486	mov	%rax, w2
487	mov	%rdx, w3
488	add	$4, n
489	js	L(ta0)
490
491	add	w0, X(-16(rp,n,8),8(rp))
492	adc	w1, w2
493	adc	$0, w3
494	add	w2, X(-8(rp,n,8),16(rp))
495	adc	$0, w3
496	mov	w3, X((rp,n,8),24(rp))
497	jmp	L(outer)
498
499
500L(end):	mov	X(8(up,un,8),(up)), v0
501	mov	X(16(up,un,8),8(up)), %rax
502	mul	v0
503	mov	%rax, w0
504	mov	%rdx, w1
505	mov	X(24(up,un,8),16(up)), %rax
506	mul	v0
507	mov	%rax, w2
508	mov	%rdx, w3
509	add	w0, X(24(rp,un,8),16(rp))
510	adc	w1, w2
511	adc	$0, w3
512	add	w2, X(32(rp,un,8),24(rp))
513	adc	$0, w3
514	mov	X(16(up,un,8),8(up)), v0
515	mov	X(24(up,un,8),16(up)), %rax
516	mul	v0
517	add	%rax, w3
518	mov	w3, X(40(rp,un,8),32(rp))
519	adc	$0, %rdx
520	mov	%rdx, X(48(rp,un,8),40(rp))
521
522
523C sqr_diag_addlsh1
524
525	lea	16(up), up
526	lea	40(rp), rp
527	pop	n
528	lea	2(n,n), n
529
530	mov	(up,n,4), %rax
531	mul	%rax
532	xor	R32(w2), R32(w2)
533
534	mov	8(rp,n,8), w0
535	mov	%rax, (rp,n,8)
536	jmp	L(lm)
537
538	ALIGN(8)
539L(tsd):	add	%rbx, w0
540	adc	%rax, w1
541	mov	w0, -8(rp,n,8)
542	mov	8(rp,n,8), w0
543	mov	w1, (rp,n,8)
544L(lm):	mov	16(rp,n,8), w1
545	adc	w0, w0
546	adc	w1, w1
547	lea	(%rdx,w2), %rbx
548	mov	8(up,n,4), %rax
549	setc	R8(w2)
550	mul	%rax
551	add	$2, n
552	js	L(tsd)
553
554L(esd):	add	%rbx, w0
555	adc	%rax, w1
556	mov	w0, X(-8(rp,n,8),-8(rp))
557	mov	w1, X((rp,n,8),(rp))
558	adc	w2, %rdx
559	mov	%rdx, X(8(rp,n,8),8(rp))
560
561	pop	%rbp
562	pop	%rbx
563	FUNC_EXIT()
564	ret
565EPILOGUE()
566