14a1767b4Smrgdnl  Alpha ev6 nails mpn_addmul_4.
24a1767b4Smrg
34a1767b4Smrgdnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
4*f81b1c5bSmrg
54a1767b4Smrgdnl  This file is part of the GNU MP Library.
64a1767b4Smrgdnl
7*f81b1c5bSmrgdnl  The GNU MP Library is free software; you can redistribute it and/or modify
8*f81b1c5bSmrgdnl  it under the terms of either:
94a1767b4Smrgdnl
10*f81b1c5bSmrgdnl    * the GNU Lesser General Public License as published by the Free
11*f81b1c5bSmrgdnl      Software Foundation; either version 3 of the License, or (at your
12*f81b1c5bSmrgdnl      option) any later version.
134a1767b4Smrgdnl
14*f81b1c5bSmrgdnl  or
15*f81b1c5bSmrgdnl
16*f81b1c5bSmrgdnl    * the GNU General Public License as published by the Free Software
17*f81b1c5bSmrgdnl      Foundation; either version 2 of the License, or (at your option) any
18*f81b1c5bSmrgdnl      later version.
19*f81b1c5bSmrgdnl
20*f81b1c5bSmrgdnl  or both in parallel, as here.
21*f81b1c5bSmrgdnl
22*f81b1c5bSmrgdnl  The GNU MP Library is distributed in the hope that it will be useful, but
23*f81b1c5bSmrgdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24*f81b1c5bSmrgdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25*f81b1c5bSmrgdnl  for more details.
26*f81b1c5bSmrgdnl
27*f81b1c5bSmrgdnl  You should have received copies of the GNU General Public License and the
28*f81b1c5bSmrgdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29*f81b1c5bSmrgdnl  see https://www.gnu.org/licenses/.
304a1767b4Smrg
314a1767b4Smrginclude(`../config.m4')
324a1767b4Smrg
334a1767b4SmrgC Runs at 2.5 cycles/limb.
344a1767b4Smrg
354a1767b4SmrgC We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding
364a1767b4SmrgC to 3.24 insn/cycle.
374a1767b4Smrg
384a1767b4Smrg
394a1767b4SmrgC  INPUT PARAMETERS
404a1767b4Smrgdefine(`rp',`r16')
414a1767b4Smrgdefine(`up',`r17')
424a1767b4Smrgdefine(`n',`r18')
434a1767b4Smrgdefine(`vp',`r19')
444a1767b4Smrg
454a1767b4SmrgC  Useful register aliases
464a1767b4Smrgdefine(`numb_mask',`r24')
474a1767b4Smrgdefine(`ulimb',`r25')
484a1767b4Smrgdefine(`rlimb',`r27')
494a1767b4Smrg
504a1767b4Smrgdefine(`m0a',`r0')
514a1767b4Smrgdefine(`m0b',`r1')
524a1767b4Smrgdefine(`m1a',`r2')
534a1767b4Smrgdefine(`m1b',`r3')
544a1767b4Smrgdefine(`m2a',`r20')
554a1767b4Smrgdefine(`m2b',`r21')
564a1767b4Smrgdefine(`m3a',`r12')
574a1767b4Smrgdefine(`m3b',`r13')
584a1767b4Smrg
594a1767b4Smrgdefine(`acc0',`r4')
604a1767b4Smrgdefine(`acc1',`r5')
614a1767b4Smrgdefine(`acc2',`r22')
624a1767b4Smrgdefine(`acc3',`r14')
634a1767b4Smrg
644a1767b4Smrgdefine(`v0',`r6')
654a1767b4Smrgdefine(`v1',`r7')
664a1767b4Smrgdefine(`v2',`r23')
674a1767b4Smrgdefine(`v3',`r15')
684a1767b4Smrg
694a1767b4SmrgC Used for temps: r8 r19 r28
704a1767b4Smrg
714a1767b4Smrgdefine(`NAIL_BITS',`GMP_NAIL_BITS')
724a1767b4Smrgdefine(`NUMB_BITS',`GMP_NUMB_BITS')
734a1767b4Smrg
744a1767b4SmrgC  This declaration is munged by configure
754a1767b4SmrgNAILS_SUPPORT(4-63)
764a1767b4Smrg
774a1767b4SmrgASM_START()
784a1767b4SmrgPROLOGUE(mpn_addmul_4)
794a1767b4Smrg	lda	r30,	-240(r30)
804a1767b4Smrg	stq	r12,	32(r30)
814a1767b4Smrg	stq	r13,	40(r30)
824a1767b4Smrg	stq	r14,	48(r30)
834a1767b4Smrg	stq	r15,	56(r30)
844a1767b4Smrg
854a1767b4Smrg	lda	numb_mask,-1(r31)
864a1767b4Smrg	srl	numb_mask,NAIL_BITS,numb_mask
874a1767b4Smrg
884a1767b4Smrg	ldq	v0,	0(vp)
894a1767b4Smrg	ldq	v1,	8(vp)
904a1767b4Smrg	ldq	v2,	16(vp)
914a1767b4Smrg	ldq	v3,	24(vp)
924a1767b4Smrg
934a1767b4Smrg	bis	r31,	r31,	acc0		C	zero acc0
944a1767b4Smrg	sll	v0,NAIL_BITS,	v0
954a1767b4Smrg	bis	r31,	r31,	acc1		C	zero acc1
964a1767b4Smrg	sll	v1,NAIL_BITS,	v1
974a1767b4Smrg	bis	r31,	r31,	acc2		C	zero acc2
984a1767b4Smrg	sll	v2,NAIL_BITS,	v2
994a1767b4Smrg	bis	r31,	r31,	acc3		C	zero acc3
1004a1767b4Smrg	sll	v3,NAIL_BITS,	v3
1014a1767b4Smrg	bis	r31,	r31,	r19
1024a1767b4Smrg
1034a1767b4Smrg	ldq	ulimb,	0(up)
1044a1767b4Smrg	lda	up,	8(up)
1054a1767b4Smrg	mulq	v0,	ulimb,	m0a		C U1
1064a1767b4Smrg	umulh	v0,	ulimb,	m0b		C U1
1074a1767b4Smrg	mulq	v1,	ulimb,	m1a		C U1
1084a1767b4Smrg	umulh	v1,	ulimb,	m1b		C U1
1094a1767b4Smrg	lda	n,	-1(n)
1104a1767b4Smrg	mulq	v2,	ulimb,	m2a		C U1
1114a1767b4Smrg	umulh	v2,	ulimb,	m2b		C U1
1124a1767b4Smrg	mulq	v3,	ulimb,	m3a		C U1
1134a1767b4Smrg	umulh	v3,	ulimb,	m3b		C U1
1144a1767b4Smrg	beq	n,	L(end)			C U0
1154a1767b4Smrg
1164a1767b4Smrg	ALIGN(16)
1174a1767b4SmrgL(top):	bis	r31,	r31,	r31		C U1	nop
1184a1767b4Smrg	ldq	rlimb,	0(rp)			C L0
1194a1767b4Smrg	ldq	ulimb,	0(up)			C L1
1204a1767b4Smrg	addq	r19,	acc0,	acc0		C U0	propagate nail
1214a1767b4Smrg
1224a1767b4Smrg	bis	r31,	r31,	r31		C L0	nop
1234a1767b4Smrg	bis	r31,	r31,	r31		C U1	nop
1244a1767b4Smrg	bis	r31,	r31,	r31		C L1	nop
1254a1767b4Smrg	bis	r31,	r31,	r31		C U0	nop
1264a1767b4Smrg
1274a1767b4Smrg	lda	rp,	8(rp)			C L0
1284a1767b4Smrg	srl	m0a,NAIL_BITS,	r8		C U0
1294a1767b4Smrg	lda	up,	8(up)			C L1
1304a1767b4Smrg	mulq	v0,	ulimb,	m0a		C U1
1314a1767b4Smrg
1324a1767b4Smrg	addq	r8,	acc0,	r19		C U0
1334a1767b4Smrg	addq	m0b,	acc1,	acc0		C L0
1344a1767b4Smrg	umulh	v0,	ulimb,	m0b		C U1
1354a1767b4Smrg	bis	r31,	r31,	r31		C L1	nop
1364a1767b4Smrg
1374a1767b4Smrg	addq	rlimb,	r19,	r19		C L0
1384a1767b4Smrg	srl	m1a,NAIL_BITS,	r8		C U0
1394a1767b4Smrg	bis	r31,	r31,	r31		C L1	nop
1404a1767b4Smrg	mulq	v1,	ulimb,	m1a		C U1
1414a1767b4Smrg
1424a1767b4Smrg	addq	r8,	acc0,	acc0		C U0
1434a1767b4Smrg	addq	m1b,	acc2,	acc1		C L0
1444a1767b4Smrg	umulh	v1,	ulimb,	m1b		C U1
1454a1767b4Smrg	and	r19,numb_mask,	r28		C L1	extract numb part
1464a1767b4Smrg
1474a1767b4Smrg	bis	r31,	r31,	r31		C L0	nop
1484a1767b4Smrg	srl	m2a,NAIL_BITS,	r8		C U0
1494a1767b4Smrg	lda	n,	-1(n)			C L1
1504a1767b4Smrg	mulq	v2,	ulimb,	m2a		C U1
1514a1767b4Smrg
1524a1767b4Smrg	addq	r8,	acc1,	acc1		C L1
1534a1767b4Smrg	addq	m2b,	acc3,	acc2		C L0
1544a1767b4Smrg	umulh	v2,	ulimb,	m2b		C U1
1554a1767b4Smrg	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
1564a1767b4Smrg
1574a1767b4Smrg	bis	r31,	r31,	r31		C L0	nop
1584a1767b4Smrg	srl	m3a,NAIL_BITS,	r8		C U0
1594a1767b4Smrg	stq	r28,	-8(rp)			C L1
1604a1767b4Smrg	mulq	v3,	ulimb,	m3a		C U1
1614a1767b4Smrg
1624a1767b4Smrg	addq	r8,	acc2,	acc2		C L0
1634a1767b4Smrg	bis	r31,	m3b,	acc3		C L1
1644a1767b4Smrg	umulh	v3,	ulimb,	m3b		C U1
1654a1767b4Smrg	bne	n,	L(top)			C U0
1664a1767b4Smrg
1674a1767b4SmrgL(end):	ldq	rlimb,	0(rp)
1684a1767b4Smrg	addq	r19,	acc0,	acc0		C	propagate nail
1694a1767b4Smrg	lda	rp,	8(rp)			C FIXME: DELETE
1704a1767b4Smrg	srl	m0a,NAIL_BITS,	r8		C U0
1714a1767b4Smrg	addq	r8,	acc0,	r19
1724a1767b4Smrg	addq	m0b,	acc1,	acc0
1734a1767b4Smrg	addq	rlimb,	r19,	r19
1744a1767b4Smrg	srl	m1a,NAIL_BITS,	r8		C U0
1754a1767b4Smrg	addq	r8,	acc0,	acc0
1764a1767b4Smrg	addq	m1b,	acc2,	acc1
1774a1767b4Smrg	and	r19,numb_mask,	r28		C extract limb
1784a1767b4Smrg	srl	m2a,NAIL_BITS,	r8		C U0
1794a1767b4Smrg	addq	r8,	acc1,	acc1
1804a1767b4Smrg	addq	m2b,	acc3,	acc2
1814a1767b4Smrg	srl	r19,NUMB_BITS,	r19		C extract nail
1824a1767b4Smrg	srl	m3a,NAIL_BITS,	r8		C U0
1834a1767b4Smrg	stq	r28,	-8(rp)
1844a1767b4Smrg	addq	r8,	acc2,	acc2
1854a1767b4Smrg	bis	r31,	m3b,	acc3
1864a1767b4Smrg
1874a1767b4Smrg	addq	r19,	acc0,	acc0		C propagate nail
1884a1767b4Smrg	and	acc0,numb_mask,	r28
1894a1767b4Smrg	stq	r28,	0(rp)
1904a1767b4Smrg	srl	acc0,NUMB_BITS,	r19
1914a1767b4Smrg	addq	r19,	acc1,	acc1
1924a1767b4Smrg
1934a1767b4Smrg	and	acc1,numb_mask,	r28
1944a1767b4Smrg	stq	r28,	8(rp)
1954a1767b4Smrg	srl	acc1,NUMB_BITS,	r19
1964a1767b4Smrg	addq	r19,	acc2,	acc2
1974a1767b4Smrg
1984a1767b4Smrg	and	acc2,numb_mask,	r28
1994a1767b4Smrg	stq	r28,	16(rp)
2004a1767b4Smrg	srl	acc2,NUMB_BITS,	r19
2014a1767b4Smrg	addq	r19,	acc3,	r0
2024a1767b4Smrg
2034a1767b4Smrg	ldq	r12,	32(r30)
2044a1767b4Smrg	ldq	r13,	40(r30)
2054a1767b4Smrg	ldq	r14,	48(r30)
2064a1767b4Smrg	ldq	r15,	56(r30)
2074a1767b4Smrg	lda	r30,	240(r30)
2084a1767b4Smrg	ret	r31,	(r26),	1
2094a1767b4SmrgEPILOGUE()
2104a1767b4SmrgASM_END()
211