14a1767b4Smrgdnl Alpha ev6 nails mpn_addmul_4. 24a1767b4Smrg 34a1767b4Smrgdnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. 4*f81b1c5bSmrg 54a1767b4Smrgdnl This file is part of the GNU MP Library. 64a1767b4Smrgdnl 7*f81b1c5bSmrgdnl The GNU MP Library is free software; you can redistribute it and/or modify 8*f81b1c5bSmrgdnl it under the terms of either: 94a1767b4Smrgdnl 10*f81b1c5bSmrgdnl * the GNU Lesser General Public License as published by the Free 11*f81b1c5bSmrgdnl Software Foundation; either version 3 of the License, or (at your 12*f81b1c5bSmrgdnl option) any later version. 134a1767b4Smrgdnl 14*f81b1c5bSmrgdnl or 15*f81b1c5bSmrgdnl 16*f81b1c5bSmrgdnl * the GNU General Public License as published by the Free Software 17*f81b1c5bSmrgdnl Foundation; either version 2 of the License, or (at your option) any 18*f81b1c5bSmrgdnl later version. 19*f81b1c5bSmrgdnl 20*f81b1c5bSmrgdnl or both in parallel, as here. 21*f81b1c5bSmrgdnl 22*f81b1c5bSmrgdnl The GNU MP Library is distributed in the hope that it will be useful, but 23*f81b1c5bSmrgdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24*f81b1c5bSmrgdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25*f81b1c5bSmrgdnl for more details. 26*f81b1c5bSmrgdnl 27*f81b1c5bSmrgdnl You should have received copies of the GNU General Public License and the 28*f81b1c5bSmrgdnl GNU Lesser General Public License along with the GNU MP Library. If not, 29*f81b1c5bSmrgdnl see https://www.gnu.org/licenses/. 304a1767b4Smrg 314a1767b4Smrginclude(`../config.m4') 324a1767b4Smrg 334a1767b4SmrgC Runs at 2.5 cycles/limb. 344a1767b4Smrg 354a1767b4SmrgC We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding 364a1767b4SmrgC to 3.24 insn/cycle. 374a1767b4Smrg 384a1767b4Smrg 394a1767b4SmrgC INPUT PARAMETERS 404a1767b4Smrgdefine(`rp',`r16') 414a1767b4Smrgdefine(`up',`r17') 424a1767b4Smrgdefine(`n',`r18') 434a1767b4Smrgdefine(`vp',`r19') 444a1767b4Smrg 454a1767b4SmrgC Useful register aliases 464a1767b4Smrgdefine(`numb_mask',`r24') 474a1767b4Smrgdefine(`ulimb',`r25') 484a1767b4Smrgdefine(`rlimb',`r27') 494a1767b4Smrg 504a1767b4Smrgdefine(`m0a',`r0') 514a1767b4Smrgdefine(`m0b',`r1') 524a1767b4Smrgdefine(`m1a',`r2') 534a1767b4Smrgdefine(`m1b',`r3') 544a1767b4Smrgdefine(`m2a',`r20') 554a1767b4Smrgdefine(`m2b',`r21') 564a1767b4Smrgdefine(`m3a',`r12') 574a1767b4Smrgdefine(`m3b',`r13') 584a1767b4Smrg 594a1767b4Smrgdefine(`acc0',`r4') 604a1767b4Smrgdefine(`acc1',`r5') 614a1767b4Smrgdefine(`acc2',`r22') 624a1767b4Smrgdefine(`acc3',`r14') 634a1767b4Smrg 644a1767b4Smrgdefine(`v0',`r6') 654a1767b4Smrgdefine(`v1',`r7') 664a1767b4Smrgdefine(`v2',`r23') 674a1767b4Smrgdefine(`v3',`r15') 684a1767b4Smrg 694a1767b4SmrgC Used for temps: r8 r19 r28 704a1767b4Smrg 714a1767b4Smrgdefine(`NAIL_BITS',`GMP_NAIL_BITS') 724a1767b4Smrgdefine(`NUMB_BITS',`GMP_NUMB_BITS') 734a1767b4Smrg 744a1767b4SmrgC This declaration is munged by configure 754a1767b4SmrgNAILS_SUPPORT(4-63) 764a1767b4Smrg 774a1767b4SmrgASM_START() 784a1767b4SmrgPROLOGUE(mpn_addmul_4) 794a1767b4Smrg lda r30, -240(r30) 804a1767b4Smrg stq r12, 32(r30) 814a1767b4Smrg stq r13, 40(r30) 824a1767b4Smrg stq r14, 48(r30) 834a1767b4Smrg stq r15, 56(r30) 844a1767b4Smrg 854a1767b4Smrg lda numb_mask,-1(r31) 864a1767b4Smrg srl numb_mask,NAIL_BITS,numb_mask 874a1767b4Smrg 884a1767b4Smrg ldq v0, 0(vp) 894a1767b4Smrg ldq v1, 8(vp) 904a1767b4Smrg ldq v2, 16(vp) 914a1767b4Smrg ldq v3, 24(vp) 924a1767b4Smrg 934a1767b4Smrg bis r31, r31, acc0 C zero acc0 944a1767b4Smrg sll v0,NAIL_BITS, v0 954a1767b4Smrg bis r31, r31, acc1 C zero acc1 964a1767b4Smrg sll v1,NAIL_BITS, v1 974a1767b4Smrg bis r31, r31, acc2 C zero acc2 984a1767b4Smrg sll v2,NAIL_BITS, v2 994a1767b4Smrg bis r31, r31, acc3 C zero acc3 1004a1767b4Smrg sll v3,NAIL_BITS, v3 1014a1767b4Smrg bis r31, r31, r19 1024a1767b4Smrg 1034a1767b4Smrg ldq ulimb, 0(up) 1044a1767b4Smrg lda up, 8(up) 1054a1767b4Smrg mulq v0, ulimb, m0a C U1 1064a1767b4Smrg umulh v0, ulimb, m0b C U1 1074a1767b4Smrg mulq v1, ulimb, m1a C U1 1084a1767b4Smrg umulh v1, ulimb, m1b C U1 1094a1767b4Smrg lda n, -1(n) 1104a1767b4Smrg mulq v2, ulimb, m2a C U1 1114a1767b4Smrg umulh v2, ulimb, m2b C U1 1124a1767b4Smrg mulq v3, ulimb, m3a C U1 1134a1767b4Smrg umulh v3, ulimb, m3b C U1 1144a1767b4Smrg beq n, L(end) C U0 1154a1767b4Smrg 1164a1767b4Smrg ALIGN(16) 1174a1767b4SmrgL(top): bis r31, r31, r31 C U1 nop 1184a1767b4Smrg ldq rlimb, 0(rp) C L0 1194a1767b4Smrg ldq ulimb, 0(up) C L1 1204a1767b4Smrg addq r19, acc0, acc0 C U0 propagate nail 1214a1767b4Smrg 1224a1767b4Smrg bis r31, r31, r31 C L0 nop 1234a1767b4Smrg bis r31, r31, r31 C U1 nop 1244a1767b4Smrg bis r31, r31, r31 C L1 nop 1254a1767b4Smrg bis r31, r31, r31 C U0 nop 1264a1767b4Smrg 1274a1767b4Smrg lda rp, 8(rp) C L0 1284a1767b4Smrg srl m0a,NAIL_BITS, r8 C U0 1294a1767b4Smrg lda up, 8(up) C L1 1304a1767b4Smrg mulq v0, ulimb, m0a C U1 1314a1767b4Smrg 1324a1767b4Smrg addq r8, acc0, r19 C U0 1334a1767b4Smrg addq m0b, acc1, acc0 C L0 1344a1767b4Smrg umulh v0, ulimb, m0b C U1 1354a1767b4Smrg bis r31, r31, r31 C L1 nop 1364a1767b4Smrg 1374a1767b4Smrg addq rlimb, r19, r19 C L0 1384a1767b4Smrg srl m1a,NAIL_BITS, r8 C U0 1394a1767b4Smrg bis r31, r31, r31 C L1 nop 1404a1767b4Smrg mulq v1, ulimb, m1a C U1 1414a1767b4Smrg 1424a1767b4Smrg addq r8, acc0, acc0 C U0 1434a1767b4Smrg addq m1b, acc2, acc1 C L0 1444a1767b4Smrg umulh v1, ulimb, m1b C U1 1454a1767b4Smrg and r19,numb_mask, r28 C L1 extract numb part 1464a1767b4Smrg 1474a1767b4Smrg bis r31, r31, r31 C L0 nop 1484a1767b4Smrg srl m2a,NAIL_BITS, r8 C U0 1494a1767b4Smrg lda n, -1(n) C L1 1504a1767b4Smrg mulq v2, ulimb, m2a C U1 1514a1767b4Smrg 1524a1767b4Smrg addq r8, acc1, acc1 C L1 1534a1767b4Smrg addq m2b, acc3, acc2 C L0 1544a1767b4Smrg umulh v2, ulimb, m2b C U1 1554a1767b4Smrg srl r19,NUMB_BITS, r19 C U0 extract nail part 1564a1767b4Smrg 1574a1767b4Smrg bis r31, r31, r31 C L0 nop 1584a1767b4Smrg srl m3a,NAIL_BITS, r8 C U0 1594a1767b4Smrg stq r28, -8(rp) C L1 1604a1767b4Smrg mulq v3, ulimb, m3a C U1 1614a1767b4Smrg 1624a1767b4Smrg addq r8, acc2, acc2 C L0 1634a1767b4Smrg bis r31, m3b, acc3 C L1 1644a1767b4Smrg umulh v3, ulimb, m3b C U1 1654a1767b4Smrg bne n, L(top) C U0 1664a1767b4Smrg 1674a1767b4SmrgL(end): ldq rlimb, 0(rp) 1684a1767b4Smrg addq r19, acc0, acc0 C propagate nail 1694a1767b4Smrg lda rp, 8(rp) C FIXME: DELETE 1704a1767b4Smrg srl m0a,NAIL_BITS, r8 C U0 1714a1767b4Smrg addq r8, acc0, r19 1724a1767b4Smrg addq m0b, acc1, acc0 1734a1767b4Smrg addq rlimb, r19, r19 1744a1767b4Smrg srl m1a,NAIL_BITS, r8 C U0 1754a1767b4Smrg addq r8, acc0, acc0 1764a1767b4Smrg addq m1b, acc2, acc1 1774a1767b4Smrg and r19,numb_mask, r28 C extract limb 1784a1767b4Smrg srl m2a,NAIL_BITS, r8 C U0 1794a1767b4Smrg addq r8, acc1, acc1 1804a1767b4Smrg addq m2b, acc3, acc2 1814a1767b4Smrg srl r19,NUMB_BITS, r19 C extract nail 1824a1767b4Smrg srl m3a,NAIL_BITS, r8 C U0 1834a1767b4Smrg stq r28, -8(rp) 1844a1767b4Smrg addq r8, acc2, acc2 1854a1767b4Smrg bis r31, m3b, acc3 1864a1767b4Smrg 1874a1767b4Smrg addq r19, acc0, acc0 C propagate nail 1884a1767b4Smrg and acc0,numb_mask, r28 1894a1767b4Smrg stq r28, 0(rp) 1904a1767b4Smrg srl acc0,NUMB_BITS, r19 1914a1767b4Smrg addq r19, acc1, acc1 1924a1767b4Smrg 1934a1767b4Smrg and acc1,numb_mask, r28 1944a1767b4Smrg stq r28, 8(rp) 1954a1767b4Smrg srl acc1,NUMB_BITS, r19 1964a1767b4Smrg addq r19, acc2, acc2 1974a1767b4Smrg 1984a1767b4Smrg and acc2,numb_mask, r28 1994a1767b4Smrg stq r28, 16(rp) 2004a1767b4Smrg srl acc2,NUMB_BITS, r19 2014a1767b4Smrg addq r19, acc3, r0 2024a1767b4Smrg 2034a1767b4Smrg ldq r12, 32(r30) 2044a1767b4Smrg ldq r13, 40(r30) 2054a1767b4Smrg ldq r14, 48(r30) 2064a1767b4Smrg ldq r15, 56(r30) 2074a1767b4Smrg lda r30, 240(r30) 2084a1767b4Smrg ret r31, (r26), 1 2094a1767b4SmrgEPILOGUE() 2104a1767b4SmrgASM_END() 211