1d25e02daSmrgdnl AMD64 mpn_addlshC_n, mpn_sublshC_n -- rp[] = up[] +- (vp[] << C), where 2d25e02daSmrgdnl C is 1, 2, 3. Optimized for Pentium 4. 3d25e02daSmrg 4d25e02daSmrgdnl Contributed to the GNU project by Torbjorn Granlund. 5d25e02daSmrg 6*f81b1c5bSmrgdnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. 7d25e02daSmrg 8d25e02daSmrgdnl This file is part of the GNU MP Library. 9*f81b1c5bSmrgdnl 10d25e02daSmrgdnl The GNU MP Library is free software; you can redistribute it and/or modify 11*f81b1c5bSmrgdnl it under the terms of either: 12*f81b1c5bSmrgdnl 13*f81b1c5bSmrgdnl * the GNU Lesser General Public License as published by the Free 14*f81b1c5bSmrgdnl Software Foundation; either version 3 of the License, or (at your 15*f81b1c5bSmrgdnl option) any later version. 16*f81b1c5bSmrgdnl 17*f81b1c5bSmrgdnl or 18*f81b1c5bSmrgdnl 19*f81b1c5bSmrgdnl * the GNU General Public License as published by the Free Software 20*f81b1c5bSmrgdnl Foundation; either version 2 of the License, or (at your option) any 21*f81b1c5bSmrgdnl later version. 22*f81b1c5bSmrgdnl 23*f81b1c5bSmrgdnl or both in parallel, as here. 24*f81b1c5bSmrgdnl 25d25e02daSmrgdnl The GNU MP Library is distributed in the hope that it will be useful, but 26d25e02daSmrgdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27*f81b1c5bSmrgdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28*f81b1c5bSmrgdnl for more details. 29*f81b1c5bSmrgdnl 30*f81b1c5bSmrgdnl You should have received copies of the GNU General Public License and the 31*f81b1c5bSmrgdnl GNU Lesser General Public License along with the GNU MP Library. If not, 32*f81b1c5bSmrgdnl see https://www.gnu.org/licenses/. 33d25e02daSmrg 34d25e02daSmrgC cycles/limb 35d25e02daSmrgC AMD K8,K9 3.8 36d25e02daSmrgC AMD K10 3.8 37d25e02daSmrgC Intel P4 5.8 38d25e02daSmrgC Intel core2 4.75 39d25e02daSmrgC Intel corei 4.75 40d25e02daSmrgC Intel atom ? 41d25e02daSmrgC VIA nano 4.75 42d25e02daSmrg 43d25e02daSmrg 44d25e02daSmrgC INPUT PARAMETERS 45d25e02daSmrgdefine(`rp',`%rdi') 46d25e02daSmrgdefine(`up',`%rsi') 47d25e02daSmrgdefine(`vp',`%rdx') 48d25e02daSmrgdefine(`n', `%rcx') 49d25e02daSmrg 50d25e02daSmrgdefine(M, eval(m4_lshift(1,LSH))) 51d25e02daSmrg 52d25e02daSmrgABI_SUPPORT(DOS64) 53d25e02daSmrgABI_SUPPORT(STD64) 54d25e02daSmrg 55d25e02daSmrgASM_START() 56d25e02daSmrg TEXT 57d25e02daSmrg ALIGN(16) 58d25e02daSmrgPROLOGUE(func) 59d25e02daSmrg FUNC_ENTRY(4) 60d25e02daSmrg push %rbx 61d25e02daSmrg push %r12 62d25e02daSmrg push %rbp 63d25e02daSmrg 64d25e02daSmrg mov (vp), %r9 65d25e02daSmrg shl $LSH, %r9 66d25e02daSmrg mov 4(vp), R32(%rbp) 67d25e02daSmrg 68d25e02daSmrg xor R32(%rbx), R32(%rbx) 69d25e02daSmrg 70d25e02daSmrg mov R32(n), R32(%rax) 71d25e02daSmrg and $3, R32(%rax) 72d25e02daSmrg jne L(n00) C n = 0, 4, 8, ... 73d25e02daSmrg 74d25e02daSmrg mov (up), %r8 75d25e02daSmrg mov 8(up), %r10 76d25e02daSmrg shr $RSH, R32(%rbp) 77d25e02daSmrg ADDSUB %r9, %r8 78d25e02daSmrg mov 8(vp), %r9 79d25e02daSmrg lea (%rbp,%r9,M), %r9 80d25e02daSmrg setc R8(%rax) 81d25e02daSmrg mov 12(vp), R32(%rbp) 82d25e02daSmrg lea -16(rp), rp 83d25e02daSmrg jmp L(L00) 84d25e02daSmrg 85d25e02daSmrgL(n00): cmp $2, R32(%rax) 86d25e02daSmrg jnc L(n01) C n = 1, 5, 9, ... 87d25e02daSmrg mov (up), %r11 88d25e02daSmrg lea -8(rp), rp 89d25e02daSmrg shr $RSH, R32(%rbp) 90d25e02daSmrg ADDSUB %r9, %r11 91d25e02daSmrg setc R8(%rbx) 92d25e02daSmrg dec n 93d25e02daSmrg jz L(1) C jump for n = 1 94d25e02daSmrg mov 8(up), %r8 95d25e02daSmrg mov 8(vp), %r9 96d25e02daSmrg lea (%rbp,%r9,M), %r9 97d25e02daSmrg mov 12(vp), R32(%rbp) 98d25e02daSmrg lea 8(up), up 99d25e02daSmrg lea 8(vp), vp 100d25e02daSmrg jmp L(L01) 101d25e02daSmrg 102d25e02daSmrgL(n01): jne L(n10) C n = 2, 6, 10, ... 103d25e02daSmrg mov (up), %r12 104d25e02daSmrg mov 8(up), %r11 105d25e02daSmrg shr $RSH, R32(%rbp) 106d25e02daSmrg ADDSUB %r9, %r12 107d25e02daSmrg mov 8(vp), %r9 108d25e02daSmrg lea (%rbp,%r9,M), %r9 109d25e02daSmrg setc R8(%rax) 110d25e02daSmrg mov 12(vp), R32(%rbp) 111d25e02daSmrg lea 16(up), up 112d25e02daSmrg lea 16(vp), vp 113d25e02daSmrg jmp L(L10) 114d25e02daSmrg 115d25e02daSmrgL(n10): mov (up), %r10 116d25e02daSmrg mov 8(up), %r12 117d25e02daSmrg shr $RSH, R32(%rbp) 118d25e02daSmrg ADDSUB %r9, %r10 119d25e02daSmrg mov 8(vp), %r9 120d25e02daSmrg lea (%rbp,%r9,M), %r9 121d25e02daSmrg setc R8(%rbx) 122d25e02daSmrg mov 12(vp), R32(%rbp) 123d25e02daSmrg lea -24(rp), rp 124d25e02daSmrg lea -8(up), up 125d25e02daSmrg lea -8(vp), vp 126d25e02daSmrg jmp L(L11) 127d25e02daSmrg 128d25e02daSmrgL(c0): mov $1, R8(%rbx) 129d25e02daSmrg jmp L(rc0) 130d25e02daSmrgL(c1): mov $1, R8(%rax) 131d25e02daSmrg jmp L(rc1) 132d25e02daSmrgL(c2): mov $1, R8(%rbx) 133d25e02daSmrg jmp L(rc2) 134d25e02daSmrg 135d25e02daSmrg ALIGN(16) 136d25e02daSmrgL(top): mov (up), %r8 C not on critical path 137d25e02daSmrg shr $RSH, R32(%rbp) 138d25e02daSmrg ADDSUB %r9, %r11 C not on critical path 139d25e02daSmrg mov (vp), %r9 140d25e02daSmrg lea (%rbp,%r9,M), %r9 141d25e02daSmrg setc R8(%rbx) C save carry out 142d25e02daSmrg mov 4(vp), R32(%rbp) 143d25e02daSmrg mov %r12, (rp) 144d25e02daSmrg ADDSUB %rax, %r11 C apply previous carry out 145d25e02daSmrg jc L(c0) C jump if ripple 146d25e02daSmrgL(rc0): 147d25e02daSmrgL(L01): mov 8(up), %r10 148d25e02daSmrg shr $RSH, R32(%rbp) 149d25e02daSmrg ADDSUB %r9, %r8 150d25e02daSmrg mov 8(vp), %r9 151d25e02daSmrg lea (%rbp,%r9,M), %r9 152d25e02daSmrg setc R8(%rax) 153d25e02daSmrg mov 12(vp), R32(%rbp) 154d25e02daSmrg mov %r11, 8(rp) 155d25e02daSmrg ADDSUB %rbx, %r8 156d25e02daSmrg jc L(c1) 157d25e02daSmrgL(rc1): 158d25e02daSmrgL(L00): mov 16(up), %r12 159d25e02daSmrg shr $RSH, R32(%rbp) 160d25e02daSmrg ADDSUB %r9, %r10 161d25e02daSmrg mov 16(vp), %r9 162d25e02daSmrg lea (%rbp,%r9,M), %r9 163d25e02daSmrg setc R8(%rbx) 164d25e02daSmrg mov 20(vp), R32(%rbp) 165d25e02daSmrg mov %r8, 16(rp) 166d25e02daSmrg ADDSUB %rax, %r10 167d25e02daSmrg jc L(c2) 168d25e02daSmrgL(rc2): 169d25e02daSmrgL(L11): mov 24(up), %r11 170d25e02daSmrg shr $RSH, R32(%rbp) 171d25e02daSmrg ADDSUB %r9, %r12 172d25e02daSmrg mov 24(vp), %r9 173d25e02daSmrg lea (%rbp,%r9,M), %r9 174d25e02daSmrg lea 32(up), up 175d25e02daSmrg lea 32(vp), vp 176d25e02daSmrg setc R8(%rax) 177d25e02daSmrg mov -4(vp), R32(%rbp) 178d25e02daSmrg mov %r10, 24(rp) 179d25e02daSmrg ADDSUB %rbx, %r12 180d25e02daSmrg jc L(c3) 181d25e02daSmrgL(rc3): lea 32(rp), rp 182d25e02daSmrgL(L10): sub $4, n 183d25e02daSmrg ja L(top) 184d25e02daSmrg 185d25e02daSmrgL(end): 186d25e02daSmrg shr $RSH, R32(%rbp) 187d25e02daSmrg ADDSUB %r9, %r11 188d25e02daSmrg setc R8(%rbx) 189d25e02daSmrg mov %r12, (rp) 190d25e02daSmrg ADDSUB %rax, %r11 191d25e02daSmrg jnc L(1) 192d25e02daSmrg mov $1, R8(%rbx) 193d25e02daSmrgL(1): mov %r11, 8(rp) 194d25e02daSmrg lea (%rbx,%rbp), R32(%rax) 195d25e02daSmrg pop %rbp 196d25e02daSmrg pop %r12 197d25e02daSmrg pop %rbx 198d25e02daSmrg FUNC_EXIT() 199d25e02daSmrg ret 200d25e02daSmrgL(c3): mov $1, R8(%rax) 201d25e02daSmrg jmp L(rc3) 202d25e02daSmrgEPILOGUE() 203d25e02daSmrgASM_END() 204