1*671ea119Smrgdnl AMD64 mpn_add_n, mpn_sub_n optimised for bobcat. 2*671ea119Smrg 3*671ea119Smrgdnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc. 4*671ea119Smrg 5*671ea119Smrgdnl This file is part of the GNU MP Library. 6*671ea119Smrgdnl 7*671ea119Smrgdnl The GNU MP Library is free software; you can redistribute it and/or modify 8*671ea119Smrgdnl it under the terms of either: 9*671ea119Smrgdnl 10*671ea119Smrgdnl * the GNU Lesser General Public License as published by the Free 11*671ea119Smrgdnl Software Foundation; either version 3 of the License, or (at your 12*671ea119Smrgdnl option) any later version. 13*671ea119Smrgdnl 14*671ea119Smrgdnl or 15*671ea119Smrgdnl 16*671ea119Smrgdnl * the GNU General Public License as published by the Free Software 17*671ea119Smrgdnl Foundation; either version 2 of the License, or (at your option) any 18*671ea119Smrgdnl later version. 19*671ea119Smrgdnl 20*671ea119Smrgdnl or both in parallel, as here. 21*671ea119Smrgdnl 22*671ea119Smrgdnl The GNU MP Library is distributed in the hope that it will be useful, but 23*671ea119Smrgdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24*671ea119Smrgdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25*671ea119Smrgdnl for more details. 26*671ea119Smrgdnl 27*671ea119Smrgdnl You should have received copies of the GNU General Public License and the 28*671ea119Smrgdnl GNU Lesser General Public License along with the GNU MP Library. If not, 29*671ea119Smrgdnl see https://www.gnu.org/licenses/. 30*671ea119Smrg 31*671ea119Smrginclude(`../config.m4') 32*671ea119Smrg 33*671ea119SmrgC cycles/limb 34*671ea119SmrgC AMD K8,K9 1.77 35*671ea119SmrgC AMD K10 1.76\1.82 36*671ea119SmrgC AMD bd1 1.67\2.12 37*671ea119SmrgC AMD bd2 1.62\1.82 38*671ea119SmrgC AMD bd3 39*671ea119SmrgC AMD bd4 1.55\2.2 40*671ea119SmrgC AMD zen 41*671ea119SmrgC AMD bt1 2.54 42*671ea119SmrgC AMD bt2 2 43*671ea119SmrgC Intel P4 11 44*671ea119SmrgC Intel PNR 4.76 45*671ea119SmrgC Intel NHM 5.27 46*671ea119SmrgC Intel SBR 2 47*671ea119SmrgC Intel IBR 1.94 48*671ea119SmrgC Intel HWL 1.63 49*671ea119SmrgC Intel BWL 1.51 50*671ea119SmrgC Intel SKL 1.51 51*671ea119SmrgC Intel atom 3.56 52*671ea119SmrgC Intel SLM 4 53*671ea119SmrgC VIA nano 54*671ea119Smrg 55*671ea119SmrgC The loop of this code is the result of running a code generation and 56*671ea119SmrgC optimization tool suite written by David Harvey and Torbjorn Granlund. 57*671ea119Smrg 58*671ea119SmrgC INPUT PARAMETERS 59*671ea119Smrgdefine(`rp', `%rdi') C rcx 60*671ea119Smrgdefine(`up', `%rsi') C rdx 61*671ea119Smrgdefine(`vp', `%rdx') C r8 62*671ea119Smrgdefine(`n', `%rcx') C r9 63*671ea119Smrgdefine(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) 64*671ea119Smrg 65*671ea119Smrgifdef(`OPERATION_add_n', ` 66*671ea119Smrg define(ADCSBB, adc) 67*671ea119Smrg define(func, mpn_add_n) 68*671ea119Smrg define(func_nc, mpn_add_nc)') 69*671ea119Smrgifdef(`OPERATION_sub_n', ` 70*671ea119Smrg define(ADCSBB, sbb) 71*671ea119Smrg define(func, mpn_sub_n) 72*671ea119Smrg define(func_nc, mpn_sub_nc)') 73*671ea119Smrg 74*671ea119SmrgMULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 75*671ea119Smrg 76*671ea119SmrgABI_SUPPORT(DOS64) 77*671ea119SmrgABI_SUPPORT(STD64) 78*671ea119Smrg 79*671ea119SmrgASM_START() 80*671ea119Smrg TEXT 81*671ea119Smrg ALIGN(16) 82*671ea119SmrgPROLOGUE(func) 83*671ea119Smrg FUNC_ENTRY(4) 84*671ea119Smrg xor %r8, %r8 85*671ea119SmrgL(ent): test $1, R8(n) 86*671ea119Smrg jnz L(bx1) 87*671ea119Smrg 88*671ea119SmrgL(bx0): test $2, R8(n) 89*671ea119Smrg jnz L(b10) 90*671ea119Smrg 91*671ea119SmrgL(b00): shr $2, n 92*671ea119Smrg neg %r8 93*671ea119Smrg mov $3, R32(%rax) 94*671ea119Smrg mov (up), %r10 95*671ea119Smrg mov 8(up), %r11 96*671ea119Smrg jmp L(lo0) 97*671ea119Smrg 98*671ea119SmrgL(b10): shr $2, n 99*671ea119Smrg neg %r8 100*671ea119Smrg mov $1, R32(%rax) 101*671ea119Smrg mov (up), %r8 102*671ea119Smrg mov 8(up), %r9 103*671ea119Smrg jrcxz L(cj2) 104*671ea119Smrg jmp L(top) 105*671ea119Smrg 106*671ea119SmrgL(bx1): test $2, R8(n) 107*671ea119Smrg jnz L(b11) 108*671ea119Smrg 109*671ea119SmrgL(b01): shr $2, n 110*671ea119Smrg neg %r8 111*671ea119Smrg mov $0, R32(%rax) 112*671ea119Smrg mov (up), %r9 113*671ea119Smrg jrcxz L(cj1) 114*671ea119Smrg mov 8(up), %r10 115*671ea119Smrg jmp L(lo1) 116*671ea119Smrg 117*671ea119Smrg ALIGN(8) 118*671ea119SmrgL(b11): inc n 119*671ea119Smrg shr $2, n 120*671ea119Smrg neg %r8 121*671ea119Smrg mov $2, R32(%rax) 122*671ea119Smrg mov (up), %r11 123*671ea119Smrg jmp L(lo3) 124*671ea119Smrg 125*671ea119Smrg ALIGN(4) 126*671ea119SmrgL(top): mov 8(up,%rax,8), %r10 127*671ea119Smrg ADCSBB -8(vp,%rax,8), %r8 128*671ea119Smrg mov %r8, -8(rp,%rax,8) 129*671ea119SmrgL(lo1): mov 16(up,%rax,8), %r11 130*671ea119Smrg ADCSBB (vp,%rax,8), %r9 131*671ea119Smrg lea 4(%rax), %rax 132*671ea119Smrg mov %r9, -32(rp,%rax,8) 133*671ea119SmrgL(lo0): ADCSBB -24(vp,%rax,8), %r10 134*671ea119Smrg mov %r10, -24(rp,%rax,8) 135*671ea119SmrgL(lo3): ADCSBB -16(vp,%rax,8), %r11 136*671ea119Smrg dec n 137*671ea119Smrg mov -8(up,%rax,8), %r8 138*671ea119Smrg mov %r11, -16(rp,%rax,8) 139*671ea119SmrgL(lo2): mov (up,%rax,8), %r9 140*671ea119Smrg jnz L(top) 141*671ea119Smrg 142*671ea119SmrgL(cj2): ADCSBB -8(vp,%rax,8), %r8 143*671ea119Smrg mov %r8, -8(rp,%rax,8) 144*671ea119SmrgL(cj1): ADCSBB (vp,%rax,8), %r9 145*671ea119Smrg mov %r9, (rp,%rax,8) 146*671ea119Smrg 147*671ea119Smrg mov $0, R32(%rax) 148*671ea119Smrg adc $0, R32(%rax) 149*671ea119Smrg 150*671ea119Smrg FUNC_EXIT() 151*671ea119Smrg ret 152*671ea119SmrgEPILOGUE() 153*671ea119Smrg 154*671ea119Smrg ALIGN(16) 155*671ea119SmrgPROLOGUE(func_nc) 156*671ea119Smrg FUNC_ENTRY(4) 157*671ea119SmrgIFDOS(` mov 56(%rsp), %r8 ') 158*671ea119Smrg jmp L(ent) 159*671ea119SmrgEPILOGUE() 160