1*f81b1c5bSmrgdnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n 2*f81b1c5bSmrg 3*f81b1c5bSmrgdnl Copyright 2011-2013 Free Software Foundation, Inc. 4*f81b1c5bSmrg 5*f81b1c5bSmrgdnl This file is part of the GNU MP Library. 6*f81b1c5bSmrgdnl 7*f81b1c5bSmrgdnl The GNU MP Library is free software; you can redistribute it and/or modify 8*f81b1c5bSmrgdnl it under the terms of either: 9*f81b1c5bSmrgdnl 10*f81b1c5bSmrgdnl * the GNU Lesser General Public License as published by the Free 11*f81b1c5bSmrgdnl Software Foundation; either version 3 of the License, or (at your 12*f81b1c5bSmrgdnl option) any later version. 13*f81b1c5bSmrgdnl 14*f81b1c5bSmrgdnl or 15*f81b1c5bSmrgdnl 16*f81b1c5bSmrgdnl * the GNU General Public License as published by the Free Software 17*f81b1c5bSmrgdnl Foundation; either version 2 of the License, or (at your option) any 18*f81b1c5bSmrgdnl later version. 19*f81b1c5bSmrgdnl 20*f81b1c5bSmrgdnl or both in parallel, as here. 21*f81b1c5bSmrgdnl 22*f81b1c5bSmrgdnl The GNU MP Library is distributed in the hope that it will be useful, but 23*f81b1c5bSmrgdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24*f81b1c5bSmrgdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25*f81b1c5bSmrgdnl for more details. 26*f81b1c5bSmrgdnl 27*f81b1c5bSmrgdnl You should have received copies of the GNU General Public License and the 28*f81b1c5bSmrgdnl GNU Lesser General Public License along with the GNU MP Library. If not, 29*f81b1c5bSmrgdnl see https://www.gnu.org/licenses/. 30*f81b1c5bSmrg 31*f81b1c5bSmrginclude(`../config.m4') 32*f81b1c5bSmrg 33*f81b1c5bSmrgC cycles/limb 34*f81b1c5bSmrgC AMD K8,K9 2 35*f81b1c5bSmrgC AMD K10 2 36*f81b1c5bSmrgC AMD bd1 2.32 37*f81b1c5bSmrgC AMD bobcat 3 38*f81b1c5bSmrgC Intel P4 13 39*f81b1c5bSmrgC Intel core2 2.9 40*f81b1c5bSmrgC Intel NHM 2.8 41*f81b1c5bSmrgC Intel SBR 2.4 42*f81b1c5bSmrgC Intel atom 5.33 43*f81b1c5bSmrgC VIA nano 3 44*f81b1c5bSmrg 45*f81b1c5bSmrgC NOTES 46*f81b1c5bSmrgC * It might seem natural to use the cmov insn here, but since this function 47*f81b1c5bSmrgC is supposed to have the exact same execution pattern for cnd true and 48*f81b1c5bSmrgC false, and since cmov's documentation is not clear about whether it 49*f81b1c5bSmrgC actually reads both source operands and writes the register for a false 50*f81b1c5bSmrgC condition, we cannot use it. 51*f81b1c5bSmrgC * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory 52*f81b1c5bSmrgC to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use 53*f81b1c5bSmrgC ADCSBB-to-memory, again saving 1 insn/limb. 54*f81b1c5bSmrgC * This runs optimally at decoder bandwidth on K10. It has not been tuned 55*f81b1c5bSmrgC for any other processor. 56*f81b1c5bSmrg 57*f81b1c5bSmrgC INPUT PARAMETERS 58*f81b1c5bSmrgdefine(`cnd', `%rdi') dnl rcx 59*f81b1c5bSmrgdefine(`rp', `%rsi') dnl rdx 60*f81b1c5bSmrgdefine(`up', `%rdx') dnl r8 61*f81b1c5bSmrgdefine(`vp', `%rcx') dnl r9 62*f81b1c5bSmrgdefine(`n', `%r8') dnl rsp+40 63*f81b1c5bSmrg 64*f81b1c5bSmrgifdef(`OPERATION_cnd_add_n', ` 65*f81b1c5bSmrg define(ADDSUB, add) 66*f81b1c5bSmrg define(ADCSBB, adc) 67*f81b1c5bSmrg define(func, mpn_cnd_add_n)') 68*f81b1c5bSmrgifdef(`OPERATION_cnd_sub_n', ` 69*f81b1c5bSmrg define(ADDSUB, sub) 70*f81b1c5bSmrg define(ADCSBB, sbb) 71*f81b1c5bSmrg define(func, mpn_cnd_sub_n)') 72*f81b1c5bSmrg 73*f81b1c5bSmrgMULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) 74*f81b1c5bSmrg 75*f81b1c5bSmrgABI_SUPPORT(DOS64) 76*f81b1c5bSmrgABI_SUPPORT(STD64) 77*f81b1c5bSmrg 78*f81b1c5bSmrgASM_START() 79*f81b1c5bSmrg TEXT 80*f81b1c5bSmrg ALIGN(16) 81*f81b1c5bSmrgPROLOGUE(func) 82*f81b1c5bSmrg FUNC_ENTRY(4) 83*f81b1c5bSmrgIFDOS(` mov 56(%rsp), R32(%r8)') 84*f81b1c5bSmrg push %rbx 85*f81b1c5bSmrg push %rbp 86*f81b1c5bSmrg push %r12 87*f81b1c5bSmrg push %r13 88*f81b1c5bSmrg push %r14 89*f81b1c5bSmrg 90*f81b1c5bSmrg neg cnd 91*f81b1c5bSmrg sbb cnd, cnd C make cnd mask 92*f81b1c5bSmrg 93*f81b1c5bSmrg lea (vp,n,8), vp 94*f81b1c5bSmrg lea (up,n,8), up 95*f81b1c5bSmrg lea (rp,n,8), rp 96*f81b1c5bSmrg 97*f81b1c5bSmrg mov R32(n), R32(%rax) 98*f81b1c5bSmrg neg n 99*f81b1c5bSmrg and $3, R32(%rax) 100*f81b1c5bSmrg jz L(top) C carry-save reg rax = 0 in this arc 101*f81b1c5bSmrg cmp $2, R32(%rax) 102*f81b1c5bSmrg jc L(b1) 103*f81b1c5bSmrg jz L(b2) 104*f81b1c5bSmrg 105*f81b1c5bSmrgL(b3): mov (vp,n,8), %r12 106*f81b1c5bSmrg mov 8(vp,n,8), %r13 107*f81b1c5bSmrg mov 16(vp,n,8), %r14 108*f81b1c5bSmrg and cnd, %r12 109*f81b1c5bSmrg mov (up,n,8), %r10 110*f81b1c5bSmrg and cnd, %r13 111*f81b1c5bSmrg mov 8(up,n,8), %rbx 112*f81b1c5bSmrg and cnd, %r14 113*f81b1c5bSmrg mov 16(up,n,8), %rbp 114*f81b1c5bSmrg ADDSUB %r12, %r10 115*f81b1c5bSmrg mov %r10, (rp,n,8) 116*f81b1c5bSmrg ADCSBB %r13, %rbx 117*f81b1c5bSmrg mov %rbx, 8(rp,n,8) 118*f81b1c5bSmrg ADCSBB %r14, %rbp 119*f81b1c5bSmrg mov %rbp, 16(rp,n,8) 120*f81b1c5bSmrg sbb R32(%rax), R32(%rax) C save carry 121*f81b1c5bSmrg add $3, n 122*f81b1c5bSmrg js L(top) 123*f81b1c5bSmrg jmp L(end) 124*f81b1c5bSmrg 125*f81b1c5bSmrgL(b2): mov (vp,n,8), %r12 126*f81b1c5bSmrg mov 8(vp,n,8), %r13 127*f81b1c5bSmrg mov (up,n,8), %r10 128*f81b1c5bSmrg and cnd, %r12 129*f81b1c5bSmrg mov 8(up,n,8), %rbx 130*f81b1c5bSmrg and cnd, %r13 131*f81b1c5bSmrg ADDSUB %r12, %r10 132*f81b1c5bSmrg mov %r10, (rp,n,8) 133*f81b1c5bSmrg ADCSBB %r13, %rbx 134*f81b1c5bSmrg mov %rbx, 8(rp,n,8) 135*f81b1c5bSmrg sbb R32(%rax), R32(%rax) C save carry 136*f81b1c5bSmrg add $2, n 137*f81b1c5bSmrg js L(top) 138*f81b1c5bSmrg jmp L(end) 139*f81b1c5bSmrg 140*f81b1c5bSmrgL(b1): mov (vp,n,8), %r12 141*f81b1c5bSmrg mov (up,n,8), %r10 142*f81b1c5bSmrg and cnd, %r12 143*f81b1c5bSmrg ADDSUB %r12, %r10 144*f81b1c5bSmrg mov %r10, (rp,n,8) 145*f81b1c5bSmrg sbb R32(%rax), R32(%rax) C save carry 146*f81b1c5bSmrg add $1, n 147*f81b1c5bSmrg jns L(end) 148*f81b1c5bSmrg 149*f81b1c5bSmrg ALIGN(16) 150*f81b1c5bSmrgL(top): mov (vp,n,8), %r12 151*f81b1c5bSmrg mov 8(vp,n,8), %r13 152*f81b1c5bSmrg mov 16(vp,n,8), %r14 153*f81b1c5bSmrg mov 24(vp,n,8), %r11 154*f81b1c5bSmrg and cnd, %r12 155*f81b1c5bSmrg mov (up,n,8), %r10 156*f81b1c5bSmrg and cnd, %r13 157*f81b1c5bSmrg mov 8(up,n,8), %rbx 158*f81b1c5bSmrg and cnd, %r14 159*f81b1c5bSmrg mov 16(up,n,8), %rbp 160*f81b1c5bSmrg and cnd, %r11 161*f81b1c5bSmrg mov 24(up,n,8), %r9 162*f81b1c5bSmrg add R32(%rax), R32(%rax) C restore carry 163*f81b1c5bSmrg ADCSBB %r12, %r10 164*f81b1c5bSmrg mov %r10, (rp,n,8) 165*f81b1c5bSmrg ADCSBB %r13, %rbx 166*f81b1c5bSmrg mov %rbx, 8(rp,n,8) 167*f81b1c5bSmrg ADCSBB %r14, %rbp 168*f81b1c5bSmrg mov %rbp, 16(rp,n,8) 169*f81b1c5bSmrg ADCSBB %r11, %r9 170*f81b1c5bSmrg mov %r9, 24(rp,n,8) 171*f81b1c5bSmrg sbb R32(%rax), R32(%rax) C save carry 172*f81b1c5bSmrg add $4, n 173*f81b1c5bSmrg js L(top) 174*f81b1c5bSmrg 175*f81b1c5bSmrgL(end): neg R32(%rax) 176*f81b1c5bSmrg pop %r14 177*f81b1c5bSmrg pop %r13 178*f81b1c5bSmrg pop %r12 179*f81b1c5bSmrg pop %rbp 180*f81b1c5bSmrg pop %rbx 181*f81b1c5bSmrg FUNC_EXIT() 182*f81b1c5bSmrg ret 183*f81b1c5bSmrgEPILOGUE() 184