1dnl AMD64 mpn_add_n, mpn_sub_n 2 3dnl Copyright 2003-2005, 2007, 2008, 2010-2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 1.5 35C AMD K10 1.5 36C AMD bd1 1.8 37C AMD bobcat 2.5 38C Intel P4 39C Intel core2 4.9 40C Intel NHM 5.5 41C Intel SBR 1.61 42C Intel IBR 1.61 43C Intel atom 4 44C VIA nano 3.25 45 46C The loop of this code is the result of running a code generation and 47C optimization tool suite written by David Harvey and Torbjorn Granlund. 48 49C INPUT PARAMETERS 50define(`rp', `%rdi') C rcx 51define(`up', `%rsi') C rdx 52define(`vp', `%rdx') C r8 53define(`n', `%rcx') C r9 54define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) 55 56ifdef(`OPERATION_add_n', ` 57 define(ADCSBB, adc) 58 define(func, mpn_add_n) 59 define(func_nc, mpn_add_nc)') 60ifdef(`OPERATION_sub_n', ` 61 define(ADCSBB, sbb) 62 define(func, mpn_sub_n) 63 define(func_nc, mpn_sub_nc)') 64 65MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 66 67ABI_SUPPORT(DOS64) 68ABI_SUPPORT(STD64) 69 70ASM_START() 71 TEXT 72 ALIGN(16) 73PROLOGUE(func_nc) 74 FUNC_ENTRY(4) 75IFDOS(` mov 56(%rsp), %r8 ') 76 mov R32(n), R32(%rax) 77 shr $2, n 78 and $3, R32(%rax) 79 bt $0, %r8 C cy flag <- carry parameter 80 jrcxz L(lt4) 81 82 mov (up), %r8 83 mov 8(up), %r9 84 dec n 85 jmp L(mid) 86 87EPILOGUE() 88 ALIGN(16) 89PROLOGUE(func) 90 FUNC_ENTRY(4) 91 mov R32(n), R32(%rax) 92 shr $2, n 93 and $3, R32(%rax) 94 jrcxz L(lt4) 95 96 mov (up), %r8 97 mov 8(up), %r9 98 dec n 99 jmp L(mid) 100 101L(lt4): dec R32(%rax) 102 mov (up), %r8 103 jnz L(2) 104 ADCSBB (vp), %r8 105 mov %r8, (rp) 106 adc R32(%rax), R32(%rax) 107 FUNC_EXIT() 108 ret 109 110L(2): dec R32(%rax) 111 mov 8(up), %r9 112 jnz L(3) 113 ADCSBB (vp), %r8 114 ADCSBB 8(vp), %r9 115 mov %r8, (rp) 116 mov %r9, 8(rp) 117 adc R32(%rax), R32(%rax) 118 FUNC_EXIT() 119 ret 120 121L(3): mov 16(up), %r10 122 ADCSBB (vp), %r8 123 ADCSBB 8(vp), %r9 124 ADCSBB 16(vp), %r10 125 mov %r8, (rp) 126 mov %r9, 8(rp) 127 mov %r10, 16(rp) 128 setc R8(%rax) 129 FUNC_EXIT() 130 ret 131 132 ALIGN(16) 133L(top): ADCSBB (vp), %r8 134 ADCSBB 8(vp), %r9 135 ADCSBB 16(vp), %r10 136 ADCSBB 24(vp), %r11 137 mov %r8, (rp) 138 lea 32(up), up 139 mov %r9, 8(rp) 140 mov %r10, 16(rp) 141 dec n 142 mov %r11, 24(rp) 143 lea 32(vp), vp 144 mov (up), %r8 145 mov 8(up), %r9 146 lea 32(rp), rp 147L(mid): mov 16(up), %r10 148 mov 24(up), %r11 149 jnz L(top) 150 151L(end): lea 32(up), up 152 ADCSBB (vp), %r8 153 ADCSBB 8(vp), %r9 154 ADCSBB 16(vp), %r10 155 ADCSBB 24(vp), %r11 156 lea 32(vp), vp 157 mov %r8, (rp) 158 mov %r9, 8(rp) 159 mov %r10, 16(rp) 160 mov %r11, 24(rp) 161 lea 32(rp), rp 162 163 inc R32(%rax) 164 dec R32(%rax) 165 jnz L(lt4) 166 adc R32(%rax), R32(%rax) 167 FUNC_EXIT() 168 ret 169EPILOGUE() 170