1dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. 2 3dnl Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 2.1 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public 18dnl License along with the GNU MP Library; see the file COPYING.LIB. If 19dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street, 20dnl Fifth Floor, Boston, MA 02110-1301, USA. 21 22include(`../config.m4') 23 24 25C cycles/limb 26C P5: 27C P6 model 0-8,10-12) 6.44 28C P6 model 9 (Banias) 29C P6 model 13 (Dothan) 6.11 30C P4 model 0 (Willamette) 31C P4 model 1 (?) 32C P4 model 2 (Northwood) 33C P4 model 3 (Prescott) 34C P4 model 4 (Nocona) 35C K6: 36C K7: 37C K8: 38 39 40dnl P6 UNROLL_COUNT cycles/limb 41dnl 8 6.7 42dnl 16 6.35 43dnl 32 6.3 44dnl 64 6.3 45dnl Maximum possible with the current code is 64. 46 47deflit(UNROLL_COUNT, 16) 48 49define(`OPERATION_addmul_1',1) 50 51ifdef(`OPERATION_addmul_1', ` 52 define(M4_inst, addl) 53 define(M4_function_1, mpn_addmul_1) 54 define(M4_function_1c, mpn_addmul_1c) 55 define(M4_description, add it to) 56 define(M4_desc_retval, carry) 57',`ifdef(`OPERATION_submul_1', ` 58 define(M4_inst, subl) 59 define(M4_function_1, mpn_submul_1) 60 define(M4_function_1c, mpn_submul_1c) 61 define(M4_description, subtract it from) 62 define(M4_desc_retval, borrow) 63',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 64')')') 65 66MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c) 67 68 69C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 70C mp_limb_t mult); 71C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 72C mp_limb_t mult, mp_limb_t carry); 73C 74C Calculate src,size multiplied by mult and M4_description dst,size. 75C Return the M4_desc_retval limb from the top of the result. 76C 77C This code is pretty much the same as the K6 code. The unrolled loop is 78C the same, but there's just a few scheduling tweaks in the setups and the 79C simple loop. 80C 81C A number of variations have been tried for the unrolled loop, with one or 82C two carries, and with loads scheduled earlier, but nothing faster than 6 83C cycles/limb has been found. 84 85ifdef(`PIC',` 86deflit(UNROLL_THRESHOLD, 5) 87',` 88deflit(UNROLL_THRESHOLD, 5) 89') 90 91defframe(PARAM_CARRY, 20) 92defframe(PARAM_MULTIPLIER,16) 93defframe(PARAM_SIZE, 12) 94defframe(PARAM_SRC, 8) 95defframe(PARAM_DST, 4) 96 97 TEXT 98 ALIGN(32) 99 100PROLOGUE(M4_function_1c) 101 pushl %ebx 102deflit(`FRAME',4) 103 movl PARAM_CARRY, %ebx 104 jmp L(start_nc) 105EPILOGUE() 106 107PROLOGUE(M4_function_1) 108 push %ebx 109deflit(`FRAME',4) 110 xorl %ebx, %ebx C initial carry 111 112L(start_nc): 113 movl PARAM_SIZE, %ecx 114 pushl %esi 115deflit(`FRAME',8) 116 117 movl PARAM_SRC, %esi 118 pushl %edi 119deflit(`FRAME',12) 120 121 movl PARAM_DST, %edi 122 pushl %ebp 123deflit(`FRAME',16) 124 cmpl $UNROLL_THRESHOLD, %ecx 125 126 movl PARAM_MULTIPLIER, %ebp 127 jae L(unroll) 128 129 130 C simple loop 131 C this is offset 0x22, so close enough to aligned 132L(simple): 133 C eax scratch 134 C ebx carry 135 C ecx counter 136 C edx scratch 137 C esi src 138 C edi dst 139 C ebp multiplier 140 141 movl (%esi), %eax 142 addl $4, %edi 143 144 mull %ebp 145 146 addl %ebx, %eax 147 adcl $0, %edx 148 149 M4_inst %eax, -4(%edi) 150 movl %edx, %ebx 151 152 adcl $0, %ebx 153 decl %ecx 154 155 leal 4(%esi), %esi 156 jnz L(simple) 157 158 159 popl %ebp 160 popl %edi 161 162 popl %esi 163 movl %ebx, %eax 164 165 popl %ebx 166 ret 167 168 169 170C------------------------------------------------------------------------------ 171C VAR_JUMP holds the computed jump temporarily because there's not enough 172C registers when doing the mul for the initial two carry limbs. 173C 174C The add/adc for the initial carry in %ebx is necessary only for the 175C mpn_add/submul_1c entry points. Duplicating the startup code to 176C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good 177C idea. 178 179dnl overlapping with parameters already fetched 180define(VAR_COUNTER,`PARAM_SIZE') 181define(VAR_JUMP, `PARAM_DST') 182 183 C this is offset 0x43, so close enough to aligned 184L(unroll): 185 C eax 186 C ebx initial carry 187 C ecx size 188 C edx 189 C esi src 190 C edi dst 191 C ebp 192 193 movl %ecx, %edx 194 decl %ecx 195 196 subl $2, %edx 197 negl %ecx 198 199 shrl $UNROLL_LOG2, %edx 200 andl $UNROLL_MASK, %ecx 201 202 movl %edx, VAR_COUNTER 203 movl %ecx, %edx 204 205 C 15 code bytes per limb 206ifdef(`PIC',` 207 call L(pic_calc) 208L(here): 209',` 210 shll $4, %edx 211 negl %ecx 212 213 leal L(entry) (%edx,%ecx,1), %edx 214') 215 movl (%esi), %eax C src low limb 216 217 movl %edx, VAR_JUMP 218 leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi 219 220 mull %ebp 221 222 addl %ebx, %eax C initial carry (from _1c) 223 adcl $0, %edx 224 225 movl %edx, %ebx C high carry 226 leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi 227 228 movl VAR_JUMP, %edx 229 testl $1, %ecx 230 movl %eax, %ecx C low carry 231 232 cmovnz( %ebx, %ecx) C high,low carry other way around 233 cmovnz( %eax, %ebx) 234 235 jmp *%edx 236 237 238ifdef(`PIC',` 239L(pic_calc): 240 shll $4, %edx 241 negl %ecx 242 243 C See mpn/x86/README about old gas bugs 244 leal (%edx,%ecx,1), %edx 245 addl $L(entry)-L(here), %edx 246 247 addl (%esp), %edx 248 249 ret_internal 250') 251 252 253C ----------------------------------------------------------- 254 ALIGN(32) 255L(top): 256deflit(`FRAME',16) 257 C eax scratch 258 C ebx carry hi 259 C ecx carry lo 260 C edx scratch 261 C esi src 262 C edi dst 263 C ebp multiplier 264 C 265 C VAR_COUNTER loop counter 266 C 267 C 15 code bytes per limb 268 269 addl $UNROLL_BYTES, %edi 270 271L(entry): 272deflit(CHUNK_COUNT,2) 273forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 274 deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) 275 deflit(`disp1', eval(disp0 + 4)) 276 277Zdisp( movl, disp0,(%esi), %eax) 278 mull %ebp 279Zdisp( M4_inst,%ecx, disp0,(%edi)) 280 adcl %eax, %ebx 281 movl %edx, %ecx 282 adcl $0, %ecx 283 284 movl disp1(%esi), %eax 285 mull %ebp 286 M4_inst %ebx, disp1(%edi) 287 adcl %eax, %ecx 288 movl %edx, %ebx 289 adcl $0, %ebx 290') 291 292 decl VAR_COUNTER 293 leal UNROLL_BYTES(%esi), %esi 294 295 jns L(top) 296 297 298deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) 299 300 M4_inst %ecx, disp0(%edi) 301 movl %ebx, %eax 302 303 popl %ebp 304 popl %edi 305 306 popl %esi 307 popl %ebx 308 adcl $0, %eax 309 310 ret 311 312EPILOGUE() 313