1dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. 2 3dnl Copyright 1999-2002, 2005 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C P5 36C P6 model 0-8,10-12 6.44 37C P6 model 9 (Banias) 6.15 38C P6 model 13 (Dothan) 6.11 39C P4 model 0 (Willamette) 40C P4 model 1 (?) 41C P4 model 2 (Northwood) 42C P4 model 3 (Prescott) 43C P4 model 4 (Nocona) 44C AMD K6 45C AMD K7 46C AMD K8 47 48 49dnl P6 UNROLL_COUNT cycles/limb 50dnl 8 6.7 51dnl 16 6.35 52dnl 32 6.3 53dnl 64 6.3 54dnl Maximum possible with the current code is 64. 55 56deflit(UNROLL_COUNT, 16) 57 58 59ifdef(`OPERATION_addmul_1', ` 60 define(M4_inst, addl) 61 define(M4_function_1, mpn_addmul_1) 62 define(M4_function_1c, mpn_addmul_1c) 63 define(M4_description, add it to) 64 define(M4_desc_retval, carry) 65',`ifdef(`OPERATION_submul_1', ` 66 define(M4_inst, subl) 67 define(M4_function_1, mpn_submul_1) 68 define(M4_function_1c, mpn_submul_1c) 69 define(M4_description, subtract it from) 70 define(M4_desc_retval, borrow) 71',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 72')')') 73 74MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) 75 76 77C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 78C mp_limb_t mult); 79C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 80C mp_limb_t mult, mp_limb_t carry); 81C 82C Calculate src,size multiplied by mult and M4_description dst,size. 83C Return the M4_desc_retval limb from the top of the result. 84C 85C This code is pretty much the same as the K6 code. The unrolled loop is 86C the same, but there's just a few scheduling tweaks in the setups and the 87C simple loop. 88C 89C A number of variations have been tried for the unrolled loop, with one or 90C two carries, and with loads scheduled earlier, but nothing faster than 6 91C cycles/limb has been found. 92 93ifdef(`PIC',` 94deflit(UNROLL_THRESHOLD, 5) 95',` 96deflit(UNROLL_THRESHOLD, 5) 97') 98 99defframe(PARAM_CARRY, 20) 100defframe(PARAM_MULTIPLIER,16) 101defframe(PARAM_SIZE, 12) 102defframe(PARAM_SRC, 8) 103defframe(PARAM_DST, 4) 104 105 TEXT 106 ALIGN(32) 107 108PROLOGUE(M4_function_1c) 109 pushl %ebx 110deflit(`FRAME',4) 111 movl PARAM_CARRY, %ebx 112 jmp L(start_nc) 113EPILOGUE() 114 115PROLOGUE(M4_function_1) 116 push %ebx 117deflit(`FRAME',4) 118 xorl %ebx, %ebx C initial carry 119 120L(start_nc): 121 movl PARAM_SIZE, %ecx 122 pushl %esi 123deflit(`FRAME',8) 124 125 movl PARAM_SRC, %esi 126 pushl %edi 127deflit(`FRAME',12) 128 129 movl PARAM_DST, %edi 130 pushl %ebp 131deflit(`FRAME',16) 132 cmpl $UNROLL_THRESHOLD, %ecx 133 134 movl PARAM_MULTIPLIER, %ebp 135 jae L(unroll) 136 137 138 C simple loop 139 C this is offset 0x22, so close enough to aligned 140L(simple): 141 C eax scratch 142 C ebx carry 143 C ecx counter 144 C edx scratch 145 C esi src 146 C edi dst 147 C ebp multiplier 148 149 movl (%esi), %eax 150 addl $4, %edi 151 152 mull %ebp 153 154 addl %ebx, %eax 155 adcl $0, %edx 156 157 M4_inst %eax, -4(%edi) 158 movl %edx, %ebx 159 160 adcl $0, %ebx 161 decl %ecx 162 163 leal 4(%esi), %esi 164 jnz L(simple) 165 166 167 popl %ebp 168 popl %edi 169 170 popl %esi 171 movl %ebx, %eax 172 173 popl %ebx 174 ret 175 176 177 178C------------------------------------------------------------------------------ 179C VAR_JUMP holds the computed jump temporarily because there's not enough 180C registers when doing the mul for the initial two carry limbs. 181C 182C The add/adc for the initial carry in %ebx is necessary only for the 183C mpn_add/submul_1c entry points. Duplicating the startup code to 184C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good 185C idea. 186 187dnl overlapping with parameters already fetched 188define(VAR_COUNTER,`PARAM_SIZE') 189define(VAR_JUMP, `PARAM_DST') 190 191 C this is offset 0x43, so close enough to aligned 192L(unroll): 193 C eax 194 C ebx initial carry 195 C ecx size 196 C edx 197 C esi src 198 C edi dst 199 C ebp 200 201 movl %ecx, %edx 202 decl %ecx 203 204 subl $2, %edx 205 negl %ecx 206 207 shrl $UNROLL_LOG2, %edx 208 andl $UNROLL_MASK, %ecx 209 210 movl %edx, VAR_COUNTER 211 movl %ecx, %edx 212 213 C 15 code bytes per limb 214ifdef(`PIC',` 215 call L(pic_calc) 216L(here): 217',` 218 shll $4, %edx 219 negl %ecx 220 221 leal L(entry) (%edx,%ecx,1), %edx 222') 223 movl (%esi), %eax C src low limb 224 225 movl %edx, VAR_JUMP 226 leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi 227 228 mull %ebp 229 230 addl %ebx, %eax C initial carry (from _1c) 231 adcl $0, %edx 232 233 movl %edx, %ebx C high carry 234 leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi 235 236 movl VAR_JUMP, %edx 237 testl $1, %ecx 238 movl %eax, %ecx C low carry 239 240 cmovnz( %ebx, %ecx) C high,low carry other way around 241 cmovnz( %eax, %ebx) 242 243 jmp *%edx 244 245 246ifdef(`PIC',` 247L(pic_calc): 248 shll $4, %edx 249 negl %ecx 250 251 C See mpn/x86/README about old gas bugs 252 leal (%edx,%ecx,1), %edx 253 addl $L(entry)-L(here), %edx 254 255 addl (%esp), %edx 256 257 ret_internal 258') 259 260 261C ----------------------------------------------------------- 262 ALIGN(32) 263L(top): 264deflit(`FRAME',16) 265 C eax scratch 266 C ebx carry hi 267 C ecx carry lo 268 C edx scratch 269 C esi src 270 C edi dst 271 C ebp multiplier 272 C 273 C VAR_COUNTER loop counter 274 C 275 C 15 code bytes per limb 276 277 addl $UNROLL_BYTES, %edi 278 279L(entry): 280deflit(CHUNK_COUNT,2) 281forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 282 deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) 283 deflit(`disp1', eval(disp0 + 4)) 284 285Zdisp( movl, disp0,(%esi), %eax) 286 mull %ebp 287Zdisp( M4_inst,%ecx, disp0,(%edi)) 288 adcl %eax, %ebx 289 movl %edx, %ecx 290 adcl $0, %ecx 291 292 movl disp1(%esi), %eax 293 mull %ebp 294 M4_inst %ebx, disp1(%edi) 295 adcl %eax, %ecx 296 movl %edx, %ebx 297 adcl $0, %ebx 298') 299 300 decl VAR_COUNTER 301 leal UNROLL_BYTES(%esi), %esi 302 303 jns L(top) 304 305 306deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) 307 308 M4_inst %ecx, disp0(%edi) 309 movl %ebx, %eax 310 311 popl %ebp 312 popl %edi 313 314 popl %esi 315 popl %ebx 316 adcl $0, %eax 317 318 ret 319 320EPILOGUE() 321