1dnl AMD K7 mpn_mod_1 -- mpn by limb remainder. 2 3dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 2.1 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public 18dnl License along with the GNU MP Library; see the file COPYING.LIB. If 19dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street, 20dnl Fifth Floor, Boston, MA 02110-1301, USA. 21 22include(`../config.m4') 23 24 25C K7: 17.0 cycles/limb. 26 27 28C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); 29C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, 30C mp_limb_t carry); 31C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, 32C mp_limb_t inverse); 33C 34C The code here is the same as mpn_divrem_1, but with the quotient 35C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments. 36 37 38dnl MUL_THRESHOLD is the size at which the multiply by inverse method is 39dnl used, rather than plain "divl"s. Minimum value 2. 40dnl 41dnl The inverse takes about 50 cycles to calculate, but after that the 42dnl multiply is 17 c/l versus division at 41 c/l. 43dnl 44dnl Using mul or div is about the same speed at 3 limbs, so the threshold 45dnl is set to 4 to get the smaller div code used at 3. 46 47deflit(MUL_THRESHOLD, 4) 48 49 50defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1 51defframe(PARAM_CARRY, 16) dnl mpn_mod_1c 52defframe(PARAM_DIVISOR,12) 53defframe(PARAM_SIZE, 8) 54defframe(PARAM_SRC, 4) 55 56defframe(SAVE_EBX, -4) 57defframe(SAVE_ESI, -8) 58defframe(SAVE_EDI, -12) 59defframe(SAVE_EBP, -16) 60 61defframe(VAR_NORM, -20) 62defframe(VAR_INVERSE, -24) 63defframe(VAR_SRC_STOP,-28) 64 65deflit(STACK_SPACE, 28) 66 67 TEXT 68 69 ALIGN(32) 70PROLOGUE(mpn_preinv_mod_1) 71deflit(`FRAME',0) 72 movl PARAM_SRC, %ecx 73 movl PARAM_SIZE, %eax 74 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 75 76 movl %ebp, SAVE_EBP 77 movl PARAM_DIVISOR, %ebp 78 79 movl %edi, SAVE_EDI 80 movl PARAM_INVERSE, %edx 81 82 movl %esi, SAVE_ESI 83 movl -4(%ecx,%eax,4), %edi C src high limb 84 leal -16(%ecx,%eax,4), %ecx C &src[size-4] 85 86 movl %ebx, SAVE_EBX 87 movl PARAM_INVERSE, %edx 88 89 movl $0, VAR_NORM C l==0 90 91 movl %edi, %esi 92 subl %ebp, %edi C high-divisor 93 94 cmovc( %esi, %edi) C restore if underflow 95 decl %eax 96 jz L(done_edi) C size==1, high-divisor only 97 98 movl 8(%ecx), %esi C src second high limb 99 movl %edx, VAR_INVERSE 100 101 movl $32, %ebx C 32-l 102 decl %eax 103 jz L(inverse_one_left) C size==2, one divide 104 105 movd %ebx, %mm7 C 32-l 106 decl %eax 107 jz L(inverse_two_left) C size==3, two divides 108 109 jmp L(inverse_top) C size>=4 110 111 112L(done_edi): 113 movl SAVE_ESI, %esi 114 movl SAVE_EBP, %ebp 115 movl %edi, %eax 116 117 movl SAVE_EDI, %edi 118 addl $STACK_SPACE, %esp 119 120 ret 121 122EPILOGUE() 123 124 125 ALIGN(32) 126PROLOGUE(mpn_mod_1c) 127deflit(`FRAME',0) 128 movl PARAM_CARRY, %edx 129 movl PARAM_SIZE, %ecx 130 subl $STACK_SPACE, %esp 131deflit(`FRAME',STACK_SPACE) 132 133 movl %ebp, SAVE_EBP 134 movl PARAM_DIVISOR, %ebp 135 136 movl %esi, SAVE_ESI 137 movl PARAM_SRC, %esi 138 jmp L(start_1c) 139 140EPILOGUE() 141 142 143 ALIGN(32) 144PROLOGUE(mpn_mod_1) 145deflit(`FRAME',0) 146 147 movl PARAM_SIZE, %ecx 148 movl $0, %edx C initial carry (if can't skip a div) 149 subl $STACK_SPACE, %esp 150deflit(`FRAME',STACK_SPACE) 151 152 movl %esi, SAVE_ESI 153 movl PARAM_SRC, %esi 154 155 movl %ebp, SAVE_EBP 156 movl PARAM_DIVISOR, %ebp 157 158 orl %ecx, %ecx 159 jz L(divide_done) 160 161 movl -4(%esi,%ecx,4), %eax C src high limb 162 163 cmpl %ebp, %eax C carry flag if high<divisor 164 165 cmovc( %eax, %edx) C src high limb as initial carry 166 sbbl $0, %ecx C size-1 to skip one div 167 jz L(divide_done) 168 169 170 ALIGN(16) 171L(start_1c): 172 C eax 173 C ebx 174 C ecx size 175 C edx carry 176 C esi src 177 C edi 178 C ebp divisor 179 180 cmpl $MUL_THRESHOLD, %ecx 181 jae L(mul_by_inverse) 182 183 184 185C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations, 186C but it's already fast and compact, and there's nothing to gain by 187C expanding it out. 188C 189C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp. 190 191 orl %ecx, %ecx 192 jz L(divide_done) 193 194 195L(divide_top): 196 C eax scratch (quotient) 197 C ebx 198 C ecx counter, limbs, decrementing 199 C edx scratch (remainder) 200 C esi src 201 C edi 202 C ebp 203 204 movl -4(%esi,%ecx,4), %eax 205 206 divl PARAM_DIVISOR 207 208 decl %ecx 209 jnz L(divide_top) 210 211 212L(divide_done): 213 movl SAVE_ESI, %esi 214 movl SAVE_EBP, %ebp 215 addl $STACK_SPACE, %esp 216 217 movl %edx, %eax 218 219 ret 220 221 222 223C ----------------------------------------------------------------------------- 224 225L(mul_by_inverse): 226 C eax 227 C ebx 228 C ecx size 229 C edx carry 230 C esi src 231 C edi 232 C ebp divisor 233 234 bsrl %ebp, %eax C 31-l 235 236 movl %ebx, SAVE_EBX 237 movl %ecx, %ebx C size 238 239 movl %edi, SAVE_EDI 240 movl $31, %ecx 241 242 movl %edx, %edi C carry 243 movl $-1, %edx 244 245 C 246 247 xorl %eax, %ecx C l 248 incl %eax C 32-l 249 250 shll %cl, %ebp C d normalized 251 movl %ecx, VAR_NORM 252 253 movd %eax, %mm7 C 32-l 254 255 movl $-1, %eax 256 subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 257 258 divl %ebp C floor (b*(b-d)-1) / d 259 260 C 261 262 movl %eax, VAR_INVERSE 263 leal -12(%esi,%ebx,4), %eax C &src[size-3] 264 265 movl 8(%eax), %esi C src high limb 266 movl 4(%eax), %edx C src second highest limb 267 268 shldl( %cl, %esi, %edi) C n2 = carry,high << l 269 270 shldl( %cl, %edx, %esi) C n10 = high,second << l 271 272 movl %eax, %ecx C &src[size-3] 273 274 275ifelse(MUL_THRESHOLD,2,` 276 cmpl $2, %ebx 277 je L(inverse_two_left) 278') 279 280 281C The dependent chain here is the same as in mpn_divrem_1, but a few 282C instructions are saved by not needing to store the quotient limbs. 283C Unfortunately this doesn't get the code down to the theoretical 16 c/l. 284C 285C There's four dummy instructions in the loop, all of which are necessary 286C for the claimed 17 c/l. It's a 1 to 3 cycle slowdown if any are removed, 287C or changed from load to store or vice versa. They're not completely 288C random, since they correspond to what mpn_divrem_1 has, but there's no 289C obvious reason why they're necessary. Presumably they induce something 290C good in the out of order execution, perhaps through some load/store 291C ordering and/or decoding effects. 292C 293C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1. On 294C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at 295C about 13.5 c/l. 296 297 ALIGN(32) 298L(inverse_top): 299 C eax scratch 300 C ebx scratch (nadj, q1) 301 C ecx src pointer, decrementing 302 C edx scratch 303 C esi n10 304 C edi n2 305 C ebp divisor 306 C 307 C mm0 scratch (src qword) 308 C mm7 rshift for normalization 309 310 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 311 movl %edi, %eax C n2 312 movl PARAM_SIZE, %ebx C dummy 313 314 leal (%ebp,%esi), %ebx 315 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 316 sbbl $-1, %eax C n2+n1 317 318 mull VAR_INVERSE C m*(n2+n1) 319 320 movq (%ecx), %mm0 C next src limb and the one below it 321 subl $4, %ecx 322 323 movl %ecx, PARAM_SIZE C dummy 324 325 C 326 327 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 328 leal 1(%edi), %ebx C n2+1 329 movl %ebp, %eax C d 330 331 C 332 333 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 334 jz L(q1_ff) 335 nop C dummy 336 337 mull %ebx C (q1+1)*d 338 339 psrlq %mm7, %mm0 340 leal (%ecx), %ecx C dummy 341 342 C 343 344 C 345 346 subl %eax, %esi C low n - (q1+1)*d 347 movl PARAM_SRC, %eax 348 349 C 350 351 sbbl %edx, %edi C high n - (q1+1)*d, 0 or -1 352 movl %esi, %edi C remainder -> n2 353 leal (%ebp,%esi), %edx 354 355 movd %mm0, %esi 356 357 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 358 cmpl %eax, %ecx 359 jae L(inverse_top) 360 361 362L(inverse_loop_done): 363 364 365C ----------------------------------------------------------------------------- 366 367L(inverse_two_left): 368 C eax scratch 369 C ebx scratch (nadj, q1) 370 C ecx &src[-1] 371 C edx scratch 372 C esi n10 373 C edi n2 374 C ebp divisor 375 C 376 C mm0 scratch (src dword) 377 C mm7 rshift 378 379 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 380 movl %edi, %eax C n2 381 382 leal (%ebp,%esi), %ebx 383 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 384 sbbl $-1, %eax C n2+n1 385 386 mull VAR_INVERSE C m*(n2+n1) 387 388 movd 4(%ecx), %mm0 C src low limb 389 390 C 391 392 C 393 394 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 395 leal 1(%edi), %ebx C n2+1 396 movl %ebp, %eax C d 397 398 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 399 400 sbbl $0, %ebx 401 402 mull %ebx C (q1+1)*d 403 404 psllq $32, %mm0 405 406 psrlq %mm7, %mm0 407 408 C 409 410 subl %eax, %esi 411 412 C 413 414 sbbl %edx, %edi C n - (q1+1)*d 415 movl %esi, %edi C remainder -> n2 416 leal (%ebp,%esi), %edx 417 418 movd %mm0, %esi 419 420 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 421 422 423L(inverse_one_left): 424 C eax scratch 425 C ebx scratch (nadj, q1) 426 C ecx 427 C edx scratch 428 C esi n10 429 C edi n2 430 C ebp divisor 431 C 432 C mm0 src limb, shifted 433 C mm7 rshift 434 435 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc 436 movl %edi, %eax C n2 437 438 leal (%ebp,%esi), %ebx 439 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow 440 sbbl $-1, %eax C n2+n1 441 442 mull VAR_INVERSE C m*(n2+n1) 443 444 movl VAR_NORM, %ecx C for final denorm 445 446 C 447 448 C 449 450 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag 451 leal 1(%edi), %ebx C n2+1 452 movl %ebp, %eax C d 453 454 C 455 456 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 457 458 sbbl $0, %ebx 459 460 mull %ebx C (q1+1)*d 461 462 movl SAVE_EBX, %ebx 463 464 C 465 466 C 467 468 subl %eax, %esi 469 470 movl %esi, %eax C remainder 471 movl SAVE_ESI, %esi 472 473 sbbl %edx, %edi C n - (q1+1)*d 474 leal (%ebp,%eax), %edx 475 movl SAVE_EBP, %ebp 476 477 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 478 movl SAVE_EDI, %edi 479 480 shrl %cl, %eax C denorm remainder 481 addl $STACK_SPACE, %esp 482 emms 483 484 ret 485 486 487C ----------------------------------------------------------------------------- 488C 489C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword 490C of q*d is simply -d and the remainder n-q*d = n10+d 491 492L(q1_ff): 493 C eax (divisor) 494 C ebx (q1+1 == 0) 495 C ecx src pointer 496 C edx 497 C esi n10 498 C edi (n2) 499 C ebp divisor 500 501 movl PARAM_SRC, %edx 502 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 503 psrlq %mm7, %mm0 504 505 movd %mm0, %esi C next n10 506 507 cmpl %edx, %ecx 508 jae L(inverse_top) 509 jmp L(inverse_loop_done) 510 511EPILOGUE() 512