1dnl X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 ? 37C AMD K10 ? 38C AMD bull ? 39C AMD pile ? 40C AMD steam ? 41C AMD bobcat ? 42C AMD jaguar ? 43C Intel P4 ? 44C Intel core ? 45C Intel NHM ? 46C Intel SBR 3.24 47C Intel IBR 3.04 48C Intel HWL ? 49C Intel BWL ? 50C Intel atom ? 51C VIA nano ? 52 53C The inner loops of this code are the result of running a code generation and 54C optimisation tool suite written by David Harvey and Torbjörn Granlund. 55 56C TODO 57C * Micro-optimise, none performed thus far. 58C * Consider inlining mpn_add_n. 59C * Single basecases out before the pushes. 60 61C When playing with pointers, set this to $2 to fall back to conservative 62C indexing in wind-down code. 63define(`I',`$1') 64 65define(`rp', `%rdi') C rcx 66define(`up', `%rsi') C rdx 67define(`mp_param', `%rdx') C r8 68define(`n', `%rcx') C r9 69define(`u0inv', `%r8') C stack 70 71define(`i', `%r14') 72define(`j', `%r15') 73define(`mp', `%r12') 74define(`q0', `%r13') 75 76C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 77 78ABI_SUPPORT(DOS64) 79ABI_SUPPORT(STD64) 80 81define(`ALIGNx', `ALIGN(16)') 82 83ASM_START() 84 TEXT 85 ALIGN(32) 86PROLOGUE(mpn_redc_1) 87 FUNC_ENTRY(4) 88IFDOS(` mov 56(%rsp), %r8 ') 89 push %rbx 90 push %rbp 91 push %r12 92 push %r13 93 push %r14 94 push %r15 95 96 mov (up), q0 97 mov n, j C outer loop induction var 98 lea 8(mp_param,n,8), mp 99 lea 8(up,n,8), up 100 neg n 101 imul u0inv, q0 C first iteration q0 102 103 test $1, R8(n) 104 jz L(bx0) 105 106L(bx1): test $2, R8(n) 107 jz L(b3) 108 109L(b1): cmp $-1, R32(n) 110 jz L(n1) 111 112L(otp1):lea 1(n), i 113 mov -8(mp,n,8), %rax 114 mul q0 115 mov -8(up,n,8), %r10 116 mov %rdx, %r11 117 add %rax, %r10 118 mov (mp,n,8), %rax 119 adc $0, %r11 120 mul q0 121 mov %rdx, %r9 122 mov (up,n,8), %rbx 123 add %rax, %rbx 124 adc $0, %r9 125 mov (mp,i,8), %rax 126 mul q0 127 mov (up,i,8), %r10 128 add %r11, %rbx 129 mov %rbx, -8(up,i,8) C next low remainder limb 130 adc $0, %r9 131 imul u0inv, %rbx C next q limb 132 jmp L(e1) 133 134 ALIGNx 135L(tp1): mul q0 136 mov -16(up,i,8), %r10 137 add %r11, %rbp 138 mov %rdx, %r11 139 adc $0, %r9 140 mov %rbp, -24(up,i,8) 141 add %rax, %r10 142 mov -8(mp,i,8), %rax 143 adc $0, %r11 144 mul q0 145 add %r9, %r10 146 mov %rdx, %r9 147 mov -8(up,i,8), %rbp 148 adc $0, %r11 149 mov %r10, -16(up,i,8) 150 add %rax, %rbp 151 adc $0, %r9 152 mov (mp,i,8), %rax 153 mul q0 154 mov (up,i,8), %r10 155 add %r11, %rbp 156 mov %rbp, -8(up,i,8) 157 adc $0, %r9 158L(e1): mov %rdx, %r11 159 add %rax, %r10 160 mov 8(mp,i,8), %rax 161 adc $0, %r11 162 mul q0 163 mov 8(up,i,8), %rbp 164 add %r9, %r10 165 mov %rdx, %r9 166 mov %r10, (up,i,8) 167 adc $0, %r11 168 add %rax, %rbp 169 adc $0, %r9 170 mov 16(mp,i,8), %rax 171 add $4, i 172 jnc L(tp1) 173 174L(ed1): mul q0 175 mov I(-16(up),-16(up,i,8)), %r10 176 add %r11, %rbp 177 adc $0, %r9 178 mov %rbp, I(-24(up),-24(up,i,8)) 179 add %rax, %r10 180 adc $0, %rdx 181 add %r9, %r10 182 adc $0, %rdx 183 mov %r10, I(-16(up),-16(up,i,8)) 184 mov %rdx, -8(up,n,8) C up[0] 185 mov %rbx, q0 C previously computed q limb -> q0 186 lea 8(up), up C up++ 187 dec j 188 jnz L(otp1) 189 jmp L(cj) 190 191L(b3): cmp $-3, R32(n) 192 jz L(n3) 193 194L(otp3):lea 3(n), i 195 mov -8(mp,n,8), %rax 196 mul q0 197 mov -8(up,n,8), %r10 198 mov %rdx, %r11 199 add %rax, %r10 200 mov (mp,n,8), %rax 201 adc $0, %r11 202 mul q0 203 mov (up,n,8), %rbx 204 mov %rdx, %r9 205 add %rax, %rbx 206 adc $0, %r9 207 mov 8(mp,n,8), %rax 208 mul q0 209 mov 8(up,n,8), %r10 210 add %r11, %rbx 211 mov %rdx, %r11 212 adc $0, %r9 213 mov %rbx, (up,n,8) 214 imul u0inv, %rbx C next q limb 215 jmp L(e3) 216 217 ALIGNx 218L(tp3): mul q0 219 mov -16(up,i,8), %r10 220 add %r11, %rbp 221 mov %rdx, %r11 222 adc $0, %r9 223 mov %rbp, -24(up,i,8) 224L(e3): add %rax, %r10 225 mov -8(mp,i,8), %rax 226 adc $0, %r11 227 mul q0 228 add %r9, %r10 229 mov %rdx, %r9 230 mov -8(up,i,8), %rbp 231 adc $0, %r11 232 mov %r10, -16(up,i,8) 233 add %rax, %rbp 234 adc $0, %r9 235 mov (mp,i,8), %rax 236 mul q0 237 mov (up,i,8), %r10 238 add %r11, %rbp 239 mov %rbp, -8(up,i,8) 240 adc $0, %r9 241 mov %rdx, %r11 242 add %rax, %r10 243 mov 8(mp,i,8), %rax 244 adc $0, %r11 245 mul q0 246 mov 8(up,i,8), %rbp 247 add %r9, %r10 248 mov %rdx, %r9 249 mov %r10, (up,i,8) 250 adc $0, %r11 251 add %rax, %rbp 252 adc $0, %r9 253 mov 16(mp,i,8), %rax 254 add $4, i 255 jnc L(tp3) 256 257L(ed3): mul q0 258 mov I(-16(up),-16(up,i,8)), %r10 259 add %r11, %rbp 260 adc $0, %r9 261 mov %rbp, I(-24(up),-24(up,i,8)) 262 add %rax, %r10 263 adc $0, %rdx 264 add %r9, %r10 265 adc $0, %rdx 266 mov %r10, I(-16(up),-16(up,i,8)) 267 mov %rdx, -8(up,n,8) C up[0] 268 mov %rbx, q0 C previously computed q limb -> q0 269 lea 8(up), up C up++ 270 dec j 271 jnz L(otp3) 272C jmp L(cj) 273 274L(cj): 275IFSTD(` lea -8(up,n,8), up C param 2: up 276 lea (up,n,8), %rdx C param 3: up - n 277 neg R32(n) ') C param 4: n 278 279IFDOS(` lea -8(up,n,8), %rdx C param 2: up 280 lea (%rdx,n,8), %r8 C param 3: up - n 281 neg R32(n) 282 mov n, %r9 C param 4: n 283 mov rp, %rcx ') C param 1: rp 284 285 CALL( mpn_add_n) 286 287L(ret): pop %r15 288 pop %r14 289 pop %r13 290 pop %r12 291 pop %rbp 292 pop %rbx 293 FUNC_EXIT() 294 ret 295 296L(bx0): test $2, R8(n) 297 jnz L(b2) 298 299L(b0): 300L(otp0):lea (n), i 301 mov -8(mp,n,8), %rax 302 mul q0 303 mov %rdx, %r9 304 mov -8(up,n,8), %rbp 305 add %rax, %rbp 306 adc $0, %r9 307 mov (mp,n,8), %rax 308 mul q0 309 mov (up,n,8), %rbx 310 mov %rdx, %r11 311 add %rax, %rbx 312 mov 8(mp,n,8), %rax 313 adc $0, %r11 314 mul q0 315 mov 8(up,n,8), %rbp 316 add %r9, %rbx 317 mov %rdx, %r9 318 mov %rbx, (up,n,8) 319 adc $0, %r11 320 imul u0inv, %rbx C next q limb 321 jmp L(e0) 322 323 ALIGNx 324L(tp0): mul q0 325 mov -16(up,i,8), %r10 326 add %r11, %rbp 327 mov %rdx, %r11 328 adc $0, %r9 329 mov %rbp, -24(up,i,8) 330 add %rax, %r10 331 mov -8(mp,i,8), %rax 332 adc $0, %r11 333 mul q0 334 add %r9, %r10 335 mov %rdx, %r9 336 mov -8(up,i,8), %rbp 337 adc $0, %r11 338 mov %r10, -16(up,i,8) 339 add %rax, %rbp 340 adc $0, %r9 341 mov (mp,i,8), %rax 342 mul q0 343 mov (up,i,8), %r10 344 add %r11, %rbp 345 mov %rbp, -8(up,i,8) 346 adc $0, %r9 347 mov %rdx, %r11 348 add %rax, %r10 349 mov 8(mp,i,8), %rax 350 adc $0, %r11 351 mul q0 352 mov 8(up,i,8), %rbp 353 add %r9, %r10 354 mov %rdx, %r9 355 mov %r10, (up,i,8) 356 adc $0, %r11 357L(e0): add %rax, %rbp 358 adc $0, %r9 359 mov 16(mp,i,8), %rax 360 add $4, i 361 jnc L(tp0) 362 363L(ed0): mul q0 364 mov I(-16(up),-16(up,i,8)), %r10 365 add %r11, %rbp 366 adc $0, %r9 367 mov %rbp, I(-24(up),-24(up,i,8)) 368 add %rax, %r10 369 adc $0, %rdx 370 add %r9, %r10 371 adc $0, %rdx 372 mov %r10, I(-16(up),-16(up,i,8)) 373 mov %rdx, -8(up,n,8) C up[0] 374 mov %rbx, q0 C previously computed q limb -> q0 375 lea 8(up), up C up++ 376 dec j 377 jnz L(otp0) 378 jmp L(cj) 379 380L(b2): cmp $-2, R32(n) 381 jz L(n2) 382 383L(otp2):lea 2(n), i 384 mov -8(mp,n,8), %rax 385 mul q0 386 mov -8(up,n,8), %rbp 387 mov %rdx, %r9 388 add %rax, %rbp 389 adc $0, %r9 390 mov (mp,n,8), %rax 391 mul q0 392 mov (up,n,8), %rbx 393 mov %rdx, %r11 394 add %rax, %rbx 395 mov 8(mp,n,8), %rax 396 adc $0, %r11 397 mul q0 398 add %r9, %rbx 399 mov %rdx, %r9 400 mov 8(up,n,8), %rbp 401 adc $0, %r11 402 mov %rbx, (up,n,8) 403 imul u0inv, %rbx C next q limb 404 jmp L(e2) 405 406 ALIGNx 407L(tp2): mul q0 408 mov -16(up,i,8), %r10 409 add %r11, %rbp 410 mov %rdx, %r11 411 adc $0, %r9 412 mov %rbp, -24(up,i,8) 413 add %rax, %r10 414 mov -8(mp,i,8), %rax 415 adc $0, %r11 416 mul q0 417 add %r9, %r10 418 mov %rdx, %r9 419 mov -8(up,i,8), %rbp 420 adc $0, %r11 421 mov %r10, -16(up,i,8) 422L(e2): add %rax, %rbp 423 adc $0, %r9 424 mov (mp,i,8), %rax 425 mul q0 426 mov (up,i,8), %r10 427 add %r11, %rbp 428 mov %rbp, -8(up,i,8) 429 adc $0, %r9 430 mov %rdx, %r11 431 add %rax, %r10 432 mov 8(mp,i,8), %rax 433 adc $0, %r11 434 mul q0 435 mov 8(up,i,8), %rbp 436 add %r9, %r10 437 mov %rdx, %r9 438 mov %r10, (up,i,8) 439 adc $0, %r11 440 add %rax, %rbp 441 adc $0, %r9 442 mov 16(mp,i,8), %rax 443 add $4, i 444 jnc L(tp2) 445 446L(ed2): mul q0 447 mov I(-16(up),-16(up,i,8)), %r10 448 add %r11, %rbp 449 adc $0, %r9 450 mov %rbp, I(-24(up),-24(up,i,8)) 451 add %rax, %r10 452 adc $0, %rdx 453 add %r9, %r10 454 adc $0, %rdx 455 mov %r10, I(-16(up),-16(up,i,8)) 456 mov %rdx, -8(up,n,8) C up[0] 457 mov %rbx, q0 C previously computed q limb -> q0 458 lea 8(up), up C up++ 459 dec j 460 jnz L(otp2) 461 jmp L(cj) 462 463L(n1): mov (mp_param), %rax 464 mul q0 465 add -16(up), %rax 466 adc -8(up), %rdx 467 mov %rdx, (rp) 468 mov $0, R32(%rax) 469 adc R32(%rax), R32(%rax) 470 jmp L(ret) 471 472L(n2): mov (mp_param), %rax 473 mov -24(up), %rbp 474 mul q0 475 add %rax, %rbp 476 mov %rdx, %r9 477 adc $0, %r9 478 mov -16(mp), %rax 479 mov -16(up), %r10 480 mul q0 481 add %rax, %r10 482 mov %rdx, %r11 483 adc $0, %r11 484 add %r9, %r10 485 adc $0, %r11 486 mov %r10, q0 487 imul u0inv, q0 C next q0 488 mov -24(mp), %rax 489 mul q0 490 add %rax, %r10 491 mov %rdx, %r9 492 adc $0, %r9 493 mov -16(mp), %rax 494 mov -8(up), %r14 495 mul q0 496 add %rax, %r14 497 adc $0, %rdx 498 add %r9, %r14 499 adc $0, %rdx 500 xor R32(%rax), R32(%rax) 501 add %r11, %r14 502 adc (up), %rdx 503 mov %r14, (rp) 504 mov %rdx, 8(rp) 505 adc R32(%rax), R32(%rax) 506 jmp L(ret) 507 508 ALIGNx 509L(n3): mov -32(mp), %rax 510 mov -32(up), %r10 511 mul q0 512 add %rax, %r10 513 mov -24(mp), %rax 514 mov %rdx, %r11 515 adc $0, %r11 516 mov -24(up), %rbp 517 mul q0 518 add %rax, %rbp 519 mov %rdx, %r9 520 adc $0, %r9 521 mov -16(mp), %rax 522 add %r11, %rbp 523 mov -16(up), %r10 524 adc $0, %r9 525 mul q0 526 mov %rbp, q0 527 imul u0inv, q0 C next q0 528 add %rax, %r10 529 mov %rdx, %r11 530 adc $0, %r11 531 mov %rbp, -24(up) 532 add %r9, %r10 533 adc $0, %r11 534 mov %r10, -16(up) 535 mov %r11, -32(up) C up[0] 536 lea 8(up), up C up++ 537 dec j 538 jnz L(n3) 539 jmp L(cj) 540EPILOGUE() 541ASM_END() 542