1dnl X86-64 mpn_redc_1 optimised for Intel Atom. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 ? 37C AMD K10 ? 38C AMD bull ? 39C AMD pile ? 40C AMD steam ? 41C AMD bobcat 5.0 42C AMD jaguar ? 43C Intel P4 ? 44C Intel core ? 45C Intel NHM ? 46C Intel SBR ? 47C Intel IBR ? 48C Intel HWL ? 49C Intel BWL ? 50C Intel atom ? 51C VIA nano ? 52 53C TODO 54C * Micro-optimise, none performed thus far. 55C * Consider inlining mpn_add_n. 56C * Single basecases out before the pushes. 57C * Make lead-in code for the inner loops be more similar. 58 59C When playing with pointers, set this to $2 to fall back to conservative 60C indexing in wind-down code. 61define(`I',`$1') 62 63define(`rp', `%rdi') C rcx 64define(`up', `%rsi') C rdx 65define(`mp_param', `%rdx') C r8 66define(`n', `%rcx') C r9 67define(`u0inv', `%r8') C stack 68 69define(`i', `%r14') 70define(`j', `%r15') 71define(`mp', `%r12') 72define(`q0', `%r13') 73define(`w0', `%rbp') 74define(`w1', `%r9') 75define(`w2', `%r10') 76define(`w3', `%r11') 77 78C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 79 80ABI_SUPPORT(DOS64) 81ABI_SUPPORT(STD64) 82 83define(`ALIGNx', `ALIGN(16)') 84 85ASM_START() 86 TEXT 87 ALIGN(32) 88PROLOGUE(mpn_redc_1) 89 FUNC_ENTRY(4) 90IFDOS(` mov 56(%rsp), %r8 ') 91 push %rbx 92 push %rbp 93 push %r12 94 push %r13 95 push %r14 96 push %r15 97 98 mov (up), q0 99 mov n, j C outer loop induction var 100 lea (mp_param,n,8), mp 101 lea (up,n,8), up 102 neg n 103 imul u0inv, q0 C first iteration q0 104 105 test $1, R8(n) 106 jz L(bx0) 107 108L(bx1): test $2, R8(n) 109 jz L(b3) 110 111L(b1): cmp $-1, R32(n) 112 jz L(n1) 113 114L(otp1):lea 1(n), i 115 mov (mp,n,8), %rax 116 mul q0 117 mov %rax, %rbp 118 mov 8(mp,n,8), %rax 119 mov %rdx, %r9 120 mul q0 121 mov %rax, %rbx 122 mov 16(mp,n,8), %rax 123 mov %rdx, %r10 124 mul q0 125 add (up,n,8), %rbp 126 mov %rax, %rbp 127 adc %r9, %rbx 128 mov 24(mp,n,8), %rax 129 adc $0, %r10 130 mov %rdx, %r9 131 mul q0 132 add 8(up,n,8), %rbx 133 mov %rbx, 8(up,n,8) 134 mov %rax, %r11 135 adc %r10, %rbp 136 mov 32(mp,n,8), %rax 137 adc $0, %r9 138 imul u0inv, %rbx C next q limb 139 jmp L(e1) 140 141 ALIGNx 142L(tp1): mul q0 143 add %rbp, -24(up,i,8) 144 mov %rax, %rbp 145 mov (mp,i,8), %rax 146 adc %r9, %r11 147 mov %rdx, %r9 148 adc $0, %r10 149 mul q0 150 add %r11, -16(up,i,8) 151 mov %rax, %r11 152 mov 8(mp,i,8), %rax 153 adc %r10, %rbp 154 mov %rdx, %r10 155 adc $0, %r9 156 mul q0 157 add %rbp, -8(up,i,8) 158 mov %rax, %rbp 159 adc %r9, %r11 160 mov 16(mp,i,8), %rax 161 adc $0, %r10 162 mov %rdx, %r9 163 mul q0 164 add %r11, (up,i,8) 165 mov %rax, %r11 166 adc %r10, %rbp 167 mov 24(mp,i,8), %rax 168 adc $0, %r9 169L(e1): add $4, i 170 mov %rdx, %r10 171 js L(tp1) 172 173L(ed1): mul q0 174 add %rbp, I(-24(up),-24(up,i,8)) 175 adc %r9, %r11 176 adc $0, %r10 177 add %r11, I(-16(up),-16(up,i,8)) 178 adc %r10, %rax 179 adc $0, %rdx 180 add %rax, I(-8(up),-8(up,i,8)) 181 adc $0, %rdx 182 mov %rdx, (up,n,8) C up[0] 183 mov %rbx, q0 C previously computed q limb -> q0 184 lea 8(up), up C up++ 185 dec j 186 jnz L(otp1) 187 jmp L(cj) 188 189L(b3): cmp $-3, R32(n) 190 jz L(n3) 191 192L(otp3):lea 3(n), i 193 mov (mp,n,8), %rax 194 mul q0 195 mov %rax, %rbp 196 mov 8(mp,n,8), %rax 197 mov %rdx, %r9 198 mul q0 199 mov %rax, %rbx 200 mov 16(mp,n,8), %rax 201 mov %rdx, %r10 202 mul q0 203 add (up,n,8), %rbp 204 mov %rax, %rbp 205 mov 24(mp,n,8), %rax 206 adc %r9, %rbx 207 mov %rdx, %r9 208 adc $0, %r10 209 mul q0 210 add 8(up,n,8), %rbx 211 mov %rbx, 8(up,n,8) 212 mov %rax, %r11 213 mov 32(mp,n,8), %rax 214 adc %r10, %rbp 215 mov %rdx, %r10 216 adc $0, %r9 217 imul u0inv, %rbx C next q limb 218 jmp L(e3) 219 220 ALIGNx 221L(tp3): mul q0 222 add %rbp, -24(up,i,8) 223 mov %rax, %rbp 224 mov (mp,i,8), %rax 225 adc %r9, %r11 226 mov %rdx, %r9 227 adc $0, %r10 228 mul q0 229 add %r11, -16(up,i,8) 230 mov %rax, %r11 231 mov 8(mp,i,8), %rax 232 adc %r10, %rbp 233 mov %rdx, %r10 234 adc $0, %r9 235L(e3): mul q0 236 add %rbp, -8(up,i,8) 237 mov %rax, %rbp 238 adc %r9, %r11 239 mov 16(mp,i,8), %rax 240 adc $0, %r10 241 mov %rdx, %r9 242 mul q0 243 add %r11, (up,i,8) 244 mov %rax, %r11 245 adc %r10, %rbp 246 mov 24(mp,i,8), %rax 247 adc $0, %r9 248 add $4, i 249 mov %rdx, %r10 250 js L(tp3) 251 252L(ed3): mul q0 253 add %rbp, I(-24(up),-24(up,i,8)) 254 adc %r9, %r11 255 adc $0, %r10 256 add %r11, I(-16(up),-16(up,i,8)) 257 adc %r10, %rax 258 adc $0, %rdx 259 add %rax, I(-8(up),-8(up,i,8)) 260 adc $0, %rdx 261 mov %rdx, (up,n,8) C up[0] 262 mov %rbx, q0 C previously computed q limb -> q0 263 lea 8(up), up C up++ 264 dec j 265 jnz L(otp3) 266C jmp L(cj) 267 268L(cj): 269IFSTD(` lea (up,n,8), up C param 2: up 270 lea (up,n,8), %rdx C param 3: up - n 271 neg R32(n) ') C param 4: n 272 273IFDOS(` lea (up,n,8), %rdx C param 2: up 274 lea (%rdx,n,8), %r8 C param 3: up - n 275 neg R32(n) 276 mov n, %r9 C param 4: n 277 mov rp, %rcx ') C param 1: rp 278 279 CALL( mpn_add_n) 280 281L(ret): pop %r15 282 pop %r14 283 pop %r13 284 pop %r12 285 pop %rbp 286 pop %rbx 287 FUNC_EXIT() 288 ret 289 290L(bx0): test $2, R8(n) 291 jnz L(b2) 292 293L(b0): cmp $-4, R32(n) 294 jz L(n4) 295 296L(otp0):lea 4(n), i 297 mov (mp,n,8), %rax 298 mul q0 299 mov %rax, %r11 300 mov 8(mp,n,8), %rax 301 mov %rdx, %r10 302 mul q0 303 mov %rax, %rbx 304 mov 16(mp,n,8), %rax 305 mov %rdx, %r9 306 mul q0 307 add (up,n,8), %r11 308 mov %rax, %r11 309 adc %r10, %rbx 310 mov 24(mp,n,8), %rax 311 adc $0, %r9 312 mov %rdx, %r10 313 mul q0 314 add 8(up,n,8), %rbx 315 mov %rbx, 8(up,n,8) 316 mov %rax, %rbp 317 mov 32(mp,n,8), %rax 318 adc %r9, %r11 319 mov %rdx, %r9 320 adc $0, %r10 321 imul u0inv, %rbx C next q limb 322 jmp L(e0) 323 324 ALIGNx 325L(tp0): mul q0 326 add %rbp, -24(up,i,8) 327 mov %rax, %rbp 328 mov (mp,i,8), %rax 329 adc %r9, %r11 330 mov %rdx, %r9 331 adc $0, %r10 332L(e0): mul q0 333 add %r11, -16(up,i,8) 334 mov %rax, %r11 335 mov 8(mp,i,8), %rax 336 adc %r10, %rbp 337 mov %rdx, %r10 338 adc $0, %r9 339 mul q0 340 add %rbp, -8(up,i,8) 341 mov %rax, %rbp 342 adc %r9, %r11 343 mov 16(mp,i,8), %rax 344 adc $0, %r10 345 mov %rdx, %r9 346 mul q0 347 add %r11, (up,i,8) 348 mov %rax, %r11 349 adc %r10, %rbp 350 mov 24(mp,i,8), %rax 351 adc $0, %r9 352 add $4, i 353 mov %rdx, %r10 354 js L(tp0) 355 356L(ed0): mul q0 357 add %rbp, I(-24(up),-24(up,i,8)) 358 adc %r9, %r11 359 adc $0, %r10 360 add %r11, I(-16(up),-16(up,i,8)) 361 adc %r10, %rax 362 adc $0, %rdx 363 add %rax, I(-8(up),-8(up,i,8)) 364 adc $0, %rdx 365 mov %rdx, (up,n,8) C up[0] 366 mov %rbx, q0 C previously computed q limb -> q0 367 lea 8(up), up C up++ 368 dec j 369 jnz L(otp0) 370 jmp L(cj) 371 372L(b2): cmp $-2, R32(n) 373 jz L(n2) 374 375L(otp2):lea 2(n), i 376 mov (mp,n,8), %rax 377 mul q0 378 mov %rax, %r11 379 mov 8(mp,n,8), %rax 380 mov %rdx, %r10 381 mul q0 382 mov %rax, %rbx 383 mov 16(mp,n,8), %rax 384 mov %rdx, %r9 385 mul q0 386 add (up,n,8), %r11 387 mov %rax, %r11 388 adc %r10, %rbx 389 mov 24(mp,n,8), %rax 390 adc $0, %r9 391 mov %rdx, %r10 392 mul q0 393 add 8(up,n,8), %rbx 394 mov %rbx, 8(up,n,8) 395 mov %rax, %rbp 396 mov 32(mp,n,8), %rax 397 adc %r9, %r11 398 mov %rdx, %r9 399 adc $0, %r10 400 imul u0inv, %rbx C next q limb 401 jmp L(e2) 402 403 ALIGNx 404L(tp2): mul q0 405 add %rbp, -24(up,i,8) 406 mov %rax, %rbp 407 mov (mp,i,8), %rax 408 adc %r9, %r11 409 mov %rdx, %r9 410 adc $0, %r10 411 mul q0 412 add %r11, -16(up,i,8) 413 mov %rax, %r11 414 mov 8(mp,i,8), %rax 415 adc %r10, %rbp 416 mov %rdx, %r10 417 adc $0, %r9 418 mul q0 419 add %rbp, -8(up,i,8) 420 mov %rax, %rbp 421 adc %r9, %r11 422 mov 16(mp,i,8), %rax 423 adc $0, %r10 424 mov %rdx, %r9 425L(e2): mul q0 426 add %r11, (up,i,8) 427 mov %rax, %r11 428 adc %r10, %rbp 429 mov 24(mp,i,8), %rax 430 adc $0, %r9 431 add $4, i 432 mov %rdx, %r10 433 js L(tp2) 434 435L(ed2): mul q0 436 add %rbp, I(-24(up),-24(up,i,8)) 437 adc %r9, %r11 438 adc $0, %r10 439 add %r11, I(-16(up),-16(up,i,8)) 440 adc %r10, %rax 441 adc $0, %rdx 442 add %rax, I(-8(up),-8(up,i,8)) 443 adc $0, %rdx 444 mov %rdx, (up,n,8) C up[0] 445 mov %rbx, q0 C previously computed q limb -> q0 446 lea 8(up), up C up++ 447 dec j 448 jnz L(otp2) 449 jmp L(cj) 450 451L(n1): mov (mp_param), %rax 452 mul q0 453 add -8(up), %rax 454 adc (up), %rdx 455 mov %rdx, (rp) 456 mov $0, R32(%rax) 457 adc R32(%rax), R32(%rax) 458 jmp L(ret) 459 460L(n2): mov (mp_param), %rax 461 mov -16(up), %rbp 462 mul q0 463 add %rax, %rbp 464 mov %rdx, %r9 465 adc $0, %r9 466 mov -8(mp), %rax 467 mov -8(up), %r10 468 mul q0 469 add %rax, %r10 470 mov %rdx, %r11 471 adc $0, %r11 472 add %r9, %r10 473 adc $0, %r11 474 mov %r10, q0 475 imul u0inv, q0 C next q0 476 mov -16(mp), %rax 477 mul q0 478 add %rax, %r10 479 mov %rdx, %r9 480 adc $0, %r9 481 mov -8(mp), %rax 482 mov (up), %r14 483 mul q0 484 add %rax, %r14 485 adc $0, %rdx 486 add %r9, %r14 487 adc $0, %rdx 488 xor R32(%rax), R32(%rax) 489 add %r11, %r14 490 adc 8(up), %rdx 491 mov %r14, (rp) 492 mov %rdx, 8(rp) 493 adc R32(%rax), R32(%rax) 494 jmp L(ret) 495 496 ALIGNx 497L(n3): mov -24(mp), %rax 498 mov -24(up), %r10 499 mul q0 500 add %rax, %r10 501 mov -16(mp), %rax 502 mov %rdx, %r11 503 adc $0, %r11 504 mov -16(up), %rbp 505 mul q0 506 add %rax, %rbp 507 mov %rdx, %r9 508 adc $0, %r9 509 mov -8(mp), %rax 510 add %r11, %rbp 511 mov -8(up), %r10 512 adc $0, %r9 513 mul q0 514 mov %rbp, q0 515 imul u0inv, q0 C next q0 516 add %rax, %r10 517 mov %rdx, %r11 518 adc $0, %r11 519 mov %rbp, -16(up) 520 add %r9, %r10 521 adc $0, %r11 522 mov %r10, -8(up) 523 mov %r11, -24(up) C up[0] 524 lea 8(up), up C up++ 525 dec j 526 jnz L(n3) 527 528 mov -48(up), %rdx 529 mov -40(up), %rbx 530 xor R32(%rax), R32(%rax) 531 add %rbp, %rdx 532 adc %r10, %rbx 533 adc -8(up), %r11 534 mov %rdx, (rp) 535 mov %rbx, 8(rp) 536 mov %r11, 16(rp) 537 adc R32(%rax), R32(%rax) 538 jmp L(ret) 539 540L(n4): mov -32(mp), %rax 541 mul q0 542 mov %rax, %r11 543 mov -24(mp), %rax 544 mov %rdx, %r10 545 mul q0 546 mov %rax, %rbx 547 mov -16(mp), %rax 548 mov %rdx, %r9 549 mul q0 550 add -32(up), %r11 551 mov %rax, %r11 552 adc %r10, %rbx 553 mov -8(mp), %rax 554 adc $0, %r9 555 mov %rdx, %r10 556 mul q0 557 add -24(up), %rbx 558 mov %rbx, -24(up) 559 adc %r9, %r11 560 adc $0, %r10 561 imul u0inv, %rbx C next q limb 562 add %r11, -16(up) 563 adc %r10, %rax 564 adc $0, %rdx 565 add %rax, -8(up) 566 adc $0, %rdx 567 mov %rdx, -32(up) C up[0] 568 mov %rbx, q0 C previously computed q limb -> q0 569 dec j 570 lea 8(up), up C up++ 571 jnz L(n4) 572 jmp L(cj) 573EPILOGUE() 574ASM_END() 575