1dnl AMD64 mpn_mulmid_basecase 2 3dnl Based on mul_basecase.asm from GMP 4.3.1, modifications are copyright 4dnl (C) 2009, David Harvey. The original mul_basecase.asm was released under 5dnl LGPLv3+, license terms reproduced below. These modifications are hereby 6dnl released under the same terms. 7 8dnl ========= Original license terms: 9 10dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey. 11 12dnl Copyright 2008 Free Software Foundation, Inc. 13 14dnl This file is part of the GNU MP Library. 15 16dnl The GNU MP Library is free software; you can redistribute it and/or modify 17dnl it under the terms of the GNU Lesser General Public License as published 18dnl by the Free Software Foundation; either version 3 of the License, or (at 19dnl your option) any later version. 20 21dnl The GNU MP Library is distributed in the hope that it will be useful, but 22dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 23dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 24dnl License for more details. 25 26dnl You should have received a copy of the GNU Lesser General Public License 27dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 28 29dnl ========= end license terms 30 31 32include(`../config.m4') 33 34C cycles/limb 35C K8,K9: 2.375 (2.5 when un - vn is "small") 36C K10: ? 37C P4: ? 38C P6-15: ? 39 40C INPUT PARAMETERS 41define(`rp', `%rdi') 42define(`up', `%rsi') 43define(`un_param',`%rdx') 44define(`vp_param',`%rcx') 45define(`vn', `%r8') 46define(`vn32', `%r8d') 47 48define(`v0', `%r12') 49define(`v1', `%r9') 50 51define(`w0', `%rbx') 52define(`w1', `%rcx') 53define(`w2', `%rbp') 54define(`w3', `%r10') 55define(`w032', `%ebx') 56define(`w132', `%ecx') 57define(`w232', `%ebp') 58define(`w332', `%r10d') 59 60define(`n', `%r11') 61define(`outer_addr', `%r14') 62define(`un', `%r13') 63define(`un32',`%r13d') 64define(`vp', `%r15') 65 66define(`vp_inner', `%r10') 67 68 69ASM_START() 70 TEXT 71 ALIGN(16) 72PROLOGUE(mpn_mulmid_basecase) 73 push %rbx 74 push %rbp 75 push %r12 76 push %r13 77 push %r14 78 push %r15 79 80 mov vp_param, vp 81 82 C use un for row length (= un_param - vn + 1) 83 lea 1(un_param), un 84 sub vn, un 85 86 lea (rp,un,8), rp 87 88 cmp $4, un C FIXME: needs tuning 89 jc L(diagonal) 90 91 lea (up,un_param,8), up 92 93 test $1, vn 94 jz L(mul_2) 95 96C =========================================================== 97C mul_1 for vp[0] if vn is odd 98 99L(mul_1): 100 mov un32, w032 101 102 neg un 103 mov (up,un,8), %rax 104 mov (vp), v0 105 mul v0 106 107 and $-4, un C round down to multiple of 4 108 mov un, n 109 110 and $3, w032 111 jz L(mul_1_prologue_0) 112 cmp $2, w032 113 jc L(mul_1_prologue_1) 114 jz L(mul_1_prologue_2) 115 116L(mul_1_prologue_3): 117 mov %rax, w3 118 mov %rdx, w0 119 lea L(addmul_prologue_3)(%rip), outer_addr 120 jmp L(mul_1_entry_3) 121 122 ALIGN(16) 123L(mul_1_prologue_0): 124 mov %rax, w2 125 mov %rdx, w3 C note already w0 == 0 126 lea L(addmul_prologue_0)(%rip), outer_addr 127 jmp L(mul_1_entry_0) 128 129 ALIGN(16) 130L(mul_1_prologue_1): 131 add $4, n 132 mov %rax, w1 133 mov %rdx, w2 134 mov $0, w332 135 mov (up,n,8), %rax 136 lea L(addmul_prologue_1)(%rip), outer_addr 137 jmp L(mul_1_entry_1) 138 139 ALIGN(16) 140L(mul_1_prologue_2): 141 mov %rax, w0 142 mov %rdx, w1 143 mov 24(up,n,8), %rax 144 mov $0, w232 145 mov $0, w332 146 lea L(addmul_prologue_2)(%rip), outer_addr 147 jmp L(mul_1_entry_2) 148 149 150 C this loop is 10 c/loop = 2.5 c/l on K8 151 152 ALIGN(16) 153L(mul_1_top): 154 mov w0, -16(rp,n,8) 155 add %rax, w1 156 mov (up,n,8), %rax 157 adc %rdx, w2 158L(mul_1_entry_1): 159 mov $0, w032 160 mul v0 161 mov w1, -8(rp,n,8) 162 add %rax, w2 163 adc %rdx, w3 164L(mul_1_entry_0): 165 mov 8(up,n,8), %rax 166 mul v0 167 mov w2, (rp,n,8) 168 add %rax, w3 169 adc %rdx, w0 170L(mul_1_entry_3): 171 mov 16(up,n,8), %rax 172 mul v0 173 mov w3, 8(rp,n,8) 174 mov $0, w232 C zero 175 mov w2, w3 C zero 176 add %rax, w0 177 mov 24(up,n,8), %rax 178 mov w2, w1 C zero 179 adc %rdx, w1 180L(mul_1_entry_2): 181 mul v0 182 add $4, n 183 js L(mul_1_top) 184 185 mov w0, -16(rp) 186 add %rax, w1 187 mov w1, -8(rp) 188 mov w2, 8(rp) C zero last limb of output 189 adc %rdx, w2 190 mov w2, (rp) 191 192 dec vn 193 jz L(ret) 194 195 lea -8(up), up 196 lea 8(vp), vp 197 198 mov un, n 199 mov (vp), v0 200 mov 8(vp), v1 201 202 jmp *outer_addr 203 204C =========================================================== 205C mul_2 for vp[0], vp[1] if vn is even 206 207 ALIGN(16) 208L(mul_2): 209 mov un32, w032 210 211 neg un 212 mov -8(up,un,8), %rax 213 mov (vp), v0 214 mov 8(vp), v1 215 mul v1 216 217 and $-4, un C round down to multiple of 4 218 mov un, n 219 220 and $3, w032 221 jz L(mul_2_prologue_0) 222 cmp $2, w032 223 jc L(mul_2_prologue_1) 224 jz L(mul_2_prologue_2) 225 226L(mul_2_prologue_3): 227 mov %rax, w1 228 mov %rdx, w2 229 lea L(addmul_prologue_3)(%rip), outer_addr 230 jmp L(mul_2_entry_3) 231 232 ALIGN(16) 233L(mul_2_prologue_0): 234 mov %rax, w0 235 mov %rdx, w1 236 lea L(addmul_prologue_0)(%rip), outer_addr 237 jmp L(mul_2_entry_0) 238 239 ALIGN(16) 240L(mul_2_prologue_1): 241 mov %rax, w3 242 mov %rdx, w0 243 mov $0, w132 244 lea L(addmul_prologue_1)(%rip), outer_addr 245 jmp L(mul_2_entry_1) 246 247 ALIGN(16) 248L(mul_2_prologue_2): 249 mov %rax, w2 250 mov %rdx, w3 251 mov $0, w032 252 mov 16(up,n,8), %rax 253 lea L(addmul_prologue_2)(%rip), outer_addr 254 jmp L(mul_2_entry_2) 255 256 257 C this loop is 18 c/loop = 2.25 c/l on K8 258 259 ALIGN(16) 260L(mul_2_top): 261 mov -8(up,n,8), %rax 262 mul v1 263 add %rax, w0 264 adc %rdx, w1 265L(mul_2_entry_0): 266 mov $0, w232 267 mov (up,n,8), %rax 268 mul v0 269 add %rax, w0 270 mov (up,n,8), %rax 271 adc %rdx, w1 272 adc $0, w232 273 mul v1 274 add %rax, w1 275 mov w0, (rp,n,8) 276 adc %rdx, w2 277L(mul_2_entry_3): 278 mov 8(up,n,8), %rax 279 mul v0 280 mov $0, w332 281 add %rax, w1 282 adc %rdx, w2 283 mov $0, w032 284 adc $0, w332 285 mov 8(up,n,8), %rax 286 mov w1, 8(rp,n,8) 287 mul v1 288 add %rax, w2 289 mov 16(up,n,8), %rax 290 adc %rdx, w3 291L(mul_2_entry_2): 292 mov $0, w132 293 mul v0 294 add %rax, w2 295 mov 16(up,n,8), %rax 296 adc %rdx, w3 297 adc $0, w032 298 mul v1 299 add %rax, w3 300 mov w2, 16(rp,n,8) 301 adc %rdx, w0 302L(mul_2_entry_1): 303 mov 24(up,n,8), %rax 304 mul v0 305 add %rax, w3 306 adc %rdx, w0 307 adc $0, w132 308 add $4, n 309 mov w3, -8(rp,n,8) 310 jnz L(mul_2_top) 311 312 mov w0, (rp) 313 mov w1, 8(rp) 314 315 sub $2, vn 316 jz L(ret) 317 318 lea 16(vp), vp 319 lea -16(up), up 320 321 mov un, n 322 mov (vp), v0 323 mov 8(vp), v1 324 325 jmp *outer_addr 326 327C =========================================================== 328C addmul_2 for remaining vp's 329 330 ALIGN(16) 331L(addmul_prologue_0): 332 mov -8(up,n,8), %rax 333 mul v1 334 mov %rax, w1 335 mov %rdx, w2 336 mov $0, w332 337 jmp L(addmul_entry_0) 338 339 ALIGN(16) 340L(addmul_prologue_1): 341 mov 16(up,n,8), %rax 342 mul v1 343 mov %rax, w0 344 mov %rdx, w1 345 mov $0, w232 346 mov 24(up,n,8), %rax 347 jmp L(addmul_entry_1) 348 349 ALIGN(16) 350L(addmul_prologue_2): 351 mov 8(up,n,8), %rax 352 mul v1 353 mov %rax, w3 354 mov %rdx, w0 355 mov $0, w132 356 jmp L(addmul_entry_2) 357 358 ALIGN(16) 359L(addmul_prologue_3): 360 mov (up,n,8), %rax 361 mul v1 362 mov %rax, w2 363 mov %rdx, w3 364 mov $0, w032 365 mov $0, w132 366 jmp L(addmul_entry_3) 367 368 C this loop is 19 c/loop = 2.375 c/l on K8 369 370 ALIGN(16) 371L(addmul_top): 372 mov $0, w332 373 add %rax, w0 374 mov -8(up,n,8), %rax 375 adc %rdx, w1 376 adc $0, w232 377 mul v1 378 add w0, -8(rp,n,8) 379 adc %rax, w1 380 adc %rdx, w2 381L(addmul_entry_0): 382 mov (up,n,8), %rax 383 mul v0 384 add %rax, w1 385 mov (up,n,8), %rax 386 adc %rdx, w2 387 adc $0, w332 388 mul v1 389 add w1, (rp,n,8) 390 mov $0, w132 391 adc %rax, w2 392 mov $0, w032 393 adc %rdx, w3 394L(addmul_entry_3): 395 mov 8(up,n,8), %rax 396 mul v0 397 add %rax, w2 398 mov 8(up,n,8), %rax 399 adc %rdx, w3 400 adc $0, w032 401 mul v1 402 add w2, 8(rp,n,8) 403 adc %rax, w3 404 adc %rdx, w0 405L(addmul_entry_2): 406 mov 16(up,n,8), %rax 407 mul v0 408 add %rax, w3 409 mov 16(up,n,8), %rax 410 adc %rdx, w0 411 adc $0, w132 412 mul v1 413 add w3, 16(rp,n,8) 414 nop C don't ask... 415 adc %rax, w0 416 mov $0, w232 417 mov 24(up,n,8), %rax 418 adc %rdx, w1 419L(addmul_entry_1): 420 mul v0 421 add $4, n 422 jnz L(addmul_top) 423 424 add %rax, w0 425 adc %rdx, w1 426 adc $0, w232 427 428 add w0, -8(rp) 429 adc w1, (rp) 430 adc w2, 8(rp) 431 432 sub $2, vn 433 jz L(ret) 434 435 lea 16(vp), vp 436 lea -16(up), up 437 438 mov un, n 439 mov (vp), v0 440 mov 8(vp), v1 441 442 jmp *outer_addr 443 444C =========================================================== 445C accumulate along diagonals if un - vn is small 446 447 ALIGN(16) 448L(diagonal): 449 xor w032, w032 450 xor w132, w132 451 xor w232, w232 452 453 neg un 454 455 mov vn32, %eax 456 and $3, %eax 457 jz L(diag_prologue_0) 458 cmp $2, %eax 459 jc L(diag_prologue_1) 460 jz L(diag_prologue_2) 461 462L(diag_prologue_3): 463 lea -8(vp), vp 464 mov vp, vp_inner 465 add $1, vn 466 mov vn, n 467 lea L(diag_entry_3)(%rip), outer_addr 468 jmp L(diag_entry_3) 469 470L(diag_prologue_0): 471 mov vp, vp_inner 472 mov vn, n 473 lea 0(%rip), outer_addr 474 mov -8(up,n,8), %rax 475 jmp L(diag_entry_0) 476 477L(diag_prologue_1): 478 lea 8(vp), vp 479 mov vp, vp_inner 480 add $3, vn 481 mov vn, n 482 lea 0(%rip), outer_addr 483 mov -8(vp_inner), %rax 484 jmp L(diag_entry_1) 485 486L(diag_prologue_2): 487 lea -16(vp), vp 488 mov vp, vp_inner 489 add $2, vn 490 mov vn, n 491 lea 0(%rip), outer_addr 492 mov 16(vp_inner), %rax 493 jmp L(diag_entry_2) 494 495 496 C this loop is 10 c/loop = 2.5 c/l on K8 497 498 ALIGN(16) 499L(diag_top): 500 add %rax, w0 501 adc %rdx, w1 502 mov -8(up,n,8), %rax 503 adc $0, w2 504L(diag_entry_0): 505 mulq (vp_inner) 506 add %rax, w0 507 adc %rdx, w1 508 adc $0, w2 509L(diag_entry_3): 510 mov -16(up,n,8), %rax 511 mulq 8(vp_inner) 512 add %rax, w0 513 mov 16(vp_inner), %rax 514 adc %rdx, w1 515 adc $0, w2 516L(diag_entry_2): 517 mulq -24(up,n,8) 518 add %rax, w0 519 mov 24(vp_inner), %rax 520 adc %rdx, w1 521 lea 32(vp_inner), vp_inner 522 adc $0, w2 523L(diag_entry_1): 524 mulq -32(up,n,8) 525 sub $4, n 526 jnz L(diag_top) 527 528 add %rax, w0 529 adc %rdx, w1 530 adc $0, w2 531 532 mov w0, (rp,un,8) 533 534 inc un 535 jz L(diag_end) 536 537 mov vn, n 538 mov vp, vp_inner 539 540 lea 8(up), up 541 mov w1, w0 542 mov w2, w1 543 xor w232, w232 544 545 jmp *outer_addr 546 547L(diag_end): 548 mov w1, (rp) 549 mov w2, 8(rp) 550 551L(ret): pop %r15 552 pop %r14 553 pop %r13 554 pop %r12 555 pop %rbp 556 pop %rbx 557 ret 558 559EPILOGUE() 560