1dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb mul_2 addmul_2 36C AMD K8,K9 37C AMD K10 38C AMD bull 39C AMD pile 40C AMD steam 41C AMD bobcat 42C AMD jaguar 43C Intel P4 44C Intel core 4.0 4.18-4.25 45C Intel NHM 3.75 4.06-4.2 46C Intel SBR 47C Intel IBR 48C Intel HWL 49C Intel BWL 50C Intel atom 51C VIA nano 52 53C The inner loops of this code are the result of running a code generation and 54C optimisation tool suite written by David Harvey and Torbjörn Granlund. 55 56C TODO 57C * Implement proper cor2, replacing current cor0. 58C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?) 59C * Micro-optimise. 60 61C When playing with pointers, set this to $2 to fall back to conservative 62C indexing in wind-down code. 63define(`I',`$1') 64 65define(`rp', `%rdi') 66define(`up', `%rsi') 67define(`vp_param', `%rdx') 68define(`n_param', `%rcx') 69define(`n_param8', `%cl') 70 71define(`v0', `%r10') 72define(`v1', `%r11') 73define(`w0', `%rbx') 74define(`w032', `%ebx') 75define(`w1', `%rcx') 76define(`w132', `%ecx') 77define(`w2', `%rbp') 78define(`w232', `%ebp') 79define(`w3', `%r12') 80define(`w332', `%r12d') 81define(`n', `%r9') 82define(`n32', `%r9d') 83define(`n8', `%r9b') 84define(`i', `%r13') 85define(`vp', `%r8') 86 87define(`X0', `%r14') 88define(`X1', `%r15') 89 90C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 91 92define(`ALIGNx', `ALIGN(16)') 93 94define(`N', 85) 95ifdef(`N',,`define(`N',0)') 96define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') 97 98ASM_START() 99 TEXT 100 ALIGN(32) 101PROLOGUE(mpn_mullow_n_basecase) 102 103 mov (up), %rax 104 mov vp_param, vp 105 106 cmp $4, n_param 107 jb lsmall 108 109 mov (vp_param), v0 110 push %rbx 111 lea (rp,n_param,8), rp C point rp at R[un] 112 push %rbp 113 lea (up,n_param,8), up C point up right after U's end 114 push %r12 115 mov $0, n32 C FIXME 116 sub n_param, n 117 push %r13 118 mul v0 119 mov 8(vp), v1 120 121 test $1, n_param8 122 jnz lm2x1 123 124lm2x0:test $2, n_param8 125 jnz lm2b2 126 127lm2b0:lea (n), i 128 mov %rax, (rp,n,8) 129 mov %rdx, w1 130 mov (up,n,8), %rax 131 xor w232, w232 132 jmp lm2e0 133 134lm2b2:lea -2(n), i 135 mov %rax, w2 136 mov (up,n,8), %rax 137 mov %rdx, w3 138 xor w032, w032 139 jmp lm2e2 140 141lm2x1:test $2, n_param8 142 jnz lm2b3 143 144lm2b1:lea 1(n), i 145 mov %rax, (rp,n,8) 146 mov (up,n,8), %rax 147 mov %rdx, w0 148 xor w132, w132 149 jmp lm2e1 150 151lm2b3:lea -1(n), i 152 xor w332, w332 153 mov %rax, w1 154 mov %rdx, w2 155 mov (up,n,8), %rax 156 jmp lm2e3 157 158 ALIGNx 159lm2tp:mul v0 160 add %rax, w3 161 mov -8(up,i,8), %rax 162 mov w3, -8(rp,i,8) 163 adc %rdx, w0 164 adc $0, w132 165lm2e1:mul v1 166 add %rax, w0 167 adc %rdx, w1 168 mov $0, w232 169 mov (up,i,8), %rax 170 mul v0 171 add %rax, w0 172 mov w0, (rp,i,8) 173 adc %rdx, w1 174 mov (up,i,8), %rax 175 adc $0, w232 176lm2e0:mul v1 177 add %rax, w1 178 adc %rdx, w2 179 mov 8(up,i,8), %rax 180 mul v0 181 mov $0, w332 182 add %rax, w1 183 adc %rdx, w2 184 adc $0, w332 185 mov 8(up,i,8), %rax 186lm2e3:mul v1 187 add %rax, w2 188 mov w1, 8(rp,i,8) 189 adc %rdx, w3 190 mov $0, w032 191 mov 16(up,i,8), %rax 192 mul v0 193 add %rax, w2 194 mov 16(up,i,8), %rax 195 adc %rdx, w3 196 adc $0, w032 197lm2e2:mul v1 198 mov $0, w132 C FIXME: dead in last iteration 199 add %rax, w3 200 mov 24(up,i,8), %rax 201 mov w2, 16(rp,i,8) 202 adc %rdx, w0 C FIXME: dead in last iteration 203 add $4, i 204 js lm2tp 205 206lm2ed:imul v0, %rax 207 add w3, %rax 208 mov %rax, I(-8(rp),-8(rp,i,8)) 209 210 add $2, n 211 lea 16(vp), vp 212 lea -16(up), up 213 cmp $-2, n 214 jge lcor1 215 216 push %r14 217 push %r15 218 219louter: 220 mov (vp), v0 221 mov 8(vp), v1 222 mov (up,n,8), %rax 223 mul v0 224 test $1, n8 225 jnz la1x1 226 227la1x0:mov %rax, X1 228 MOV( %rdx, X0, 8) 229 mov (up,n,8), %rax 230 mul v1 231 test $2, n8 232 jnz la110 233 234la100:lea (n), i 235 mov (rp,n,8), w3 236 mov %rax, w0 237 MOV( %rdx, w1, 16) 238 jmp llo0 239 240la110:lea 2(n), i 241 mov (rp,n,8), w1 242 mov %rax, w2 243 mov 8(up,n,8), %rax 244 MOV( %rdx, w3, 1) 245 jmp llo2 246 247la1x1:mov %rax, X0 248 MOV( %rdx, X1, 2) 249 mov (up,n,8), %rax 250 mul v1 251 test $2, n8 252 jz la111 253 254la101:lea 1(n), i 255 MOV( %rdx, w0, 4) 256 mov (rp,n,8), w2 257 mov %rax, w3 258 jmp llo1 259 260la111:lea -1(n), i 261 MOV( %rdx, w2, 64) 262 mov %rax, w1 263 mov (rp,n,8), w0 264 mov 8(up,n,8), %rax 265 jmp llo3 266 267 ALIGNx 268ltop: mul v1 269 add w0, w1 270 adc %rax, w2 271 mov -8(up,i,8), %rax 272 MOV( %rdx, w3, 1) 273 adc $0, w3 274llo2: mul v0 275 add w1, X1 276 mov X1, -16(rp,i,8) 277 adc %rax, X0 278 MOV( %rdx, X1, 2) 279 adc $0, X1 280 mov -8(up,i,8), %rax 281 mul v1 282 MOV( %rdx, w0, 4) 283 mov -8(rp,i,8), w1 284 add w1, w2 285 adc %rax, w3 286 adc $0, w0 287llo1: mov (up,i,8), %rax 288 mul v0 289 add w2, X0 290 adc %rax, X1 291 mov X0, -8(rp,i,8) 292 MOV( %rdx, X0, 8) 293 adc $0, X0 294 mov (up,i,8), %rax 295 mov (rp,i,8), w2 296 mul v1 297 add w2, w3 298 adc %rax, w0 299 MOV( %rdx, w1, 16) 300 adc $0, w1 301llo0: mov 8(up,i,8), %rax 302 mul v0 303 add w3, X1 304 mov X1, (rp,i,8) 305 adc %rax, X0 306 MOV( %rdx, X1, 32) 307 mov 8(rp,i,8), w3 308 adc $0, X1 309 mov 8(up,i,8), %rax 310 mul v1 311 add w3, w0 312 MOV( %rdx, w2, 64) 313 adc %rax, w1 314 mov 16(up,i,8), %rax 315 adc $0, w2 316llo3: mul v0 317 add w0, X0 318 mov X0, 8(rp,i,8) 319 MOV( %rdx, X0, 128) 320 adc %rax, X1 321 mov 16(up,i,8), %rax 322 mov 16(rp,i,8), w0 323 adc $0, X0 324 add $4, i 325 jnc ltop 326 327lend: imul v1, %rax 328 add w0, w1 329 adc %rax, w2 330 mov I(-8(up),-8(up,i,8)), %rax 331 imul v0, %rax 332 add w1, X1 333 mov X1, I(-16(rp),-16(rp,i,8)) 334 adc X0, %rax 335 mov I(-8(rp),-8(rp,i,8)), w1 336 add w1, w2 337 add w2, %rax 338 mov %rax, I(-8(rp),-8(rp,i,8)) 339 340 add $2, n 341 lea 16(vp), vp 342 lea -16(up), up 343 cmp $-2, n 344 jl louter 345 346 pop %r15 347 pop %r14 348 349 jnz lcor0 350 351lcor1:mov (vp), v0 352 mov 8(vp), v1 353 mov -16(up), %rax 354 mul v0 C u0 x v2 355 add -16(rp), %rax C FIXME: rp[0] still available in reg? 356 adc -8(rp), %rdx C FIXME: rp[1] still available in reg? 357 mov -8(up), %rbx 358 imul v0, %rbx 359 mov -16(up), %rcx 360 imul v1, %rcx 361 mov %rax, -16(rp) 362 add %rbx, %rcx 363 add %rdx, %rcx 364 mov %rcx, -8(rp) 365 pop %r13 366 pop %r12 367 pop %rbp 368 pop %rbx 369 ret 370 371lcor0:mov (vp), %r11 372 imul -8(up), %r11 373 add %rax, %r11 374 mov %r11, -8(rp) 375 pop %r13 376 pop %r12 377 pop %rbp 378 pop %rbx 379 ret 380 381 ALIGN(16) 382lsmall: 383 cmp $2, n_param 384 jae lgt1 385ln1: imul (vp_param), %rax 386 mov %rax, (rp) 387 ret 388lgt1: ja lgt2 389ln2: mov (vp_param), %r9 390 mul %r9 391 mov %rax, (rp) 392 mov 8(up), %rax 393 imul %r9, %rax 394 add %rax, %rdx 395 mov 8(vp), %r9 396 mov (up), %rcx 397 imul %r9, %rcx 398 add %rcx, %rdx 399 mov %rdx, 8(rp) 400 ret 401lgt2: 402ln3: mov (vp_param), %r9 403 mul %r9 C u0 x v0 404 mov %rax, (rp) 405 mov %rdx, %r10 406 mov 8(up), %rax 407 mul %r9 C u1 x v0 408 imul 16(up), %r9 C u2 x v0 409 add %rax, %r10 410 adc %rdx, %r9 411 mov 8(vp), %r11 412 mov (up), %rax 413 mul %r11 C u0 x v1 414 add %rax, %r10 415 adc %rdx, %r9 416 imul 8(up), %r11 C u1 x v1 417 add %r11, %r9 418 mov %r10, 8(rp) 419 mov 16(vp), %r10 420 mov (up), %rax 421 imul %rax, %r10 C u0 x v2 422 add %r10, %r9 423 mov %r9, 16(rp) 424 ret 425EPILOGUE() 426