1# This file is part of ELPA. 2# 3# The ELPA library was originally created by the ELPA consortium, 4# consisting of the following organizations: 5# 6# - Max Planck Computing and Data Facility (MPCDF), formerly known as 7# Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), 8# - Bergische Universität Wuppertal, Lehrstuhl für angewandte 9# Informatik, 10# - Technische Universität München, Lehrstuhl für Informatik mit 11# Schwerpunkt Wissenschaftliches Rechnen , 12# - Fritz-Haber-Institut, Berlin, Abt. Theorie, 13# - Max-Plack-Institut für Mathematik in den Naturwissenschaften, 14# Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, 15# and 16# - IBM Deutschland GmbH 17# 18# 19# More information can be found here: 20# http://elpa.mpcdf.mpg.de/ 21# 22# ELPA is free software: you can redistribute it and/or modify 23# it under the terms of the version 3 of the license of the 24# GNU Lesser General Public License as published by the Free 25# Software Foundation. 26# 27# ELPA is distributed in the hope that it will be useful, 28# but WITHOUT ANY WARRANTY; without even the implied warranty of 29# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30# GNU Lesser General Public License for more details. 31# 32# You should have received a copy of the GNU Lesser General Public License 33# along with ELPA. If not, see <http://www.gnu.org/licenses/> 34# 35# ELPA reflects a substantial effort on the part of the original 36# ELPA consortium, and we ask you to respect the spirit of the 37# license that we chose: i.e., please contribute any changes you 38# may have back to the original ELPA library distribution, and keep 39# any derivatives of ELPA under the same license that we chose for 40# the original distribution, the GNU Lesser General Public License. 41# 42 43 44# -------------------------------------------------------------------------------------------------- 45# 46# This file contains the compute intensive kernels for the Householder transformations, 47# coded in x86_64 assembler and using SSE2/SSE3 instructions. 48# 49# It must be assembled with GNU assembler (just "as" on most Linux machines) 50# 51# Copyright of the original code rests with the authors inside the ELPA 52# consortium. The copyright of any additional modifications shall rest 53# with their original authors, but shall adhere to the licensing terms 54# distributed along with the original code in the file "COPYING". 55# 56# -------------------------------------------------------------------------------------------------- 57 .globl double_hh_trafo_real_double_sse_assembly 58 .globl single_hh_trafo_complex_double_sse_assembly 59 .text 60 61#------------------------------------------------------------------------------- 62#------------------------------------------------------------------------------- 63 64 .macro hh_trafo_real nrows 65 66 # When this macro is called, the following registers are set and must not be changed 67 # %rdi: Address of q 68 # %rsi: Address of hh 69 # %rdx: nb 70 # %rcx: Remaining rows nq 71 # %r8: ldq in bytes 72 # %r9: ldh in bytes 73 # %rax: address of hh at the end of the loops 74 # The top of the stack must contain the dot product of the two Householder vectors 75 76 movq %rdi, %r10 # Copy address of q 77 movq %rsi, %r11 # Copy address of hh 78 79 80# x1 = q(1,2) 81# x2 = q(2,2) 82# 83# y1 = q(1,1) + q(1,2)*hh(2,2) 84# y2 = q(2,1) + q(2,2)*hh(2,2) 85 86 movaps (%r10), %xmm6 # y1 = q(1,1) 87 movaps 16(%r10), %xmm7 # y2 = q(2,1) 88 .if \nrows>=8 89 movaps 32(%r10), %xmm8 90 movaps 48(%r10), %xmm9 91 .if \nrows==12 92 movaps 64(%r10), %xmm10 93 movaps 80(%r10), %xmm11 94 .endif 95 .endif 96 97 addq %r8, %r10 # %r10 => q(.,2) 98 movddup 8(%r11,%r9), %xmm15 # hh(2,2) 99 100 .macro mac_pre_loop1 qoff, X, Y 101 movaps \qoff(%r10), \X # xn = q(n,2) 102 movaps \X, %xmm12 103 mulpd %xmm15, %xmm12 104 addpd %xmm12, \Y # yn = yn + xn*h(2,2) 105 .endm 106 107 mac_pre_loop1 0, %xmm0, %xmm6 108 mac_pre_loop1 16, %xmm1, %xmm7 109 .if \nrows>=8 110 mac_pre_loop1 32, %xmm2, %xmm8 111 mac_pre_loop1 48, %xmm3, %xmm9 112 .if \nrows==12 113 mac_pre_loop1 64, %xmm4, %xmm10 114 mac_pre_loop1 80, %xmm5, %xmm11 115 .endif 116 .endif 117 .purgem mac_pre_loop1 118 119# do i=3,nb 120# h1 = hh(i-1,1) 121# h2 = hh(i,2) 122# x1 = x1 + q(1,i)*h1 123# y1 = y1 + q(1,i)*h2 124# x2 = x2 + q(2,i)*h1 125# y2 = y2 + q(2,i)*h2 126# ... 127# enddo 128 129 addq $8, %r11 130 .align 16 1311: 132 cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax 133 jge 2f 134 135 addq %r8, %r10 # %r10 => q(.,i) 136 137 movddup (%r11), %xmm14 # hh(i-1,1) 138 movddup 8(%r11,%r9), %xmm15 # hh(i,2) 139 140 .macro mac_loop1 qoff, X, Y 141 movaps \qoff(%r10), %xmm13 # q(.,i) 142 movaps %xmm13, %xmm12 143 mulpd %xmm14, %xmm13 144 addpd %xmm13, \X # xn = xn + q(.,i)*h1 145 mulpd %xmm15, %xmm12 146 addpd %xmm12, \Y # yn = yn + q(.,i)*h2 147 .endm 148 149 mac_loop1 0, %xmm0, %xmm6 150 mac_loop1 16, %xmm1, %xmm7 151 .if \nrows>=8 152 mac_loop1 32, %xmm2, %xmm8 153 mac_loop1 48, %xmm3, %xmm9 154 .if \nrows==12 155 mac_loop1 64, %xmm4, %xmm10 156 mac_loop1 80, %xmm5, %xmm11 157 .endif 158 .endif 159 .purgem mac_loop1 160 161 addq $8, %r11 162 jmp 1b 1632: 164 165# x1 = x1 + q(1,nb+1)*hh(nb,1) 166# x2 = x2 + q(2,nb+1)*hh(nb,1) 167 168 addq %r8, %r10 # %r10 => q(.,nb+1) 169 movddup (%r11), %xmm14 170 171 .macro mac_post_loop1 qoff, X 172 movaps \qoff(%r10), %xmm13 # q(.,nb+1) 173 mulpd %xmm14, %xmm13 174 addpd %xmm13, \X 175 .endm 176 177 mac_post_loop1 0, %xmm0 178 mac_post_loop1 16, %xmm1 179 .if \nrows>=8 180 mac_post_loop1 32, %xmm2 181 mac_post_loop1 48, %xmm3 182 .if \nrows==12 183 mac_post_loop1 64, %xmm4 184 mac_post_loop1 80, %xmm5 185 .endif 186 .endif 187 .purgem mac_post_loop1 188 189# tau1 = hh(1,1) 190# tau2 = hh(1,2) 191# 192# h1 = -tau1 193# x1 = x1*h1 194# x2 = x2*h1 195 196 movq %rsi, %r11 # restore %r11 (hh(1,1)) 197 198 movddup (%r11), %xmm12 # hh(1,1) 199 xorps %xmm14, %xmm14 200 subpd %xmm12, %xmm14 # %xmm14 = -hh(1,1) 201 202 mulpd %xmm14, %xmm0 203 mulpd %xmm14, %xmm1 204 .if \nrows>=8 205 mulpd %xmm14, %xmm2 206 mulpd %xmm14, %xmm3 207 .if \nrows==12 208 mulpd %xmm14, %xmm4 209 mulpd %xmm14, %xmm5 210 .endif 211 .endif 212 213# h1 = -tau2 214# h2 = -tau2*s 215# y1 = y1*h1 + x1*h2 216# y2 = y2*h1 + x2*h2 217 218 movddup (%r11,%r9), %xmm12 # hh(1,2) 219 xorps %xmm15, %xmm15 220 subpd %xmm12, %xmm15 # %xmm15 = -hh(1,2) = h1 221 movaps %xmm15, %xmm14 222 movddup (%rsp), %xmm12 # Get s from top of stack 223 mulpd %xmm12, %xmm14 # %xmm14 = h2 224 225 .macro mac_xform_y X, Y 226 mulpd %xmm15, \Y # y1 = y1*h1 227 movaps \X, %xmm12 228 mulpd %xmm14, %xmm12 229 addpd %xmm12, \Y 230 .endm 231 232 mac_xform_y %xmm0, %xmm6 233 mac_xform_y %xmm1, %xmm7 234 .if \nrows>=8 235 mac_xform_y %xmm2, %xmm8 236 mac_xform_y %xmm3, %xmm9 237 .if \nrows==12 238 mac_xform_y %xmm4, %xmm10 239 mac_xform_y %xmm5, %xmm11 240 .endif 241 .endif 242 .purgem mac_xform_y 243 244# q(1,1) = q(1,1) + y1 245# q(2,1) = q(2,1) + y2 246 247 movq %rdi, %r10 # restore original Q 248 249 .macro mac_pre_loop2_1 qoff, Y 250 movaps \qoff(%r10), %xmm13 # q(.,1) 251 addpd \Y, %xmm13 252 movaps %xmm13, \qoff(%r10) 253 .endm 254 255 mac_pre_loop2_1 0, %xmm6 256 mac_pre_loop2_1 16, %xmm7 257 .if \nrows>=8 258 mac_pre_loop2_1 32, %xmm8 259 mac_pre_loop2_1 48, %xmm9 260 .if \nrows==12 261 mac_pre_loop2_1 64, %xmm10 262 mac_pre_loop2_1 80, %xmm11 263 .endif 264 .endif 265 .purgem mac_pre_loop2_1 266 267# q(1,2) = q(1,2) + x1 + y1*hh(2,2) 268# q(2,2) = q(2,2) + x2 + y2*hh(2,2) 269 270 addq %r8, %r10 # %r10 => q(.,2) 271 272 movddup 8(%r11,%r9), %xmm15 # hh(2,2) 273 274 .macro mac_pre_loop2_2 qoff, X, Y 275 movaps \X, %xmm13 276 movaps \Y, %xmm12 277 mulpd %xmm15, %xmm12 278 addpd %xmm12, %xmm13 279 addpd \qoff(%r10), %xmm13 280 movaps %xmm13, \qoff(%r10) 281 .endm 282 283 mac_pre_loop2_2 0, %xmm0, %xmm6 284 mac_pre_loop2_2 16, %xmm1, %xmm7 285 .if \nrows>=8 286 mac_pre_loop2_2 32, %xmm2, %xmm8 287 mac_pre_loop2_2 48, %xmm3, %xmm9 288 .if \nrows==12 289 mac_pre_loop2_2 64, %xmm4, %xmm10 290 mac_pre_loop2_2 80, %xmm5, %xmm11 291 .endif 292 .endif 293 .purgem mac_pre_loop2_2 294 295# do i=3,nb 296# h1 = hh(i-1,1) 297# h2 = hh(i,2) 298# q(1,i) = q(1,i) + x1*h1 + y1*h2 299# q(2,i) = q(2,i) + x2*h1 + y2*h2 300# enddo 301 302 addq $8, %r11 303 .align 16 3041: 305 cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax 306 jge 2f 307 308 addq %r8, %r10 # %r10 => q(.,i) 309 310 movddup (%r11), %xmm14 # hh(i-1,1) 311 movddup 8(%r11,%r9), %xmm15 # hh(i,2) 312 313 .macro mac_loop2 qoff, X, Y 314 movaps \X, %xmm13 315 mulpd %xmm14, %xmm13 316 movaps \Y, %xmm12 317 mulpd %xmm15, %xmm12 318 addpd %xmm12, %xmm13 319 addpd \qoff(%r10), %xmm13 320 movaps %xmm13, \qoff(%r10) 321 .endm 322 323 mac_loop2 0, %xmm0, %xmm6 324 mac_loop2 16, %xmm1, %xmm7 325 .if \nrows>=8 326 mac_loop2 32, %xmm2, %xmm8 327 mac_loop2 48, %xmm3, %xmm9 328 .if \nrows==12 329 mac_loop2 64, %xmm4, %xmm10 330 mac_loop2 80, %xmm5, %xmm11 331 .endif 332 .endif 333 .purgem mac_loop2 334 335 addq $8, %r11 336 jmp 1b 3372: 338 339# q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1) 340# q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1) 341 342 addq %r8, %r10 # %r10 => q(.,nb+1) 343 movddup (%r11), %xmm14 344 345 .macro mac_post_loop2 qoff, X 346 movaps \qoff(%r10), %xmm13 # q(.,nb+1) 347 mulpd %xmm14, \X 348 addpd \X, %xmm13 349 movaps %xmm13, \qoff(%r10) 350 .endm 351 352 mac_post_loop2 0, %xmm0 353 mac_post_loop2 16, %xmm1 354 .if \nrows>=8 355 mac_post_loop2 32, %xmm2 356 mac_post_loop2 48, %xmm3 357 .if \nrows==12 358 mac_post_loop2 64, %xmm4 359 mac_post_loop2 80, %xmm5 360 .endif 361 .endif 362 .purgem mac_post_loop2 363 364 .endm 365 366#------------------------------------------------------------------------------- 367#------------------------------------------------------------------------------- 368# FORTRAN Interface: 369# 370# subroutine double_hh_trafo_real_double_sse_assembly(q, hh, nb, nq, ldq, ldh) 371# 372# integer, intent(in) :: nb, nq, ldq, ldh 373# real*8, intent(inout) :: q(ldq,*) 374# real*8, intent(in) :: hh(ldh,*) 375# 376# Parameter mapping to registers 377# parameter 1: %rdi : q 378# parameter 2: %rsi : hh 379# parameter 3: %rdx : nb 380# parameter 4: %rcx : nq 381# parameter 5: %r8 : ldq 382# parameter 6: %r9 : ldh 383# 384#------------------------------------------------------------------------------- 385 386#!f>#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL 387#!f> interface 388#!f> subroutine double_hh_trafo_real_double_sse_assembly(q, hh, nb, nq, ldq, ldh) & 389#!f> bind(C,name="double_hh_trafo_real_double_sse_assembly") 390#!f> use, intrinsic :: iso_c_binding 391#!f> integer(kind=c_int) :: nb, nq, ldq, ldh 392#!f> type(c_ptr), value :: q 393#!f> real(kind=c_double_complex) :: hh(nb,6) 394#!f> end subroutine 395#!f> end interface 396#!f>#endif 397 .align 16,0x90 398double_hh_trafo_real_double_sse_assembly: 399 400 # Get integer parameters into corresponding registers 401 402 movslq (%rdx), %rdx # nb 403 movslq (%rcx), %rcx # nq 404 movslq (%r8), %r8 # ldq 405 movslq (%r9), %r9 # ldh 406 407 # Get ldq in bytes 408 addq %r8, %r8 409 addq %r8, %r8 410 addq %r8, %r8 # 8*ldq, i.e. ldq in bytes 411 412 # Get ldh in bytes 413 addq %r9, %r9 414 addq %r9, %r9 415 addq %r9, %r9 # 8*ldh, i.e. ldh in bytes 416 417 # set %rax to the address of hh at the end of the loops, 418 # i.e. if %rdx >= %rax we must jump out of the loop. 419 # please note: %rax = 8*%rdx + %rsi - 8 420 movq %rdx, %rax 421 addq %rax, %rax 422 addq %rax, %rax 423 addq %rax, %rax 424 addq %rsi, %rax 425 subq $8, %rax 426 427#----------------------------------------------------------- 428 # Calculate the dot product of the two Householder vectors 429 430 # decrement stack pointer to make space for s 431 subq $8, %rsp 432 433# Fortran code: 434# s = hh(2,2)*1 435# do i=3,nb 436# s = s+hh(i,2)*hh(i-1,1) 437# enddo 438 439 movq %rsi, %r11 # Copy address of hh 440 441 movsd 8(%r11,%r9), %xmm0 # hh(2,2) 442 addq $8, %r11 4431: 444 cmpq %rax, %r11 445 jge 2f 446 movsd (%r11), %xmm14 # hh(i-1,1) 447 movsd 8(%r11,%r9), %xmm15 # hh(i,2) 448 mulsd %xmm14, %xmm15 449 addsd %xmm15, %xmm0 450 addq $8, %r11 451 jmp 1b 4522: 453 movsd %xmm0, (%rsp) # put s on top of stack 454#----------------------------------------------------------- 455 456rloop_s: 457 cmpq $8, %rcx # if %rcx <= 8 jump out of loop 458 jle rloop_e 459 hh_trafo_real 12 # transform 12 rows 460 addq $96, %rdi # increment q start adress by 96 bytes (6 rows) 461 subq $12, %rcx # decrement nq 462 jmp rloop_s 463rloop_e: 464 465 cmpq $4, %rcx # if %rcx <= 4 jump to test_2 466 jle test_4 467 hh_trafo_real 8 # transform 8 rows 468 jmp return1 469 470test_4: 471 cmpq $0, %rcx # if %rcx <= 0 jump to return 472 jle return1 473 hh_trafo_real 4 # transform 4 rows 474 475return1: 476 addq $8, %rsp # reset stack pointer 477 ret 478 479 .align 16,0x90 480 481#------------------------------------------------------------------------------- 482#------------------------------------------------------------------------------- 483 484 .macro hh_trafo_complex nrows 485 486 # When this macro is called, the following registers are set and must not be changed 487 # %rdi: Address of q 488 # %rsi: Address of hh 489 # %rdx: nb 490 # %rcx: Remaining rows nq 491 # %r8: ldq in bytes 492 493 movq %rdi, %r10 # Copy address of q 494 movq %rsi, %r11 # Copy address of hh 495 496 # set %rax to the address of hh at the end of the loops, 497 # i.e. if %rdx >= %rax we must jump out of the loop. 498 # please note: %rax = 16*%rdx + %rsi 499 movq %rdx, %rax 500 addq %rax, %rax 501 addq %rax, %rax 502 addq %rax, %rax 503 addq %rax, %rax 504 addq %rsi, %rax 505 506# x1 = q(1,1); y1 = 0 507# x2 = q(2,1); y2 = 0 508# ... 509 510 movaps (%r10), %xmm0 511 movaps 16(%r10), %xmm1 512 xorps %xmm6, %xmm6 513 xorps %xmm7, %xmm7 514 .if \nrows>=4 515 movaps 32(%r10), %xmm2 516 movaps 48(%r10), %xmm3 517 xorps %xmm8, %xmm8 518 xorps %xmm9, %xmm9 519 .if \nrows==6 520 movaps 64(%r10), %xmm4 521 movaps 80(%r10), %xmm5 522 xorps %xmm10, %xmm10 523 xorps %xmm11, %xmm11 524 .endif 525 .endif 526 527# do i=2,nb 528# h1 = conjg(hh(i)) 529# x1 = x1 + q(1,i)*h1 530# x2 = x2 + q(2,i)*h1 531# ... 532# enddo 533 534 addq $16, %r11 # %r11 => hh(2) 535 .align 16 5361: 537 cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax 538 jge 2f 539 540 addq %r8, %r10 # %r10 => q(.,i) 541 542 movddup (%r11), %xmm14 # real(hh(i)) 543 movddup 8(%r11), %xmm15 # imag(hh(i)) 544 545 .macro mac_loop1 qoff, X, Y 546 movaps \qoff(%r10), %xmm13 # q(.,i) 547 movaps %xmm13, %xmm12 548 mulpd %xmm14, %xmm13 # q(.,i)*real(hh(i)) 549 addpd %xmm13, \X # x1 = x1 + q(.,i)*real(hh(i)) 550 mulpd %xmm15, %xmm12 # q(.,i)*imag(hh(i)) 551 addsubpd %xmm12, \Y # y1 = y1 -/+ q(.,i)*imag(hh(i)) 552 .endm 553 554 mac_loop1 0, %xmm0, %xmm6 555 mac_loop1 16, %xmm1, %xmm7 556 .if \nrows>=4 557 mac_loop1 32, %xmm2, %xmm8 558 mac_loop1 48, %xmm3, %xmm9 559 .if \nrows==6 560 mac_loop1 64, %xmm4, %xmm10 561 mac_loop1 80, %xmm5, %xmm11 562 .endif 563 .endif 564 565 .purgem mac_loop1 566 567 addq $16, %r11 # %r11 => hh(i+1) 568 jmp 1b 5692: 570 571 # Now the content of the yn has to be swapped and added to xn 572 .macro mac_post_loop_1 X, Y 573 shufpd $1, \Y, \Y 574 addpd \Y, \X 575 .endm 576 577 mac_post_loop_1 %xmm0, %xmm6 578 mac_post_loop_1 %xmm1, %xmm7 579 .if \nrows>=4 580 mac_post_loop_1 %xmm2, %xmm8 581 mac_post_loop_1 %xmm3, %xmm9 582 .if \nrows==6 583 mac_post_loop_1 %xmm4, %xmm10 584 mac_post_loop_1 %xmm5, %xmm11 585 .endif 586 .endif 587 .purgem mac_post_loop_1 588 589# tau1 = hh(1) 590# 591# h1 = -tau1 592# x1 = x1*h1; y1 = x1 with halfes exchanged 593# x2 = x2*h1; y2 = x2 with halfes exchanged 594# ... 595 596 movq %rsi, %r11 # restore address of hh 597 598 xorps %xmm14, %xmm14 599 movddup (%r11), %xmm12 # real(hh(1)) 600 subpd %xmm12, %xmm14 #-real(hh(1)) 601 xorps %xmm15, %xmm15 602 movddup 8(%r11), %xmm12 # imag(hh(1)) 603 subpd %xmm12, %xmm15 #-imag(hh(1)) 604 605 .macro mac_xform X, Y 606 movaps \X, %xmm12 607 shufpd $1, \X, %xmm12 608 mulpd %xmm15, %xmm12 609 mulpd %xmm14, \X 610 addsubpd %xmm12, \X 611 movaps \X, \Y # copy to y 612 shufpd $1, \X, \Y # exchange halfes 613 .endm 614 615 mac_xform %xmm0, %xmm6 616 mac_xform %xmm1, %xmm7 617 .if \nrows>=4 618 mac_xform %xmm2, %xmm8 619 mac_xform %xmm3, %xmm9 620 .if \nrows==6 621 mac_xform %xmm4, %xmm10 622 mac_xform %xmm5, %xmm11 623 .endif 624 .endif 625 .purgem mac_xform 626 627# q(1,1) = q(1,1) + x1 628# q(2,1) = q(2,1) + x2 629# ... 630 631 movq %rdi, %r10 # restore address of q 632 .macro mac_pre_loop2 qoff, X 633 movaps \qoff(%r10), %xmm13 # q(.,1) 634 addpd \X, %xmm13 635 movaps %xmm13, \qoff(%r10) 636 .endm 637 638 mac_pre_loop2 0, %xmm0 639 mac_pre_loop2 16, %xmm1 640 .if \nrows>=4 641 mac_pre_loop2 32, %xmm2 642 mac_pre_loop2 48, %xmm3 643 .if \nrows==6 644 mac_pre_loop2 64, %xmm4 645 mac_pre_loop2 80, %xmm5 646 .endif 647 .endif 648 .purgem mac_pre_loop2 649 650# do i=2,nb 651# h1 = hh(i) 652# q(1,i) = q(1,i) + x1*h1 653# q(2,i) = q(2,i) + x2*h1 654# ... 655# enddo 656 657 addq $16, %r11 658 .align 16 6591: 660 cmpq %rax, %r11 # Jump out of the loop if %r11 >= %rax 661 jge 2f 662 663 addq %r8, %r10 # %r10 => q(.,i) 664 665 movddup (%r11), %xmm14 # real(hh(i)) 666 movddup 8(%r11), %xmm15 # imag(hh(i)) 667 668 .macro mac_loop2 qoff, X, Y 669 movaps \X, %xmm13 670 mulpd %xmm14, %xmm13 671 movaps \Y, %xmm12 672 mulpd %xmm15, %xmm12 673 addsubpd %xmm12, %xmm13 674 addpd \qoff(%r10), %xmm13 675 movaps %xmm13, \qoff(%r10) 676 .endm 677 678 mac_loop2 0, %xmm0, %xmm6 679 mac_loop2 16, %xmm1, %xmm7 680 .if \nrows>=4 681 mac_loop2 32, %xmm2, %xmm8 682 mac_loop2 48, %xmm3, %xmm9 683 .if \nrows==6 684 mac_loop2 64, %xmm4, %xmm10 685 mac_loop2 80, %xmm5, %xmm11 686 .endif 687 .endif 688 .purgem mac_loop2 689 690 addq $16, %r11 691 jmp 1b 6922: 693 .endm 694 695#------------------------------------------------------------------------------- 696#------------------------------------------------------------------------------- 697# FORTRAN Interface: 698# 699# subroutine single_hh_trafo_complex_double_sse_assembly(q, hh, nb, nq, ldq) 700# 701# integer, intent(in) :: nb, nq, ldq 702# complex*16, intent(inout) :: q(ldq,*) 703# complex*16, intent(in) :: hh(*) 704# 705# Parameter mapping to registers 706# parameter 1: %rdi : q 707# parameter 2: %rsi : hh 708# parameter 3: %rdx : nb 709# parameter 4: %rcx : nq 710# parameter 5: %r8 : ldq 711# 712#------------------------------------------------------------------------------- 713#!f>#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL 714#!f> interface 715#!f> subroutine single_hh_trafo_complex_double_sse_assembly(q, hh, nb, nq, ldq) & 716#!f> bind(C,name="single_hh_trafo_complex_double_sse_assembly") 717#!f> use, intrinsic :: iso_c_binding 718#!f> integer(kind=c_int) :: nb, nq, ldq 719#!f> !complex(kind=c_double_complex) :: q(*) 720#!f> type(c_ptr), value :: q 721#!f> complex(kind=c_double_complex) :: hh(nb,2) 722#!f> end subroutine 723#!f> end interface 724#!f>#endif 725 .align 16,0x90 726single_hh_trafo_complex_double_sse_assembly: 727 728 # Get integer parameters into corresponding registers 729 730 movslq (%rdx), %rdx # nb 731 movslq (%rcx), %rcx # nq 732 movslq (%r8), %r8 # ldq 733 734 # Get ldq in bytes 735 addq %r8, %r8 736 addq %r8, %r8 737 addq %r8, %r8 738 addq %r8, %r8 # 16*ldq, i.e. ldq in bytes 739 740cloop_s: 741 cmpq $4, %rcx # if %rcx <= 4 jump out of loop 742 jle cloop_e 743 hh_trafo_complex 6 # transform 6 rows 744 addq $96, %rdi # increment q start adress by 96 bytes (6 rows) 745 subq $6, %rcx # decrement nq 746 jmp cloop_s 747cloop_e: 748 749 cmpq $2, %rcx # if %rcx <= 2 jump to test_2 750 jle test_2 751 hh_trafo_complex 4 # transform 4 rows 752 jmp return2 753 754test_2: 755 cmpq $0, %rcx # if %rcx <= 0 jump to return 756 jle return2 757 hh_trafo_complex 2 # transform 2 rows 758 759return2: 760 ret 761 762 .align 16,0x90 763#------------------------------------------------------------------------------- 764#------------------------------------------------------------------------------- 765#------------------------------------------------------------------------------- 766 767# Declare that we do not need an executable stack here 768 .section .note.GNU-stack,"",@progbits 769