1#    This file is part of ELPA.
2#
3#    The ELPA library was originally created by the ELPA consortium,
4#    consisting of the following organizations:
5#
6#    - Max Planck Computing and Data Facility (MPCDF), formerly known as
7#      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8#    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9#      Informatik,
10#    - Technische Universität München, Lehrstuhl für Informatik mit
11#      Schwerpunkt Wissenschaftliches Rechnen ,
12#    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13#    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14#      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15#      and
16#    - IBM Deutschland GmbH
17#
18#
19#    More information can be found here:
20#    http://elpa.mpcdf.mpg.de/
21#
22#    ELPA is free software: you can redistribute it and/or modify
23#    it under the terms of the version 3 of the license of the
24#    GNU Lesser General Public License as published by the Free
25#    Software Foundation.
26#
27#    ELPA is distributed in the hope that it will be useful,
28#    but WITHOUT ANY WARRANTY; without even the implied warranty of
29#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30#    GNU Lesser General Public License for more details.
31#
32#    You should have received a copy of the GNU Lesser General Public License
33#    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
34#
35#    ELPA reflects a substantial effort on the part of the original
36#    ELPA consortium, and we ask you to respect the spirit of the
37#    license that we chose: i.e., please contribute any changes you
38#    may have back to the original ELPA library distribution, and keep
39#    any derivatives of ELPA under the same license that we chose for
40#    the original distribution, the GNU Lesser General Public License.
41#
42
43
44# --------------------------------------------------------------------------------------------------
45#
46# This file contains the compute intensive kernels for the Householder transformations,
47# coded in x86_64 assembler and using SSE2/SSE3 instructions.
48#
49# It must be assembled with GNU assembler (just "as" on most Linux machines)
50#
51# Copyright of the original code rests with the authors inside the ELPA
52# consortium. The copyright of any additional modifications shall rest
53# with their original authors, but shall adhere to the licensing terms
54# distributed along with the original code in the file "COPYING".
55#
56# --------------------------------------------------------------------------------------------------
57        .globl double_hh_trafo_real_double_sse_assembly
58        .globl single_hh_trafo_complex_double_sse_assembly
59        .text
60
61#-------------------------------------------------------------------------------
62#-------------------------------------------------------------------------------
63
64        .macro hh_trafo_real nrows
65
66        # When this macro is called, the following registers are set and must not be changed
67        # %rdi: Address of q
68        # %rsi: Address of hh
69        # %rdx: nb
70        # %rcx: Remaining rows nq
71        # %r8:  ldq in bytes
72        # %r9:  ldh in bytes
73        # %rax: address of hh at the end of the loops
74        # The top of the stack must contain the dot product of the two Householder vectors
75
76        movq      %rdi, %r10   # Copy address of q
77        movq      %rsi, %r11   # Copy address of hh
78
79
80#   x1 = q(1,2)
81#   x2 = q(2,2)
82#
83#   y1 = q(1,1) + q(1,2)*hh(2,2)
84#   y2 = q(2,1) + q(2,2)*hh(2,2)
85
86        movaps      (%r10), %xmm6       # y1 = q(1,1)
87        movaps    16(%r10), %xmm7       # y2 = q(2,1)
88        .if \nrows>=8
89        movaps    32(%r10), %xmm8
90        movaps    48(%r10), %xmm9
91        .if \nrows==12
92        movaps    64(%r10), %xmm10
93        movaps    80(%r10), %xmm11
94        .endif
95        .endif
96
97        addq      %r8, %r10             # %r10 => q(.,2)
98        movddup   8(%r11,%r9), %xmm15   #  hh(2,2)
99
100        .macro mac_pre_loop1 qoff, X, Y
101        movaps    \qoff(%r10), \X       # xn = q(n,2)
102        movaps    \X, %xmm12
103        mulpd     %xmm15, %xmm12
104        addpd     %xmm12, \Y            # yn = yn + xn*h(2,2)
105        .endm
106
107        mac_pre_loop1  0, %xmm0, %xmm6
108        mac_pre_loop1 16, %xmm1, %xmm7
109        .if \nrows>=8
110        mac_pre_loop1 32, %xmm2, %xmm8
111        mac_pre_loop1 48, %xmm3, %xmm9
112        .if \nrows==12
113        mac_pre_loop1 64, %xmm4, %xmm10
114        mac_pre_loop1 80, %xmm5, %xmm11
115        .endif
116        .endif
117        .purgem   mac_pre_loop1
118
119#   do i=3,nb
120#      h1 = hh(i-1,1)
121#      h2 = hh(i,2)
122#      x1 = x1 + q(1,i)*h1
123#      y1 = y1 + q(1,i)*h2
124#      x2 = x2 + q(2,i)*h1
125#      y2 = y2 + q(2,i)*h2
126#      ...
127#   enddo
128
129        addq      $8, %r11
130        .align 16
1311:
132        cmpq %rax, %r11                 # Jump out of the loop if %r11 >= %rax
133        jge       2f
134
135        addq      %r8, %r10             # %r10 => q(.,i)
136
137        movddup   (%r11), %xmm14        # hh(i-1,1)
138        movddup   8(%r11,%r9), %xmm15   # hh(i,2)
139
140        .macro mac_loop1 qoff, X, Y
141        movaps    \qoff(%r10), %xmm13   # q(.,i)
142        movaps    %xmm13, %xmm12
143        mulpd     %xmm14, %xmm13
144        addpd     %xmm13, \X            # xn = xn + q(.,i)*h1
145        mulpd     %xmm15, %xmm12
146        addpd     %xmm12, \Y            # yn = yn + q(.,i)*h2
147        .endm
148
149        mac_loop1  0, %xmm0, %xmm6
150        mac_loop1 16, %xmm1, %xmm7
151        .if \nrows>=8
152        mac_loop1 32, %xmm2, %xmm8
153        mac_loop1 48, %xmm3, %xmm9
154        .if \nrows==12
155        mac_loop1 64, %xmm4, %xmm10
156        mac_loop1 80, %xmm5, %xmm11
157        .endif
158        .endif
159        .purgem   mac_loop1
160
161        addq      $8, %r11
162        jmp       1b
1632:
164
165#   x1 = x1 + q(1,nb+1)*hh(nb,1)
166#   x2 = x2 + q(2,nb+1)*hh(nb,1)
167
168        addq      %r8, %r10             # %r10 => q(.,nb+1)
169        movddup   (%r11), %xmm14
170
171        .macro mac_post_loop1 qoff, X
172        movaps    \qoff(%r10), %xmm13   # q(.,nb+1)
173        mulpd     %xmm14, %xmm13
174        addpd     %xmm13, \X
175        .endm
176
177        mac_post_loop1  0, %xmm0
178        mac_post_loop1 16, %xmm1
179        .if \nrows>=8
180        mac_post_loop1 32, %xmm2
181        mac_post_loop1 48, %xmm3
182        .if \nrows==12
183        mac_post_loop1 64, %xmm4
184        mac_post_loop1 80, %xmm5
185        .endif
186        .endif
187        .purgem   mac_post_loop1
188
189#   tau1 = hh(1,1)
190#   tau2 = hh(1,2)
191#
192#   h1 = -tau1
193#   x1 = x1*h1
194#   x2 = x2*h1
195
196        movq      %rsi, %r11    # restore %r11 (hh(1,1))
197
198        movddup (%r11), %xmm12 # hh(1,1)
199        xorps   %xmm14, %xmm14
200        subpd   %xmm12, %xmm14 # %xmm14 = -hh(1,1)
201
202        mulpd   %xmm14, %xmm0
203        mulpd   %xmm14, %xmm1
204        .if \nrows>=8
205        mulpd   %xmm14, %xmm2
206        mulpd   %xmm14, %xmm3
207        .if \nrows==12
208        mulpd   %xmm14, %xmm4
209        mulpd   %xmm14, %xmm5
210        .endif
211        .endif
212
213#   h1 = -tau2
214#   h2 = -tau2*s
215#   y1 = y1*h1 + x1*h2
216#   y2 = y2*h1 + x2*h2
217
218        movddup (%r11,%r9), %xmm12  # hh(1,2)
219        xorps   %xmm15, %xmm15
220        subpd   %xmm12, %xmm15 # %xmm15 = -hh(1,2) = h1
221        movaps  %xmm15, %xmm14
222        movddup (%rsp), %xmm12 # Get s from top of stack
223        mulpd   %xmm12, %xmm14 # %xmm14 = h2
224
225        .macro mac_xform_y X, Y
226        mulpd   %xmm15, \Y  # y1 = y1*h1
227        movaps  \X, %xmm12
228        mulpd   %xmm14, %xmm12
229        addpd   %xmm12, \Y
230        .endm
231
232        mac_xform_y %xmm0, %xmm6
233        mac_xform_y %xmm1, %xmm7
234        .if \nrows>=8
235        mac_xform_y %xmm2, %xmm8
236        mac_xform_y %xmm3, %xmm9
237        .if \nrows==12
238        mac_xform_y %xmm4, %xmm10
239        mac_xform_y %xmm5, %xmm11
240        .endif
241        .endif
242        .purgem   mac_xform_y
243
244#   q(1,1) = q(1,1) + y1
245#   q(2,1) = q(2,1) + y2
246
247        movq   %rdi, %r10   # restore original Q
248
249        .macro mac_pre_loop2_1 qoff, Y
250        movaps    \qoff(%r10), %xmm13   # q(.,1)
251        addpd     \Y, %xmm13
252        movaps    %xmm13, \qoff(%r10)
253        .endm
254
255        mac_pre_loop2_1  0, %xmm6
256        mac_pre_loop2_1 16, %xmm7
257        .if \nrows>=8
258        mac_pre_loop2_1 32, %xmm8
259        mac_pre_loop2_1 48, %xmm9
260        .if \nrows==12
261        mac_pre_loop2_1 64, %xmm10
262        mac_pre_loop2_1 80, %xmm11
263        .endif
264        .endif
265        .purgem   mac_pre_loop2_1
266
267#   q(1,2) = q(1,2) + x1 + y1*hh(2,2)
268#   q(2,2) = q(2,2) + x2 + y2*hh(2,2)
269
270        addq      %r8, %r10             # %r10 => q(.,2)
271
272        movddup   8(%r11,%r9), %xmm15   # hh(2,2)
273
274        .macro mac_pre_loop2_2 qoff, X, Y
275        movaps    \X, %xmm13
276        movaps    \Y, %xmm12
277        mulpd     %xmm15, %xmm12
278        addpd     %xmm12, %xmm13
279        addpd     \qoff(%r10), %xmm13
280        movaps    %xmm13, \qoff(%r10)
281        .endm
282
283        mac_pre_loop2_2  0, %xmm0, %xmm6
284        mac_pre_loop2_2 16, %xmm1, %xmm7
285        .if \nrows>=8
286        mac_pre_loop2_2 32, %xmm2, %xmm8
287        mac_pre_loop2_2 48, %xmm3, %xmm9
288        .if \nrows==12
289        mac_pre_loop2_2 64, %xmm4, %xmm10
290        mac_pre_loop2_2 80, %xmm5, %xmm11
291        .endif
292        .endif
293        .purgem   mac_pre_loop2_2
294
295#   do i=3,nb
296#      h1 = hh(i-1,1)
297#      h2 = hh(i,2)
298#      q(1,i) = q(1,i) + x1*h1 + y1*h2
299#      q(2,i) = q(2,i) + x2*h1 + y2*h2
300#   enddo
301
302        addq      $8, %r11
303        .align 16
3041:
305        cmpq %rax, %r11                 # Jump out of the loop if %r11 >= %rax
306        jge       2f
307
308        addq      %r8, %r10             # %r10 => q(.,i)
309
310        movddup   (%r11), %xmm14        # hh(i-1,1)
311        movddup   8(%r11,%r9), %xmm15   # hh(i,2)
312
313        .macro mac_loop2 qoff, X, Y
314        movaps    \X, %xmm13
315        mulpd     %xmm14, %xmm13
316        movaps    \Y, %xmm12
317        mulpd     %xmm15, %xmm12
318        addpd     %xmm12, %xmm13
319        addpd     \qoff(%r10), %xmm13
320        movaps    %xmm13, \qoff(%r10)
321        .endm
322
323        mac_loop2  0, %xmm0, %xmm6
324        mac_loop2 16, %xmm1, %xmm7
325        .if \nrows>=8
326        mac_loop2 32, %xmm2, %xmm8
327        mac_loop2 48, %xmm3, %xmm9
328        .if \nrows==12
329        mac_loop2 64, %xmm4, %xmm10
330        mac_loop2 80, %xmm5, %xmm11
331        .endif
332        .endif
333        .purgem   mac_loop2
334
335        addq      $8, %r11
336        jmp       1b
3372:
338
339#   q(1,nb+1) = q(1,nb+1) + x1*hh(nb,1)
340#   q(2,nb+1) = q(2,nb+1) + x2*hh(nb,1)
341
342        addq      %r8, %r10             # %r10 => q(.,nb+1)
343        movddup   (%r11), %xmm14
344
345        .macro mac_post_loop2 qoff, X
346        movaps    \qoff(%r10), %xmm13   # q(.,nb+1)
347        mulpd     %xmm14, \X
348        addpd     \X, %xmm13
349        movaps    %xmm13, \qoff(%r10)
350        .endm
351
352        mac_post_loop2  0, %xmm0
353        mac_post_loop2 16, %xmm1
354        .if \nrows>=8
355        mac_post_loop2 32, %xmm2
356        mac_post_loop2 48, %xmm3
357        .if \nrows==12
358        mac_post_loop2 64, %xmm4
359        mac_post_loop2 80, %xmm5
360        .endif
361        .endif
362        .purgem   mac_post_loop2
363
364        .endm
365
366#-------------------------------------------------------------------------------
367#-------------------------------------------------------------------------------
368# FORTRAN Interface:
369#
370# subroutine double_hh_trafo_real_double_sse_assembly(q, hh, nb, nq, ldq, ldh)
371#
372#   integer, intent(in) :: nb, nq, ldq, ldh
373#   real*8, intent(inout) :: q(ldq,*)
374#   real*8, intent(in) :: hh(ldh,*)
375#
376# Parameter mapping to registers
377#   parameter 1: %rdi : q
378#   parameter 2: %rsi : hh
379#   parameter 3: %rdx : nb
380#   parameter 4: %rcx : nq
381#   parameter 5: %r8  : ldq
382#   parameter 6: %r9  : ldh
383#
384#-------------------------------------------------------------------------------
385
386#!f>#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
387#!f>  interface
388#!f>    subroutine double_hh_trafo_real_double_sse_assembly(q, hh, nb, nq, ldq, ldh) &
389#!f>      bind(C,name="double_hh_trafo_real_double_sse_assembly")
390#!f>      use, intrinsic :: iso_c_binding
391#!f>      integer(kind=c_int)  :: nb, nq, ldq, ldh
392#!f>      type(c_ptr), value   :: q
393#!f>      real(kind=c_double_complex)  :: hh(nb,6)
394#!f>    end subroutine
395#!f>  end interface
396#!f>#endif
397        .align    16,0x90
398double_hh_trafo_real_double_sse_assembly:
399
400        # Get integer parameters into corresponding registers
401
402        movslq    (%rdx), %rdx # nb
403        movslq    (%rcx), %rcx # nq
404        movslq    (%r8),  %r8  # ldq
405        movslq    (%r9),  %r9  # ldh
406
407        # Get ldq in bytes
408        addq      %r8, %r8
409        addq      %r8, %r8
410        addq      %r8, %r8 # 8*ldq, i.e. ldq in bytes
411
412        # Get ldh in bytes
413        addq      %r9, %r9
414        addq      %r9, %r9
415        addq      %r9, %r9 # 8*ldh, i.e. ldh in bytes
416
417        # set %rax to the address of hh at the end of the loops,
418        # i.e. if %rdx >= %rax we must jump out of the loop.
419        # please note: %rax = 8*%rdx + %rsi - 8
420        movq %rdx, %rax
421        addq %rax, %rax
422        addq %rax, %rax
423        addq %rax, %rax
424        addq %rsi, %rax
425        subq $8, %rax
426
427#-----------------------------------------------------------
428        # Calculate the dot product of the two Householder vectors
429
430        # decrement stack pointer to make space for s
431        subq $8, %rsp
432
433#   Fortran code:
434#   s = hh(2,2)*1
435#   do i=3,nb
436#      s = s+hh(i,2)*hh(i-1,1)
437#   enddo
438
439        movq      %rsi, %r11   # Copy address of hh
440
441        movsd     8(%r11,%r9), %xmm0 #  hh(2,2)
442        addq      $8, %r11
4431:
444        cmpq %rax, %r11
445        jge       2f
446        movsd   (%r11), %xmm14       # hh(i-1,1)
447        movsd   8(%r11,%r9), %xmm15  # hh(i,2)
448        mulsd   %xmm14, %xmm15
449        addsd   %xmm15, %xmm0
450        addq      $8, %r11
451        jmp       1b
4522:
453        movsd   %xmm0, (%rsp)   # put s on top of stack
454#-----------------------------------------------------------
455
456rloop_s:
457        cmpq      $8, %rcx   # if %rcx <= 8 jump out of loop
458        jle       rloop_e
459        hh_trafo_real 12 # transform 12 rows
460        addq      $96, %rdi  # increment q start adress by 96 bytes (6 rows)
461        subq      $12, %rcx  # decrement nq
462        jmp       rloop_s
463rloop_e:
464
465        cmpq      $4, %rcx   # if %rcx <= 4 jump to test_2
466        jle       test_4
467        hh_trafo_real 8 # transform 8 rows
468        jmp       return1
469
470test_4:
471        cmpq      $0, %rcx   # if %rcx <= 0 jump to return
472        jle       return1
473        hh_trafo_real 4 # transform 4 rows
474
475return1:
476        addq      $8, %rsp   # reset stack pointer
477        ret
478
479        .align    16,0x90
480
481#-------------------------------------------------------------------------------
482#-------------------------------------------------------------------------------
483
484        .macro hh_trafo_complex nrows
485
486        # When this macro is called, the following registers are set and must not be changed
487        # %rdi: Address of q
488        # %rsi: Address of hh
489        # %rdx: nb
490        # %rcx: Remaining rows nq
491        # %r8:  ldq in bytes
492
493        movq      %rdi, %r10   # Copy address of q
494        movq      %rsi, %r11   # Copy address of hh
495
496        # set %rax to the address of hh at the end of the loops,
497        # i.e. if %rdx >= %rax we must jump out of the loop.
498        # please note: %rax = 16*%rdx + %rsi
499        movq %rdx, %rax
500        addq %rax, %rax
501        addq %rax, %rax
502        addq %rax, %rax
503        addq %rax, %rax
504        addq %rsi, %rax
505
506#   x1 = q(1,1); y1 = 0
507#   x2 = q(2,1); y2 = 0
508#   ...
509
510        movaps      (%r10), %xmm0
511        movaps    16(%r10), %xmm1
512        xorps     %xmm6, %xmm6
513        xorps     %xmm7, %xmm7
514        .if \nrows>=4
515        movaps    32(%r10), %xmm2
516        movaps    48(%r10), %xmm3
517        xorps     %xmm8, %xmm8
518        xorps     %xmm9, %xmm9
519        .if \nrows==6
520        movaps    64(%r10), %xmm4
521        movaps    80(%r10), %xmm5
522        xorps     %xmm10, %xmm10
523        xorps     %xmm11, %xmm11
524        .endif
525        .endif
526
527#   do i=2,nb
528#      h1 = conjg(hh(i))
529#      x1 = x1 + q(1,i)*h1
530#      x2 = x2 + q(2,i)*h1
531#      ...
532#   enddo
533
534        addq      $16, %r11  # %r11 => hh(2)
535        .align 16
5361:
537        cmpq      %rax, %r11      # Jump out of the loop if %r11 >= %rax
538        jge 2f
539
540        addq      %r8, %r10       # %r10 => q(.,i)
541
542        movddup    (%r11), %xmm14 # real(hh(i))
543        movddup   8(%r11), %xmm15 # imag(hh(i))
544
545        .macro mac_loop1 qoff, X, Y
546        movaps    \qoff(%r10), %xmm13     # q(.,i)
547        movaps    %xmm13, %xmm12
548        mulpd     %xmm14, %xmm13          # q(.,i)*real(hh(i))
549        addpd     %xmm13, \X              # x1 = x1 + q(.,i)*real(hh(i))
550        mulpd     %xmm15, %xmm12          # q(.,i)*imag(hh(i))
551        addsubpd  %xmm12, \Y              # y1 = y1 -/+ q(.,i)*imag(hh(i))
552        .endm
553
554        mac_loop1   0, %xmm0, %xmm6
555        mac_loop1  16, %xmm1, %xmm7
556        .if \nrows>=4
557        mac_loop1  32, %xmm2, %xmm8
558        mac_loop1  48, %xmm3, %xmm9
559        .if \nrows==6
560        mac_loop1  64, %xmm4, %xmm10
561        mac_loop1  80, %xmm5, %xmm11
562        .endif
563        .endif
564
565        .purgem   mac_loop1
566
567        addq      $16, %r11                # %r11 => hh(i+1)
568        jmp       1b
5692:
570
571        # Now the content of the yn has to be swapped and added to xn
572        .macro mac_post_loop_1 X, Y
573        shufpd $1, \Y, \Y
574        addpd  \Y, \X
575        .endm
576
577        mac_post_loop_1  %xmm0, %xmm6
578        mac_post_loop_1  %xmm1, %xmm7
579        .if \nrows>=4
580        mac_post_loop_1  %xmm2, %xmm8
581        mac_post_loop_1  %xmm3, %xmm9
582        .if \nrows==6
583        mac_post_loop_1  %xmm4, %xmm10
584        mac_post_loop_1  %xmm5, %xmm11
585        .endif
586        .endif
587        .purgem   mac_post_loop_1
588
589#   tau1 = hh(1)
590#
591#   h1 = -tau1
592#   x1 = x1*h1; y1 = x1 with halfes exchanged
593#   x2 = x2*h1; y2 = x2 with halfes exchanged
594#   ...
595
596        movq      %rsi, %r11      # restore address of hh
597
598        xorps     %xmm14, %xmm14
599        movddup    (%r11), %xmm12 # real(hh(1))
600        subpd     %xmm12, %xmm14  #-real(hh(1))
601        xorps     %xmm15, %xmm15
602        movddup   8(%r11), %xmm12 # imag(hh(1))
603        subpd     %xmm12, %xmm15  #-imag(hh(1))
604
605        .macro mac_xform X, Y
606        movaps    \X, %xmm12
607        shufpd    $1, \X, %xmm12
608        mulpd     %xmm15, %xmm12
609        mulpd     %xmm14, \X
610        addsubpd  %xmm12, \X
611        movaps    \X, \Y          # copy to y
612        shufpd    $1, \X, \Y      # exchange halfes
613        .endm
614
615        mac_xform %xmm0, %xmm6
616        mac_xform %xmm1, %xmm7
617        .if \nrows>=4
618        mac_xform %xmm2, %xmm8
619        mac_xform %xmm3, %xmm9
620        .if \nrows==6
621        mac_xform %xmm4, %xmm10
622        mac_xform %xmm5, %xmm11
623        .endif
624        .endif
625        .purgem mac_xform
626
627#   q(1,1) = q(1,1) + x1
628#   q(2,1) = q(2,1) + x2
629#   ...
630
631        movq      %rdi, %r10      # restore address of q
632        .macro mac_pre_loop2 qoff, X
633        movaps    \qoff(%r10), %xmm13     # q(.,1)
634        addpd     \X, %xmm13
635        movaps    %xmm13, \qoff(%r10)
636        .endm
637
638        mac_pre_loop2   0, %xmm0
639        mac_pre_loop2  16, %xmm1
640        .if \nrows>=4
641        mac_pre_loop2  32, %xmm2
642        mac_pre_loop2  48, %xmm3
643        .if \nrows==6
644        mac_pre_loop2  64, %xmm4
645        mac_pre_loop2  80, %xmm5
646        .endif
647        .endif
648        .purgem mac_pre_loop2
649
650#   do i=2,nb
651#      h1 = hh(i)
652#      q(1,i) = q(1,i) + x1*h1
653#      q(2,i) = q(2,i) + x2*h1
654#      ...
655#   enddo
656
657        addq      $16, %r11
658        .align 16
6591:
660        cmpq      %rax, %r11      # Jump out of the loop if %r11 >= %rax
661        jge 2f
662
663        addq      %r8, %r10       # %r10 => q(.,i)
664
665        movddup    (%r11), %xmm14 # real(hh(i))
666        movddup   8(%r11), %xmm15 # imag(hh(i))
667
668        .macro mac_loop2 qoff, X, Y
669        movaps    \X, %xmm13
670        mulpd     %xmm14, %xmm13
671        movaps    \Y, %xmm12
672        mulpd     %xmm15, %xmm12
673        addsubpd  %xmm12, %xmm13
674        addpd     \qoff(%r10), %xmm13
675        movaps    %xmm13, \qoff(%r10)
676        .endm
677
678        mac_loop2   0, %xmm0, %xmm6
679        mac_loop2  16, %xmm1, %xmm7
680        .if \nrows>=4
681        mac_loop2  32, %xmm2, %xmm8
682        mac_loop2  48, %xmm3, %xmm9
683        .if \nrows==6
684        mac_loop2  64, %xmm4, %xmm10
685        mac_loop2  80, %xmm5, %xmm11
686        .endif
687        .endif
688        .purgem   mac_loop2
689
690        addq      $16, %r11
691        jmp       1b
6922:
693        .endm
694
695#-------------------------------------------------------------------------------
696#-------------------------------------------------------------------------------
697# FORTRAN Interface:
698#
699# subroutine single_hh_trafo_complex_double_sse_assembly(q, hh, nb, nq, ldq)
700#
701#   integer, intent(in) :: nb, nq, ldq
702#   complex*16, intent(inout) :: q(ldq,*)
703#   complex*16, intent(in) :: hh(*)
704#
705# Parameter mapping to registers
706#   parameter 1: %rdi : q
707#   parameter 2: %rsi : hh
708#   parameter 3: %rdx : nb
709#   parameter 4: %rcx : nq
710#   parameter 5: %r8  : ldq
711#
712#-------------------------------------------------------------------------------
713#!f>#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
714#!f>  interface
715#!f>    subroutine single_hh_trafo_complex_double_sse_assembly(q, hh, nb, nq, ldq) &
716#!f>      bind(C,name="single_hh_trafo_complex_double_sse_assembly")
717#!f>      use, intrinsic :: iso_c_binding
718#!f>      integer(kind=c_int)              :: nb, nq, ldq
719#!f>      !complex(kind=c_double_complex)  :: q(*)
720#!f>      type(c_ptr), value               :: q
721#!f>      complex(kind=c_double_complex)   :: hh(nb,2)
722#!f>    end subroutine
723#!f>  end interface
724#!f>#endif
725        .align    16,0x90
726single_hh_trafo_complex_double_sse_assembly:
727
728        # Get integer parameters into corresponding registers
729
730        movslq    (%rdx), %rdx # nb
731        movslq    (%rcx), %rcx # nq
732        movslq    (%r8),  %r8  # ldq
733
734        # Get ldq in bytes
735        addq      %r8, %r8
736        addq      %r8, %r8
737        addq      %r8, %r8
738        addq      %r8, %r8 # 16*ldq, i.e. ldq in bytes
739
740cloop_s:
741        cmpq      $4, %rcx   # if %rcx <= 4 jump out of loop
742        jle       cloop_e
743        hh_trafo_complex 6 # transform 6 rows
744        addq      $96, %rdi  # increment q start adress by 96 bytes (6 rows)
745        subq      $6,  %rcx  # decrement nq
746        jmp       cloop_s
747cloop_e:
748
749        cmpq      $2, %rcx   # if %rcx <= 2 jump to test_2
750        jle       test_2
751        hh_trafo_complex 4 # transform 4 rows
752        jmp       return2
753
754test_2:
755        cmpq      $0, %rcx   # if %rcx <= 0 jump to return
756        jle       return2
757        hh_trafo_complex 2 # transform 2 rows
758
759return2:
760        ret
761
762        .align    16,0x90
763#-------------------------------------------------------------------------------
764#-------------------------------------------------------------------------------
765#-------------------------------------------------------------------------------
766
767# Declare that we do not need an executable stack here
768	.section	.note.GNU-stack,"",@progbits
769