1 // This file is part of ELPA.
2 //
3 // The ELPA library was originally created by the ELPA consortium,
4 // consisting of the following organizations:
5 //
6 // - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 // Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 // - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 // Informatik,
10 // - Technische Universität München, Lehrstuhl für Informatik mit
11 // Schwerpunkt Wissenschaftliches Rechnen ,
12 // - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 // - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 // Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 // and
16 // - IBM Deutschland GmbH
17 //
18 //
19 // This particular source code file contains additions, changes and
20 // enhancements authored by Intel Corporation which is not part of
21 // the ELPA consortium.
22 //
23 // More information can be found here:
24 // http://elpa.mpcdf.mpg.de/
25 //
26 // ELPA is free software: you can redistribute it and/or modify
27 // it under the terms of the version 3 of the license of the
28 // GNU Lesser General Public License as published by the Free
29 // Software Foundation.
30 //
31 // ELPA is distributed in the hope that it will be useful,
32 // but WITHOUT ANY WARRANTY; without even the implied warranty of
33 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 // GNU Lesser General Public License for more details.
35 //
36 // You should have received a copy of the GNU Lesser General Public License
37 // along with ELPA. If not, see <http://www.gnu.org/licenses/>
38 //
39 // ELPA reflects a substantial effort on the part of the original
40 // ELPA consortium, and we ask you to respect the spirit of the
41 // license that we chose: i.e., please contribute any changes you
42 // may have back to the original ELPA library distribution, and keep
43 // any derivatives of ELPA under the same license that we chose for
44 // the original distribution, the GNU Lesser General Public License.
45 //
46 // Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
47 //
48 #include "config-f90.h"
49
50 #define CONCAT_8ARGS(a, b, c, d, e, f, g, h) CONCAT2_8ARGS(a, b, c, d, e, f, g, h)
51 #define CONCAT2_8ARGS(a, b, c, d, e, f, g, h) a ## b ## c ## d ## e ## f ## g ## h
52
53 #define CONCAT_7ARGS(a, b, c, d, e, f, g) CONCAT2_7ARGS(a, b, c, d, e, f, g)
54 #define CONCAT2_7ARGS(a, b, c, d, e, f, g) a ## b ## c ## d ## e ## f ## g
55
56 #define CONCAT_6ARGS(a, b, c, d, e, f) CONCAT2_6ARGS(a, b, c, d, e, f)
57 #define CONCAT2_6ARGS(a, b, c, d, e, f) a ## b ## c ## d ## e ## f
58
59 #define CONCAT_5ARGS(a, b, c, d, e) CONCAT2_5ARGS(a, b, c, d, e)
60 #define CONCAT2_5ARGS(a, b, c, d, e) a ## b ## c ## d ## e
61
62 #define CONCAT_4ARGS(a, b, c, d) CONCAT2_4ARGS(a, b, c, d)
63 #define CONCAT2_4ARGS(a, b, c, d) a ## b ## c ## d
64
65 #define CONCAT_3ARGS(a, b, c) CONCAT2_3ARGS(a, b, c)
66 #define CONCAT2_3ARGS(a, b, c) a ## b ## c
67
68 //define instruction set numbers
69 #define SSE_128 128
70 #define AVX_256 256
71 #define AVX_512 512
72 #define NEON_ARCH64_128 1285
73
74 #if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512
75 #include <x86intrin.h>
76 #ifdef BLOCK2
77 #if VEC_SET == SSE_128
78 #include <pmmintrin.h>
79 #endif
80 #endif
81
82 #define __forceinline __attribute__((always_inline))
83
84 #endif
85
86
87 #include <complex.h>
88
89 #include <stdio.h>
90 #include <stdlib.h>
91
92 #ifdef BLOCK2
93 #define PREFIX double
94 #define BLOCK 2
95 #endif
96
97 #ifdef BLOCK1
98 #define PREFIX single
99 #define BLOCK 1
100 #endif
101
102 #if VEC_SET == SSE_128
103 #define SIMD_SET SSE
104 #endif
105
106 #if VEC_SET == AVX_256
107 #define SIMD_SET AVX_AVX2
108 #endif
109
110 #if VEC_SET == AVX_512
111 #define SIMD_SET AVX512
112 #endif
113
114
115 #if VEC_SET == SSE_128
116
117 #ifdef DOUBLE_PRECISION_COMPLEX
118 #define offset 2
119 #define __SIMD_DATATYPE __m128d
120 #define _SIMD_LOAD _mm_load_pd
121 #define _SIMD_LOADU _mm_loadu_pd
122 #define _SIMD_STORE _mm_store_pd
123 #define _SIMD_STOREU _mm_storeu_pd
124 #define _SIMD_MUL _mm_mul_pd
125 #define _SIMD_ADD _mm_add_pd
126 #define _SIMD_XOR _mm_xor_pd
127 #define _SIMD_ADDSUB _mm_addsub_pd
128 #define _SIMD_SHUFFLE _mm_shuffle_pd
129 #define _SHUFFLE _MM_SHUFFLE2(0,1)
130
131 #ifdef __ELPA_USE_FMA__
132 #define _SIMD_FMSUBADD _mm_maddsub_pd
133 #endif
134 #endif /* DOUBLE_PRECISION_COMPLEX */
135
136 #ifdef SINGLE_PRECISION_COMPLEX
137 #define offset 4
138 #define __SIMD_DATATYPE __m128
139 #define _SIMD_LOAD _mm_load_ps
140 #define _SIMD_LOADU _mm_loadu_ps
141 #define _SIMD_STORE _mm_store_ps
142 #define _SIMD_STOREU _mm_storeu_ps
143 #define _SIMD_MUL _mm_mul_ps
144 #define _SIMD_ADD _mm_add_ps
145 #define _SIMD_XOR _mm_xor_ps
146 #define _SIMD_ADDSUB _mm_addsub_ps
147 #define _SIMD_SHUFFLE _mm_shuffle_ps
148 #define _SHUFFLE 0xb1
149
150 #ifdef __ELPA_USE_FMA__
151 #define _SIMD_FMSUBADD _mm_maddsub_ps
152 #endif
153
154 #endif /* SINGLE_PRECISION_COMPLEX */
155
156 #endif /* VEC_SET == SSE_128 */
157
158 #if VEC_SET == AVX_256
159
160 #ifdef DOUBLE_PRECISION_COMPLEX
161 #define offset 4
162 #define __SIMD_DATATYPE __m256d
163 #define _SIMD_LOAD _mm256_load_pd
164 #define _SIMD_LOADU 1
165 #define _SIMD_STORE _mm256_store_pd
166 #define _SIMD_STOREU 1
167 #define _SIMD_MUL _mm256_mul_pd
168 #define _SIMD_ADD _mm256_add_pd
169 #define _SIMD_XOR _mm256_xor_pd
170 #define _SIMD_BROADCAST _mm256_broadcast_sd
171 #define _SIMD_SET1 _mm256_set1_pd
172 #define _SIMD_ADDSUB _mm256_addsub_pd
173 #define _SIMD_SHUFFLE _mm256_shuffle_pd
174 #define _SHUFFLE 0x5
175
176 #ifdef HAVE_AVX2
177
178 #ifdef __FMA4__
179 #define __ELPA_USE_FMA__
180 #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
181 #define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
182 #endif
183
184 #ifdef __AVX2__
185 #define __ELPA_USE_FMA__
186 #define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
187 #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
188 #endif
189
190 #define _SIMD_FMADDSUB _mm256_FMADDSUB_pd
191 #define _SIMD_FMSUBADD _mm256_FMSUBADD_pd
192 #endif /* HAVE_AVX2 */
193
194 #endif /* DOUBLE_PRECISION_COMPLEX */
195
196 #ifdef SINGLE_PRECISION_COMPLEX
197 #define offset 8
198 #define __SIMD_DATATYPE __m256
199 #define _SIMD_LOAD _mm256_load_ps
200 #define _SIMD_LOADU 1
201 #define _SIMD_STORE _mm256_store_ps
202 #define _SIMD_STOREU 1
203 #define _SIMD_MUL _mm256_mul_ps
204 #define _SIMD_ADD _mm256_add_ps
205 #define _SIMD_XOR _mm256_xor_ps
206 #define _SIMD_BROADCAST _mm256_broadcast_ss
207 #define _SIMD_SET1 _mm256_set1_ps
208 #define _SIMD_ADDSUB _mm256_addsub_ps
209 #define _SIMD_SHUFFLE _mm256_shuffle_ps
210 #define _SHUFFLE 0xb1
211
212 #ifdef HAVE_AVX2
213
214 #ifdef __FMA4__
215 #define __ELPA_USE_FMA__
216 #define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
217 #define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
218 #endif
219
220 #ifdef __AVX2__
221 #define __ELPA_USE_FMA__
222 #define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
223 #define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
224 #endif
225
226 #define _SIMD_FMADDSUB _mm256_FMADDSUB_ps
227 #define _SIMD_FMSUBADD _mm256_FMSUBADD_ps
228 #endif /* HAVE_AVX2 */
229
230 #endif /* SINGLE_PRECISION_COMPLEX */
231
232 #endif /* VEC_SET == AVX_256 */
233
234 #if VEC_SET == AVX_512
235
236 #ifdef DOUBLE_PRECISION_COMPLEX
237 #define offset 8
238 #define __SIMD_DATATYPE __m512d
239 #define _SIMD_LOAD _mm512_load_pd
240 #define _SIMD_LOADU 1
241 #define _SIMD_STORE _mm512_store_pd
242 #define _SIMD_STOREU 1
243 #define _SIMD_MUL _mm512_mul_pd
244 #define _SIMD_ADD _mm512_add_pd
245 #ifdef HAVE_AVX512_XEON
246 #define _SIMD_XOR _mm512_xor_pd
247 #endif
248 #define _SIMD_BROADCAST 1
249 #define _SIMD_SET1 _mm512_set1_pd
250 #define _SIMD_SET _mm512_set_pd
251 #define _SIMD_XOR_EPI _mm512_xor_epi64
252 #define _SIMD_ADDSUB 1
253 #define _SIMD_SHUFFLE _mm512_shuffle_pd
254 #define _SIMD_MASK_STOREU _mm512_mask_storeu_pd
255 #define _SHUFFLE 0x55
256
257 #ifdef HAVE_AVX512
258 #define __ELPA_USE_FMA__
259 #define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
260 #define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
261
262 #define _SIMD_FMADDSUB _mm512_FMADDSUB_pd
263 #define _SIMD_FMSUBADD _mm512_FMSUBADD_pd
264 #endif /* HAVE_AVX512 */
265
266 #endif /* DOUBLE_PRECISION_COMPLEX */
267
268 #ifdef SINGLE_PRECISION_COMPLEX
269 #define offset 16
270 #define __SIMD_DATATYPE __m512
271 #define _SIMD_LOAD _mm512_load_ps
272 #define _SIMD_LOADU 1
273 #define _SIMD_STORE _mm512_store_ps
274 #define _SIMD_STOREU 1
275 #define _SIMD_MUL _mm512_mul_ps
276 #define _SIMD_ADD _mm512_add_ps
277 #ifdef HAVE_AVX512_XEON
278 #define _SIMD_XOR _mm512_xor_ps
279 #endif
280 #define _SIMD_BROADCAST 1
281 #define _SIMD_SET1 _mm512_set1_ps
282 #define _SIMD_SET _mm512_set_ps
283 #define _SIMD_ADDSUB 1
284 #define _SIMD_SHUFFLE _mm512_shuffle_ps
285 #define _SIMD_MASK_STOREU _mm512_mask_storeu_ps
286 #define _SIMD_XOR_EPI _mm512_xor_epi32
287 #define _SHUFFLE 0xb1
288
289 #ifdef HAVE_AVX512
290
291 #define __ELPA_USE_FMA__
292 #define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
293 #define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
294
295 #define _SIMD_FMADDSUB _mm512_FMADDSUB_ps
296 #define _SIMD_FMSUBADD _mm512_FMSUBADD_ps
297 #endif /* HAVE_AVX512 */
298
299 #endif /* SINGLE_PRECISION_COMPLEX */
300
301 #endif /* VEC_SET == AVX_512 */
302
303
304
305
306 #define __forceinline __attribute__((always_inline))
307
308 #ifdef HAVE_SSE_INTRINSICS
309 #undef __AVX__
310 #endif
311
312 #ifdef DOUBLE_PRECISION_COMPLEX
313 #define WORD_LENGTH double
314 #define DATA_TYPE double complex
315 #define DATA_TYPE_PTR double complex*
316 #define DATA_TYPE_REAL double
317 #define DATA_TYPE_REAL_PTR double*
318 #endif
319
320 #ifdef SINGLE_PRECISION_COMPLEX
321 #define WORD_LENGTH single
322 #define DATA_TYPE float complex
323 #define DATA_TYPE_PTR float complex*
324 #define DATA_TYPE_REAL float
325 #define DATA_TYPE_REAL_PTR float*
326 #endif
327
328
329 //Forward declaration
330
331 #if VEC_SET == SSE_128
332 #ifdef DOUBLE_PRECISION_COMPLEX
333 #undef ROW_LENGTH
334 #define ROW_LENGTH 6
335 #endif
336 #ifdef SINGLE_PRECISION_COMPLEX
337 #undef ROW_LENGTH
338 #define ROW_LENGTH 12
339 #endif
340 #endif /* VEC_SET == SSE_128 */
341
342 #if VEC_SET == AVX_256
343 #ifdef DOUBLE_PRECISION_COMPLEX
344 #undef ROW_LENGTH
345 #define ROW_LENGTH 12
346 #endif
347 #ifdef SINGLE_PRECISION_COMPLEX
348 #undef ROW_LENGTH
349 #define ROW_LENGTH 24
350 #endif
351 #endif /* VEC_SET == AVX_256 */
352
353 #if VEC_SET == AVX_512
354 #ifdef DOUBLE_PRECISION_COMPLEX
355 #undef ROW_LENGTH
356 #define ROW_LENGTH 24
357 #endif
358 #ifdef SINGLE_PRECISION_COMPLEX
359 #undef ROW_LENGTH
360 #define ROW_LENGTH 48
361 #endif
362 #endif /* VEC_SET == AVX_512 */
363 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
364 #ifdef BLOCK1
365 );
366 #endif
367 #ifdef BLOCK2
368 ,int ldh, DATA_TYPE s);
369 #endif
370
371 #if VEC_SET == SSE_128
372 #ifdef DOUBLE_PRECISION_COMPLEX
373 #undef ROW_LENGTH
374 #define ROW_LENGTH 5
375 #endif
376 #ifdef SINGLE_PRECISION_COMPLEX
377 #undef ROW_LENGTH
378 #define ROW_LENGTH 10
379 #endif
380 #endif /* VEC_SET == SSE_128 */
381
382 #if VEC_SET == AVX_256
383 #ifdef DOUBLE_PRECISION_COMPLEX
384 #undef ROW_LENGTH
385 #define ROW_LENGTH 10
386 #endif
387 #ifdef SINGLE_PRECISION_COMPLEX
388 #undef ROW_LENGTH
389 #define ROW_LENGTH 20
390 #endif
391 #endif /* VEC_SET == AVX_256 */
392
393 #if VEC_SET == AVX_512
394 #ifdef DOUBLE_PRECISION_COMPLEX
395 #undef ROW_LENGTH
396 #define ROW_LENGTH 20
397 #endif
398 #ifdef SINGLE_PRECISION_COMPLEX
399 #undef ROW_LENGTH
400 #define ROW_LENGTH 40
401 #endif
402 #endif /* VEC_SET == AVX_512 */
403
404 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
405 #ifdef BLOCK1
406 );
407 #endif
408 #ifdef BLOCK2
409 ,int ldh, DATA_TYPE s);
410 #endif
411
412
413 #if VEC_SET == SSE_128
414 #ifdef DOUBLE_PRECISION_COMPLEX
415 #undef ROW_LENGTH
416 #define ROW_LENGTH 4
417 #endif
418 #ifdef SINGLE_PRECISION_COMPLEX
419 #undef ROW_LENGTH
420 #define ROW_LENGTH 8
421 #endif
422 #endif /* VEC_SET == SSE_128 */
423
424 #if VEC_SET == AVX_256
425 #ifdef DOUBLE_PRECISION_COMPLEX
426 #undef ROW_LENGTH
427 #define ROW_LENGTH 8
428 #endif
429 #ifdef SINGLE_PRECISION_COMPLEX
430 #undef ROW_LENGTH
431 #define ROW_LENGTH 16
432 #endif
433 #endif /* VEC_SET == AVX_256 */
434
435 #if VEC_SET == AVX_512
436 #ifdef DOUBLE_PRECISION_COMPLEX
437 #undef ROW_LENGTH
438 #define ROW_LENGTH 16
439 #endif
440 #ifdef SINGLE_PRECISION_COMPLEX
441 #undef ROW_LENGTH
442 #define ROW_LENGTH 32
443 #endif
444 #endif /* VEC_SET == AVX_512 */
445
446 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
447 #ifdef BLOCK1
448 );
449 #endif
450 #ifdef BLOCK2
451 ,int ldh, DATA_TYPE s);
452 #endif
453
454 #if VEC_SET == SSE_128
455 #ifdef DOUBLE_PRECISION_COMPLEX
456 #undef ROW_LENGTH
457 #define ROW_LENGTH 3
458 #endif
459 #ifdef SINGLE_PRECISION_COMPLEX
460 #undef ROW_LENGTH
461 #define ROW_LENGTH 6
462 #endif
463 #endif /* VEC_SET == SSE_128 */
464
465 #if VEC_SET == AVX_256
466 #ifdef DOUBLE_PRECISION_COMPLEX
467 #undef ROW_LENGTH
468 #define ROW_LENGTH 6
469 #endif
470 #ifdef SINGLE_PRECISION_COMPLEX
471 #undef ROW_LENGTH
472 #define ROW_LENGTH 12
473 #endif
474 #endif /* VEC_SET == AVX_256 */
475
476 #if VEC_SET == AVX_512
477 #ifdef DOUBLE_PRECISION_COMPLEX
478 #undef ROW_LENGTH
479 #define ROW_LENGTH 12
480 #endif
481 #ifdef SINGLE_PRECISION_COMPLEX
482 #undef ROW_LENGTH
483 #define ROW_LENGTH 24
484 #endif
485 #endif /* VEC_SET == AVX_512 */
486
487 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
488 #ifdef BLOCK1
489 );
490 #endif
491 #ifdef BLOCK2
492 ,int ldh, DATA_TYPE s);
493 #endif
494
495 #if VEC_SET == SSE_128
496 #ifdef DOUBLE_PRECISION_COMPLEX
497 #undef ROW_LENGTH
498 #define ROW_LENGTH 2
499 #endif
500 #ifdef SINGLE_PRECISION_COMPLEX
501 #undef ROW_LENGTH
502 #define ROW_LENGTH 4
503 #endif
504 #endif /* VEC_SET == SSE_128 */
505
506 #if VEC_SET == AVX_256
507 #ifdef DOUBLE_PRECISION_COMPLEX
508 #undef ROW_LENGTH
509 #define ROW_LENGTH 4
510 #endif
511 #ifdef SINGLE_PRECISION_COMPLEX
512 #undef ROW_LENGTH
513 #define ROW_LENGTH 8
514 #endif
515 #endif /* VEC_SET == AVX_256 */
516
517 #if VEC_SET == AVX_512
518 #ifdef DOUBLE_PRECISION_COMPLEX
519 #undef ROW_LENGTH
520 #define ROW_LENGTH 8
521 #endif
522 #ifdef SINGLE_PRECISION_COMPLEX
523 #undef ROW_LENGTH
524 #define ROW_LENGTH 16
525 #endif
526 #endif /* VEC_SET == AVX_512 */
527
528 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
529 #ifdef BLOCK1
530 );
531 #endif
532 #ifdef BLOCK2
533 ,int ldh, DATA_TYPE s);
534 #endif
535
536 #if VEC_SET == SSE_128
537 #ifdef DOUBLE_PRECISION_COMPLEX
538 #undef ROW_LENGTH
539 #define ROW_LENGTH 1
540 #endif
541 #ifdef SINGLE_PRECISION_COMPLEX
542 #undef ROW_LENGTH
543 #define ROW_LENGTH 2
544 #endif
545 #endif /* VEC_SET == SSE_128 */
546
547 #if VEC_SET == AVX_256
548 #ifdef DOUBLE_PRECISION_COMPLEX
549 #undef ROW_LENGTH
550 #define ROW_LENGTH 2
551 #endif
552 #ifdef SINGLE_PRECISION_COMPLEX
553 #undef ROW_LENGTH
554 #define ROW_LENGTH 4
555 #endif
556 #endif /* VEC_SET == AVX_256 */
557
558 #if VEC_SET == AVX_512
559 #ifdef DOUBLE_PRECISION_COMPLEX
560 #undef ROW_LENGTH
561 #define ROW_LENGTH 4
562 #endif
563 #ifdef SINGLE_PRECISION_COMPLEX
564 #undef ROW_LENGTH
565 #define ROW_LENGTH 8
566 #endif
567 #endif /* VEC_SET == AVX_512 */
568
569 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
570 #ifdef BLOCK1
571 );
572 #endif
573 #ifdef BLOCK2
574 ,int ldh, DATA_TYPE s);
575 #endif
576
577
578 /*
579 !f>#ifdef HAVE_SSE_INTRINSICS
580 !f> interface
581 !f> subroutine single_hh_trafo_complex_SSE_1hv_double(q, hh, pnb, pnq, pldq) &
582 !f> bind(C, name="single_hh_trafo_complex_SSE_1hv_double")
583 !f> use, intrinsic :: iso_c_binding
584 !f> integer(kind=c_int) :: pnb, pnq, pldq
585 !f> ! complex(kind=c_double_complex) :: q(*)
586 !f> type(c_ptr), value :: q
587 !f> complex(kind=c_double_complex) :: hh(pnb,2)
588 !f> end subroutine
589 !f> end interface
590 !f>#endif
591 */
592
593 /*
594 !f>#ifdef HAVE_SSE_INTRINSICS
595 !f> interface
596 !f> subroutine single_hh_trafo_complex_SSE_1hv_single(q, hh, pnb, pnq, pldq) &
597 !f> bind(C, name="single_hh_trafo_complex_SSE_1hv_single")
598 !f> use, intrinsic :: iso_c_binding
599 !f> integer(kind=c_int) :: pnb, pnq, pldq
600 !f> ! complex(kind=c_float_complex) :: q(*)
601 !f> type(c_ptr), value :: q
602 !f> complex(kind=c_float_complex) :: hh(pnb,2)
603 !f> end subroutine
604 !f> end interface
605 !f>#endif
606 */
607
608
609 /*
610 !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
611 !f> interface
612 !f> subroutine single_hh_trafo_complex_AVX_AVX2_1hv_double(q, hh, pnb, pnq, pldq) &
613 !f> bind(C, name="single_hh_trafo_complex_AVX_AVX2_1hv_double")
614 !f> use, intrinsic :: iso_c_binding
615 !f> integer(kind=c_int) :: pnb, pnq, pldq
616 !f> ! complex(kind=c_double_complex) :: q(*)
617 !f> type(c_ptr), value :: q
618 !f> complex(kind=c_double_complex) :: hh(pnb,2)
619 !f> end subroutine
620 !f> end interface
621 !f>#endif
622 */
623
624 /*
625 !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
626 !f> interface
627 !f> subroutine single_hh_trafo_complex_AVX_AVX2_1hv_single(q, hh, pnb, pnq, pldq) &
628 !f> bind(C, name="single_hh_trafo_complex_AVX_AVX2_1hv_single")
629 !f> use, intrinsic :: iso_c_binding
630 !f> integer(kind=c_int) :: pnb, pnq, pldq
631 !f> ! complex(kind=c_float_complex) :: q(*)
632 !f> type(c_ptr), value :: q
633 !f> complex(kind=c_float_complex) :: hh(pnb,2)
634 !f> end subroutine
635 !f> end interface
636 !f>#endif
637 */
638
639 /*
640 !f>#if defined(HAVE_AVX512)
641 !f> interface
642 !f> subroutine single_hh_trafo_complex_AVX512_1hv_double(q, hh, pnb, pnq, pldq) &
643 !f> bind(C, name="single_hh_trafo_complex_AVX512_1hv_double")
644 !f> use, intrinsic :: iso_c_binding
645 !f> integer(kind=c_int) :: pnb, pnq, pldq
646 !f> ! complex(kind=c_double_complex) :: q(*)
647 !f> type(c_ptr), value :: q
648 !f> complex(kind=c_double_complex) :: hh(pnb,2)
649 !f> end subroutine
650 !f> end interface
651 !f>#endif
652 */
653
654 /*
655 !f>#if defined(HAVE_AVX512)
656 !f> interface
657 !f> subroutine single_hh_trafo_complex_AVX512_1hv_single(q, hh, pnb, pnq, pldq) &
658 !f> bind(C, name="single_hh_trafo_complex_AVX512_1hv_single")
659 !f> use, intrinsic :: iso_c_binding
660 !f> integer(kind=c_int) :: pnb, pnq, pldq
661 !f> ! complex(kind=c_float_complex) :: q(*)
662 !f> type(c_ptr), value :: q
663 !f> complex(kind=c_float_complex) :: hh(pnb,2)
664 !f> end subroutine
665 !f> end interface
666 !f>#endif
667 */
668
669
670 /*
671 !f>#ifdef HAVE_SSE_INTRINSICS
672 !f> interface
673 !f> subroutine double_hh_trafo_complex_SSE_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
674 !f> bind(C, name="double_hh_trafo_complex_SSE_2hv_double")
675 !f> use, intrinsic :: iso_c_binding
676 !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
677 !f> ! complex(kind=c_double_complex) :: q(*)
678 !f> type(c_ptr), value :: q
679 !f> complex(kind=c_double_complex) :: hh(pnb,2)
680 !f> end subroutine
681 !f> end interface
682 !f>#endif
683 */
684
685 /*
686 !f>#ifdef HAVE_SSE_INTRINSICS
687 !f> interface
688 !f> subroutine double_hh_trafo_complex_SSE_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
689 !f> bind(C, name="double_hh_trafo_complex_SSE_2hv_single")
690 !f> use, intrinsic :: iso_c_binding
691 !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
692 !f> ! complex(kind=c_float_complex) :: q(*)
693 !f> type(c_ptr), value :: q
694 !f> complex(kind=c_float_complex) :: hh(pnb,2)
695 !f> end subroutine
696 !f> end interface
697 !f>#endif
698 */
699
700 /*
701 !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
702 !f> interface
703 !f> subroutine double_hh_trafo_complex_AVX_AVX2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
704 !f> bind(C, name="double_hh_trafo_complex_AVX_AVX2_2hv_double")
705 !f> use, intrinsic :: iso_c_binding
706 !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
707 !f> ! complex(kind=c_double_complex) :: q(*)
708 !f> type(c_ptr), value :: q
709 !f> complex(kind=c_double_complex) :: hh(pnb,2)
710 !f> end subroutine
711 !f> end interface
712 !f>#endif
713 */
714
715 /*
716 !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
717 !f> interface
718 !f> subroutine double_hh_trafo_complex_AVX_AVX2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
719 !f> bind(C, name="double_hh_trafo_complex_AVX_AVX2_2hv_single")
720 !f> use, intrinsic :: iso_c_binding
721 !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
722 !f> ! complex(kind=c_float_complex) :: q(*)
723 !f> type(c_ptr), value :: q
724 !f> complex(kind=c_float_complex) :: hh(pnb,2)
725 !f> end subroutine
726 !f> end interface
727 !f>#endif
728 */
729
730 /*
731 !f>#if defined(HAVE_AVX512)
732 !f> interface
733 !f> subroutine double_hh_trafo_complex_AVX512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
734 !f> bind(C, name="double_hh_trafo_complex_AVX512_2hv_double")
735 !f> use, intrinsic :: iso_c_binding
736 !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
737 !f> ! complex(kind=c_double_complex) :: q(*)
738 !f> type(c_ptr), value :: q
739 !f> complex(kind=c_double_complex) :: hh(pnb,2)
740 !f> end subroutine
741 !f> end interface
742 !f>#endif
743 */
744
745 /*
746 !f>#if defined(HAVE_AVX512)
747 !f> interface
748 !f> subroutine double_hh_trafo_complex_AVX512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
749 !f> bind(C, name="double_hh_trafo_complex_AVX512_2hv_single")
750 !f> use, intrinsic :: iso_c_binding
751 !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
752 !f> ! complex(kind=c_float_complex) :: q(*)
753 !f> type(c_ptr), value :: q
754 !f> complex(kind=c_float_complex) :: hh(pnb,2)
755 !f> end subroutine
756 !f> end interface
757 !f>#endif
758 */
759
760
CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)761 void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq
762 #ifdef BLOCK1
763 )
764 #endif
765 #ifdef BLOCK2
766 ,int* pldh)
767 #endif
768 {
769
770 int i, worked_on;
771 int nb = *pnb;
772 int nq = *pldq;
773 int ldq = *pldq;
774 #ifdef BLOCK2
775 int ldh = *pldh;
776
777 DATA_TYPE s = conj(hh[(ldh)+1])*1.0;
778
779 for (i = 2; i < nb; i++)
780 {
781 s += hh[i-1] * conj(hh[(i+ldh)]);
782 }
783 #endif
784
785 worked_on = 0;
786
787 #ifdef BLOCK1
788
789 #if VEC_SET == SSE_128
790 #ifdef DOUBLE_PRECISION_COMPLEX
791 #define ROW_LENGTH 6
792 #define STEP_SIZE 6
793 #define UPPER_BOUND 5
794 #endif
795 #ifdef SINGLE_PRECISION_COMPLEX
796 #define ROW_LENGTH 12
797 #define STEP_SIZE 12
798 #define UPPER_BOUND 10
799 #endif
800 #endif /* VEC_SET == SSE_128 */
801
802 #if VEC_SET == AVX_256
803 #ifdef DOUBLE_PRECISION_COMPLEX
804 #define ROW_LENGTH 12
805 #define STEP_SIZE 12
806 #define UPPER_BOUND 10
807 #endif
808 #ifdef SINGLE_PRECISION_COMPLEX
809 #define ROW_LENGTH 24
810 #define STEP_SIZE 24
811 #define UPPER_BOUND 20
812 #endif
813 #endif /* VEC_SET == AVX_256 */
814
815 #if VEC_SET == AVX_512
816 #ifdef DOUBLE_PRECISION_COMPLEX
817 #define ROW_LENGTH 24
818 #define STEP_SIZE 24
819 #define UPPER_BOUND 20
820 #endif
821 #ifdef SINGLE_PRECISION_COMPLEX
822 #define ROW_LENGTH 48
823 #define STEP_SIZE 48
824 #define UPPER_BOUND 40
825 #endif
826 #endif /* VEC_SET == AVX_512 */
827
828
829 for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE)
830 {
831
832 CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
833 worked_on += ROW_LENGTH;
834 }
835
836 if (nq == i) {
837 return;
838 }
839
840 #if VEC_SET == SSE_128
841 #undef ROW_LENGTH
842 #ifdef DOUBLE_PRECISION_COMPLEX
843 #define ROW_LENGTH 5
844 #endif
845 #ifdef SINGLE_PRECISION_COMPLEX
846 #define ROW_LENGTH 10
847 #endif
848 #endif /* VEC_SET == SSE_128 */
849
850 #if VEC_SET == AVX_256
851 #undef ROW_LENGTH
852 #ifdef DOUBLE_PRECISION_COMPLEX
853 #define ROW_LENGTH 10
854 #endif
855 #ifdef SINGLE_PRECISION_COMPLEX
856 #define ROW_LENGTH 20
857 #endif
858 #endif /* VEC_SET == AVX_256 */
859
860 #if VEC_SET == AVX_512
861 #undef ROW_LENGTH
862 #ifdef DOUBLE_PRECISION_COMPLEX
863 #define ROW_LENGTH 20
864 #endif
865 #ifdef SINGLE_PRECISION_COMPLEX
866 #define ROW_LENGTH 40
867 #endif
868 #endif /* VEC_SET == AVX_512 */
869
870 if (nq-i == ROW_LENGTH)
871 {
872 CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
873 worked_on += ROW_LENGTH;
874 }
875
876 #if VEC_SET == SSE_128
877 #undef ROW_LENGTH
878 #ifdef DOUBLE_PRECISION_COMPLEX
879 #define ROW_LENGTH 4
880 #endif
881 #ifdef SINGLE_PRECISION_COMPLEX
882 #define ROW_LENGTH 8
883 #endif
884 #endif /* VEC_SET == SSE_128 */
885
886 #if VEC_SET == AVX_256
887 #undef ROW_LENGTH
888 #ifdef DOUBLE_PRECISION_COMPLEX
889 #define ROW_LENGTH 8
890 #endif
891 #ifdef SINGLE_PRECISION_COMPLEX
892 #define ROW_LENGTH 16
893 #endif
894 #endif /* VEC_SET == AVX_256 */
895
896 #if VEC_SET == AVX_512
897 #undef ROW_LENGTH
898 #ifdef DOUBLE_PRECISION_COMPLEX
899 #define ROW_LENGTH 16
900 #endif
901 #ifdef SINGLE_PRECISION_COMPLEX
902 #define ROW_LENGTH 32
903 #endif
904 #endif /* VEC_SET == AVX_512 */
905
906 if (nq-i == ROW_LENGTH)
907 {
908 CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
909 worked_on += ROW_LENGTH;
910 }
911
912 #if VEC_SET == SSE_128
913 #undef ROW_LENGTH
914 #ifdef DOUBLE_PRECISION_COMPLEX
915 #define ROW_LENGTH 3
916 #endif
917 #ifdef SINGLE_PRECISION_COMPLEX
918 #define ROW_LENGTH 6
919 #endif
920 #endif /* VEC_SET == SSE_128 */
921
922 #if VEC_SET == AVX_256
923 #undef ROW_LENGTH
924 #ifdef DOUBLE_PRECISION_COMPLEX
925 #define ROW_LENGTH 6
926 #endif
927 #ifdef SINGLE_PRECISION_COMPLEX
928 #define ROW_LENGTH 12
929 #endif
930 #endif /* VEC_SET == AVX_256 */
931
932 #if VEC_SET == AVX_512
933 #undef ROW_LENGTH
934 #ifdef DOUBLE_PRECISION_COMPLEX
935 #define ROW_LENGTH 12
936 #endif
937 #ifdef SINGLE_PRECISION_COMPLEX
938 #define ROW_LENGTH 24
939 #endif
940 #endif /* VEC_SET == AVX_512 */
941
942 if (nq-i == ROW_LENGTH)
943 {
944 CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
945 worked_on += ROW_LENGTH;
946 }
947
948 #if VEC_SET == SSE_128
949 #undef ROW_LENGTH
950 #ifdef DOUBLE_PRECISION_COMPLEX
951 #define ROW_LENGTH 2
952 #endif
953 #ifdef SINGLE_PRECISION_COMPLEX
954 #define ROW_LENGTH 4
955 #endif
956 #endif /* VEC_SET == SSE_128 */
957
958 #if VEC_SET == AVX_256
959 #undef ROW_LENGTH
960 #ifdef DOUBLE_PRECISION_COMPLEX
961 #define ROW_LENGTH 4
962 #endif
963 #ifdef SINGLE_PRECISION_COMPLEX
964 #define ROW_LENGTH 8
965 #endif
966 #endif /* VEC_SET == AVX_256 */
967
968 #if VEC_SET == AVX_512
969 #undef ROW_LENGTH
970 #ifdef DOUBLE_PRECISION_COMPLEX
971 #define ROW_LENGTH 8
972 #endif
973 #ifdef SINGLE_PRECISION_COMPLEX
974 #define ROW_LENGTH 16
975 #endif
976 #endif /* VEC_SET == AVX_512 */
977
978 if (nq-i == ROW_LENGTH)
979 {
980 CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
981 worked_on += ROW_LENGTH;
982 }
983
984 #if VEC_SET == SSE_128
985 #undef ROW_LENGTH
986 #ifdef DOUBLE_PRECISION_COMPLEX
987 #define ROW_LENGTH 1
988 #endif
989 #ifdef SINGLE_PRECISION_COMPLEX
990 #define ROW_LENGTH 2
991 #endif
992 #endif /* VEC_SET == SSE_128 */
993
994 #if VEC_SET == AVX_256
995 #undef ROW_LENGTH
996 #ifdef DOUBLE_PRECISION_COMPLEX
997 #define ROW_LENGTH 2
998 #endif
999 #ifdef SINGLE_PRECISION_COMPLEX
1000 #define ROW_LENGTH 4
1001 #endif
1002 #endif /* VEC_SET == AVX_256 */
1003
1004 #if VEC_SET == AVX_512
1005 #undef ROW_LENGTH
1006 #ifdef DOUBLE_PRECISION_COMPLEX
1007 #define ROW_LENGTH 4
1008 #endif
1009 #ifdef SINGLE_PRECISION_COMPLEX
1010 #define ROW_LENGTH 8
1011 #endif
1012 #endif /* VEC_SET == AVX_512 */
1013
1014 if (nq-i == ROW_LENGTH)
1015 {
1016 CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
1017 worked_on += ROW_LENGTH;
1018 }
1019
1020 #endif /* BLOCK1 */
1021
1022 #ifdef BLOCK2
1023
1024 #if VEC_SET == SSE_128
1025 #undef ROW_LENGTH
1026 #ifdef DOUBLE_PRECISION_COMPLEX
1027 #define ROW_LENGTH 4
1028 #define STEP_SIZE 4
1029 #define UPPER_BOUND 3
1030 #endif
1031 #ifdef SINGLE_PRECISION_COMPLEX
1032 #define ROW_LENGTH 8
1033 #define STEP_SIZE 8
1034 #define UPPER_BOUND 6
1035 #endif
1036 #endif /* VEC_SET == SSE_128 */
1037
1038 #if VEC_SET == AVX_256
1039 #undef ROW_LENGTH
1040 #ifdef DOUBLE_PRECISION_COMPLEX
1041 #define ROW_LENGTH 8
1042 #define STEP_SIZE 8
1043 #define UPPER_BOUND 6
1044 #endif
1045 #ifdef SINGLE_PRECISION_COMPLEX
1046 #define ROW_LENGTH 16
1047 #define STEP_SIZE 16
1048 #define UPPER_BOUND 12
1049 #endif
1050 #endif /* VEC_SET == AVX_256 */
1051
1052 #if VEC_SET == AVX_512
1053 #undef ROW_LENGTH
1054 #ifdef DOUBLE_PRECISION_COMPLEX
1055 #define ROW_LENGTH 16
1056 #define STEP_SIZE 16
1057 #define UPPER_BOUND 12
1058 #endif
1059 #ifdef SINGLE_PRECISION_COMPLEX
1060 #define ROW_LENGTH 32
1061 #define STEP_SIZE 32
1062 #define UPPER_BOUND 24
1063 #endif
1064 #endif /* VEC_SET == AVX_512 */
1065
1066 for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE)
1067 {
1068 CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
1069 worked_on +=ROW_LENGTH;
1070 }
1071
1072 if (nq == i)
1073 {
1074 return;
1075 }
1076
1077 #if VEC_SET == SSE_128
1078 #undef ROW_LENGTH
1079 #ifdef DOUBLE_PRECISION_COMPLEX
1080 #define ROW_LENGTH 3
1081 #endif
1082 #ifdef SINGLE_PRECISION_COMPLEX
1083 #define ROW_LENGTH 6
1084 #endif
1085 #endif /* VEC_SET == SSE_128 */
1086
1087 #if VEC_SET == AVX_256
1088 #undef ROW_LENGTH
1089 #ifdef DOUBLE_PRECISION_COMPLEX
1090 #define ROW_LENGTH 6
1091 #endif
1092 #ifdef SINGLE_PRECISION_COMPLEX
1093 #define ROW_LENGTH 12
1094 #endif
1095 #endif /* VEC_SET == AVX_256 */
1096
1097 #if VEC_SET == AVX_512
1098 #undef ROW_LENGTH
1099 #ifdef DOUBLE_PRECISION_COMPLEX
1100 #define ROW_LENGTH 12
1101 #endif
1102 #ifdef SINGLE_PRECISION_COMPLEX
1103 #define ROW_LENGTH 24
1104 #endif
1105 #endif /* VEC_SET == AVX_512 */
1106
1107 if (nq-i == ROW_LENGTH)
1108 {
1109 CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
1110 worked_on += ROW_LENGTH;
1111 }
1112
1113 #if VEC_SET == SSE_128
1114 #undef ROW_LENGTH
1115 #ifdef DOUBLE_PRECISION_COMPLEX
1116 #define ROW_LENGTH 2
1117 #endif
1118 #ifdef SINGLE_PRECISION_COMPLEX
1119 #define ROW_LENGTH 4
1120 #endif
1121 #endif /* VEC_SET == SSE_128 */
1122
1123 #if VEC_SET == AVX_256
1124 #undef ROW_LENGTH
1125 #ifdef DOUBLE_PRECISION_COMPLEX
1126 #define ROW_LENGTH 4
1127 #endif
1128 #ifdef SINGLE_PRECISION_COMPLEX
1129 #define ROW_LENGTH 8
1130 #endif
1131 #endif /* VEC_SET == AVX_256 */
1132
1133 #if VEC_SET == AVX_512
1134 #undef ROW_LENGTH
1135 #ifdef DOUBLE_PRECISION_COMPLEX
1136 #define ROW_LENGTH 8
1137 #endif
1138 #ifdef SINGLE_PRECISION_COMPLEX
1139 #define ROW_LENGTH 16
1140 #endif
1141 #endif /* VEC_SET == AVX_512 */
1142
1143 if (nq-i == ROW_LENGTH)
1144 {
1145 CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
1146 worked_on += ROW_LENGTH;
1147 }
1148
1149 #if VEC_SET == SSE_128
1150 #undef ROW_LENGTH
1151 #ifdef DOUBLE_PRECISION_COMPLEX
1152 #define ROW_LENGTH 1
1153 #endif
1154 #ifdef SINGLE_PRECISION_COMPLEX
1155 #define ROW_LENGTH 2
1156 #endif
1157 #endif /* VEC_SET == SSE_128 */
1158
1159 #if VEC_SET == AVX_256
1160 #undef ROW_LENGTH
1161 #ifdef DOUBLE_PRECISION_COMPLEX
1162 #define ROW_LENGTH 2
1163 #endif
1164 #ifdef SINGLE_PRECISION_COMPLEX
1165 #define ROW_LENGTH 4
1166 #endif
1167 #endif /* VEC_SET == AVX_256 */
1168
1169 #if VEC_SET == AVX_512
1170 #undef ROW_LENGTH
1171 #ifdef DOUBLE_PRECISION_COMPLEX
1172 #define ROW_LENGTH 4
1173 #endif
1174 #ifdef SINGLE_PRECISION_COMPLEX
1175 #define ROW_LENGTH 8
1176 #endif
1177 #endif /* VEC_SET == AVX_512 */
1178
1179 if (nq-i == ROW_LENGTH)
1180 {
1181 CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
1182 worked_on += ROW_LENGTH;
1183 }
1184
1185 #endif /* BLOCK2 */
1186
1187 #ifdef WITH_DEBUG
1188 if (worked_on != nq)
1189 {
1190 printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq);
1191 abort();
1192 }
1193 #endif
1194
1195 }
1196
1197 #if VEC_SET == SSE_128
1198 #ifdef DOUBLE_PRECISION_COMPLEX
1199 #define ROW_LENGTH 6
1200 #endif
1201 #ifdef SINGLE_PRECISION_COMPLEX
1202 #define ROW_LENGTH 12
1203 #endif
1204 #endif /* VEC_SET == SSE_128 */
1205
1206 #if VEC_SET == AVX_256
1207 #ifdef DOUBLE_PRECISION_COMPLEX
1208 #define ROW_LENGTH 12
1209 #endif
1210 #ifdef SINGLE_PRECISION_COMPLEX
1211 #define ROW_LENGTH 24
1212 #endif
1213 #endif /* VEC_SET == AVX_256 */
1214
1215 #if VEC_SET == AVX_512
1216 #ifdef DOUBLE_PRECISION_COMPLEX
1217 #define ROW_LENGTH 24
1218 #endif
1219 #ifdef SINGLE_PRECISION_COMPLEX
1220 #define ROW_LENGTH 48
1221 #endif
1222 #endif /* VEC_SET == AVX_512 */
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)1223 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
1224 #ifdef BLOCK1
1225 )
1226 #endif
1227 #ifdef BLOCK2
1228 ,int ldh, DATA_TYPE s)
1229 #endif
1230 {
1231
1232 DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
1233 DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
1234 #ifdef BLOCK2
1235 DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
1236 #endif
1237
1238 __SIMD_DATATYPE x1, x2, x3, x4, x5, x6;
1239 __SIMD_DATATYPE q1, q2, q3, q4, q5, q6;
1240 #ifdef BLOCK2
1241 __SIMD_DATATYPE y1, y2, y3, y4, y5, y6;
1242 __SIMD_DATATYPE h2_real, h2_imag;
1243 #endif
1244 __SIMD_DATATYPE h1_real, h1_imag;
1245 __SIMD_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1246 int i=0;
1247
1248 #if VEC_SET == SSE_128
1249 #ifdef DOUBLE_PRECISION_COMPLEX
1250 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
1251 #endif
1252 #ifdef SINGLE_PRECISION_COMPLEX
1253 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
1254 #endif
1255 #endif /* VEC_SET == SSE_128 */
1256
1257 #if VEC_SET == AVX_256
1258 #ifdef DOUBLE_PRECISION_COMPLEX
1259 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
1260 #endif
1261 #ifdef SINGLE_PRECISION_COMPLEX
1262 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
1263 #endif
1264 #endif /* VEC_SET == AVX_256 */
1265
1266 #if VEC_SET == AVX_512
1267 #ifdef DOUBLE_PRECISION_COMPLEX
1268 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
1269 #endif
1270 #ifdef SINGLE_PRECISION_COMPLEX
1271 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
1272 #endif
1273 #endif /* VEC_SET == AVX_512 */
1274
1275 #ifdef BLOCK2
1276 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
1277 x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
1278 x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
1279 x4 = _SIMD_LOAD(&q_dbl[(2*ldq)+3*offset]);
1280 x5 = _SIMD_LOAD(&q_dbl[(2*ldq)+4*offset]);
1281 x6 = _SIMD_LOAD(&q_dbl[(2*ldq)+5*offset]);
1282
1283 #if VEC_SET == SSE_128
1284 #ifdef DOUBLE_PRECISION_COMPLEX
1285 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
1286 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
1287 #endif
1288 #ifdef SINGLE_PRECISION_COMPLEX
1289 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
1290 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
1291 #endif
1292 #endif /* VEC_SET == SSE_128 */
1293
1294 #if VEC_SET == AVX_256
1295 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
1296 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
1297 #endif /* VEC_SET == AVX_256 */
1298
1299 #if VEC_SET == AVX_512
1300 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
1301 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
1302 #endif /* VEC_SET == AVX_512 */
1303
1304 #ifndef __ELPA_USE_FMA__
1305 // conjugate
1306 h2_imag = _SIMD_XOR(h2_imag, sign);
1307 #endif
1308
1309 y1 = _SIMD_LOAD(&q_dbl[0]);
1310 y2 = _SIMD_LOAD(&q_dbl[offset]);
1311 y3 = _SIMD_LOAD(&q_dbl[2*offset]);
1312 y4 = _SIMD_LOAD(&q_dbl[3*offset]);
1313 y5 = _SIMD_LOAD(&q_dbl[4*offset]);
1314 y6 = _SIMD_LOAD(&q_dbl[5*offset]);
1315
1316 tmp1 = _SIMD_MUL(h2_imag, x1);
1317 #ifdef __ELPA_USE_FMA__
1318 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1319 #else
1320 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1321 #endif
1322 tmp2 = _SIMD_MUL(h2_imag, x2);
1323 #ifdef __ELPA_USE_FMA__
1324 y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1325 #else
1326 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1327 #endif
1328
1329 tmp3 = _SIMD_MUL(h2_imag, x3);
1330 #ifdef __ELPA_USE_FMA__
1331 y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1332 #else
1333 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1334 #endif
1335 tmp4 = _SIMD_MUL(h2_imag, x4);
1336 #ifdef __ELPA_USE_FMA__
1337 y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1338 #else
1339 y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1340 #endif
1341
1342 tmp5 = _SIMD_MUL(h2_imag, x5);
1343 #ifdef __ELPA_USE_FMA__
1344 y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1345 #else
1346 y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1347 #endif
1348 tmp6 = _SIMD_MUL(h2_imag, x6);
1349 #ifdef __ELPA_USE_FMA__
1350 y6 = _SIMD_ADD(y6, _SIMD_FMSUBADD(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1351 #else
1352 y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1353 #endif
1354
1355 #endif /* BLOCK2 */
1356
1357 #ifdef BLOCK1
1358 x1 = _SIMD_LOAD(&q_dbl[0]);
1359 x2 = _SIMD_LOAD(&q_dbl[offset]);
1360 x3 = _SIMD_LOAD(&q_dbl[2*offset]);
1361 x4 = _SIMD_LOAD(&q_dbl[3*offset]);
1362 x5 = _SIMD_LOAD(&q_dbl[4*offset]);
1363 x6 = _SIMD_LOAD(&q_dbl[5*offset]);
1364 #endif
1365
1366 for (i = BLOCK; i < nb; i++)
1367 {
1368
1369 #if VEC_SET == SSE_128
1370 #ifdef DOUBLE_PRECISION_COMPLEX
1371 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
1372 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
1373 #endif
1374 #ifdef SINGLE_PRECISION_COMPLEX
1375 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
1376 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
1377 #endif
1378 #endif /* VEC_SET == SSE_128 */
1379
1380 #if VEC_SET == AVX_256
1381 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
1382 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
1383 #endif /* VEC_SET == AVX_256 */
1384
1385 #if VEC_SET == AVX_512
1386 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
1387 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
1388 #endif /* VEC_SET == AVX_512 */
1389
1390 #ifndef __ELPA_USE_FMA__
1391 // conjugate
1392 h1_imag = _SIMD_XOR(h1_imag, sign);
1393 #endif
1394
1395 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
1396 q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
1397 q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
1398 q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
1399 q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
1400 q6 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
1401
1402 tmp1 = _SIMD_MUL(h1_imag, q1);
1403 #ifdef __ELPA_USE_FMA__
1404 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1405 #else
1406 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1407 #endif
1408 tmp2 = _SIMD_MUL(h1_imag, q2);
1409 #ifdef __ELPA_USE_FMA__
1410 x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1411 #else
1412 x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1413 #endif
1414 tmp3 = _SIMD_MUL(h1_imag, q3);
1415 #ifdef __ELPA_USE_FMA__
1416 x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1417 #else
1418 x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1419 #endif
1420
1421 tmp4 = _SIMD_MUL(h1_imag, q4);
1422 #ifdef __ELPA_USE_FMA__
1423 x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1424 #else
1425 x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1426 #endif
1427 tmp5 = _SIMD_MUL(h1_imag, q5);
1428 #ifdef __ELPA_USE_FMA__
1429 x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1430 #else
1431 x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1432 #endif
1433 tmp6 = _SIMD_MUL(h1_imag, q6);
1434 #ifdef __ELPA_USE_FMA__
1435 x6 = _SIMD_ADD(x6, _SIMD_FMSUBADD(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1436 #else
1437 x6 = _SIMD_ADD(x6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1438 #endif
1439
1440 #ifdef BLOCK2
1441
1442 #if VEC_SET == SSE_128
1443 #ifdef DOUBLE_PRECISION_COMPLEX
1444 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
1445 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
1446 #endif
1447 #ifdef SINGLE_PRECISION_COMPLEX
1448 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
1449 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
1450 #endif
1451 #endif /* VEC_SET == SSE_128 */
1452
1453 #if VEC_SET == AVX_256
1454 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
1455 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
1456 #endif /* VEC_SET == AVX_256 */
1457
1458 #if VEC_SET == AVX_512
1459 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
1460 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
1461 #endif /* VEC_SET == AVX_512 */
1462
1463
1464 #ifndef __ELPA_USE_FMA__
1465 // conjugate
1466 h2_imag = _SIMD_XOR(h2_imag, sign);
1467 #endif
1468
1469 tmp1 = _SIMD_MUL(h2_imag, q1);
1470 #ifdef __ELPA_USE_FMA__
1471 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1472 #else
1473 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1474 #endif
1475 tmp2 = _SIMD_MUL(h2_imag, q2);
1476 #ifdef __ELPA_USE_FMA__
1477 y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1478 #else
1479 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1480 #endif
1481
1482 tmp3 = _SIMD_MUL(h2_imag, q3);
1483 #ifdef __ELPA_USE_FMA__
1484 y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1485 #else
1486 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1487 #endif
1488 tmp4 = _SIMD_MUL(h2_imag, q4);
1489 #ifdef __ELPA_USE_FMA__
1490 y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1491 #else
1492 y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1493 #endif
1494
1495 tmp5 = _SIMD_MUL(h2_imag, q5);
1496 #ifdef __ELPA_USE_FMA__
1497 y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1498 #else
1499 y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1500 #endif
1501 tmp6 = _SIMD_MUL(h2_imag, q6);
1502 #ifdef __ELPA_USE_FMA__
1503 y6 = _SIMD_ADD(y6, _SIMD_FMSUBADD(h2_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1504 #else
1505 y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1506 #endif
1507
1508 #endif /* BLOCK2 */
1509
1510 }
1511
1512 #ifdef BLOCK2
1513
1514 #if VEC_SET == SSE_128
1515 #ifdef DOUBLE_PRECISION_COMPLEX
1516 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
1517 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
1518 #endif
1519 #ifdef SINGLE_PRECISION_COMPLEX
1520 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
1521 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
1522 #endif
1523 #endif /* VEC_SET == SSE_128 */
1524
1525 #if VEC_SET == AVX_256
1526 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
1527 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
1528 #endif /* VEC_SET == AVX_256 */
1529
1530 #if VEC_SET == AVX_512
1531 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
1532 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
1533 #endif /* VEC_SET == AVX_512 */
1534
1535 #ifndef __ELPA_USE_FMA__
1536 // conjugate
1537 h1_imag = _SIMD_XOR(h1_imag, sign);
1538 #endif
1539
1540 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
1541 q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
1542 q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
1543 q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
1544 q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
1545 q6 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+5*offset]);
1546
1547 tmp1 = _SIMD_MUL(h1_imag, q1);
1548 #ifdef __ELPA_USE_FMA__
1549 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1550 #else
1551 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1552 #endif
1553 tmp2 = _SIMD_MUL(h1_imag, q2);
1554 #ifdef __ELPA_USE_FMA__
1555 x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1556 #else
1557 x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1558 #endif
1559
1560 tmp3 = _SIMD_MUL(h1_imag, q3);
1561 #ifdef __ELPA_USE_FMA__
1562 x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1563 #else
1564 x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1565 #endif
1566 tmp4 = _SIMD_MUL(h1_imag, q4);
1567 #ifdef __ELPA_USE_FMA__
1568 x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1569 #else
1570 x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1571 #endif
1572
1573 tmp5 = _SIMD_MUL(h1_imag, q5);
1574 #ifdef __ELPA_USE_FMA__
1575 x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1576 #else
1577 x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1578 #endif
1579 tmp6 = _SIMD_MUL(h1_imag, q6);
1580 #ifdef __ELPA_USE_FMA__
1581 x6 = _SIMD_ADD(x6, _SIMD_FMSUBADD(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1582 #else
1583 x6 = _SIMD_ADD(x6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1584 #endif
1585
1586 #endif /* BLOCK2 */
1587
1588 #if VEC_SET == SSE_128
1589 #ifdef DOUBLE_PRECISION_COMPLEX
1590 h1_real = _mm_loaddup_pd(&hh_dbl[0]);
1591 h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
1592 #endif
1593 #ifdef SINGLE_PRECISION_COMPLEX
1594 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
1595 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
1596 #endif
1597 #endif /* VEC_SET == SSE_128 */
1598
1599 #if VEC_SET == AVX_256
1600 h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
1601 h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
1602 #endif /* VEC_SET == AVX_256 */
1603
1604 #if VEC_SET == AVX_512
1605 h1_real = _SIMD_SET1(hh_dbl[0]);
1606 h1_imag = _SIMD_SET1(hh_dbl[1]);
1607
1608 #ifdef HAVE_AVX512_XEON_PHI
1609 #ifdef DOUBLE_PRECISION_COMPLEX
1610 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
1611 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
1612 #endif
1613 #ifdef SINGLE_PRECISION_COMPLEX
1614 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
1615 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
1616 #endif
1617 #endif
1618 #ifdef HAVE_AVX512_XEON
1619 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
1620 h1_real = _SIMD_XOR(h1_real, sign);
1621 h1_imag = _SIMD_XOR(h1_imag, sign);
1622 #endif
1623 #endif
1624
1625 #endif /* VEC_SET == AVX_512 */
1626
1627 #if VEC_SET != AVX_512
1628 h1_real = _SIMD_XOR(h1_real, sign);
1629 h1_imag = _SIMD_XOR(h1_imag, sign);
1630 #endif /* VEC_SET != AVX_512 */
1631
1632 tmp1 = _SIMD_MUL(h1_imag, x1);
1633 #ifdef __ELPA_USE_FMA__
1634 x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1635 #else
1636 x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1637 #endif
1638 tmp2 = _SIMD_MUL(h1_imag, x2);
1639 #ifdef __ELPA_USE_FMA__
1640 x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1641 #else
1642 x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1643 #endif
1644 tmp3 = _SIMD_MUL(h1_imag, x3);
1645 #ifdef __ELPA_USE_FMA__
1646 x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
1647 #else
1648 x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
1649 #endif
1650
1651 tmp4 = _SIMD_MUL(h1_imag, x4);
1652 #ifdef __ELPA_USE_FMA__
1653 x4 = _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
1654 #else
1655 x4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
1656 #endif
1657 tmp5 = _SIMD_MUL(h1_imag, x5);
1658 #ifdef __ELPA_USE_FMA__
1659 x5 = _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
1660 #else
1661 x5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
1662 #endif
1663 tmp6 = _SIMD_MUL(h1_imag, x6);
1664 #ifdef __ELPA_USE_FMA__
1665 x6 = _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
1666 #else
1667 x6 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
1668 #endif
1669
1670 #ifdef BLOCK2
1671
1672 #if VEC_SET == SSE_128
1673 #ifdef DOUBLE_PRECISION_COMPLEX
1674 h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
1675 h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
1676 #endif
1677 #ifdef SINGLE_PRECISION_COMPLEX
1678 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
1679 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
1680 #endif
1681
1682 #ifdef DOUBLE_PRECISION_COMPLEX
1683 h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
1684 h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
1685 #endif
1686 #ifdef SINGLE_PRECISION_COMPLEX
1687 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
1688 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
1689 #endif
1690 #endif /* VEC_SET == 128 */
1691
1692 #if VEC_SET == AVX_256
1693 h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
1694 h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
1695 h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
1696 h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
1697 #endif /* VEC_SET == AVX_256 */
1698
1699 #if VEC_SET == AVX_512
1700 h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
1701 h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
1702 h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
1703 h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
1704
1705 #ifdef HAVE_AVX512_XEON_PHI
1706
1707 #ifdef DOUBLE_PRECISION_COMPLEX
1708 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
1709 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
1710 #endif
1711 #ifdef SINGLE_PRECISION_COMPLEX
1712 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
1713 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
1714 #endif
1715
1716 #ifdef DOUBLE_PRECISION_COMPLEX
1717 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
1718 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
1719 #endif
1720 #ifdef SINGLE_PRECISION_COMPLEX
1721 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
1722 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
1723 #endif
1724 #endif /* HAVE_AVX512_XEON_PHI */
1725
1726 #ifdef HAVE_AVX512_XEON
1727 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
1728 h1_real = _SIMD_XOR(h1_real, sign);
1729 h1_imag = _SIMD_XOR(h1_imag, sign);
1730 h2_real = _SIMD_XOR(h2_real, sign);
1731 h2_imag = _SIMD_XOR(h2_imag, sign);
1732 #endif
1733 #endif
1734 #endif /* VEC_SET == AVX_512 */
1735
1736 #if VEC_SET != AVX_512
1737 h1_real = _SIMD_XOR(h1_real, sign);
1738 h1_imag = _SIMD_XOR(h1_imag, sign);
1739 h2_real = _SIMD_XOR(h2_real, sign);
1740 h2_imag = _SIMD_XOR(h2_imag, sign);
1741 #endif /* VEC_SET != AVX_512 */
1742
1743 #if VEC_SET == SSE_128
1744 #ifdef SINGLE_PRECISION_COMPLEX
1745 tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
1746 #else
1747 tmp2 = _SIMD_LOADU(s_dbl);
1748 #endif
1749 #endif /* VEC_SET == SSE_128 */
1750
1751 #if VEC_SET == AVX_256
1752 #ifdef DOUBLE_PRECISION_COMPLEX
1753 tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
1754 #endif
1755 #ifdef SINGLE_PRECISION_COMPLEX
1756 tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
1757 s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
1758 #endif
1759 #endif /* VEC_SET == AVX_256 */
1760
1761 #if VEC_SET == AVX_512
1762 #ifdef DOUBLE_PRECISION_COMPLEX
1763 tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
1764 s_dbl[1], s_dbl[0],
1765 s_dbl[1], s_dbl[0],
1766 s_dbl[1], s_dbl[0]);
1767 #endif
1768 #ifdef SINGLE_PRECISION_COMPLEX
1769 tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
1770 #endif
1771 #endif /* VEC_SET == AVX_512 */
1772
1773 tmp1 = _SIMD_MUL(h2_imag, tmp2);
1774 #ifdef __ELPA_USE_FMA__
1775 tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1776 #else
1777 tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1778 #endif
1779
1780 #if VEC_SET == AVX_512
1781 _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
1782
1783 h2_real = _SIMD_SET1(s_dbl[0]);
1784 h2_imag = _SIMD_SET1(s_dbl[1]);
1785 #endif /* VEC_SET == AVX_512 */
1786
1787 #if VEC_SET == SSE_128
1788 #ifdef DOUBLE_PRECISION_COMPLEX
1789 h2_real = _mm_movedup_pd(tmp2);
1790 h2_imag = _mm_set1_pd(tmp2[1]);
1791 #endif
1792 #ifdef SINGLE_PRECISION_COMPLEX
1793 h2_real = _mm_moveldup_ps(tmp2);
1794 h2_imag = _mm_movehdup_ps(tmp2);
1795 #endif
1796 #endif /* VEC_SET == SSE_128 */
1797
1798 #if VEC_SET == AVX_256
1799 h2_real = _SIMD_SET1(tmp2[0]);
1800 h2_imag = _SIMD_SET1(tmp2[1]);
1801 #endif /* VEC_SET == AVX_256 */
1802
1803 tmp1 = _SIMD_MUL(h1_imag, y1);
1804 #ifdef __ELPA_USE_FMA__
1805 y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1806 #else
1807 y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1808 #endif
1809 tmp2 = _SIMD_MUL(h1_imag, y2);
1810 #ifdef __ELPA_USE_FMA__
1811 y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1812 #else
1813 y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1814 #endif
1815
1816 tmp3 = _SIMD_MUL(h1_imag, y3);
1817 #ifdef __ELPA_USE_FMA__
1818 y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
1819 #else
1820 y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
1821 #endif
1822 tmp4 = _SIMD_MUL(h1_imag, y4);
1823 #ifdef __ELPA_USE_FMA__
1824 y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
1825 #else
1826 y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
1827 #endif
1828
1829 tmp5 = _SIMD_MUL(h1_imag, y5);
1830 #ifdef __ELPA_USE_FMA__
1831 y5 = _SIMD_FMADDSUB(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
1832 #else
1833 y5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
1834 #endif
1835 tmp6 = _SIMD_MUL(h1_imag, y6);
1836 #ifdef __ELPA_USE_FMA__
1837 y6 = _SIMD_FMADDSUB(h1_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
1838 #else
1839 y6 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
1840 #endif
1841
1842 tmp1 = _SIMD_MUL(h2_imag, x1);
1843 #ifdef __ELPA_USE_FMA__
1844 y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1845 #else
1846 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1847 #endif
1848 tmp2 = _SIMD_MUL(h2_imag, x2);
1849 #ifdef __ELPA_USE_FMA__
1850 y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1851 #else
1852 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1853 #endif
1854
1855 tmp3 = _SIMD_MUL(h2_imag, x3);
1856 #ifdef __ELPA_USE_FMA__
1857 y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1858 #else
1859 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1860 #endif
1861 tmp4 = _SIMD_MUL(h2_imag, x4);
1862 #ifdef __ELPA_USE_FMA__
1863 y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1864 #else
1865 y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1866 #endif
1867
1868 tmp5 = _SIMD_MUL(h2_imag, x5);
1869 #ifdef __ELPA_USE_FMA__
1870 y5 = _SIMD_ADD(y5, _SIMD_FMADDSUB(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1871 #else
1872 y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1873 #endif
1874 tmp6 = _SIMD_MUL(h2_imag, x6);
1875 #ifdef __ELPA_USE_FMA__
1876 y6 = _SIMD_ADD(y6, _SIMD_FMADDSUB(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1877 #else
1878 y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1879 #endif
1880
1881 #endif /* BLOCK2 */
1882
1883 q1 = _SIMD_LOAD(&q_dbl[0]);
1884 q2 = _SIMD_LOAD(&q_dbl[offset]);
1885 q3 = _SIMD_LOAD(&q_dbl[2*offset]);
1886 q4 = _SIMD_LOAD(&q_dbl[3*offset]);
1887 q5 = _SIMD_LOAD(&q_dbl[4*offset]);
1888 q6 = _SIMD_LOAD(&q_dbl[5*offset]);
1889
1890 #ifdef BLOCK1
1891 q1 = _SIMD_ADD(q1, x1);
1892 q2 = _SIMD_ADD(q2, x2);
1893 q3 = _SIMD_ADD(q3, x3);
1894 q4 = _SIMD_ADD(q4, x4);
1895 q5 = _SIMD_ADD(q5, x5);
1896 q6 = _SIMD_ADD(q6, x6);
1897 #endif
1898
1899
1900 #ifdef BLOCK2
1901 q1 = _SIMD_ADD(q1, y1);
1902 q2 = _SIMD_ADD(q2, y2);
1903 q3 = _SIMD_ADD(q3, y3);
1904 q4 = _SIMD_ADD(q4, y4);
1905 q5 = _SIMD_ADD(q5, y5);
1906 q6 = _SIMD_ADD(q6, y6);
1907 #endif
1908
1909 _SIMD_STORE(&q_dbl[0], q1);
1910 _SIMD_STORE(&q_dbl[offset], q2);
1911 _SIMD_STORE(&q_dbl[2*offset], q3);
1912 _SIMD_STORE(&q_dbl[3*offset], q4);
1913 _SIMD_STORE(&q_dbl[4*offset], q5);
1914 _SIMD_STORE(&q_dbl[5*offset], q6);
1915
1916
1917 #ifdef BLOCK2
1918
1919 #if VEC_SET == SSE_128
1920 #ifdef DOUBLE_PRECISION_COMPLEX
1921 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
1922 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
1923 #endif
1924 #ifdef SINGLE_PRECISION_COMPLEX
1925 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
1926 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
1927 #endif
1928 #endif /* VEC_SET == SSE_128 */
1929
1930 #if VEC_SET == AVX_256
1931 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
1932 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
1933 #endif /* VEC_SET == AVX_256 */
1934
1935 #if VEC_SET == AVX_512
1936 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
1937 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
1938 #endif /* VEC_SET == AVX_512 */
1939
1940 q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
1941 q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
1942 q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
1943 q4 = _SIMD_LOAD(&q_dbl[(ldq*2)+3*offset]);
1944 q5 = _SIMD_LOAD(&q_dbl[(ldq*2)+4*offset]);
1945 q6 = _SIMD_LOAD(&q_dbl[(ldq*2)+5*offset]);
1946
1947 q1 = _SIMD_ADD(q1, x1);
1948 q2 = _SIMD_ADD(q2, x2);
1949 q3 = _SIMD_ADD(q3, x3);
1950 q4 = _SIMD_ADD(q4, x4);
1951 q5 = _SIMD_ADD(q5, x5);
1952 q6 = _SIMD_ADD(q6, x6);
1953
1954 tmp1 = _SIMD_MUL(h2_imag, y1);
1955 #ifdef __ELPA_USE_FMA__
1956 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1957 #else
1958 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1959 #endif
1960 tmp2 = _SIMD_MUL(h2_imag, y2);
1961 #ifdef __ELPA_USE_FMA__
1962 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1963 #else
1964 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1965 #endif
1966
1967 tmp3 = _SIMD_MUL(h2_imag, y3);
1968 #ifdef __ELPA_USE_FMA__
1969 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1970 #else
1971 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1972 #endif
1973 tmp4 = _SIMD_MUL(h2_imag, y4);
1974 #ifdef __ELPA_USE_FMA__
1975 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1976 #else
1977 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1978 #endif
1979
1980 tmp5 = _SIMD_MUL(h2_imag, y5);
1981 #ifdef __ELPA_USE_FMA__
1982 q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1983 #else
1984 q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1985 #endif
1986 tmp6 = _SIMD_MUL(h2_imag, y6);
1987 #ifdef __ELPA_USE_FMA__
1988 q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1989 #else
1990 q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1991 #endif
1992
1993 _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
1994 _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
1995 _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
1996 _SIMD_STORE(&q_dbl[(ldq*2)+3*offset], q4);
1997 _SIMD_STORE(&q_dbl[(ldq*2)+4*offset], q5);
1998 _SIMD_STORE(&q_dbl[(ldq*2)+5*offset], q6);
1999
2000 #endif /* BLOCK2 */
2001
2002
2003 for (i = BLOCK; i < nb; i++)
2004 {
2005
2006 #if VEC_SET == SSE_128
2007 #ifdef DOUBLE_PRECISION_COMPLEX
2008 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
2009 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
2010 #endif
2011 #ifdef SINGLE_PRECISION_COMPLEX
2012 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
2013 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
2014 #endif
2015 #endif /* VEC_SET == SSE_128 */
2016
2017 #if VEC_SET == AVX_256
2018 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
2019 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
2020 #endif /* VEC_SET == AVX_256 */
2021
2022 #if VEC_SET == AVX_512
2023 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
2024 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
2025 #endif /* VEC_SET == AVX_512 */
2026
2027 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
2028 q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
2029 q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
2030 q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
2031 q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
2032 q6 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
2033
2034 tmp1 = _SIMD_MUL(h1_imag, x1);
2035 #ifdef __ELPA_USE_FMA__
2036 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2037 #else
2038 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2039 #endif
2040 tmp2 = _SIMD_MUL(h1_imag, x2);
2041 #ifdef __ELPA_USE_FMA__
2042 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2043 #else
2044 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2045 #endif
2046 tmp3 = _SIMD_MUL(h1_imag, x3);
2047 #ifdef __ELPA_USE_FMA__
2048 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2049 #else
2050 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2051 #endif
2052
2053 tmp4 = _SIMD_MUL(h1_imag, x4);
2054 #ifdef __ELPA_USE_FMA__
2055 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2056 #else
2057 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2058 #endif
2059 tmp5 = _SIMD_MUL(h1_imag, x5);
2060 #ifdef __ELPA_USE_FMA__
2061 q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2062 #else
2063 q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2064 #endif
2065 tmp6 = _SIMD_MUL(h1_imag, x6);
2066 #ifdef __ELPA_USE_FMA__
2067 q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2068 #else
2069 q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2070 #endif
2071
2072 #ifdef BLOCK2
2073
2074 #if VEC_SET == SSE_128
2075 #ifdef DOUBLE_PRECISION_COMPLEX
2076 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
2077 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
2078 #endif
2079 #ifdef SINGLE_PRECISION_COMPLEX
2080 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
2081 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
2082 #endif
2083 #endif /* VEC_SET == SSE_128 */
2084
2085 #if VEC_SET == AVX_256
2086 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
2087 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
2088 #endif /* VEC_SET == AVX_256 */
2089
2090 #if VEC_SET == AVX_512
2091 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
2092 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
2093 #endif /* VEC_SET == AVX_512 */
2094
2095 tmp1 = _SIMD_MUL(h2_imag, y1);
2096 #ifdef __ELPA_USE_FMA__
2097 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2098 #else
2099 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2100 #endif
2101 tmp2 = _SIMD_MUL(h2_imag, y2);
2102 #ifdef __ELPA_USE_FMA__
2103 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2104 #else
2105 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2106 #endif
2107
2108 tmp3 = _SIMD_MUL(h2_imag, y3);
2109 #ifdef __ELPA_USE_FMA__
2110 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2111 #else
2112 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2113 #endif
2114 tmp4 = _SIMD_MUL(h2_imag, y4);
2115 #ifdef __ELPA_USE_FMA__
2116 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2117 #else
2118 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2119 #endif
2120
2121 tmp5 = _SIMD_MUL(h2_imag, y5);
2122 #ifdef __ELPA_USE_FMA__
2123 q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2124 #else
2125 q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2126 #endif
2127 tmp6 = _SIMD_MUL(h2_imag, y6);
2128 #ifdef __ELPA_USE_FMA__
2129 q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2130 #else
2131 q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2132 #endif
2133
2134 #endif /* BLOCK2 */
2135
2136
2137 _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
2138 _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
2139 _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
2140 _SIMD_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
2141 _SIMD_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
2142 _SIMD_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
2143 }
2144 #ifdef BLOCK2
2145
2146 #if VEC_SET == SSE_128
2147 #ifdef DOUBLE_PRECISION_COMPLEX
2148 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
2149 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
2150 #endif
2151 #ifdef SINGLE_PRECISION_COMPLEX
2152 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
2153 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
2154 #endif
2155 #endif /* VEC_SET == SSE_128 */
2156
2157 #if VEC_SET == AVX_256
2158 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
2159 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
2160 #endif /* VEC_SET == AVX_256 */
2161
2162 #if VEC_SET == AVX_512
2163 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
2164 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
2165 #endif /* VEC_SET == AVX_512 */
2166
2167 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
2168 q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
2169 q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
2170 q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
2171 q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
2172 q6 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+5*offset]);
2173
2174 tmp1 = _SIMD_MUL(h1_imag, x1);
2175 #ifdef __ELPA_USE_FMA__
2176 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2177 #else
2178 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2179 #endif
2180 tmp2 = _SIMD_MUL(h1_imag, x2);
2181 #ifdef __ELPA_USE_FMA__
2182 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2183 #else
2184 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2185 #endif
2186
2187 tmp3 = _SIMD_MUL(h1_imag, x3);
2188 #ifdef __ELPA_USE_FMA__
2189 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2190 #else
2191 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2192 #endif
2193 tmp4 = _SIMD_MUL(h1_imag, x4);
2194 #ifdef __ELPA_USE_FMA__
2195 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2196 #else
2197 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2198 #endif
2199
2200 tmp5 = _SIMD_MUL(h1_imag, x5);
2201 #ifdef __ELPA_USE_FMA__
2202 q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2203 #else
2204 q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2205 #endif
2206 tmp6 = _SIMD_MUL(h1_imag, x6);
2207 #ifdef __ELPA_USE_FMA__
2208 q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2209 #else
2210 q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2211 #endif
2212
2213 _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
2214 _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
2215 _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
2216 _SIMD_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
2217 _SIMD_STORE(&q_dbl[(2*nb*ldq)+4*offset], q5);
2218 _SIMD_STORE(&q_dbl[(2*nb*ldq)+5*offset], q6);
2219
2220 #endif /* BLOCK2 */
2221
2222 }
2223
2224
2225 #if VEC_SET == SSE_128
2226 #ifdef DOUBLE_PRECISION_COMPLEX
2227 #define ROW_LENGTH 5
2228 #endif
2229 #ifdef SINGLE_PRECISION_COMPLEX
2230 #define ROW_LENGTH 10
2231 #endif
2232 #endif /* VEC_SET == SSE_128 */
2233
2234 #if VEC_SET == AVX_256
2235 #ifdef DOUBLE_PRECISION_COMPLEX
2236 #define ROW_LENGTH 10
2237 #endif
2238 #ifdef SINGLE_PRECISION_COMPLEX
2239 #define ROW_LENGTH 20
2240 #endif
2241 #endif /* VEC_SET == AVX_256 */
2242
2243 #if VEC_SET == AVX_512
2244 #ifdef DOUBLE_PRECISION_COMPLEX
2245 #define ROW_LENGTH 20
2246 #endif
2247 #ifdef SINGLE_PRECISION_COMPLEX
2248 #define ROW_LENGTH 40
2249 #endif
2250 #endif /* VEC_SET == AVX_512 */
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)2251 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
2252 #ifdef BLOCK1
2253 )
2254 #endif
2255 #ifdef BLOCK2
2256 ,int ldh, DATA_TYPE s)
2257 #endif
2258 {
2259
2260 DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
2261 DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
2262 #ifdef BLOCK2
2263 DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
2264 #endif
2265
2266 __SIMD_DATATYPE x1, x2, x3, x4, x5;
2267 __SIMD_DATATYPE q1, q2, q3, q4, q5;
2268 #ifdef BLOCK2
2269 __SIMD_DATATYPE y1, y2, y3, y4, y5;
2270 __SIMD_DATATYPE h2_real, h2_imag;
2271 #endif
2272 __SIMD_DATATYPE h1_real, h1_imag;
2273 __SIMD_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5;
2274 int i=0;
2275
2276 #if VEC_SET == SSE_128
2277 #ifdef DOUBLE_PRECISION_COMPLEX
2278 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
2279 #endif
2280 #ifdef SINGLE_PRECISION_COMPLEX
2281 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
2282 #endif
2283 #endif /* VEC_SET == SSE_128 */
2284
2285 #if VEC_SET == AVX_256
2286 #ifdef DOUBLE_PRECISION_COMPLEX
2287 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
2288 #endif
2289 #ifdef SINGLE_PRECISION_COMPLEX
2290 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
2291 #endif
2292 #endif /* VEC_SET == AVX_256 */
2293
2294 #if VEC_SET == AVX_512
2295 #ifdef DOUBLE_PRECISION_COMPLEX
2296 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
2297 #endif
2298 #ifdef SINGLE_PRECISION_COMPLEX
2299 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
2300 #endif
2301 #endif /* VEC_SET == AVX_512 */
2302
2303 #ifdef BLOCK2
2304 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
2305 x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
2306 x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
2307 x4 = _SIMD_LOAD(&q_dbl[(2*ldq)+3*offset]);
2308 x5 = _SIMD_LOAD(&q_dbl[(2*ldq)+4*offset]);
2309
2310 #if VEC_SET == SSE_128
2311 #ifdef DOUBLE_PRECISION_COMPLEX
2312 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
2313 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
2314 #endif
2315 #ifdef SINGLE_PRECISION_COMPLEX
2316 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
2317 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
2318 #endif
2319 #endif /* VEC_SET == SSE_128 */
2320
2321 #if VEC_SET == AVX_256
2322 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
2323 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
2324 #endif /* VEC_SET == AVX_256 */
2325
2326 #if VEC_SET == AVX_512
2327 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
2328 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
2329 #endif /* VEC_SET == AVX_512 */
2330
2331 #ifndef __ELPA_USE_FMA__
2332 // conjugate
2333 h2_imag = _SIMD_XOR(h2_imag, sign);
2334 #endif
2335
2336 y1 = _SIMD_LOAD(&q_dbl[0]);
2337 y2 = _SIMD_LOAD(&q_dbl[offset]);
2338 y3 = _SIMD_LOAD(&q_dbl[2*offset]);
2339 y4 = _SIMD_LOAD(&q_dbl[3*offset]);
2340 y5 = _SIMD_LOAD(&q_dbl[4*offset]);
2341
2342 tmp1 = _SIMD_MUL(h2_imag, x1);
2343 #ifdef __ELPA_USE_FMA__
2344 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2345 #else
2346 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2347 #endif
2348 tmp2 = _SIMD_MUL(h2_imag, x2);
2349 #ifdef __ELPA_USE_FMA__
2350 y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2351 #else
2352 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2353 #endif
2354
2355 tmp3 = _SIMD_MUL(h2_imag, x3);
2356 #ifdef __ELPA_USE_FMA__
2357 y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2358 #else
2359 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2360 #endif
2361 tmp4 = _SIMD_MUL(h2_imag, x4);
2362 #ifdef __ELPA_USE_FMA__
2363 y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2364 #else
2365 y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2366 #endif
2367
2368 tmp5 = _SIMD_MUL(h2_imag, x5);
2369 #ifdef __ELPA_USE_FMA__
2370 y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2371 #else
2372 y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2373 #endif
2374
2375 #endif /* BLOCK2 */
2376
2377 #ifdef BLOCK1
2378 x1 = _SIMD_LOAD(&q_dbl[0]);
2379 x2 = _SIMD_LOAD(&q_dbl[offset]);
2380 x3 = _SIMD_LOAD(&q_dbl[2*offset]);
2381 x4 = _SIMD_LOAD(&q_dbl[3*offset]);
2382 x5 = _SIMD_LOAD(&q_dbl[4*offset]);
2383 #endif
2384
2385 for (i = BLOCK; i < nb; i++)
2386 {
2387
2388 #if VEC_SET == SSE_128
2389 #ifdef DOUBLE_PRECISION_COMPLEX
2390 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
2391 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
2392 #endif
2393 #ifdef SINGLE_PRECISION_COMPLEX
2394 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
2395 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
2396 #endif
2397 #endif /* VEC_SET == SSE_128 */
2398
2399 #if VEC_SET == AVX_256
2400 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
2401 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
2402 #endif /* VEC_SET == AVX_256 */
2403
2404 #if VEC_SET == AVX_512
2405 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
2406 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
2407 #endif /* VEC_SET == AVX_512 */
2408
2409 #ifndef __ELPA_USE_FMA__
2410 // conjugate
2411 h1_imag = _SIMD_XOR(h1_imag, sign);
2412 #endif
2413
2414 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
2415 q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
2416 q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
2417 q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
2418 q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
2419
2420 tmp1 = _SIMD_MUL(h1_imag, q1);
2421
2422 #ifdef __ELPA_USE_FMA__
2423 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2424 #else
2425 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2426 #endif
2427 tmp2 = _SIMD_MUL(h1_imag, q2);
2428 #ifdef __ELPA_USE_FMA__
2429 x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2430 #else
2431 x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2432 #endif
2433 tmp3 = _SIMD_MUL(h1_imag, q3);
2434 #ifdef __ELPA_USE_FMA__
2435 x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2436 #else
2437 x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2438 #endif
2439
2440 tmp4 = _SIMD_MUL(h1_imag, q4);
2441 #ifdef __ELPA_USE_FMA__
2442 x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2443 #else
2444 x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2445 #endif
2446 tmp5 = _SIMD_MUL(h1_imag, q5);
2447 #ifdef __ELPA_USE_FMA__
2448 x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2449 #else
2450 x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2451 #endif
2452
2453 #ifdef BLOCK2
2454
2455 #if VEC_SET == SSE_128
2456 #ifdef DOUBLE_PRECISION_COMPLEX
2457 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
2458 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
2459 #endif
2460 #ifdef SINGLE_PRECISION_COMPLEX
2461 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
2462 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
2463 #endif
2464 #endif /* VEC_SET == SSE_128 */
2465
2466 #if VEC_SET == AVX_256
2467 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
2468 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
2469 #endif /* VEC_SET == AVX_256 */
2470
2471 #if VEC_SET == AVX_512
2472 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
2473 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
2474 #endif /* VEC_SET == AVX_512 */
2475
2476 #ifndef __ELPA_USE_FMA__
2477 // conjugate
2478 h2_imag = _SIMD_XOR(h2_imag, sign);
2479 #endif
2480
2481 tmp1 = _SIMD_MUL(h2_imag, q1);
2482 #ifdef __ELPA_USE_FMA__
2483 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2484 #else
2485 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2486 #endif
2487 tmp2 = _SIMD_MUL(h2_imag, q2);
2488 #ifdef __ELPA_USE_FMA__
2489 y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2490 #else
2491 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2492 #endif
2493
2494 tmp3 = _SIMD_MUL(h2_imag, q3);
2495 #ifdef __ELPA_USE_FMA__
2496 y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2497 #else
2498 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2499 #endif
2500 tmp4 = _SIMD_MUL(h2_imag, q4);
2501 #ifdef __ELPA_USE_FMA__
2502 y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2503 #else
2504 y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2505 #endif
2506
2507 tmp5 = _SIMD_MUL(h2_imag, q5);
2508 #ifdef __ELPA_USE_FMA__
2509 y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2510 #else
2511 y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2512 #endif
2513
2514 #endif /* BLOCK2 */
2515
2516 }
2517
2518 #ifdef BLOCK2
2519
2520 #if VEC_SET == SSE_128
2521 #ifdef DOUBLE_PRECISION_COMPLEX
2522 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
2523 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
2524 #endif
2525 #ifdef SINGLE_PRECISION_COMPLEX
2526 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
2527 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
2528 #endif
2529 #endif /* VEC_SET == SSE_128 */
2530
2531 #if VEC_SET == AVX_256
2532 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
2533 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
2534 #endif /* VEC_SET == AVX_256 */
2535
2536 #if VEC_SET == AVX_512
2537 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
2538 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
2539 #endif /* VEC_SET == AVX_512 */
2540
2541 #ifndef __ELPA_USE_FMA__
2542 // conjugate
2543 h1_imag = _SIMD_XOR(h1_imag, sign);
2544 #endif
2545
2546 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
2547 q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
2548 q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
2549 q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
2550 q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
2551
2552 tmp1 = _SIMD_MUL(h1_imag, q1);
2553 #ifdef __ELPA_USE_FMA__
2554 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2555 #else
2556 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2557 #endif
2558 tmp2 = _SIMD_MUL(h1_imag, q2);
2559 #ifdef __ELPA_USE_FMA__
2560 x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2561 #else
2562 x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2563 #endif
2564
2565 tmp3 = _SIMD_MUL(h1_imag, q3);
2566 #ifdef __ELPA_USE_FMA__
2567 x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2568 #else
2569 x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2570 #endif
2571 tmp4 = _SIMD_MUL(h1_imag, q4);
2572 #ifdef __ELPA_USE_FMA__
2573 x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2574 #else
2575 x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2576 #endif
2577
2578 tmp5 = _SIMD_MUL(h1_imag, q5);
2579 #ifdef __ELPA_USE_FMA__
2580 x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2581 #else
2582 x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2583 #endif
2584
2585 #endif /* BLOCK2 */
2586
2587 #if VEC_SET == SSE_128
2588 #ifdef DOUBLE_PRECISION_COMPLEX
2589 h1_real = _mm_loaddup_pd(&hh_dbl[0]);
2590 h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
2591 #endif
2592 #ifdef SINGLE_PRECISION_COMPLEX
2593 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
2594 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
2595 #endif
2596 #endif /* VEC_SET == SSE_128 */
2597
2598 #if VEC_SET == AVX_256
2599 h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
2600 h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
2601 #endif /* AVX_256 */
2602
2603 #if VEC_SET == AVX_512
2604 h1_real = _SIMD_SET1(hh_dbl[0]);
2605 h1_imag = _SIMD_SET1(hh_dbl[1]);
2606
2607 #ifdef HAVE_AVX512_XEON_PHI
2608 #ifdef DOUBLE_PRECISION_COMPLEX
2609 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
2610 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
2611 #endif
2612 #ifdef SINGLE_PRECISION_COMPLEX
2613 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
2614 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
2615 #endif
2616 #endif
2617 #ifdef HAVE_AVX512_XEON
2618 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
2619 h1_real = _SIMD_XOR(h1_real, sign);
2620 h1_imag = _SIMD_XOR(h1_imag, sign);
2621 #endif
2622 #endif
2623
2624 #endif /* VEC_SET == AVX_512 */
2625
2626 #if VEC_SET != AVX_512
2627 h1_real = _SIMD_XOR(h1_real, sign);
2628 h1_imag = _SIMD_XOR(h1_imag, sign);
2629 #endif /* VEC_SET != AVX_512 */
2630
2631 tmp1 = _SIMD_MUL(h1_imag, x1);
2632 #ifdef __ELPA_USE_FMA__
2633 x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2634 #else
2635 x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2636 #endif
2637 tmp2 = _SIMD_MUL(h1_imag, x2);
2638 #ifdef __ELPA_USE_FMA__
2639 x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
2640 #else
2641 x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
2642 #endif
2643 tmp3 = _SIMD_MUL(h1_imag, x3);
2644 #ifdef __ELPA_USE_FMA__
2645 x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
2646 #else
2647 x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
2648 #endif
2649
2650 tmp4 = _SIMD_MUL(h1_imag, x4);
2651 #ifdef __ELPA_USE_FMA__
2652 x4 = _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
2653 #else
2654 x4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
2655 #endif
2656 tmp5 = _SIMD_MUL(h1_imag, x5);
2657 #ifdef __ELPA_USE_FMA__
2658 x5 = _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
2659 #else
2660 x5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
2661 #endif
2662
2663 #ifdef BLOCK2
2664
2665 #if VEC_SET == SSE_128
2666 #ifdef DOUBLE_PRECISION_COMPLEX
2667 h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
2668 h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
2669 #endif
2670 #ifdef SINGLE_PRECISION_COMPLEX
2671 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
2672 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
2673 #endif
2674
2675 #ifdef DOUBLE_PRECISION_COMPLEX
2676 h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
2677 h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
2678 #endif
2679 #ifdef SINGLE_PRECISION_COMPLEX
2680 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
2681 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
2682 #endif
2683 #endif /* VEC_SET == SSE_128 */
2684
2685 #if VEC_SET == AVX_256
2686 h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
2687 h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
2688 h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
2689 h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
2690 #endif /* VEC_SET == AVX_256 */
2691
2692 #if VEC_SET == AVX_512
2693 h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
2694 h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
2695 h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
2696 h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
2697
2698 #ifdef HAVE_AVX512_XEON_PHI
2699
2700 #ifdef DOUBLE_PRECISION_COMPLEX
2701 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
2702 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
2703 #endif
2704 #ifdef SINGLE_PRECISION_COMPLEX
2705 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
2706 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
2707 #endif
2708
2709 #ifdef DOUBLE_PRECISION_COMPLEX
2710 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
2711 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
2712 #endif
2713 #ifdef SINGLE_PRECISION_COMPLEX
2714 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
2715 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
2716 #endif
2717 #endif /* HAVE_AVX512_XEON_PHI */
2718
2719 #ifdef HAVE_AVX512_XEON
2720 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
2721 h1_real = _SIMD_XOR(h1_real, sign);
2722 h1_imag = _SIMD_XOR(h1_imag, sign);
2723 h2_real = _SIMD_XOR(h2_real, sign);
2724 h2_imag = _SIMD_XOR(h2_imag, sign);
2725 #endif
2726 #endif
2727 #endif /* VEC_SET == AVX_512 */
2728
2729 #if VEC_SET != AVX_512
2730 h1_real = _SIMD_XOR(h1_real, sign);
2731 h1_imag = _SIMD_XOR(h1_imag, sign);
2732 h2_real = _SIMD_XOR(h2_real, sign);
2733 h2_imag = _SIMD_XOR(h2_imag, sign);
2734 #endif /* VEC_SET != AVX_512 */
2735
2736 #if VEC_SET == SSE_128
2737 #ifdef SINGLE_PRECISION_COMPLEX
2738 tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
2739 #else
2740 tmp2 = _SIMD_LOADU(s_dbl);
2741 #endif
2742 #endif /* VEC_SET == SSE_128 */
2743
2744 #if VEC_SET == AVX_256
2745 #ifdef DOUBLE_PRECISION_COMPLEX
2746 tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
2747 #endif
2748 #ifdef SINGLE_PRECISION_COMPLEX
2749 tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
2750 s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
2751 #endif
2752 #endif /* VEC_SET == AVX_256 */
2753
2754 #if VEC_SET == AVX_512
2755 #ifdef DOUBLE_PRECISION_COMPLEX
2756 tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
2757 s_dbl[1], s_dbl[0],
2758 s_dbl[1], s_dbl[0],
2759 s_dbl[1], s_dbl[0]);
2760 #endif
2761 #ifdef SINGLE_PRECISION_COMPLEX
2762 tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
2763 #endif
2764 #endif /* VEC_SET == AVX_512 */
2765
2766 tmp1 = _SIMD_MUL(h2_imag, tmp2);
2767 #ifdef __ELPA_USE_FMA__
2768 tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2769 #else
2770 tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2771 #endif
2772
2773 #if VEC_SET == AVX_512
2774 _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
2775
2776 h2_real = _SIMD_SET1(s_dbl[0]);
2777 h2_imag = _SIMD_SET1(s_dbl[1]);
2778 #endif /* VEC_SET == AVX_512 */
2779
2780 #if VEC_SET == SSE_128
2781 #ifdef DOUBLE_PRECISION_COMPLEX
2782 h2_real = _mm_movedup_pd(tmp2);
2783 h2_imag = _mm_set1_pd(tmp2[1]);
2784 #endif
2785 #ifdef SINGLE_PRECISION_COMPLEX
2786 h2_real = _mm_moveldup_ps(tmp2);
2787 h2_imag = _mm_movehdup_ps(tmp2);
2788 #endif
2789 #endif /* VEC_SET == SSE_128 */
2790
2791 #if VEC_SET == AVX_256
2792 h2_real = _SIMD_SET1(tmp2[0]);
2793 h2_imag = _SIMD_SET1(tmp2[1]);
2794 #endif /* VEC_SET == AVX_256 */
2795 tmp1 = _SIMD_MUL(h1_imag, y1);
2796 #ifdef __ELPA_USE_FMA__
2797 y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2798 #else
2799 y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2800 #endif
2801 tmp2 = _SIMD_MUL(h1_imag, y2);
2802 #ifdef __ELPA_USE_FMA__
2803 y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
2804 #else
2805 y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
2806 #endif
2807
2808 tmp3 = _SIMD_MUL(h1_imag, y3);
2809 #ifdef __ELPA_USE_FMA__
2810 y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
2811 #else
2812 y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
2813 #endif
2814 tmp4 = _SIMD_MUL(h1_imag, y4);
2815 #ifdef __ELPA_USE_FMA__
2816 y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
2817 #else
2818 y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
2819 #endif
2820
2821 tmp5 = _SIMD_MUL(h1_imag, y5);
2822 #ifdef __ELPA_USE_FMA__
2823 y5 = _SIMD_FMADDSUB(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
2824 #else
2825 y5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
2826 #endif
2827
2828 tmp1 = _SIMD_MUL(h2_imag, x1);
2829 #ifdef __ELPA_USE_FMA__
2830 y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2831 #else
2832 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2833 #endif
2834 tmp2 = _SIMD_MUL(h2_imag, x2);
2835 #ifdef __ELPA_USE_FMA__
2836 y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2837 #else
2838 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2839 #endif
2840
2841 tmp3 = _SIMD_MUL(h2_imag, x3);
2842 #ifdef __ELPA_USE_FMA__
2843 y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2844 #else
2845 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2846 #endif
2847 tmp4 = _SIMD_MUL(h2_imag, x4);
2848 #ifdef __ELPA_USE_FMA__
2849 y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2850 #else
2851 y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2852 #endif
2853
2854 tmp5 = _SIMD_MUL(h2_imag, x5);
2855 #ifdef __ELPA_USE_FMA__
2856 y5 = _SIMD_ADD(y5, _SIMD_FMADDSUB(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2857 #else
2858 y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2859 #endif
2860
2861 #endif /* BLOCK2 */
2862
2863 q1 = _SIMD_LOAD(&q_dbl[0]);
2864 q2 = _SIMD_LOAD(&q_dbl[offset]);
2865 q3 = _SIMD_LOAD(&q_dbl[2*offset]);
2866 q4 = _SIMD_LOAD(&q_dbl[3*offset]);
2867 q5 = _SIMD_LOAD(&q_dbl[4*offset]);
2868
2869 #ifdef BLOCK1
2870 q1 = _SIMD_ADD(q1, x1);
2871 q2 = _SIMD_ADD(q2, x2);
2872 q3 = _SIMD_ADD(q3, x3);
2873 q4 = _SIMD_ADD(q4, x4);
2874 q5 = _SIMD_ADD(q5, x5);
2875 #endif
2876
2877
2878 #ifdef BLOCK2
2879 q1 = _SIMD_ADD(q1, y1);
2880 q2 = _SIMD_ADD(q2, y2);
2881 q3 = _SIMD_ADD(q3, y3);
2882 q4 = _SIMD_ADD(q4, y4);
2883 q5 = _SIMD_ADD(q5, y5);
2884 #endif
2885 _SIMD_STORE(&q_dbl[0], q1);
2886 _SIMD_STORE(&q_dbl[offset], q2);
2887 _SIMD_STORE(&q_dbl[2*offset], q3);
2888 _SIMD_STORE(&q_dbl[3*offset], q4);
2889 _SIMD_STORE(&q_dbl[4*offset], q5);
2890
2891
2892 #ifdef BLOCK2
2893
2894 #if VEC_SET == SSE_128
2895 #ifdef DOUBLE_PRECISION_COMPLEX
2896 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
2897 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
2898 #endif
2899 #ifdef SINGLE_PRECISION_COMPLEX
2900 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
2901 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
2902 #endif
2903 #endif /* VEC_SET == SSE_128 */
2904
2905 #if VEC_SET == AVX_256
2906 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
2907 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
2908 #endif /* VEC_SET == AVX_256 */
2909
2910 #if VEC_SET == AVX_512
2911 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
2912 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
2913 #endif /* VEC_SET == AVX_512 */
2914
2915 q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
2916 q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
2917 q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
2918 q4 = _SIMD_LOAD(&q_dbl[(ldq*2)+3*offset]);
2919 q5 = _SIMD_LOAD(&q_dbl[(ldq*2)+4*offset]);
2920
2921 q1 = _SIMD_ADD(q1, x1);
2922 q2 = _SIMD_ADD(q2, x2);
2923 q3 = _SIMD_ADD(q3, x3);
2924 q4 = _SIMD_ADD(q4, x4);
2925 q5 = _SIMD_ADD(q5, x5);
2926
2927 tmp1 = _SIMD_MUL(h2_imag, y1);
2928
2929 #ifdef __ELPA_USE_FMA__
2930 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2931 #else
2932 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2933 #endif
2934 tmp2 = _SIMD_MUL(h2_imag, y2);
2935 #ifdef __ELPA_USE_FMA__
2936 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2937 #else
2938 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2939 #endif
2940
2941 tmp3 = _SIMD_MUL(h2_imag, y3);
2942 #ifdef __ELPA_USE_FMA__
2943 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2944 #else
2945 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2946 #endif
2947 tmp4 = _SIMD_MUL(h2_imag, y4);
2948 #ifdef __ELPA_USE_FMA__
2949 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2950 #else
2951 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2952 #endif
2953
2954 tmp5 = _SIMD_MUL(h2_imag, y5);
2955 #ifdef __ELPA_USE_FMA__
2956 q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2957 #else
2958 q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2959 #endif
2960
2961 _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
2962 _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
2963 _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
2964 _SIMD_STORE(&q_dbl[(ldq*2)+3*offset], q4);
2965 _SIMD_STORE(&q_dbl[(ldq*2)+4*offset], q5);
2966
2967 #endif /* BLOCK2 */
2968
2969
2970 for (i = BLOCK; i < nb; i++)
2971 {
2972
2973 #if VEC_SET == SSE_128
2974 #ifdef DOUBLE_PRECISION_COMPLEX
2975 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
2976 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
2977 #endif
2978 #ifdef SINGLE_PRECISION_COMPLEX
2979 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
2980 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
2981 #endif
2982 #endif /* VEC_SET == SSE_128 */
2983
2984 #if VEC_SET == AVX_256
2985 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
2986 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
2987 #endif /* VEC_SET == AVX_256 */
2988
2989 #if VEC_SET == AVX_512
2990 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
2991 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
2992 #endif /* VEC_SET == AVX_512 */
2993
2994 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
2995 q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
2996 q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
2997 q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
2998 q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
2999
3000 tmp1 = _SIMD_MUL(h1_imag, x1);
3001 #ifdef __ELPA_USE_FMA__
3002 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3003 #else
3004 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3005 #endif
3006 tmp2 = _SIMD_MUL(h1_imag, x2);
3007 #ifdef __ELPA_USE_FMA__
3008 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3009 #else
3010 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3011 #endif
3012 tmp3 = _SIMD_MUL(h1_imag, x3);
3013 #ifdef __ELPA_USE_FMA__
3014 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3015 #else
3016 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3017 #endif
3018
3019 tmp4 = _SIMD_MUL(h1_imag, x4);
3020 #ifdef __ELPA_USE_FMA__
3021 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3022 #else
3023 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3024 #endif
3025 tmp5 = _SIMD_MUL(h1_imag, x5);
3026 #ifdef __ELPA_USE_FMA__
3027 q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3028 #else
3029 q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3030 #endif
3031
3032 #ifdef BLOCK2
3033
3034 #if VEC_SET == SSE_128
3035 #ifdef DOUBLE_PRECISION_COMPLEX
3036 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
3037 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
3038 #endif
3039 #ifdef SINGLE_PRECISION_COMPLEX
3040 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
3041 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
3042 #endif
3043 #endif /* VEC_SET == SSE_128 */
3044
3045 #if VEC_SET == AVX_256
3046 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
3047 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
3048 #endif /* VEC_SET == AVX_256 */
3049
3050 #if VEC_SET == AVX_512
3051 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
3052 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
3053 #endif /* VEC_SET == AVX_512 */
3054
3055 tmp1 = _SIMD_MUL(h2_imag, y1);
3056 #ifdef __ELPA_USE_FMA__
3057 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3058 #else
3059 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3060 #endif
3061 tmp2 = _SIMD_MUL(h2_imag, y2);
3062 #ifdef __ELPA_USE_FMA__
3063 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3064 #else
3065 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3066 #endif
3067
3068 tmp3 = _SIMD_MUL(h2_imag, y3);
3069 #ifdef __ELPA_USE_FMA__
3070 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3071 #else
3072 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3073 #endif
3074 tmp4 = _SIMD_MUL(h2_imag, y4);
3075 #ifdef __ELPA_USE_FMA__
3076 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3077 #else
3078 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3079 #endif
3080
3081 tmp5 = _SIMD_MUL(h2_imag, y5);
3082 #ifdef __ELPA_USE_FMA__
3083 q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3084 #else
3085 q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3086 #endif
3087
3088 #endif /* BLOCK2 */
3089
3090 _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
3091 _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
3092 _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
3093 _SIMD_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
3094 _SIMD_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
3095 }
3096 #ifdef BLOCK2
3097
3098 #if VEC_SET == SSE_128
3099 #ifdef DOUBLE_PRECISION_COMPLEX
3100 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
3101 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
3102 #endif
3103 #ifdef SINGLE_PRECISION_COMPLEX
3104 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
3105 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
3106 #endif
3107 #endif /* VEC_SET == SSE_128 */
3108
3109 #if VEC_SET == AVX_256
3110 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
3111 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
3112 #endif /* VEC_SET == AVX_256 */
3113
3114 #if VEC_SET == AVX_512
3115 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
3116 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
3117 #endif /* VEC_SET == AVX_512 */
3118
3119 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
3120 q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
3121 q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
3122 q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
3123 q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
3124
3125 tmp1 = _SIMD_MUL(h1_imag, x1);
3126 #ifdef __ELPA_USE_FMA__
3127 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3128 #else
3129 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3130 #endif
3131 tmp2 = _SIMD_MUL(h1_imag, x2);
3132 #ifdef __ELPA_USE_FMA__
3133 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3134 #else
3135 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3136 #endif
3137
3138 tmp3 = _SIMD_MUL(h1_imag, x3);
3139 #ifdef __ELPA_USE_FMA__
3140 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3141 #else
3142 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3143 #endif
3144 tmp4 = _SIMD_MUL(h1_imag, x4);
3145 #ifdef __ELPA_USE_FMA__
3146 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3147 #else
3148 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3149 #endif
3150
3151 tmp5 = _SIMD_MUL(h1_imag, x5);
3152 #ifdef __ELPA_USE_FMA__
3153 q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3154 #else
3155 q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3156 #endif
3157
3158 _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
3159 _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
3160 _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
3161 _SIMD_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
3162 _SIMD_STORE(&q_dbl[(2*nb*ldq)+4*offset], q5);
3163
3164 #endif /* BLOCK2 */
3165
3166 }
3167
3168 #if VEC_SET == SSE_128
3169 #ifdef DOUBLE_PRECISION_COMPLEX
3170 #define ROW_LENGTH 4
3171 #endif
3172 #ifdef SINGLE_PRECISION_COMPLEX
3173 #define ROW_LENGTH 8
3174 #endif
3175 #endif /* VEC_SET == SSE_128 */
3176
3177 #if VEC_SET == AVX_256
3178 #ifdef DOUBLE_PRECISION_COMPLEX
3179 #define ROW_LENGTH 8
3180 #endif
3181 #ifdef SINGLE_PRECISION_COMPLEX
3182 #define ROW_LENGTH 16
3183 #endif
3184 #endif /* VEC_SET == AVX_256 */
3185
3186 #if VEC_SET == AVX_512
3187 #ifdef DOUBLE_PRECISION_COMPLEX
3188 #define ROW_LENGTH 16
3189 #endif
3190 #ifdef SINGLE_PRECISION_COMPLEX
3191 #define ROW_LENGTH 32
3192 #endif
3193 #endif /* VEC_SET == AVX_512 */
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)3194 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
3195 #ifdef BLOCK1
3196 )
3197 #endif
3198 #ifdef BLOCK2
3199 ,int ldh, DATA_TYPE s)
3200 #endif
3201 {
3202 DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
3203 DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
3204 #ifdef BLOCK2
3205 DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
3206 #endif
3207
3208 __SIMD_DATATYPE x1, x2, x3, x4;
3209 __SIMD_DATATYPE q1, q2, q3, q4;
3210 #ifdef BLOCK2
3211 __SIMD_DATATYPE y1, y2, y3, y4;
3212 __SIMD_DATATYPE h2_real, h2_imag;
3213 #endif
3214 __SIMD_DATATYPE h1_real, h1_imag;
3215 __SIMD_DATATYPE tmp1, tmp2, tmp3, tmp4;
3216 int i=0;
3217
3218 #if VEC_SET == SSE_128
3219 #ifdef DOUBLE_PRECISION_COMPLEX
3220 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
3221 #endif
3222 #ifdef SINGLE_PRECISION_COMPLEX
3223 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
3224 #endif
3225 #endif /* VEC_SET == SSE_128 */
3226
3227 #if VEC_SET == AVX_256
3228 #ifdef DOUBLE_PRECISION_COMPLEX
3229 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
3230 #endif
3231 #ifdef SINGLE_PRECISION_COMPLEX
3232 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
3233 #endif
3234 #endif /* VEC_SET == AVX_256 */
3235
3236 #if VEC_SET == AVX_512
3237 #ifdef DOUBLE_PRECISION_COMPLEX
3238 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
3239 #endif
3240 #ifdef SINGLE_PRECISION_COMPLEX
3241 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
3242 #endif
3243 #endif /* VEC_SET == AVX_512 */
3244
3245 #ifdef BLOCK2
3246 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
3247 x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
3248 x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
3249 x4 = _SIMD_LOAD(&q_dbl[(2*ldq)+3*offset]);
3250
3251 #if VEC_SET == SSE_128
3252 #ifdef DOUBLE_PRECISION_COMPLEX
3253 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
3254 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
3255 #endif
3256 #ifdef SINGLE_PRECISION_COMPLEX
3257 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
3258 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
3259 #endif
3260 #endif /* VEC_SET == SSE_128 */
3261
3262 #if VEC_SET == AVX_256
3263 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
3264 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
3265 #endif /* VEC_SET == AVX_256 */
3266
3267 #if VEC_SET == AVX_512
3268 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
3269 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
3270 #endif /* VEC_SET == AVX_512 */
3271
3272 #ifndef __ELPA_USE_FMA__
3273 // conjugate
3274 h2_imag = _SIMD_XOR(h2_imag, sign);
3275 #endif
3276
3277 y1 = _SIMD_LOAD(&q_dbl[0]);
3278 y2 = _SIMD_LOAD(&q_dbl[offset]);
3279 y3 = _SIMD_LOAD(&q_dbl[2*offset]);
3280 y4 = _SIMD_LOAD(&q_dbl[3*offset]);
3281
3282 tmp1 = _SIMD_MUL(h2_imag, x1);
3283 #ifdef __ELPA_USE_FMA__
3284 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3285 #else
3286 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3287 #endif
3288
3289 tmp2 = _SIMD_MUL(h2_imag, x2);
3290 #ifdef __ELPA_USE_FMA__
3291 y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3292 #else
3293 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3294 #endif
3295
3296 tmp3 = _SIMD_MUL(h2_imag, x3);
3297 #ifdef __ELPA_USE_FMA__
3298 y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3299 #else
3300 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3301 #endif
3302
3303 tmp4 = _SIMD_MUL(h2_imag, x4);
3304 #ifdef __ELPA_USE_FMA__
3305 y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3306 #else
3307 y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3308 #endif
3309
3310 #endif /* BLOCK2 */
3311
3312 #ifdef BLOCK1
3313 x1 = _SIMD_LOAD(&q_dbl[0]);
3314 x2 = _SIMD_LOAD(&q_dbl[offset]);
3315 x3 = _SIMD_LOAD(&q_dbl[2*offset]);
3316 x4 = _SIMD_LOAD(&q_dbl[3*offset]);
3317 #endif
3318
3319 for (i = BLOCK; i < nb; i++)
3320 {
3321 #if VEC_SET == SSE_128
3322 #ifdef DOUBLE_PRECISION_COMPLEX
3323 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
3324 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
3325 #endif
3326 #ifdef SINGLE_PRECISION_COMPLEX
3327 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
3328 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
3329 #endif
3330 #endif /* VEC_SET == SSE_128 */
3331
3332 #if VEC_SET == AVX_256
3333 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
3334 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
3335 #endif /* VEC_SET == AVX_256 */
3336
3337 #if VEC_SET == AVX_512
3338 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
3339 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
3340 #endif /* VEC_SET == AVX_512 */
3341
3342 #ifndef __ELPA_USE_FMA__
3343 // conjugate
3344 h1_imag = _SIMD_XOR(h1_imag, sign);
3345 #endif
3346
3347 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
3348 q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
3349 q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
3350 q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
3351
3352 tmp1 = _SIMD_MUL(h1_imag, q1);
3353
3354 #ifdef __ELPA_USE_FMA__
3355 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3356 #else
3357 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3358 #endif
3359
3360 tmp2 = _SIMD_MUL(h1_imag, q2);
3361 #ifdef __ELPA_USE_FMA__
3362 x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3363 #else
3364 x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3365 #endif
3366
3367 tmp3 = _SIMD_MUL(h1_imag, q3);
3368 #ifdef __ELPA_USE_FMA__
3369 x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3370 #else
3371 x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3372 #endif
3373 tmp4 = _SIMD_MUL(h1_imag, q4);
3374 #ifdef __ELPA_USE_FMA__
3375 x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3376 #else
3377 x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3378 #endif
3379
3380 #ifdef BLOCK2
3381
3382 #if VEC_SET == SSE_128
3383 #ifdef DOUBLE_PRECISION_COMPLEX
3384 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
3385 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
3386 #endif
3387 #ifdef SINGLE_PRECISION_COMPLEX
3388 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
3389 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
3390 #endif
3391 #endif /* VEC_SET == SSE_128 */
3392
3393 #if VEC_SET == AVX_256
3394 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
3395 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
3396 #endif /* VEC_SET == AVX_256 */
3397
3398 #if VEC_SET == AVX_512
3399 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
3400 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
3401 #endif /* VEC_SET == AVX_512 */
3402
3403 #ifndef __ELPA_USE_FMA__
3404 // conjugate
3405 h2_imag = _SIMD_XOR(h2_imag, sign);
3406 #endif
3407
3408 tmp1 = _SIMD_MUL(h2_imag, q1);
3409 #ifdef __ELPA_USE_FMA__
3410 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3411 #else
3412 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3413 #endif
3414 tmp2 = _SIMD_MUL(h2_imag, q2);
3415 #ifdef __ELPA_USE_FMA__
3416 y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3417 #else
3418 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3419 #endif
3420
3421 tmp3 = _SIMD_MUL(h2_imag, q3);
3422 #ifdef __ELPA_USE_FMA__
3423 y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3424 #else
3425 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3426 #endif
3427 tmp4 = _SIMD_MUL(h2_imag, q4);
3428 #ifdef __ELPA_USE_FMA__
3429 y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3430 #else
3431 y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3432 #endif
3433 #endif /* BLOCK2 */
3434 }
3435
3436 #ifdef BLOCK2
3437
3438 #if VEC_SET == SSE_128
3439 #ifdef DOUBLE_PRECISION_COMPLEX
3440 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
3441 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
3442 #endif
3443 #ifdef SINGLE_PRECISION_COMPLEX
3444 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
3445 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
3446 #endif
3447 #endif /* VEC_SET == SSE_128 */
3448
3449 #if VEC_SET == AVX_256
3450 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
3451 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
3452 #endif /* VEC_SET == AVX_256 */
3453
3454 #if VEC_SET == AVX_512
3455 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
3456 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
3457 #endif /* VEC_SET == AVX_512 */
3458
3459 #ifndef __ELPA_USE_FMA__
3460 // conjugate
3461 h1_imag = _SIMD_XOR(h1_imag, sign);
3462 #endif
3463
3464 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
3465 q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
3466 q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
3467 q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
3468
3469 tmp1 = _SIMD_MUL(h1_imag, q1);
3470 #ifdef __ELPA_USE_FMA__
3471 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3472 #else
3473 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3474 #endif
3475
3476 tmp2 = _SIMD_MUL(h1_imag, q2);
3477 #ifdef __ELPA_USE_FMA__
3478 x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3479 #else
3480 x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3481 #endif
3482
3483 tmp3 = _SIMD_MUL(h1_imag, q3);
3484 #ifdef __ELPA_USE_FMA__
3485 x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3486 #else
3487 x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3488 #endif
3489
3490 tmp4 = _SIMD_MUL(h1_imag, q4);
3491 #ifdef __ELPA_USE_FMA__
3492 x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3493 #else
3494 x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3495 #endif
3496
3497 #endif /* BLOCK2 */
3498
3499 #if VEC_SET == SSE_128
3500 #ifdef DOUBLE_PRECISION_COMPLEX
3501 h1_real = _mm_loaddup_pd(&hh_dbl[0]);
3502 h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
3503 #endif
3504 #ifdef SINGLE_PRECISION_COMPLEX
3505 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
3506 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
3507 #endif
3508 #endif /* VEC_SET == SSE_128 */
3509
3510 #if VEC_SET == AVX_256
3511 h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
3512 h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
3513 #endif /* AVX_256 */
3514
3515 #if VEC_SET == AVX_512
3516 h1_real = _SIMD_SET1(hh_dbl[0]);
3517 h1_imag = _SIMD_SET1(hh_dbl[1]);
3518
3519 #ifdef HAVE_AVX512_XEON_PHI
3520 #ifdef DOUBLE_PRECISION_COMPLEX
3521 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
3522 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
3523 #endif
3524 #ifdef SINGLE_PRECISION_COMPLEX
3525 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
3526 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
3527 #endif
3528 #endif
3529 #ifdef HAVE_AVX512_XEON
3530 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
3531 h1_real = _SIMD_XOR(h1_real, sign);
3532 h1_imag = _SIMD_XOR(h1_imag, sign);
3533 #endif
3534 #endif
3535
3536 #endif /* VEC_SET == AVX_512 */
3537
3538 #if VEC_SET != AVX_512
3539 h1_real = _SIMD_XOR(h1_real, sign);
3540 h1_imag = _SIMD_XOR(h1_imag, sign);
3541 #endif /* VEC_SET != AVX_512 */
3542
3543 tmp1 = _SIMD_MUL(h1_imag, x1);
3544 #ifdef __ELPA_USE_FMA__
3545 x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3546 #else
3547 x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3548 #endif
3549
3550 tmp2 = _SIMD_MUL(h1_imag, x2);
3551 #ifdef __ELPA_USE_FMA__
3552 x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
3553 #else
3554 x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
3555 #endif
3556
3557 tmp3 = _SIMD_MUL(h1_imag, x3);
3558 #ifdef __ELPA_USE_FMA__
3559 x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
3560 #else
3561 x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
3562 #endif
3563
3564 tmp4 = _SIMD_MUL(h1_imag, x4);
3565 #ifdef __ELPA_USE_FMA__
3566 x4 = _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
3567 #else
3568 x4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
3569 #endif
3570
3571 #ifdef BLOCK2
3572
3573 #if VEC_SET == SSE_128
3574 #ifdef DOUBLE_PRECISION_COMPLEX
3575 h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
3576 h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
3577 #endif
3578 #ifdef SINGLE_PRECISION_COMPLEX
3579 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
3580 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
3581 #endif
3582
3583 #ifdef DOUBLE_PRECISION_COMPLEX
3584 h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
3585 h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
3586 #endif
3587 #ifdef SINGLE_PRECISION_COMPLEX
3588 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
3589 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
3590 #endif
3591 #endif /* VEC_SET == 128 */
3592
3593 #if VEC_SET == AVX_256
3594 h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
3595 h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
3596 h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
3597 h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
3598 #endif /* VEC_SET == AVX_256 */
3599
3600 #if VEC_SET == AVX_512
3601 h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
3602 h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
3603 h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
3604 h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
3605
3606 #ifdef HAVE_AVX512_XEON_PHI
3607
3608 #ifdef DOUBLE_PRECISION_COMPLEX
3609 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
3610 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
3611 #endif
3612 #ifdef SINGLE_PRECISION_COMPLEX
3613 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
3614 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
3615 #endif
3616
3617 #ifdef DOUBLE_PRECISION_COMPLEX
3618 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
3619 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
3620 #endif
3621 #ifdef SINGLE_PRECISION_COMPLEX
3622 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
3623 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
3624 #endif
3625 #endif /* HAVE_AVX512_XEON_PHI */
3626
3627 #ifdef HAVE_AVX512_XEON
3628 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
3629 h1_real = _SIMD_XOR(h1_real, sign);
3630 h1_imag = _SIMD_XOR(h1_imag, sign);
3631 h2_real = _SIMD_XOR(h2_real, sign);
3632 h2_imag = _SIMD_XOR(h2_imag, sign);
3633 #endif
3634 #endif
3635 #endif /* VEC_SET == AVX_512 */
3636
3637 #if VEC_SET != AVX_512
3638 h1_real = _SIMD_XOR(h1_real, sign);
3639 h1_imag = _SIMD_XOR(h1_imag, sign);
3640 h2_real = _SIMD_XOR(h2_real, sign);
3641 h2_imag = _SIMD_XOR(h2_imag, sign);
3642 #endif /* VEC_SET != AVX_512 */
3643
3644 #if VEC_SET == SSE_128
3645 #ifdef SINGLE_PRECISION_COMPLEX
3646 tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
3647 #else
3648 tmp2 = _SIMD_LOADU(s_dbl);
3649 #endif
3650 #endif /* VEC_SET == SSE_128 */
3651
3652 #if VEC_SET == AVX_256
3653 #ifdef DOUBLE_PRECISION_COMPLEX
3654 tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
3655 #endif
3656 #ifdef SINGLE_PRECISION_COMPLEX
3657 tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
3658 s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
3659 #endif
3660 #endif /* VEC_SET == AVX_256 */
3661
3662 #if VEC_SET == AVX_512
3663 #ifdef DOUBLE_PRECISION_COMPLEX
3664 tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
3665 s_dbl[1], s_dbl[0],
3666 s_dbl[1], s_dbl[0],
3667 s_dbl[1], s_dbl[0]);
3668 #endif
3669 #ifdef SINGLE_PRECISION_COMPLEX
3670 tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
3671 #endif
3672 #endif /* VEC_SET == AVX_512 */
3673
3674 tmp1 = _SIMD_MUL(h2_imag, tmp2);
3675 #ifdef __ELPA_USE_FMA__
3676 tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3677 #else
3678 tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3679 #endif
3680
3681 #if VEC_SET == AVX_512
3682 _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
3683
3684 h2_real = _SIMD_SET1(s_dbl[0]);
3685 h2_imag = _SIMD_SET1(s_dbl[1]);
3686 #endif /* VEC_SET == AVX_512 */
3687
3688 #if VEC_SET == SSE_128
3689 #ifdef DOUBLE_PRECISION_COMPLEX
3690 h2_real = _mm_movedup_pd(tmp2);
3691 h2_imag = _mm_set1_pd(tmp2[1]);
3692 #endif
3693 #ifdef SINGLE_PRECISION_COMPLEX
3694 h2_real = _mm_moveldup_ps(tmp2);
3695 h2_imag = _mm_movehdup_ps(tmp2);
3696 #endif
3697 #endif /* VEC_SET == SSE_128 */
3698
3699 #if VEC_SET == AVX_256
3700 h2_real = _SIMD_SET1(tmp2[0]);
3701 h2_imag = _SIMD_SET1(tmp2[1]);
3702 #endif /* VEC_SET == AVX_256 */
3703 tmp1 = _SIMD_MUL(h1_imag, y1);
3704 #ifdef __ELPA_USE_FMA__
3705 y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3706 #else
3707 y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3708 #endif
3709
3710 tmp2 = _SIMD_MUL(h1_imag, y2);
3711 #ifdef __ELPA_USE_FMA__
3712 y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
3713 #else
3714 y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
3715 #endif
3716
3717 tmp3 = _SIMD_MUL(h1_imag, y3);
3718 #ifdef __ELPA_USE_FMA__
3719 y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
3720 #else
3721 y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
3722 #endif
3723
3724 tmp4 = _SIMD_MUL(h1_imag, y4);
3725 #ifdef __ELPA_USE_FMA__
3726 y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
3727 #else
3728 y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
3729 #endif
3730
3731 tmp1 = _SIMD_MUL(h2_imag, x1);
3732 #ifdef __ELPA_USE_FMA__
3733 y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3734 #else
3735 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3736 #endif
3737
3738 tmp2 = _SIMD_MUL(h2_imag, x2);
3739 #ifdef __ELPA_USE_FMA__
3740 y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3741 #else
3742 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3743 #endif
3744
3745 tmp3 = _SIMD_MUL(h2_imag, x3);
3746 #ifdef __ELPA_USE_FMA__
3747 y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3748 #else
3749 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3750 #endif
3751
3752 tmp4 = _SIMD_MUL(h2_imag, x4);
3753 #ifdef __ELPA_USE_FMA__
3754 y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3755 #else
3756 y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3757 #endif
3758
3759 #endif /* BLOCK2 */
3760
3761 q1 = _SIMD_LOAD(&q_dbl[0]);
3762 q2 = _SIMD_LOAD(&q_dbl[offset]);
3763 q3 = _SIMD_LOAD(&q_dbl[2*offset]);
3764 q4 = _SIMD_LOAD(&q_dbl[3*offset]);
3765
3766 #ifdef BLOCK1
3767 q1 = _SIMD_ADD(q1, x1);
3768 q2 = _SIMD_ADD(q2, x2);
3769 q3 = _SIMD_ADD(q3, x3);
3770 q4 = _SIMD_ADD(q4, x4);
3771 #endif
3772
3773 #ifdef BLOCK2
3774 q1 = _SIMD_ADD(q1, y1);
3775 q2 = _SIMD_ADD(q2, y2);
3776 q3 = _SIMD_ADD(q3, y3);
3777 q4 = _SIMD_ADD(q4, y4);
3778 #endif
3779
3780 _SIMD_STORE(&q_dbl[0], q1);
3781 _SIMD_STORE(&q_dbl[offset], q2);
3782 _SIMD_STORE(&q_dbl[2*offset], q3);
3783 _SIMD_STORE(&q_dbl[3*offset], q4);
3784
3785 #ifdef BLOCK2
3786
3787 #if VEC_SET == SSE_128
3788 #ifdef DOUBLE_PRECISION_COMPLEX
3789 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
3790 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
3791 #endif
3792 #ifdef SINGLE_PRECISION_COMPLEX
3793 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
3794 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
3795 #endif
3796 #endif /* VEC_SET == SSE_128 */
3797
3798 #if VEC_SET == AVX_256
3799 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
3800 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
3801 #endif /* VEC_SET == AVX_256 */
3802
3803 #if VEC_SET == AVX_512
3804 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
3805 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
3806 #endif /* VEC_SET == AVX_512 */
3807
3808 q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
3809 q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
3810 q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
3811 q4 = _SIMD_LOAD(&q_dbl[(ldq*2)+3*offset]);
3812
3813 q1 = _SIMD_ADD(q1, x1);
3814 q2 = _SIMD_ADD(q2, x2);
3815 q3 = _SIMD_ADD(q3, x3);
3816 q4 = _SIMD_ADD(q4, x4);
3817
3818 tmp1 = _SIMD_MUL(h2_imag, y1);
3819 #ifdef __ELPA_USE_FMA__
3820 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3821 #else
3822 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3823 #endif
3824
3825 tmp2 = _SIMD_MUL(h2_imag, y2);
3826 #ifdef __ELPA_USE_FMA__
3827 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3828 #else
3829 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3830 #endif
3831
3832 tmp3 = _SIMD_MUL(h2_imag, y3);
3833 #ifdef __ELPA_USE_FMA__
3834 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3835 #else
3836 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3837 #endif
3838
3839 tmp4 = _SIMD_MUL(h2_imag, y4);
3840 #ifdef __ELPA_USE_FMA__
3841 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3842 #else
3843 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3844 #endif
3845
3846 _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
3847 _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
3848 _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
3849 _SIMD_STORE(&q_dbl[(ldq*2)+3*offset], q4);
3850
3851 #endif /* BLOCK2 */
3852
3853 for (i = BLOCK; i < nb; i++)
3854 {
3855
3856 #if VEC_SET == SSE_128
3857 #ifdef DOUBLE_PRECISION_COMPLEX
3858 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
3859 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
3860 #endif
3861 #ifdef SINGLE_PRECISION_COMPLEX
3862 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
3863 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
3864 #endif
3865 #endif /* VEC_SET == SSE_128 */
3866
3867 #if VEC_SET == AVX_256
3868 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
3869 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
3870 #endif /* VEC_SET == AVX_256 */
3871
3872 #if VEC_SET == AVX_512
3873 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
3874 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
3875 #endif /* VEC_SET == AVX_512 */
3876
3877 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
3878 q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
3879 q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
3880 q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
3881
3882 tmp1 = _SIMD_MUL(h1_imag, x1);
3883
3884 #ifdef __ELPA_USE_FMA__
3885 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3886 #else
3887 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3888 #endif
3889 tmp2 = _SIMD_MUL(h1_imag, x2);
3890 #ifdef __ELPA_USE_FMA__
3891 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3892 #else
3893 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3894 #endif
3895
3896 tmp3 = _SIMD_MUL(h1_imag, x3);
3897 #ifdef __ELPA_USE_FMA__
3898 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3899 #else
3900 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3901 #endif
3902 tmp4 = _SIMD_MUL(h1_imag, x4);
3903 #ifdef __ELPA_USE_FMA__
3904 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3905 #else
3906 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3907 #endif
3908
3909 #ifdef BLOCK2
3910
3911 #if VEC_SET == SSE_128
3912 #ifdef DOUBLE_PRECISION_COMPLEX
3913 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
3914 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
3915 #endif
3916 #ifdef SINGLE_PRECISION_COMPLEX
3917 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
3918 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
3919 #endif
3920 #endif /* VEC_SET == SSE_128 */
3921
3922 #if VEC_SET == AVX_256
3923 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
3924 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
3925 #endif /* VEC_SET == AVX_256 */
3926
3927 #if VEC_SET == AVX_512
3928 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
3929 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
3930 #endif /* VEC_SET == AVX_512 */
3931
3932 tmp1 = _SIMD_MUL(h2_imag, y1);
3933 #ifdef __ELPA_USE_FMA__
3934 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3935 #else
3936 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3937 #endif
3938 tmp2 = _SIMD_MUL(h2_imag, y2);
3939 #ifdef __ELPA_USE_FMA__
3940 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3941 #else
3942 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3943 #endif
3944
3945 tmp3 = _SIMD_MUL(h2_imag, y3);
3946 #ifdef __ELPA_USE_FMA__
3947 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3948 #else
3949 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3950 #endif
3951 tmp4 = _SIMD_MUL(h2_imag, y4);
3952 #ifdef __ELPA_USE_FMA__
3953 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3954 #else
3955 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3956 #endif
3957
3958 #endif /* BLOCK2 */
3959
3960 _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
3961 _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
3962 _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
3963 _SIMD_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
3964
3965 }
3966 #ifdef BLOCK2
3967
3968 #if VEC_SET == SSE_128
3969 #ifdef DOUBLE_PRECISION_COMPLEX
3970 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
3971 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
3972 #endif
3973 #ifdef SINGLE_PRECISION_COMPLEX
3974 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
3975 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
3976 #endif
3977 #endif /* VEC_SET == SSE_128 */
3978
3979 #if VEC_SET == AVX_256
3980 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
3981 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
3982 #endif /* VEC_SET == AVX_256 */
3983
3984 #if VEC_SET == AVX_512
3985 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
3986 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
3987 #endif /* VEC_SET == AVX_512 */
3988
3989 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
3990 q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
3991 q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
3992 q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
3993
3994 tmp1 = _SIMD_MUL(h1_imag, x1);
3995 #ifdef __ELPA_USE_FMA__
3996 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3997 #else
3998 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3999 #endif
4000 tmp2 = _SIMD_MUL(h1_imag, x2);
4001 #ifdef __ELPA_USE_FMA__
4002 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4003 #else
4004 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4005 #endif
4006
4007 tmp3 = _SIMD_MUL(h1_imag, x3);
4008 #ifdef __ELPA_USE_FMA__
4009 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4010 #else
4011 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4012 #endif
4013 tmp4 = _SIMD_MUL(h1_imag, x4);
4014 #ifdef __ELPA_USE_FMA__
4015 q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
4016 #else
4017 q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
4018 #endif
4019
4020 _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
4021 _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
4022 _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
4023 _SIMD_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
4024
4025 #endif /* BLOCK2 */
4026 }
4027
4028
4029 #if VEC_SET == SSE_128
4030 #ifdef DOUBLE_PRECISION_COMPLEX
4031 #define ROW_LENGTH 3
4032 #endif
4033 #ifdef SINGLE_PRECISION_COMPLEX
4034 #define ROW_LENGTH 6
4035 #endif
4036 #endif /* VEC_SET == SSE_128 */
4037
4038 #if VEC_SET == AVX_256
4039 #ifdef DOUBLE_PRECISION_COMPLEX
4040 #define ROW_LENGTH 6
4041 #endif
4042 #ifdef SINGLE_PRECISION_COMPLEX
4043 #define ROW_LENGTH 12
4044 #endif
4045 #endif /* VEC_SET == AVX_256 */
4046
4047 #if VEC_SET == AVX_512
4048 #ifdef DOUBLE_PRECISION_COMPLEX
4049 #define ROW_LENGTH 12
4050 #endif
4051 #ifdef SINGLE_PRECISION_COMPLEX
4052 #define ROW_LENGTH 24
4053 #endif
4054 #endif /* VEC_SET == AVX_512 */
4055
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)4056 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
4057 #ifdef BLOCK1
4058 )
4059 #endif
4060 #ifdef BLOCK2
4061 ,int ldh, DATA_TYPE s)
4062 #endif
4063 {
4064 DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
4065 DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
4066 #ifdef BLOCK2
4067 DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
4068 #endif
4069
4070 __SIMD_DATATYPE x1, x2, x3;
4071 __SIMD_DATATYPE q1, q2, q3;
4072 #ifdef BLOCK2
4073 __SIMD_DATATYPE y1, y2, y3;
4074 __SIMD_DATATYPE h2_real, h2_imag;
4075 #endif
4076 __SIMD_DATATYPE h1_real, h1_imag;
4077 __SIMD_DATATYPE tmp1, tmp2, tmp3;
4078 int i=0;
4079
4080 #if VEC_SET == SSE_128
4081 #ifdef DOUBLE_PRECISION_COMPLEX
4082 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
4083 #endif
4084 #ifdef SINGLE_PRECISION_COMPLEX
4085 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
4086 #endif
4087 #endif /* VEC_SET == SSE_128 */
4088
4089 #if VEC_SET == AVX_256
4090 #ifdef DOUBLE_PRECISION_COMPLEX
4091 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
4092 #endif
4093 #ifdef SINGLE_PRECISION_COMPLEX
4094 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
4095 #endif
4096 #endif /* VEC_SET == AVX_256 */
4097
4098 #if VEC_SET == AVX_512
4099 #ifdef DOUBLE_PRECISION_COMPLEX
4100 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
4101 #endif
4102 #ifdef SINGLE_PRECISION_COMPLEX
4103 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
4104 #endif
4105 #endif /* VEC_SET == AVX_512 */
4106
4107 #ifdef BLOCK2
4108 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
4109 x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
4110 x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
4111
4112 #if VEC_SET == SSE_128
4113 #ifdef DOUBLE_PRECISION_COMPLEX
4114 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
4115 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
4116 #endif
4117 #ifdef SINGLE_PRECISION_COMPLEX
4118 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
4119 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
4120 #endif
4121 #endif /* VEC_SET == SSE_128 */
4122
4123 #if VEC_SET == AVX_256
4124 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
4125 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
4126 #endif /* VEC_SET == AVX_256 */
4127
4128 #if VEC_SET == AVX_512
4129 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
4130 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
4131 #endif /* VEC_SET == AVX_512 */
4132
4133 #ifndef __ELPA_USE_FMA__
4134 // conjugate
4135 h2_imag = _SIMD_XOR(h2_imag, sign);
4136 #endif
4137
4138 y1 = _SIMD_LOAD(&q_dbl[0]);
4139 y2 = _SIMD_LOAD(&q_dbl[offset]);
4140 y3 = _SIMD_LOAD(&q_dbl[2*offset]);
4141
4142 tmp1 = _SIMD_MUL(h2_imag, x1);
4143 #ifdef __ELPA_USE_FMA__
4144 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4145 #else
4146 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4147 #endif
4148
4149 tmp2 = _SIMD_MUL(h2_imag, x2);
4150 #ifdef __ELPA_USE_FMA__
4151 y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4152 #else
4153 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4154 #endif
4155
4156 tmp3 = _SIMD_MUL(h2_imag, x3);
4157 #ifdef __ELPA_USE_FMA__
4158 y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4159 #else
4160 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4161 #endif
4162
4163 #endif /* BLOCK2 */
4164
4165 #ifdef BLOCK1
4166 x1 = _SIMD_LOAD(&q_dbl[0]);
4167 x2 = _SIMD_LOAD(&q_dbl[offset]);
4168 x3 = _SIMD_LOAD(&q_dbl[2*offset]);
4169 #endif
4170
4171 for (i = BLOCK; i < nb; i++)
4172 {
4173 #if VEC_SET == SSE_128
4174 #ifdef DOUBLE_PRECISION_COMPLEX
4175 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
4176 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
4177 #endif
4178 #ifdef SINGLE_PRECISION_COMPLEX
4179 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
4180 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
4181 #endif
4182 #endif /* VEC_SET == SSE_128 */
4183
4184 #if VEC_SET == AVX_256
4185 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
4186 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
4187 #endif /* VEC_SET == AVX_256 */
4188
4189 #if VEC_SET == AVX_512
4190 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
4191 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
4192 #endif /* VEC_SET == AVX_512 */
4193
4194 #ifndef __ELPA_USE_FMA__
4195 // conjugate
4196 h1_imag = _SIMD_XOR(h1_imag, sign);
4197 #endif
4198
4199 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
4200 q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
4201 q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
4202
4203 tmp1 = _SIMD_MUL(h1_imag, q1);
4204
4205 #ifdef __ELPA_USE_FMA__
4206 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4207 #else
4208 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4209 #endif
4210
4211 tmp2 = _SIMD_MUL(h1_imag, q2);
4212 #ifdef __ELPA_USE_FMA__
4213 x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4214 #else
4215 x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4216 #endif
4217
4218 tmp3 = _SIMD_MUL(h1_imag, q3);
4219 #ifdef __ELPA_USE_FMA__
4220 x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4221 #else
4222 x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4223 #endif
4224
4225 #ifdef BLOCK2
4226
4227 #if VEC_SET == SSE_128
4228 #ifdef DOUBLE_PRECISION_COMPLEX
4229 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
4230 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
4231 #endif
4232 #ifdef SINGLE_PRECISION_COMPLEX
4233 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
4234 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
4235 #endif
4236 #endif /* VEC_SET == SSE_128 */
4237
4238 #if VEC_SET == AVX_256
4239 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
4240 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
4241 #endif /* VEC_SET == AVX_256 */
4242
4243 #if VEC_SET == AVX_512
4244 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
4245 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
4246 #endif /* VEC_SET == AVX_512 */
4247
4248 #ifndef __ELPA_USE_FMA__
4249 // conjugate
4250 h2_imag = _SIMD_XOR(h2_imag, sign);
4251 #endif
4252
4253 tmp1 = _SIMD_MUL(h2_imag, q1);
4254 #ifdef __ELPA_USE_FMA__
4255 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4256 #else
4257 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4258 #endif
4259 tmp2 = _SIMD_MUL(h2_imag, q2);
4260 #ifdef __ELPA_USE_FMA__
4261 y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4262 #else
4263 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4264 #endif
4265
4266 tmp3 = _SIMD_MUL(h2_imag, q3);
4267 #ifdef __ELPA_USE_FMA__
4268 y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4269 #else
4270 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4271 #endif
4272 #endif /* BLOCK2 */
4273 }
4274
4275 #ifdef BLOCK2
4276
4277 #if VEC_SET == SSE_128
4278 #ifdef DOUBLE_PRECISION_COMPLEX
4279 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
4280 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
4281 #endif
4282 #ifdef SINGLE_PRECISION_COMPLEX
4283 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
4284 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
4285 #endif
4286 #endif /* VEC_SET == SSE_128 */
4287
4288 #if VEC_SET == AVX_256
4289 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
4290 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
4291 #endif /* VEC_SET == AVX_256 */
4292
4293 #if VEC_SET == AVX_512
4294 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
4295 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
4296 #endif /* VEC_SET == AVX_512 */
4297
4298 #ifndef __ELPA_USE_FMA__
4299 // conjugate
4300 h1_imag = _SIMD_XOR(h1_imag, sign);
4301 #endif
4302
4303 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
4304 q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
4305 q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
4306
4307 tmp1 = _SIMD_MUL(h1_imag, q1);
4308 #ifdef __ELPA_USE_FMA__
4309 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4310 #else
4311 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4312 #endif
4313
4314 tmp2 = _SIMD_MUL(h1_imag, q2);
4315 #ifdef __ELPA_USE_FMA__
4316 x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4317 #else
4318 x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4319 #endif
4320
4321 tmp3 = _SIMD_MUL(h1_imag, q3);
4322 #ifdef __ELPA_USE_FMA__
4323 x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4324 #else
4325 x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4326 #endif
4327
4328 #endif /* BLOCK2 */
4329
4330 #if VEC_SET == SSE_128
4331 #ifdef DOUBLE_PRECISION_COMPLEX
4332 h1_real = _mm_loaddup_pd(&hh_dbl[0]);
4333 h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
4334 #endif
4335 #ifdef SINGLE_PRECISION_COMPLEX
4336 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
4337 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
4338 #endif
4339 #endif /* VEC_SET == SSE_128 */
4340
4341 #if VEC_SET == AVX_256
4342 h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
4343 h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
4344 #endif /* VEC_SET == AVX_256 */
4345
4346 #if VEC_SET == AVX_512
4347 h1_real = _SIMD_SET1(hh_dbl[0]);
4348 h1_imag = _SIMD_SET1(hh_dbl[1]);
4349
4350 #ifdef HAVE_AVX512_XEON_PHI
4351 #ifdef DOUBLE_PRECISION_COMPLEX
4352 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
4353 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
4354 #endif
4355 #ifdef SINGLE_PRECISION_COMPLEX
4356 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
4357 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
4358 #endif
4359 #endif /* HAVE_AVX512_XEON_PHI */
4360
4361 #ifdef HAVE_AVX512_XEON
4362 h1_real = _SIMD_XOR(h1_real, sign);
4363 h1_imag = _SIMD_XOR(h1_imag, sign);
4364 #endif
4365
4366 #endif /* VEC_SET == AVX_512 */
4367
4368 #if VEC_SET != AVX_512
4369 h1_real = _SIMD_XOR(h1_real, sign);
4370 h1_imag = _SIMD_XOR(h1_imag, sign);
4371 #endif /* VEC_SET != AVX_512 */
4372
4373 tmp1 = _SIMD_MUL(h1_imag, x1);
4374 #ifdef __ELPA_USE_FMA__
4375 x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4376 #else
4377 x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4378 #endif
4379
4380 tmp2 = _SIMD_MUL(h1_imag, x2);
4381 #ifdef __ELPA_USE_FMA__
4382 x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
4383 #else
4384 x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
4385 #endif
4386
4387 tmp3 = _SIMD_MUL(h1_imag, x3);
4388 #ifdef __ELPA_USE_FMA__
4389 x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
4390 #else
4391 x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
4392 #endif
4393
4394 #ifdef BLOCK2
4395
4396 #if VEC_SET == SSE_128
4397 #ifdef DOUBLE_PRECISION_COMPLEX
4398 h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
4399 h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
4400 #endif
4401 #ifdef SINGLE_PRECISION_COMPLEX
4402 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
4403 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
4404 #endif
4405
4406 #ifdef DOUBLE_PRECISION_COMPLEX
4407 h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
4408 h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
4409 #endif
4410 #ifdef SINGLE_PRECISION_COMPLEX
4411 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
4412 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
4413 #endif
4414 #endif /* VEC_SET == 128 */
4415
4416 #if VEC_SET == AVX_256
4417 h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
4418 h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
4419 h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
4420 h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
4421 #endif /* VEC_SET == AVX_256 */
4422
4423 #if VEC_SET == AVX_512
4424 h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
4425 h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
4426 h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
4427 h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
4428
4429 #ifdef HAVE_AVX512_XEON_PHI
4430
4431 #ifdef DOUBLE_PRECISION_COMPLEX
4432 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
4433 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
4434 #endif
4435 #ifdef SINGLE_PRECISION_COMPLEX
4436 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
4437 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
4438 #endif
4439
4440 #ifdef DOUBLE_PRECISION_COMPLEX
4441 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
4442 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
4443 #endif
4444 #ifdef SINGLE_PRECISION_COMPLEX
4445 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
4446 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
4447 #endif
4448 #endif /* HAVE_AVX512_XEON_PHI */
4449
4450 #ifdef HAVE_AVX512_XEON
4451 h1_real = _SIMD_XOR(h1_real, sign);
4452 h1_imag = _SIMD_XOR(h1_imag, sign);
4453 h2_real = _SIMD_XOR(h2_real, sign);
4454 h2_imag = _SIMD_XOR(h2_imag, sign);
4455 #endif
4456 #endif /* VEC_SET == AVX_512 */
4457
4458 #if VEC_SET != AVX_512
4459 h1_real = _SIMD_XOR(h1_real, sign);
4460 h1_imag = _SIMD_XOR(h1_imag, sign);
4461 h2_real = _SIMD_XOR(h2_real, sign);
4462 h2_imag = _SIMD_XOR(h2_imag, sign);
4463 #endif /* VEC_SET != AVX_512 */
4464
4465 #if VEC_SET == SSE_128
4466 #ifdef SINGLE_PRECISION_COMPLEX
4467 tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
4468 #else
4469 tmp2 = _SIMD_LOADU(s_dbl);
4470 #endif
4471 #endif /* VEC_SET == SSE_128 */
4472
4473 #if VEC_SET == AVX_256
4474 #ifdef DOUBLE_PRECISION_COMPLEX
4475 tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
4476 #endif
4477 #ifdef SINGLE_PRECISION_COMPLEX
4478 tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
4479 s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
4480 #endif
4481 #endif /* VEC_SET == AVX_256 */
4482
4483 #if VEC_SET == AVX_512
4484 #ifdef DOUBLE_PRECISION_COMPLEX
4485 tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
4486 s_dbl[1], s_dbl[0],
4487 s_dbl[1], s_dbl[0],
4488 s_dbl[1], s_dbl[0]);
4489 #endif
4490 #ifdef SINGLE_PRECISION_COMPLEX
4491 tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
4492 #endif
4493 #endif /* VEC_SET == AVX_512 */
4494
4495
4496 tmp1 = _SIMD_MUL(h2_imag, tmp2);
4497 #ifdef __ELPA_USE_FMA__
4498 tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4499 #else
4500 tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4501 #endif
4502
4503 #if VEC_SET == AVX_512
4504 _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
4505
4506 h2_real = _SIMD_SET1(s_dbl[0]);
4507 h2_imag = _SIMD_SET1(s_dbl[1]);
4508 #endif
4509
4510 #if VEC_SET == SSE_128
4511 #ifdef DOUBLE_PRECISION_COMPLEX
4512 h2_real = _mm_movedup_pd(tmp2);
4513 h2_imag = _mm_set1_pd(tmp2[1]);
4514 #endif
4515 #ifdef SINGLE_PRECISION_COMPLEX
4516 h2_real = _mm_moveldup_ps(tmp2);
4517 h2_imag = _mm_movehdup_ps(tmp2);
4518 #endif
4519 #endif /* VEC_SET == SSE_128 */
4520
4521 #if VEC_SET == AVX_256
4522 h2_real = _SIMD_SET1(tmp2[0]);
4523 h2_imag = _SIMD_SET1(tmp2[1]);
4524 #endif /* VEC_SET == AVX_256 */
4525
4526 tmp1 = _SIMD_MUL(h1_imag, y1);
4527 #ifdef __ELPA_USE_FMA__
4528 y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4529 #else
4530 y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4531 #endif
4532
4533 tmp2 = _SIMD_MUL(h1_imag, y2);
4534 #ifdef __ELPA_USE_FMA__
4535 y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
4536 #else
4537 y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
4538 #endif
4539
4540 tmp3 = _SIMD_MUL(h1_imag, y3);
4541 #ifdef __ELPA_USE_FMA__
4542 y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
4543 #else
4544 y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
4545 #endif
4546
4547 tmp1 = _SIMD_MUL(h2_imag, x1);
4548 #ifdef __ELPA_USE_FMA__
4549 y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4550 #else
4551 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4552 #endif
4553
4554 tmp2 = _SIMD_MUL(h2_imag, x2);
4555 #ifdef __ELPA_USE_FMA__
4556 y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4557 #else
4558 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4559 #endif
4560
4561 tmp3 = _SIMD_MUL(h2_imag, x3);
4562 #ifdef __ELPA_USE_FMA__
4563 y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4564 #else
4565 y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4566 #endif
4567
4568 #endif /* BLOCK2 */
4569
4570 q1 = _SIMD_LOAD(&q_dbl[0]);
4571 q2 = _SIMD_LOAD(&q_dbl[offset]);
4572 q3 = _SIMD_LOAD(&q_dbl[2*offset]);
4573
4574 #ifdef BLOCK1
4575 q1 = _SIMD_ADD(q1, x1);
4576 q2 = _SIMD_ADD(q2, x2);
4577 q3 = _SIMD_ADD(q3, x3);
4578 #endif
4579
4580 #ifdef BLOCK2
4581 q1 = _SIMD_ADD(q1, y1);
4582 q2 = _SIMD_ADD(q2, y2);
4583 q3 = _SIMD_ADD(q3, y3);
4584 #endif
4585
4586 _SIMD_STORE(&q_dbl[0], q1);
4587 _SIMD_STORE(&q_dbl[offset], q2);
4588 _SIMD_STORE(&q_dbl[2*offset], q3);
4589
4590 #ifdef BLOCK2
4591
4592 #if VEC_SET == SSE_128
4593 #ifdef DOUBLE_PRECISION_COMPLEX
4594 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
4595 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
4596 #endif
4597 #ifdef SINGLE_PRECISION_COMPLEX
4598 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
4599 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
4600 #endif
4601 #endif /* VEC_SET == SSE_128 */
4602
4603 #if VEC_SET == AVX_256
4604 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
4605 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
4606 #endif /* VEC_SET == AVX_256 */
4607
4608 #if VEC_SET == AVX_512
4609 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
4610 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
4611 #endif /* VEC_SET == AVX_512 */
4612
4613 q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
4614 q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
4615 q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
4616
4617 q1 = _SIMD_ADD(q1, x1);
4618 q2 = _SIMD_ADD(q2, x2);
4619 q3 = _SIMD_ADD(q3, x3);
4620
4621 tmp1 = _SIMD_MUL(h2_imag, y1);
4622
4623 #ifdef __ELPA_USE_FMA__
4624 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4625 #else
4626 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4627 #endif
4628
4629 tmp2 = _SIMD_MUL(h2_imag, y2);
4630 #ifdef __ELPA_USE_FMA__
4631 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4632 #else
4633 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4634 #endif
4635
4636 tmp3 = _SIMD_MUL(h2_imag, y3);
4637 #ifdef __ELPA_USE_FMA__
4638 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4639 #else
4640 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4641 #endif
4642
4643 _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
4644 _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
4645 _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
4646
4647 #endif /* BLOCK2 */
4648
4649 for (i = BLOCK; i < nb; i++)
4650 {
4651
4652 #if VEC_SET == SSE_128
4653 #ifdef DOUBLE_PRECISION_COMPLEX
4654 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
4655 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
4656 #endif
4657 #ifdef SINGLE_PRECISION_COMPLEX
4658 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
4659 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
4660 #endif
4661 #endif /* VEC_SET == SSE_128 */
4662
4663 #if VEC_SET == AVX_256
4664 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
4665 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
4666 #endif /* VEC_SET == AVX_256 */
4667
4668 #if VEC_SET == AVX_512
4669 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
4670 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
4671 #endif /* VEC_SET == AVX_512 */
4672
4673 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
4674 q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
4675 q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
4676
4677 tmp1 = _SIMD_MUL(h1_imag, x1);
4678 #ifdef __ELPA_USE_FMA__
4679 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4680 #else
4681 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4682 #endif
4683 tmp2 = _SIMD_MUL(h1_imag, x2);
4684 #ifdef __ELPA_USE_FMA__
4685 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4686 #else
4687 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4688 #endif
4689
4690 tmp3 = _SIMD_MUL(h1_imag, x3);
4691 #ifdef __ELPA_USE_FMA__
4692 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4693 #else
4694 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4695 #endif
4696
4697 #ifdef BLOCK2
4698
4699 #if VEC_SET == SSE_128
4700 #ifdef DOUBLE_PRECISION_COMPLEX
4701 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
4702 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
4703 #endif
4704 #ifdef SINGLE_PRECISION_COMPLEX
4705 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
4706 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
4707 #endif
4708 #endif /* VEC_SET == SSE_128 */
4709
4710 #if VEC_SET == AVX_256
4711 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
4712 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
4713 #endif /* VEC_SET == AVX_256 */
4714
4715 #if VEC_SET == AVX_512
4716 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
4717 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
4718 #endif /* VEC_SET == AVX_512 */
4719
4720 tmp1 = _SIMD_MUL(h2_imag, y1);
4721 #ifdef __ELPA_USE_FMA__
4722 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4723 #else
4724 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4725 #endif
4726 tmp2 = _SIMD_MUL(h2_imag, y2);
4727 #ifdef __ELPA_USE_FMA__
4728 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4729 #else
4730 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4731 #endif
4732
4733 tmp3 = _SIMD_MUL(h2_imag, y3);
4734 #ifdef __ELPA_USE_FMA__
4735 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4736 #else
4737 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4738 #endif
4739
4740 #endif /* BLOCK2 */
4741
4742 _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
4743 _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
4744 _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
4745
4746 }
4747 #ifdef BLOCK2
4748
4749 #if VEC_SET == SSE_128
4750 #ifdef DOUBLE_PRECISION_COMPLEX
4751 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
4752 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
4753 #endif
4754 #ifdef SINGLE_PRECISION_COMPLEX
4755 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
4756 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
4757 #endif
4758 #endif /* VEC_SET == SSE_128 */
4759
4760 #if VEC_SET == AVX_256
4761 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
4762 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
4763 #endif /* VEC_SET == AVX_256 */
4764
4765 #if VEC_SET == AVX_512
4766 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
4767 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
4768 #endif /* VEC_SET == AVX_512 */
4769
4770 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
4771 q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
4772 q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
4773
4774 tmp1 = _SIMD_MUL(h1_imag, x1);
4775 #ifdef __ELPA_USE_FMA__
4776 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4777 #else
4778 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4779 #endif
4780 tmp2 = _SIMD_MUL(h1_imag, x2);
4781 #ifdef __ELPA_USE_FMA__
4782 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4783 #else
4784 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4785 #endif
4786
4787 tmp3 = _SIMD_MUL(h1_imag, x3);
4788 #ifdef __ELPA_USE_FMA__
4789 q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4790 #else
4791 q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4792 #endif
4793
4794 _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
4795 _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
4796 _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
4797
4798 #endif /* BLOCK2 */
4799 }
4800
4801
4802 #if VEC_SET == SSE_128
4803 #ifdef DOUBLE_PRECISION_COMPLEX
4804 #define ROW_LENGTH 2
4805 #endif
4806 #ifdef SINGLE_PRECISION_COMPLEX
4807 #define ROW_LENGTH 4
4808 #endif
4809 #endif /* VEC_SET == SSE_128 */
4810
4811 #if VEC_SET == AVX_256
4812 #ifdef DOUBLE_PRECISION_COMPLEX
4813 #define ROW_LENGTH 4
4814 #endif
4815 #ifdef SINGLE_PRECISION_COMPLEX
4816 #define ROW_LENGTH 8
4817 #endif
4818 #endif /* VEC_SET == AVX_256 */
4819
4820 #if VEC_SET == AVX_512
4821 #ifdef DOUBLE_PRECISION_COMPLEX
4822 #define ROW_LENGTH 8
4823 #endif
4824 #ifdef SINGLE_PRECISION_COMPLEX
4825 #define ROW_LENGTH 16
4826 #endif
4827 #endif /* VEC_SET == AVX_512 */
4828
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)4829 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
4830 #ifdef BLOCK1
4831 )
4832 #endif
4833 #ifdef BLOCK2
4834 ,int ldh, DATA_TYPE s)
4835 #endif
4836 {
4837
4838 DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
4839 DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
4840 #ifdef BLOCK2
4841 DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
4842 #endif
4843
4844 __SIMD_DATATYPE x1, x2;
4845 __SIMD_DATATYPE q1, q2;
4846 #ifdef BLOCK2
4847 __SIMD_DATATYPE y1, y2;
4848 __SIMD_DATATYPE h2_real, h2_imag;
4849 #endif
4850 __SIMD_DATATYPE h1_real, h1_imag;
4851 __SIMD_DATATYPE tmp1, tmp2;
4852 int i=0;
4853
4854 #if VEC_SET == SSE_128
4855 #ifdef DOUBLE_PRECISION_COMPLEX
4856 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
4857 #endif
4858 #ifdef SINGLE_PRECISION_COMPLEX
4859 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
4860 #endif
4861 #endif /* VEC_SET == SSE_128 */
4862
4863 #if VEC_SET == AVX_256
4864 #ifdef DOUBLE_PRECISION_COMPLEX
4865 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
4866 #endif
4867 #ifdef SINGLE_PRECISION_COMPLEX
4868 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
4869 #endif
4870 #endif /* VEC_SET == AVX_256 */
4871
4872 #if VEC_SET == AVX_512
4873 #ifdef DOUBLE_PRECISION_COMPLEX
4874 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
4875 #endif
4876 #ifdef SINGLE_PRECISION_COMPLEX
4877 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
4878 #endif
4879 #endif /* VEC_SET == AVX_512 */
4880
4881 #ifdef BLOCK2
4882 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
4883 x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
4884
4885 #if VEC_SET == SSE_128
4886 #ifdef DOUBLE_PRECISION_COMPLEX
4887 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
4888 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
4889 #endif
4890 #ifdef SINGLE_PRECISION_COMPLEX
4891 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
4892 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
4893 #endif
4894 #endif /* VEC_SET == SSE_128 */
4895
4896 #if VEC_SET == AVX_256
4897 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
4898 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
4899 #endif /* VEC_SET == AVX_256 */
4900
4901 #if VEC_SET == AVX_512
4902 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
4903 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
4904 #endif
4905
4906 #ifndef __ELPA_USE_FMA__
4907 // conjugate
4908 h2_imag = _SIMD_XOR(h2_imag, sign);
4909 #endif
4910
4911 y1 = _SIMD_LOAD(&q_dbl[0]);
4912 y2 = _SIMD_LOAD(&q_dbl[offset]);
4913
4914 tmp1 = _SIMD_MUL(h2_imag, x1);
4915 #ifdef __ELPA_USE_FMA__
4916 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4917 #else
4918 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4919 #endif
4920 tmp2 = _SIMD_MUL(h2_imag, x2);
4921 #ifdef __ELPA_USE_FMA__
4922 y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4923 #else
4924 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4925 #endif
4926
4927 #endif /* BLOCK2 */
4928
4929 #ifdef BLOCK1
4930 x1 = _SIMD_LOAD(&q_dbl[0]);
4931 x2 = _SIMD_LOAD(&q_dbl[offset]);
4932 #endif
4933
4934 for (i = BLOCK; i < nb; i++)
4935 {
4936 #if VEC_SET == SSE_128
4937 #ifdef DOUBLE_PRECISION_COMPLEX
4938 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
4939 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
4940 #endif
4941 #ifdef SINGLE_PRECISION_COMPLEX
4942 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
4943 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
4944 #endif
4945 #endif /* VEC_SET == SSE_128 */
4946
4947 #if VEC_SET == AVX_256
4948 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
4949 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
4950 #endif /* VEC_SET == AVX_256 */
4951
4952 #if VEC_SET == AVX_512
4953 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
4954 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
4955 #endif /* VEC_SET == AVX_512 */
4956
4957 #ifndef __ELPA_USE_FMA__
4958 // conjugate
4959 h1_imag = _SIMD_XOR(h1_imag, sign);
4960 #endif
4961
4962 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
4963 q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
4964 tmp1 = _SIMD_MUL(h1_imag, q1);
4965
4966 #ifdef __ELPA_USE_FMA__
4967 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4968 #else
4969 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4970 #endif
4971
4972 tmp2 = _SIMD_MUL(h1_imag, q2);
4973 #ifdef __ELPA_USE_FMA__
4974 x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4975 #else
4976 x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4977 #endif
4978
4979 #ifdef BLOCK2
4980
4981 #if VEC_SET == SSE_128
4982 #ifdef DOUBLE_PRECISION_COMPLEX
4983 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
4984 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
4985 #endif
4986 #ifdef SINGLE_PRECISION_COMPLEX
4987 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
4988 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
4989 #endif
4990 #endif /* VEC_SET == SSE_128 */
4991
4992 #if VEC_SET == AVX_256
4993 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
4994 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
4995 #endif /* VEC_SET == AVX_256 */
4996
4997 #if VEC_SET == AVX_512
4998 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
4999 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
5000 #endif /* VEC_SET == AVX_512 */
5001
5002 #ifndef __ELPA_USE_FMA__
5003 // conjugate
5004 h2_imag = _SIMD_XOR(h2_imag, sign);
5005 #endif
5006
5007 tmp1 = _SIMD_MUL(h2_imag, q1);
5008 #ifdef __ELPA_USE_FMA__
5009 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5010 #else
5011 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5012 #endif
5013 tmp2 = _SIMD_MUL(h2_imag, q2);
5014 #ifdef __ELPA_USE_FMA__
5015 y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5016 #else
5017 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5018 #endif
5019
5020 #endif /* BLOCK2 */
5021
5022 }
5023
5024 #ifdef BLOCK2
5025
5026 #if VEC_SET == SSE_128
5027 #ifdef DOUBLE_PRECISION_COMPLEX
5028 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
5029 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
5030 #endif
5031 #ifdef SINGLE_PRECISION_COMPLEX
5032 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
5033 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
5034 #endif
5035 #endif /* VEC_SET == SSE_128 */
5036
5037 #if VEC_SET == AVX_256
5038 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
5039 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
5040 #endif /* VEC_SET == AVX_256 */
5041
5042 #if VEC_SET == AVX_512
5043 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
5044 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
5045 #endif /* VEC_SET == AVX_512 */
5046
5047 #ifndef __ELPA_USE_FMA__
5048 // conjugate
5049 h1_imag = _SIMD_XOR(h1_imag, sign);
5050 #endif
5051
5052 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
5053 q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
5054
5055 tmp1 = _SIMD_MUL(h1_imag, q1);
5056 #ifdef __ELPA_USE_FMA__
5057 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5058 #else
5059 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5060 #endif
5061 tmp2 = _SIMD_MUL(h1_imag, q2);
5062 #ifdef __ELPA_USE_FMA__
5063 x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5064 #else
5065 x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5066 #endif
5067
5068 #endif /* BLOCK2 */
5069
5070 #if VEC_SET == SSE_128
5071 #ifdef DOUBLE_PRECISION_COMPLEX
5072 h1_real = _mm_loaddup_pd(&hh_dbl[0]);
5073 h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
5074 #endif
5075 #ifdef SINGLE_PRECISION_COMPLEX
5076 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
5077 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
5078 #endif
5079 #endif /* VEC_SET == SSE_128 */
5080
5081 #if VEC_SET == AVX_256
5082 h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
5083 h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
5084 #endif /* VEC_SET == AVX_256 */
5085
5086 #if VEC_SET == AVX_512
5087 h1_real = _SIMD_SET1(hh_dbl[0]);
5088 h1_imag = _SIMD_SET1(hh_dbl[1]);
5089
5090 #ifdef HAVE_AVX512_XEON_PHI
5091 #ifdef DOUBLE_PRECISION_COMPLEX
5092 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
5093 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
5094 #endif
5095 #ifdef SINGLE_PRECISION_COMPLEX
5096 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
5097 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
5098 #endif
5099 #endif
5100 #ifdef HAVE_AVX512_XEON
5101 h1_real = _SIMD_XOR(h1_real, sign);
5102 h1_imag = _SIMD_XOR(h1_imag, sign);
5103 #endif
5104
5105 #endif /* VEC_SET == AVX_512 */
5106
5107 #if VEC_SET != AVX_512
5108 h1_real = _SIMD_XOR(h1_real, sign);
5109 h1_imag = _SIMD_XOR(h1_imag, sign);
5110 #endif /* VEC_SET != AVX_512 */
5111
5112 tmp1 = _SIMD_MUL(h1_imag, x1);
5113 #ifdef __ELPA_USE_FMA__
5114 x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5115 #else
5116 x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5117 #endif
5118
5119 tmp2 = _SIMD_MUL(h1_imag, x2);
5120 #ifdef __ELPA_USE_FMA__
5121 x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
5122 #else
5123 x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
5124 #endif
5125
5126 #ifdef BLOCK2
5127
5128 #if VEC_SET == SSE_128
5129 #ifdef DOUBLE_PRECISION_COMPLEX
5130 h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
5131 h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
5132 #endif
5133 #ifdef SINGLE_PRECISION_COMPLEX
5134 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
5135 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
5136 #endif
5137
5138 #ifdef DOUBLE_PRECISION_COMPLEX
5139 h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
5140 h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
5141 #endif
5142 #ifdef SINGLE_PRECISION_COMPLEX
5143 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
5144 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
5145 #endif
5146 #endif /* VEC_SET == 128 */
5147
5148 #if VEC_SET == AVX_256
5149 h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
5150 h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
5151 h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
5152 h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
5153 #endif /* VEC_SET == AVX_256 */
5154
5155 #if VEC_SET == AVX_512
5156 h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
5157 h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
5158 h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
5159 h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
5160
5161 #ifdef HAVE_AVX512_XEON_PHI
5162 #ifdef DOUBLE_PRECISION_COMPLEX
5163 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
5164 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
5165 #endif
5166 #ifdef SINGLE_PRECISION_COMPLEX
5167 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
5168 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
5169 #endif
5170
5171 #ifdef DOUBLE_PRECISION_COMPLEX
5172 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
5173 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
5174 #endif
5175 #ifdef SINGLE_PRECISION_COMPLEX
5176 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
5177 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
5178 #endif
5179 #endif
5180 #ifdef HAVE_AVX512_XEON
5181 h1_real = _SIMD_XOR(h1_real, sign);
5182 h1_imag = _SIMD_XOR(h1_imag, sign);
5183 h2_real = _SIMD_XOR(h2_real, sign);
5184 h2_imag = _SIMD_XOR(h2_imag, sign);
5185 #endif
5186
5187 #endif /* VEC_SET == AVX_512 */
5188
5189 #if VEC_SET != AVX_512
5190 h1_real = _SIMD_XOR(h1_real, sign);
5191 h1_imag = _SIMD_XOR(h1_imag, sign);
5192 h2_real = _SIMD_XOR(h2_real, sign);
5193 h2_imag = _SIMD_XOR(h2_imag, sign);
5194 #endif /* VEC_SET != AVX_512 */
5195
5196 #if VEC_SET == SSE_128
5197 #ifdef SINGLE_PRECISION_COMPLEX
5198 tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
5199 #else
5200 tmp2 = _SIMD_LOADU(s_dbl);
5201 #endif
5202 #endif /* VEC_SET == SSE_128 */
5203
5204 #if VEC_SET == AVX_256
5205 #ifdef DOUBLE_PRECISION_COMPLEX
5206 tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
5207 #endif
5208 #ifdef SINGLE_PRECISION_COMPLEX
5209 tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
5210 s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
5211 #endif
5212 #endif /* VEC_SET == AVX_256 */
5213
5214 #if VEC_SET == AVX_512
5215 #ifdef DOUBLE_PRECISION_COMPLEX
5216 tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
5217 s_dbl[1], s_dbl[0],
5218 s_dbl[1], s_dbl[0],
5219 s_dbl[1], s_dbl[0]);
5220 #endif
5221 #ifdef SINGLE_PRECISION_COMPLEX
5222 tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
5223 #endif
5224
5225 #endif /* VEC_SET == AVX_512 */
5226
5227 tmp1 = _SIMD_MUL(h2_imag, tmp2);
5228 #ifdef __ELPA_USE_FMA__
5229 tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5230 #else
5231 tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5232 #endif
5233
5234 #if VEC_SET == AVX_512
5235 _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
5236
5237 h2_real = _SIMD_SET1(s_dbl[0]);
5238 h2_imag = _SIMD_SET1(s_dbl[1]);
5239 #endif
5240
5241 #if VEC_SET == SSE_128
5242 #ifdef DOUBLE_PRECISION_COMPLEX
5243 h2_real = _mm_movedup_pd(tmp2);
5244 h2_imag = _mm_set1_pd(tmp2[1]);
5245 #endif
5246 #ifdef SINGLE_PRECISION_COMPLEX
5247 h2_real = _mm_moveldup_ps(tmp2);
5248 h2_imag = _mm_movehdup_ps(tmp2);
5249 #endif
5250 #endif /* VEC_SET == SSE_128 */
5251
5252 #if VEC_SET == AVX_256
5253 h2_real = _SIMD_SET1(tmp2[0]);
5254 h2_imag = _SIMD_SET1(tmp2[1]);
5255 #endif /* VEC_SET == AVX_256 */
5256
5257 tmp1 = _SIMD_MUL(h1_imag, y1);
5258 #ifdef __ELPA_USE_FMA__
5259 y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5260 #else
5261 y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5262 #endif
5263 tmp2 = _SIMD_MUL(h1_imag, y2);
5264 #ifdef __ELPA_USE_FMA__
5265 y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
5266 #else
5267 y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
5268 #endif
5269
5270 tmp1 = _SIMD_MUL(h2_imag, x1);
5271 #ifdef __ELPA_USE_FMA__
5272 y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5273 #else
5274 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5275 #endif
5276 tmp2 = _SIMD_MUL(h2_imag, x2);
5277 #ifdef __ELPA_USE_FMA__
5278 y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5279 #else
5280 y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5281 #endif
5282
5283 #endif /* BLOCK2 */
5284
5285 q1 = _SIMD_LOAD(&q_dbl[0]);
5286 q2 = _SIMD_LOAD(&q_dbl[offset]);
5287
5288 #ifdef BLOCK1
5289 q1 = _SIMD_ADD(q1, x1);
5290 q2 = _SIMD_ADD(q2, x2);
5291 #endif
5292
5293 #ifdef BLOCK2
5294 q1 = _SIMD_ADD(q1, y1);
5295 q2 = _SIMD_ADD(q2, y2);
5296 #endif
5297 _SIMD_STORE(&q_dbl[0], q1);
5298 _SIMD_STORE(&q_dbl[offset], q2);
5299
5300 #ifdef BLOCK2
5301
5302 #if VEC_SET == SSE_128
5303 #ifdef DOUBLE_PRECISION_COMPLEX
5304 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
5305 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
5306 #endif
5307 #ifdef SINGLE_PRECISION_COMPLEX
5308 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
5309 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
5310 #endif
5311 #endif /* VEC_SET == SSE_128 */
5312
5313 #if VEC_SET == AVX_256
5314 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
5315 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
5316 #endif /* VEC_SET == AVX_256 */
5317
5318 #if VEC_SET == AVX_512
5319 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
5320 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
5321 #endif /* VEC_SET == AVX_512 */
5322
5323 q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
5324 q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
5325
5326 q1 = _SIMD_ADD(q1, x1);
5327 q2 = _SIMD_ADD(q2, x2);
5328
5329 tmp1 = _SIMD_MUL(h2_imag, y1);
5330
5331 #ifdef __ELPA_USE_FMA__
5332 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5333 #else
5334 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5335 #endif
5336 tmp2 = _SIMD_MUL(h2_imag, y2);
5337 #ifdef __ELPA_USE_FMA__
5338 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5339 #else
5340 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5341 #endif
5342
5343 _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
5344 _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
5345
5346 #endif /* BLOCK2 */
5347
5348 for (i = BLOCK; i < nb; i++)
5349 {
5350 #if VEC_SET == SSE_128
5351 #ifdef DOUBLE_PRECISION_COMPLEX
5352 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
5353 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
5354 #endif
5355 #ifdef SINGLE_PRECISION_COMPLEX
5356 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
5357 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
5358 #endif
5359 #endif /* VEC_SET == SSE_128 */
5360
5361 #if VEC_SET == AVX_256
5362 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
5363 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
5364 #endif /* VEC_SET == AVX_256 */
5365
5366 #if VEC_SET == AVX_512
5367 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
5368 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
5369 #endif /* VEC_SET == AVX_512 */
5370
5371 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
5372 q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
5373 tmp1 = _SIMD_MUL(h1_imag, x1);
5374
5375 #ifdef __ELPA_USE_FMA__
5376 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5377 #else
5378 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5379 #endif
5380
5381 tmp2 = _SIMD_MUL(h1_imag, x2);
5382 #ifdef __ELPA_USE_FMA__
5383 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5384 #else
5385 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5386 #endif
5387
5388
5389 #ifdef BLOCK2
5390
5391 #if VEC_SET == SSE_128
5392 #ifdef DOUBLE_PRECISION_COMPLEX
5393 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
5394 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
5395 #endif
5396 #ifdef SINGLE_PRECISION_COMPLEX
5397 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
5398 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
5399 #endif
5400 #endif /* VEC_SET == SSE_128 */
5401
5402 #if VEC_SET == AVX_256
5403 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
5404 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
5405 #endif /* VEC_SET == AVX_256 */
5406
5407 #if VEC_SET == AVX_512
5408 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
5409 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
5410 #endif /* VEC_SET == AVX_512 */
5411
5412 tmp1 = _SIMD_MUL(h2_imag, y1);
5413 #ifdef __ELPA_USE_FMA__
5414 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5415 #else
5416 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5417 #endif
5418 tmp2 = _SIMD_MUL(h2_imag, y2);
5419 #ifdef __ELPA_USE_FMA__
5420 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5421 #else
5422 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5423 #endif
5424
5425 #endif /* BLOCK2 */
5426
5427 _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
5428 _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
5429 }
5430 #ifdef BLOCK2
5431
5432 #if VEC_SET == SSE_128
5433 #ifdef DOUBLE_PRECISION_COMPLEX
5434 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
5435 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
5436 #endif
5437 #ifdef SINGLE_PRECISION_COMPLEX
5438 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
5439 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
5440 #endif
5441 #endif /* VEC_SET == SSE_128 */
5442
5443 #if VEC_SET == AVX_256
5444 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
5445 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
5446 #endif /* VEC_SET == AVX_256 */
5447
5448 #if VEC_SET == AVX_512
5449 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
5450 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
5451 #endif /* VEC_SET == AVX_512 */
5452
5453 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
5454 q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
5455
5456 tmp1 = _SIMD_MUL(h1_imag, x1);
5457 #ifdef __ELPA_USE_FMA__
5458 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5459 #else
5460 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5461 #endif
5462 tmp2 = _SIMD_MUL(h1_imag, x2);
5463 #ifdef __ELPA_USE_FMA__
5464 q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5465 #else
5466 q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5467 #endif
5468
5469 _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
5470 _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
5471
5472 #endif /* BLOCK2 */
5473
5474 }
5475
5476 #if VEC_SET == SSE_128
5477 #ifdef DOUBLE_PRECISION_COMPLEX
5478 #define ROW_LENGTH 1
5479 #endif
5480 #ifdef SINGLE_PRECISION_COMPLEX
5481 #define ROW_LENGTH 2
5482 #endif
5483 #endif /* VEC_SET == SSE_128 */
5484
5485 #if VEC_SET == AVX_256
5486 #ifdef DOUBLE_PRECISION_COMPLEX
5487 #define ROW_LENGTH 2
5488 #endif
5489 #ifdef SINGLE_PRECISION_COMPLEX
5490 #define ROW_LENGTH 4
5491 #endif
5492 #endif /* VEC_SET == AVX_256 */
5493
5494 #if VEC_SET == AVX_512
5495 #ifdef DOUBLE_PRECISION_COMPLEX
5496 #define ROW_LENGTH 4
5497 #endif
5498 #ifdef SINGLE_PRECISION_COMPLEX
5499 #define ROW_LENGTH 8
5500 #endif
5501 #endif /* VEC_SET == AVX_512 */
5502
5503
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)5504 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
5505 #ifdef BLOCK1
5506 )
5507 #endif
5508 #ifdef BLOCK2
5509 ,int ldh, DATA_TYPE s)
5510 #endif
5511 {
5512
5513 DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
5514 DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
5515 #ifdef BLOCK2
5516 DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
5517 #endif
5518
5519 __SIMD_DATATYPE x1;
5520 __SIMD_DATATYPE q1;
5521 #ifdef BLOCK2
5522 __SIMD_DATATYPE y1;
5523 __SIMD_DATATYPE h2_real, h2_imag;
5524 #endif
5525 __SIMD_DATATYPE h1_real, h1_imag;
5526 __SIMD_DATATYPE tmp1, tmp2;
5527 int i=0;
5528
5529 #if VEC_SET == SSE_128
5530 #ifdef DOUBLE_PRECISION_COMPLEX
5531 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
5532 #endif
5533 #ifdef SINGLE_PRECISION_COMPLEX
5534 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
5535 #endif
5536 #endif /* VEC_SET == SSE_128 */
5537
5538 #if VEC_SET == AVX_256
5539 #ifdef DOUBLE_PRECISION_COMPLEX
5540 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
5541 #endif
5542 #ifdef SINGLE_PRECISION_COMPLEX
5543 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
5544 #endif
5545 #endif /* VEC_SET == AVX_256 */
5546
5547 #if VEC_SET == AVX_512
5548 #ifdef DOUBLE_PRECISION_COMPLEX
5549 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
5550 #endif
5551 #ifdef SINGLE_PRECISION_COMPLEX
5552 __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
5553 #endif
5554 #endif /* VEC_SET == AVX_512 */
5555
5556 #ifdef BLOCK2
5557 x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
5558
5559 #if VEC_SET == SSE_128
5560 #ifdef DOUBLE_PRECISION_COMPLEX
5561 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
5562 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
5563 #endif
5564 #ifdef SINGLE_PRECISION_COMPLEX
5565 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
5566 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
5567 #endif
5568 #endif /* VEC_SET == SSE_128 */
5569
5570 #if VEC_SET == AVX_256
5571 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
5572 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
5573 #endif /* VEC_SET == AVX_256 */
5574
5575 #if VEC_SET == AVX_512
5576 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
5577 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
5578 #endif
5579
5580 #ifndef __ELPA_USE_FMA__
5581 // conjugate
5582 h2_imag = _SIMD_XOR(h2_imag, sign);
5583 #endif
5584
5585 y1 = _SIMD_LOAD(&q_dbl[0]);
5586
5587 tmp1 = _SIMD_MUL(h2_imag, x1);
5588 #ifdef __ELPA_USE_FMA__
5589 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5590 #else
5591 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5592 #endif
5593
5594 #endif /* BLOCK2 */
5595
5596 #ifdef BLOCK1
5597 x1 = _SIMD_LOAD(&q_dbl[0]);
5598 #endif
5599
5600 for (i = BLOCK; i < nb; i++)
5601 {
5602 #if VEC_SET == SSE_128
5603 #ifdef DOUBLE_PRECISION_COMPLEX
5604 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
5605 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
5606 #endif
5607 #ifdef SINGLE_PRECISION_COMPLEX
5608 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
5609 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
5610 #endif
5611 #endif /* VEC_SET == SSE_128 */
5612
5613 #if VEC_SET == AVX_256
5614 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
5615 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
5616 #endif /* VEC_SET == AVX_256 */
5617
5618 #if VEC_SET == AVX_512
5619 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
5620 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
5621 #endif /* VEC_SET == AVX_512 */
5622
5623 #ifndef __ELPA_USE_FMA__
5624 // conjugate
5625 h1_imag = _SIMD_XOR(h1_imag, sign);
5626 #endif
5627
5628 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
5629
5630 tmp1 = _SIMD_MUL(h1_imag, q1);
5631 #ifdef __ELPA_USE_FMA__
5632 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5633 #else
5634 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5635 #endif
5636
5637 #ifdef BLOCK2
5638
5639 #if VEC_SET == SSE_128
5640 #ifdef DOUBLE_PRECISION_COMPLEX
5641 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
5642 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
5643 #endif
5644 #ifdef SINGLE_PRECISION_COMPLEX
5645 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
5646 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
5647 #endif
5648 #endif /* VEC_SET == SSE_128 */
5649
5650 #if VEC_SET == AVX_256
5651 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
5652 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
5653 #endif /* VEC_SET == AVX_256 */
5654
5655 #if VEC_SET == AVX_512
5656 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
5657 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
5658 #endif /* AVX_512 */
5659
5660 #ifndef __ELPA_USE_FMA__
5661 // conjugate
5662 h2_imag = _SIMD_XOR(h2_imag, sign);
5663 #endif
5664
5665 tmp1 = _SIMD_MUL(h2_imag, q1);
5666 #ifdef __ELPA_USE_FMA__
5667 y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5668 #else
5669 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5670 #endif
5671
5672 #endif /* BLOCK2 */
5673
5674 }
5675
5676 #ifdef BLOCK2
5677
5678 #if VEC_SET == SSE_128
5679 #ifdef DOUBLE_PRECISION_COMPLEX
5680 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
5681 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
5682 #endif
5683 #ifdef SINGLE_PRECISION_COMPLEX
5684 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
5685 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
5686 #endif
5687 #endif /* VEC_SET == SSE_128 */
5688
5689 #if VEC_SET == AVX_256
5690 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
5691 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
5692 #endif /* VEC_SET == AVX_256 */
5693
5694 #if VEC_SET == AVX_512
5695 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
5696 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
5697 #endif /* VEC_SET == AVX_512 */
5698
5699 #ifndef __ELPA_USE_FMA__
5700 // conjugate
5701 h1_imag = _SIMD_XOR(h1_imag, sign);
5702 #endif
5703
5704 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
5705
5706 tmp1 = _SIMD_MUL(h1_imag, q1);
5707 #ifdef __ELPA_USE_FMA__
5708 x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5709 #else
5710 x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5711 #endif
5712
5713 #endif /* BLOCK2 */
5714
5715 #if VEC_SET == SSE_128
5716 #ifdef DOUBLE_PRECISION_COMPLEX
5717 h1_real = _mm_loaddup_pd(&hh_dbl[0]);
5718 h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
5719 #endif
5720 #ifdef SINGLE_PRECISION_COMPLEX
5721 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
5722 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
5723 #endif
5724 #endif /* VEC_SET == SSE_128 */
5725
5726 #if VEC_SET == AVX_256
5727 h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
5728 h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
5729 #endif /* VEC_SET == AVX_256 */
5730
5731 #if VEC_SET == AVX_512
5732 h1_real = _SIMD_SET1(hh_dbl[0]);
5733 h1_imag = _SIMD_SET1(hh_dbl[1]);
5734
5735 #ifdef HAVE_AVX512_XEON_PHI
5736 #ifdef DOUBLE_PRECISION_COMPLEX
5737 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
5738 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
5739 #endif
5740 #ifdef SINGLE_PRECISION_COMPLEX
5741 h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
5742 h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
5743 #endif
5744 #endif
5745 #ifdef HAVE_AVX512_XEON
5746 h1_real = _SIMD_XOR(h1_real, sign);
5747 h1_imag = _SIMD_XOR(h1_imag, sign);
5748 #endif
5749
5750 #endif /* VEC_SET == AVX_512 */
5751
5752 #if VEC_SET != AVX_512
5753 h1_real = _SIMD_XOR(h1_real, sign);
5754 h1_imag = _SIMD_XOR(h1_imag, sign);
5755 #endif /* VEC_SET != AVX_512 */
5756
5757 tmp1 = _SIMD_MUL(h1_imag, x1);
5758 #ifdef __ELPA_USE_FMA__
5759 x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5760 #else
5761 x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5762 #endif
5763
5764 #ifdef BLOCK2
5765
5766 #if VEC_SET == SSE_128
5767 #ifdef DOUBLE_PRECISION_COMPLEX
5768 h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
5769 h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
5770 #endif
5771 #ifdef SINGLE_PRECISION_COMPLEX
5772 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
5773 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
5774 #endif
5775
5776 #ifdef DOUBLE_PRECISION_COMPLEX
5777 h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
5778 h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
5779 #endif
5780 #ifdef SINGLE_PRECISION_COMPLEX
5781 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
5782 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
5783 #endif
5784 #endif /* VEC_SET == 128 */
5785
5786 #if VEC_SET == AVX_256
5787 h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
5788 h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
5789 h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
5790 h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
5791 #endif /* VEC_SET == AVX_256 */
5792
5793 #if VEC_SET == AVX_512
5794 h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
5795 h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
5796 h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
5797 h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
5798
5799 #ifdef HAVE_AVX512_XEON_PHI
5800 #ifdef DOUBLE_PRECISION_COMPLEX
5801 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
5802 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
5803 #endif
5804 #ifdef SINGLE_PRECISION_COMPLEX
5805 h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
5806 h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
5807 #endif
5808
5809 #ifdef DOUBLE_PRECISION_COMPLEX
5810 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
5811 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
5812 #endif
5813 #ifdef SINGLE_PRECISION_COMPLEX
5814 h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
5815 h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
5816 #endif
5817 #endif
5818 #ifdef HAVE_AVX512_XEON
5819 h1_real = _SIMD_XOR(h1_real, sign);
5820 h1_imag = _SIMD_XOR(h1_imag, sign);
5821 h2_real = _SIMD_XOR(h2_real, sign);
5822 h2_imag = _SIMD_XOR(h2_imag, sign);
5823 #endif
5824
5825 #endif /* VEC_SET == AVX_512 */
5826
5827 #if VEC_SET != AVX_512
5828 h1_real = _SIMD_XOR(h1_real, sign);
5829 h1_imag = _SIMD_XOR(h1_imag, sign);
5830 h2_real = _SIMD_XOR(h2_real, sign);
5831 h2_imag = _SIMD_XOR(h2_imag, sign);
5832 #endif
5833
5834 #if VEC_SET == SSE_128
5835 #ifdef SINGLE_PRECISION_COMPLEX
5836 tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
5837 #else
5838 tmp2 = _SIMD_LOADU(s_dbl);
5839 #endif
5840 #endif /* VEC_SET == SSE_128 */
5841
5842 #if VEC_SET == AVX_256
5843 #ifdef DOUBLE_PRECISION_COMPLEX
5844 tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
5845 #endif
5846 #ifdef SINGLE_PRECISION_COMPLEX
5847 tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
5848 s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
5849 #endif
5850 #endif /* VEC_SET == AVX_256 */
5851
5852 #if VEC_SET == AVX_512
5853 #ifdef DOUBLE_PRECISION_COMPLEX
5854 tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
5855 s_dbl[1], s_dbl[0],
5856 s_dbl[1], s_dbl[0],
5857 s_dbl[1], s_dbl[0]);
5858 #endif
5859 #ifdef SINGLE_PRECISION_COMPLEX
5860 tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
5861 #endif
5862
5863 #endif /* VEC_SET == AVX_512 */
5864
5865 tmp1 = _SIMD_MUL(h2_imag, tmp2);
5866 #ifdef __ELPA_USE_FMA__
5867 tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5868 #else
5869 tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5870 #endif
5871
5872 #if VEC_SET == AVX_512
5873 _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
5874
5875 h2_real = _SIMD_SET1(s_dbl[0]);
5876 h2_imag = _SIMD_SET1(s_dbl[1]);
5877 #endif
5878
5879 #if VEC_SET == SSE_128
5880 #ifdef DOUBLE_PRECISION_COMPLEX
5881 h2_real = _mm_movedup_pd(tmp2);
5882 h2_imag = _mm_set1_pd(tmp2[1]);
5883 #endif
5884 #ifdef SINGLE_PRECISION_COMPLEX
5885 h2_real = _mm_moveldup_ps(tmp2);
5886 h2_imag = _mm_movehdup_ps(tmp2);
5887 #endif
5888 #endif /* VEC_SET == SSE_128 */
5889
5890 #if VEC_SET == AVX_256
5891 h2_real = _SIMD_SET1(tmp2[0]);
5892 h2_imag = _SIMD_SET1(tmp2[1]);
5893 #endif /* VEC_SET == AVX_256 */
5894
5895 tmp1 = _SIMD_MUL(h1_imag, y1);
5896 #ifdef __ELPA_USE_FMA__
5897 y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5898 #else
5899 y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5900 #endif
5901
5902 tmp1 = _SIMD_MUL(h2_imag, x1);
5903 #ifdef __ELPA_USE_FMA__
5904 y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5905 #else
5906 y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5907 #endif
5908
5909 #endif /* BLOCK2 */
5910
5911 q1 = _SIMD_LOAD(&q_dbl[0]);
5912
5913 #ifdef BLOCK1
5914 q1 = _SIMD_ADD(q1, x1);
5915 #endif
5916
5917 #ifdef BLOCK2
5918 q1 = _SIMD_ADD(q1, y1);
5919 #endif
5920 _SIMD_STORE(&q_dbl[0], q1);
5921
5922 #ifdef BLOCK2
5923
5924 #if VEC_SET == SSE_128
5925 #ifdef DOUBLE_PRECISION_COMPLEX
5926 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
5927 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
5928 #endif
5929 #ifdef SINGLE_PRECISION_COMPLEX
5930 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
5931 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
5932 #endif
5933 #endif /* VEC_SET == SSE_128 */
5934
5935 #if VEC_SET == AVX_256
5936 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
5937 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
5938 #endif /* VEC_SET == AVX_256 */
5939
5940 #if VEC_SET == AVX_512
5941 h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
5942 h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
5943 #endif /* VEC_SET == AVX_512 */
5944
5945 q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
5946
5947 q1 = _SIMD_ADD(q1, x1);
5948
5949 tmp1 = _SIMD_MUL(h2_imag, y1);
5950 #ifdef __ELPA_USE_FMA__
5951 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5952 #else
5953 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5954 #endif
5955
5956 _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
5957
5958 #endif /* BLOCK2 */
5959
5960 for (i = BLOCK; i < nb; i++)
5961 {
5962 #if VEC_SET == SSE_128
5963 #ifdef DOUBLE_PRECISION_COMPLEX
5964 h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
5965 h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
5966 #endif
5967 #ifdef SINGLE_PRECISION_COMPLEX
5968 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
5969 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
5970 #endif
5971 #endif /* VEC_SET == SSE_128 */
5972
5973 #if VEC_SET == AVX_256
5974 h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
5975 h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
5976 #endif /* VEC_SET == AVX_256 */
5977
5978 #if VEC_SET == AVX_512
5979 h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
5980 h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
5981 #endif /* VEC_SET == AVX_512 */
5982
5983 q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
5984
5985 tmp1 = _SIMD_MUL(h1_imag, x1);
5986 #ifdef __ELPA_USE_FMA__
5987 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5988 #else
5989 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5990 #endif
5991
5992 #ifdef BLOCK2
5993
5994 #if VEC_SET == SSE_128
5995 #ifdef DOUBLE_PRECISION_COMPLEX
5996 h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
5997 h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
5998 #endif
5999 #ifdef SINGLE_PRECISION_COMPLEX
6000 h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
6001 h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
6002 #endif
6003 #endif /* VEC_SET == SSE_128 */
6004
6005 #if VEC_SET == AVX_256
6006 h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
6007 h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
6008 #endif /* VEC_SET == AVX_256 */
6009
6010 #if VEC_SET == AVX_512
6011 h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
6012 h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
6013 #endif /* VEC_SET == AVX_512 */
6014
6015 tmp1 = _SIMD_MUL(h2_imag, y1);
6016 #ifdef __ELPA_USE_FMA__
6017 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
6018 #else
6019 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
6020 #endif
6021 #endif /* BLOCK2 */
6022
6023 _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
6024 }
6025 #ifdef BLOCK2
6026
6027 #if VEC_SET == SSE_128
6028 #ifdef DOUBLE_PRECISION_COMPLEX
6029 h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
6030 h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
6031 #endif
6032 #ifdef SINGLE_PRECISION_COMPLEX
6033 h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
6034 h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
6035 #endif
6036 #endif /* VEC_SET == SSE_128 */
6037
6038 #if VEC_SET == AVX_256
6039 h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
6040 h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
6041 #endif /* VEC_SET == AVX_256 */
6042
6043 #if VEC_SET == AVX_512
6044 h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
6045 h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
6046 #endif /* VEC_SET == AVX_512 */
6047
6048 q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
6049
6050 tmp1 = _SIMD_MUL(h1_imag, x1);
6051 #ifdef __ELPA_USE_FMA__
6052 q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
6053 #else
6054 q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
6055 #endif
6056
6057 _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
6058
6059 #endif /* BLOCK2 */
6060
6061 }
6062