1 //    This file is part of ELPA.
2 //
3 //    The ELPA library was originally created by the ELPA consortium,
4 //    consisting of the following organizations:
5 //
6 //    - Max Planck Computing and Data Facility (MPCDF), formerly known as
7 //      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 //    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9 //      Informatik,
10 //    - Technische Universität München, Lehrstuhl für Informatik mit
11 //      Schwerpunkt Wissenschaftliches Rechnen ,
12 //    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13 //    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 //      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
15 //      and
16 //    - IBM Deutschland GmbH
17 //
18 //
19 //    This particular source code file contains additions, changes and
20 //    enhancements authored by Intel Corporation which is not part of
21 //    the ELPA consortium.
22 //
23 //    More information can be found here:
24 //    http://elpa.mpcdf.mpg.de/
25 //
26 //    ELPA is free software: you can redistribute it and/or modify
27 //    it under the terms of the version 3 of the license of the
28 //    GNU Lesser General Public License as published by the Free
29 //    Software Foundation.
30 //
31 //    ELPA is distributed in the hope that it will be useful,
32 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
33 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34 //    GNU Lesser General Public License for more details.
35 //
36 //    You should have received a copy of the GNU Lesser General Public License
37 //    along with ELPA. If not, see <http://www.gnu.org/licenses/>
38 //
39 //    ELPA reflects a substantial effort on the part of the original
40 //    ELPA consortium, and we ask you to respect the spirit of the
41 //    license that we chose: i.e., please contribute any changes you
42 //    may have back to the original ELPA library distribution, and keep
43 //    any derivatives of ELPA under the same license that we chose for
44 //    the original distribution, the GNU Lesser General Public License.
45 //
46 // Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
47 //
48 #include "config-f90.h"
49 
50 #define CONCAT_8ARGS(a, b, c, d, e, f, g, h) CONCAT2_8ARGS(a, b, c, d, e, f, g, h)
51 #define CONCAT2_8ARGS(a, b, c, d, e, f, g, h) a ## b ## c ## d ## e ## f ## g ## h
52 
53 #define CONCAT_7ARGS(a, b, c, d, e, f, g) CONCAT2_7ARGS(a, b, c, d, e, f, g)
54 #define CONCAT2_7ARGS(a, b, c, d, e, f, g) a ## b ## c ## d ## e ## f ## g
55 
56 #define CONCAT_6ARGS(a, b, c, d, e, f) CONCAT2_6ARGS(a, b, c, d, e, f)
57 #define CONCAT2_6ARGS(a, b, c, d, e, f) a ## b ## c ## d ## e ## f
58 
59 #define CONCAT_5ARGS(a, b, c, d, e) CONCAT2_5ARGS(a, b, c, d, e)
60 #define CONCAT2_5ARGS(a, b, c, d, e) a ## b ## c ## d ## e
61 
62 #define CONCAT_4ARGS(a, b, c, d) CONCAT2_4ARGS(a, b, c, d)
63 #define CONCAT2_4ARGS(a, b, c, d) a ## b ## c ## d
64 
65 #define CONCAT_3ARGS(a, b, c) CONCAT2_3ARGS(a, b, c)
66 #define CONCAT2_3ARGS(a, b, c) a ## b ## c
67 
68 //define instruction set numbers
69 #define SSE_128 128
70 #define AVX_256 256
71 #define AVX_512 512
72 #define NEON_ARCH64_128 1285
73 
74 #if VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512
75 #include <x86intrin.h>
76 #ifdef BLOCK2
77 #if VEC_SET == SSE_128
78 #include <pmmintrin.h>
79 #endif
80 #endif
81 
82 #define __forceinline __attribute__((always_inline))
83 
84 #endif
85 
86 
87 #include <complex.h>
88 
89 #include <stdio.h>
90 #include <stdlib.h>
91 
92 #ifdef BLOCK2
93 #define PREFIX double
94 #define BLOCK 2
95 #endif
96 
97 #ifdef BLOCK1
98 #define PREFIX single
99 #define BLOCK 1
100 #endif
101 
102 #if VEC_SET == SSE_128
103 #define SIMD_SET SSE
104 #endif
105 
106 #if VEC_SET == AVX_256
107 #define SIMD_SET AVX_AVX2
108 #endif
109 
110 #if VEC_SET == AVX_512
111 #define SIMD_SET AVX512
112 #endif
113 
114 
115 #if VEC_SET == SSE_128
116 
117 #ifdef DOUBLE_PRECISION_COMPLEX
118 #define offset 2
119 #define __SIMD_DATATYPE __m128d
120 #define _SIMD_LOAD _mm_load_pd
121 #define _SIMD_LOADU _mm_loadu_pd
122 #define _SIMD_STORE _mm_store_pd
123 #define _SIMD_STOREU _mm_storeu_pd
124 #define _SIMD_MUL _mm_mul_pd
125 #define _SIMD_ADD _mm_add_pd
126 #define _SIMD_XOR _mm_xor_pd
127 #define _SIMD_ADDSUB _mm_addsub_pd
128 #define _SIMD_SHUFFLE _mm_shuffle_pd
129 #define _SHUFFLE _MM_SHUFFLE2(0,1)
130 
131 #ifdef __ELPA_USE_FMA__
132 #define _SIMD_FMSUBADD _mm_maddsub_pd
133 #endif
134 #endif /* DOUBLE_PRECISION_COMPLEX */
135 
136 #ifdef SINGLE_PRECISION_COMPLEX
137 #define offset 4
138 #define __SIMD_DATATYPE __m128
139 #define _SIMD_LOAD _mm_load_ps
140 #define _SIMD_LOADU _mm_loadu_ps
141 #define _SIMD_STORE _mm_store_ps
142 #define _SIMD_STOREU _mm_storeu_ps
143 #define _SIMD_MUL _mm_mul_ps
144 #define _SIMD_ADD _mm_add_ps
145 #define _SIMD_XOR _mm_xor_ps
146 #define _SIMD_ADDSUB _mm_addsub_ps
147 #define _SIMD_SHUFFLE _mm_shuffle_ps
148 #define _SHUFFLE 0xb1
149 
150 #ifdef __ELPA_USE_FMA__
151 #define _SIMD_FMSUBADD _mm_maddsub_ps
152 #endif
153 
154 #endif /* SINGLE_PRECISION_COMPLEX */
155 
156 #endif /* VEC_SET == SSE_128 */
157 
158 #if VEC_SET == AVX_256
159 
160 #ifdef DOUBLE_PRECISION_COMPLEX
161 #define offset 4
162 #define __SIMD_DATATYPE __m256d
163 #define _SIMD_LOAD _mm256_load_pd
164 #define _SIMD_LOADU 1
165 #define _SIMD_STORE _mm256_store_pd
166 #define _SIMD_STOREU 1
167 #define _SIMD_MUL _mm256_mul_pd
168 #define _SIMD_ADD _mm256_add_pd
169 #define _SIMD_XOR _mm256_xor_pd
170 #define _SIMD_BROADCAST _mm256_broadcast_sd
171 #define _SIMD_SET1 _mm256_set1_pd
172 #define _SIMD_ADDSUB _mm256_addsub_pd
173 #define _SIMD_SHUFFLE _mm256_shuffle_pd
174 #define _SHUFFLE 0x5
175 
176 #ifdef HAVE_AVX2
177 
178 #ifdef __FMA4__
179 #define __ELPA_USE_FMA__
180 #define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
181 #define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
182 #endif
183 
184 #ifdef __AVX2__
185 #define __ELPA_USE_FMA__
186 #define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
187 #define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
188 #endif
189 
190 #define _SIMD_FMADDSUB _mm256_FMADDSUB_pd
191 #define _SIMD_FMSUBADD _mm256_FMSUBADD_pd
192 #endif /* HAVE_AVX2 */
193 
194 #endif /* DOUBLE_PRECISION_COMPLEX */
195 
196 #ifdef SINGLE_PRECISION_COMPLEX
197 #define offset 8
198 #define __SIMD_DATATYPE __m256
199 #define _SIMD_LOAD _mm256_load_ps
200 #define _SIMD_LOADU 1
201 #define _SIMD_STORE _mm256_store_ps
202 #define _SIMD_STOREU 1
203 #define _SIMD_MUL _mm256_mul_ps
204 #define _SIMD_ADD _mm256_add_ps
205 #define _SIMD_XOR _mm256_xor_ps
206 #define _SIMD_BROADCAST  _mm256_broadcast_ss
207 #define _SIMD_SET1 _mm256_set1_ps
208 #define _SIMD_ADDSUB _mm256_addsub_ps
209 #define _SIMD_SHUFFLE _mm256_shuffle_ps
210 #define _SHUFFLE 0xb1
211 
212 #ifdef HAVE_AVX2
213 
214 #ifdef __FMA4__
215 #define __ELPA_USE_FMA__
216 #define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
217 #define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
218 #endif
219 
220 #ifdef __AVX2__
221 #define __ELPA_USE_FMA__
222 #define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
223 #define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
224 #endif
225 
226 #define _SIMD_FMADDSUB _mm256_FMADDSUB_ps
227 #define _SIMD_FMSUBADD _mm256_FMSUBADD_ps
228 #endif /* HAVE_AVX2 */
229 
230 #endif /* SINGLE_PRECISION_COMPLEX */
231 
232 #endif /* VEC_SET == AVX_256 */
233 
234 #if VEC_SET == AVX_512
235 
236 #ifdef DOUBLE_PRECISION_COMPLEX
237 #define offset 8
238 #define __SIMD_DATATYPE __m512d
239 #define _SIMD_LOAD _mm512_load_pd
240 #define _SIMD_LOADU 1
241 #define _SIMD_STORE _mm512_store_pd
242 #define _SIMD_STOREU 1
243 #define _SIMD_MUL _mm512_mul_pd
244 #define _SIMD_ADD _mm512_add_pd
245 #ifdef HAVE_AVX512_XEON
246 #define _SIMD_XOR _mm512_xor_pd
247 #endif
248 #define _SIMD_BROADCAST 1
249 #define _SIMD_SET1 _mm512_set1_pd
250 #define _SIMD_SET _mm512_set_pd
251 #define _SIMD_XOR_EPI _mm512_xor_epi64
252 #define _SIMD_ADDSUB 1
253 #define _SIMD_SHUFFLE _mm512_shuffle_pd
254 #define _SIMD_MASK_STOREU _mm512_mask_storeu_pd
255 #define _SHUFFLE 0x55
256 
257 #ifdef HAVE_AVX512
258 #define __ELPA_USE_FMA__
259 #define _mm512_FMADDSUB_pd(a,b,c) _mm512_fmaddsub_pd(a,b,c)
260 #define _mm512_FMSUBADD_pd(a,b,c) _mm512_fmsubadd_pd(a,b,c)
261 
262 #define _SIMD_FMADDSUB _mm512_FMADDSUB_pd
263 #define _SIMD_FMSUBADD _mm512_FMSUBADD_pd
264 #endif /* HAVE_AVX512 */
265 
266 #endif /* DOUBLE_PRECISION_COMPLEX */
267 
268 #ifdef SINGLE_PRECISION_COMPLEX
269 #define offset 16
270 #define __SIMD_DATATYPE __m512
271 #define _SIMD_LOAD _mm512_load_ps
272 #define _SIMD_LOADU 1
273 #define _SIMD_STORE _mm512_store_ps
274 #define _SIMD_STOREU 1
275 #define _SIMD_MUL _mm512_mul_ps
276 #define _SIMD_ADD _mm512_add_ps
277 #ifdef HAVE_AVX512_XEON
278 #define _SIMD_XOR _mm512_xor_ps
279 #endif
280 #define _SIMD_BROADCAST 1
281 #define _SIMD_SET1 _mm512_set1_ps
282 #define _SIMD_SET _mm512_set_ps
283 #define _SIMD_ADDSUB 1
284 #define _SIMD_SHUFFLE _mm512_shuffle_ps
285 #define _SIMD_MASK_STOREU _mm512_mask_storeu_ps
286 #define _SIMD_XOR_EPI _mm512_xor_epi32
287 #define _SHUFFLE 0xb1
288 
289 #ifdef HAVE_AVX512
290 
291 #define __ELPA_USE_FMA__
292 #define _mm512_FMADDSUB_ps(a,b,c) _mm512_fmaddsub_ps(a,b,c)
293 #define _mm512_FMSUBADD_ps(a,b,c) _mm512_fmsubadd_ps(a,b,c)
294 
295 #define _SIMD_FMADDSUB _mm512_FMADDSUB_ps
296 #define _SIMD_FMSUBADD _mm512_FMSUBADD_ps
297 #endif /* HAVE_AVX512 */
298 
299 #endif /* SINGLE_PRECISION_COMPLEX */
300 
301 #endif /* VEC_SET == AVX_512 */
302 
303 
304 
305 
306 #define __forceinline __attribute__((always_inline))
307 
308 #ifdef HAVE_SSE_INTRINSICS
309 #undef __AVX__
310 #endif
311 
312 #ifdef DOUBLE_PRECISION_COMPLEX
313 #define WORD_LENGTH double
314 #define DATA_TYPE double complex
315 #define DATA_TYPE_PTR double complex*
316 #define DATA_TYPE_REAL double
317 #define DATA_TYPE_REAL_PTR double*
318 #endif
319 
320 #ifdef SINGLE_PRECISION_COMPLEX
321 #define WORD_LENGTH single
322 #define DATA_TYPE float complex
323 #define DATA_TYPE_PTR float complex*
324 #define DATA_TYPE_REAL float
325 #define DATA_TYPE_REAL_PTR float*
326 #endif
327 
328 
329 //Forward declaration
330 
331 #if VEC_SET  == SSE_128
332 #ifdef DOUBLE_PRECISION_COMPLEX
333 #undef ROW_LENGTH
334 #define ROW_LENGTH 6
335 #endif
336 #ifdef SINGLE_PRECISION_COMPLEX
337 #undef ROW_LENGTH
338 #define ROW_LENGTH 12
339 #endif
340 #endif /* VEC_SET  == SSE_128 */
341 
342 #if VEC_SET  == AVX_256
343 #ifdef DOUBLE_PRECISION_COMPLEX
344 #undef ROW_LENGTH
345 #define ROW_LENGTH 12
346 #endif
347 #ifdef SINGLE_PRECISION_COMPLEX
348 #undef ROW_LENGTH
349 #define ROW_LENGTH 24
350 #endif
351 #endif /* VEC_SET  == AVX_256 */
352 
353 #if VEC_SET  == AVX_512
354 #ifdef DOUBLE_PRECISION_COMPLEX
355 #undef ROW_LENGTH
356 #define ROW_LENGTH 24
357 #endif
358 #ifdef SINGLE_PRECISION_COMPLEX
359 #undef ROW_LENGTH
360 #define ROW_LENGTH 48
361 #endif
362 #endif /* VEC_SET  == AVX_512 */
363 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
364 #ifdef BLOCK1
365 		                       );
366 #endif
367 #ifdef BLOCK2
368                                        ,int ldh, DATA_TYPE s);
369 #endif
370 
371 #if VEC_SET  == SSE_128
372 #ifdef DOUBLE_PRECISION_COMPLEX
373 #undef ROW_LENGTH
374 #define ROW_LENGTH 5
375 #endif
376 #ifdef SINGLE_PRECISION_COMPLEX
377 #undef ROW_LENGTH
378 #define ROW_LENGTH 10
379 #endif
380 #endif /* VEC_SET  == SSE_128 */
381 
382 #if VEC_SET  == AVX_256
383 #ifdef DOUBLE_PRECISION_COMPLEX
384 #undef ROW_LENGTH
385 #define ROW_LENGTH 10
386 #endif
387 #ifdef SINGLE_PRECISION_COMPLEX
388 #undef ROW_LENGTH
389 #define ROW_LENGTH 20
390 #endif
391 #endif /* VEC_SET  == AVX_256 */
392 
393 #if VEC_SET  == AVX_512
394 #ifdef DOUBLE_PRECISION_COMPLEX
395 #undef ROW_LENGTH
396 #define ROW_LENGTH 20
397 #endif
398 #ifdef SINGLE_PRECISION_COMPLEX
399 #undef ROW_LENGTH
400 #define ROW_LENGTH 40
401 #endif
402 #endif /* VEC_SET  == AVX_512 */
403 
404 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
405 #ifdef BLOCK1
406 		                       );
407 #endif
408 #ifdef BLOCK2
409                                        ,int ldh, DATA_TYPE s);
410 #endif
411 
412 
413 #if VEC_SET  == SSE_128
414 #ifdef DOUBLE_PRECISION_COMPLEX
415 #undef ROW_LENGTH
416 #define ROW_LENGTH 4
417 #endif
418 #ifdef SINGLE_PRECISION_COMPLEX
419 #undef ROW_LENGTH
420 #define ROW_LENGTH 8
421 #endif
422 #endif /* VEC_SET  == SSE_128 */
423 
424 #if VEC_SET  == AVX_256
425 #ifdef DOUBLE_PRECISION_COMPLEX
426 #undef ROW_LENGTH
427 #define ROW_LENGTH 8
428 #endif
429 #ifdef SINGLE_PRECISION_COMPLEX
430 #undef ROW_LENGTH
431 #define ROW_LENGTH 16
432 #endif
433 #endif /* VEC_SET  == AVX_256 */
434 
435 #if VEC_SET  == AVX_512
436 #ifdef DOUBLE_PRECISION_COMPLEX
437 #undef ROW_LENGTH
438 #define ROW_LENGTH 16
439 #endif
440 #ifdef SINGLE_PRECISION_COMPLEX
441 #undef ROW_LENGTH
442 #define ROW_LENGTH 32
443 #endif
444 #endif /* VEC_SET  == AVX_512 */
445 
446 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
447 #ifdef BLOCK1
448 		                       );
449 #endif
450 #ifdef BLOCK2
451                                        ,int ldh, DATA_TYPE s);
452 #endif
453 
454 #if VEC_SET  == SSE_128
455 #ifdef DOUBLE_PRECISION_COMPLEX
456 #undef ROW_LENGTH
457 #define ROW_LENGTH 3
458 #endif
459 #ifdef SINGLE_PRECISION_COMPLEX
460 #undef ROW_LENGTH
461 #define ROW_LENGTH 6
462 #endif
463 #endif /* VEC_SET  == SSE_128 */
464 
465 #if VEC_SET  == AVX_256
466 #ifdef DOUBLE_PRECISION_COMPLEX
467 #undef ROW_LENGTH
468 #define ROW_LENGTH 6
469 #endif
470 #ifdef SINGLE_PRECISION_COMPLEX
471 #undef ROW_LENGTH
472 #define ROW_LENGTH 12
473 #endif
474 #endif /* VEC_SET  == AVX_256 */
475 
476 #if VEC_SET  == AVX_512
477 #ifdef DOUBLE_PRECISION_COMPLEX
478 #undef ROW_LENGTH
479 #define ROW_LENGTH 12
480 #endif
481 #ifdef SINGLE_PRECISION_COMPLEX
482 #undef ROW_LENGTH
483 #define ROW_LENGTH 24
484 #endif
485 #endif /* VEC_SET  == AVX_512 */
486 
487 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
488 #ifdef BLOCK1
489 		                       );
490 #endif
491 #ifdef BLOCK2
492                                        ,int ldh, DATA_TYPE s);
493 #endif
494 
495 #if VEC_SET  == SSE_128
496 #ifdef DOUBLE_PRECISION_COMPLEX
497 #undef ROW_LENGTH
498 #define ROW_LENGTH 2
499 #endif
500 #ifdef SINGLE_PRECISION_COMPLEX
501 #undef ROW_LENGTH
502 #define ROW_LENGTH 4
503 #endif
504 #endif /* VEC_SET  == SSE_128 */
505 
506 #if VEC_SET  == AVX_256
507 #ifdef DOUBLE_PRECISION_COMPLEX
508 #undef ROW_LENGTH
509 #define ROW_LENGTH 4
510 #endif
511 #ifdef SINGLE_PRECISION_COMPLEX
512 #undef ROW_LENGTH
513 #define ROW_LENGTH 8
514 #endif
515 #endif /* VEC_SET  == AVX_256 */
516 
517 #if VEC_SET  == AVX_512
518 #ifdef DOUBLE_PRECISION_COMPLEX
519 #undef ROW_LENGTH
520 #define ROW_LENGTH 8
521 #endif
522 #ifdef SINGLE_PRECISION_COMPLEX
523 #undef ROW_LENGTH
524 #define ROW_LENGTH 16
525 #endif
526 #endif /* VEC_SET  == AVX_512 */
527 
528 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
529 #ifdef BLOCK1
530 		                       );
531 #endif
532 #ifdef BLOCK2
533                                        ,int ldh, DATA_TYPE s);
534 #endif
535 
536 #if VEC_SET  == SSE_128
537 #ifdef DOUBLE_PRECISION_COMPLEX
538 #undef ROW_LENGTH
539 #define ROW_LENGTH 1
540 #endif
541 #ifdef SINGLE_PRECISION_COMPLEX
542 #undef ROW_LENGTH
543 #define ROW_LENGTH 2
544 #endif
545 #endif /* VEC_SET  == SSE_128 */
546 
547 #if VEC_SET  == AVX_256
548 #ifdef DOUBLE_PRECISION_COMPLEX
549 #undef ROW_LENGTH
550 #define ROW_LENGTH 2
551 #endif
552 #ifdef SINGLE_PRECISION_COMPLEX
553 #undef ROW_LENGTH
554 #define ROW_LENGTH 4
555 #endif
556 #endif /* VEC_SET  == AVX_256 */
557 
558 #if VEC_SET  == AVX_512
559 #ifdef DOUBLE_PRECISION_COMPLEX
560 #undef ROW_LENGTH
561 #define ROW_LENGTH 4
562 #endif
563 #ifdef SINGLE_PRECISION_COMPLEX
564 #undef ROW_LENGTH
565 #define ROW_LENGTH 8
566 #endif
567 #endif /* VEC_SET  == AVX_512 */
568 
569 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)(DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
570 #ifdef BLOCK1
571 		                       );
572 #endif
573 #ifdef BLOCK2
574                                        ,int ldh, DATA_TYPE s);
575 #endif
576 
577 
578 /*
579 !f>#ifdef HAVE_SSE_INTRINSICS
580 !f> interface
581 !f>   subroutine single_hh_trafo_complex_SSE_1hv_double(q, hh, pnb, pnq, pldq) &
582 !f>                             bind(C, name="single_hh_trafo_complex_SSE_1hv_double")
583 !f>     use, intrinsic :: iso_c_binding
584 !f>     integer(kind=c_int)     :: pnb, pnq, pldq
585 !f>     ! complex(kind=c_double_complex)     :: q(*)
586 !f>     type(c_ptr), value                   :: q
587 !f>     complex(kind=c_double_complex)     :: hh(pnb,2)
588 !f>   end subroutine
589 !f> end interface
590 !f>#endif
591 */
592 
593 /*
594 !f>#ifdef HAVE_SSE_INTRINSICS
595 !f> interface
596 !f>   subroutine single_hh_trafo_complex_SSE_1hv_single(q, hh, pnb, pnq, pldq) &
597 !f>                             bind(C, name="single_hh_trafo_complex_SSE_1hv_single")
598 !f>     use, intrinsic :: iso_c_binding
599 !f>     integer(kind=c_int)     :: pnb, pnq, pldq
600 !f>     ! complex(kind=c_float_complex)   :: q(*)
601 !f>     type(c_ptr), value                :: q
602 !f>     complex(kind=c_float_complex)   :: hh(pnb,2)
603 !f>   end subroutine
604 !f> end interface
605 !f>#endif
606 */
607 
608 
609 /*
610 !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
611 !f> interface
612 !f>   subroutine single_hh_trafo_complex_AVX_AVX2_1hv_double(q, hh, pnb, pnq, pldq) &
613 !f>                             bind(C, name="single_hh_trafo_complex_AVX_AVX2_1hv_double")
614 !f>     use, intrinsic :: iso_c_binding
615 !f>     integer(kind=c_int)     :: pnb, pnq, pldq
616 !f>     ! complex(kind=c_double_complex)     :: q(*)
617 !f>     type(c_ptr), value                   :: q
618 !f>     complex(kind=c_double_complex)       :: hh(pnb,2)
619 !f>   end subroutine
620 !f> end interface
621 !f>#endif
622 */
623 
624 /*
625 !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
626 !f> interface
627 !f>   subroutine single_hh_trafo_complex_AVX_AVX2_1hv_single(q, hh, pnb, pnq, pldq) &
628 !f>                             bind(C, name="single_hh_trafo_complex_AVX_AVX2_1hv_single")
629 !f>     use, intrinsic :: iso_c_binding
630 !f>     integer(kind=c_int)     :: pnb, pnq, pldq
631 !f>     ! complex(kind=c_float_complex)   :: q(*)
632 !f>     type(c_ptr), value              :: q
633 !f>     complex(kind=c_float_complex)   :: hh(pnb,2)
634 !f>   end subroutine
635 !f> end interface
636 !f>#endif
637 */
638 
639 /*
640 !f>#if defined(HAVE_AVX512)
641 !f> interface
642 !f>   subroutine single_hh_trafo_complex_AVX512_1hv_double(q, hh, pnb, pnq, pldq) &
643 !f>                             bind(C, name="single_hh_trafo_complex_AVX512_1hv_double")
644 !f>     use, intrinsic :: iso_c_binding
645 !f>     integer(kind=c_int)     :: pnb, pnq, pldq
646 !f>     ! complex(kind=c_double_complex)     :: q(*)
647 !f>     type(c_ptr), value                 :: q
648 !f>     complex(kind=c_double_complex)     :: hh(pnb,2)
649 !f>   end subroutine
650 !f> end interface
651 !f>#endif
652 */
653 
654 /*
655 !f>#if defined(HAVE_AVX512)
656 !f> interface
657 !f>   subroutine single_hh_trafo_complex_AVX512_1hv_single(q, hh, pnb, pnq, pldq) &
658 !f>                             bind(C, name="single_hh_trafo_complex_AVX512_1hv_single")
659 !f>     use, intrinsic :: iso_c_binding
660 !f>     integer(kind=c_int)     :: pnb, pnq, pldq
661 !f>     ! complex(kind=c_float_complex)     :: q(*)
662 !f>     type(c_ptr), value                  :: q
663 !f>     complex(kind=c_float_complex)     :: hh(pnb,2)
664 !f>   end subroutine
665 !f> end interface
666 !f>#endif
667 */
668 
669 
670 /*
671 !f>#ifdef HAVE_SSE_INTRINSICS
672 !f> interface
673 !f>   subroutine double_hh_trafo_complex_SSE_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
674 !f>                             bind(C, name="double_hh_trafo_complex_SSE_2hv_double")
675 !f>     use, intrinsic :: iso_c_binding
676 !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
677 !f>     ! complex(kind=c_double_complex)     :: q(*)
678 !f>     type(c_ptr), value                   :: q
679 !f>     complex(kind=c_double_complex)     :: hh(pnb,2)
680 !f>   end subroutine
681 !f> end interface
682 !f>#endif
683 */
684 
685 /*
686 !f>#ifdef HAVE_SSE_INTRINSICS
687 !f> interface
688 !f>   subroutine double_hh_trafo_complex_SSE_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
689 !f>                             bind(C, name="double_hh_trafo_complex_SSE_2hv_single")
690 !f>     use, intrinsic :: iso_c_binding
691 !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
692 !f>     ! complex(kind=c_float_complex)   :: q(*)
693 !f>     type(c_ptr), value                :: q
694 !f>     complex(kind=c_float_complex)   :: hh(pnb,2)
695 !f>   end subroutine
696 !f> end interface
697 !f>#endif
698 */
699 
700 /*
701 !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
702 !f> interface
703 !f>   subroutine double_hh_trafo_complex_AVX_AVX2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
704 !f>                                bind(C, name="double_hh_trafo_complex_AVX_AVX2_2hv_double")
705 !f>        use, intrinsic :: iso_c_binding
706 !f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
707 !f>        ! complex(kind=c_double_complex)     :: q(*)
708 !f>        type(c_ptr), value                     :: q
709 !f>        complex(kind=c_double_complex)           :: hh(pnb,2)
710 !f>   end subroutine
711 !f> end interface
712 !f>#endif
713 */
714 
715 /*
716 !f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
717 !f> interface
718 !f>   subroutine double_hh_trafo_complex_AVX_AVX2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
719 !f>                                bind(C, name="double_hh_trafo_complex_AVX_AVX2_2hv_single")
720 !f>        use, intrinsic :: iso_c_binding
721 !f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
722 !f>        ! complex(kind=c_float_complex)   :: q(*)
723 !f>        type(c_ptr), value                  :: q
724 !f>        complex(kind=c_float_complex)        :: hh(pnb,2)
725 !f>   end subroutine
726 !f> end interface
727 !f>#endif
728 */
729 
730 /*
731 !f>#if defined(HAVE_AVX512)
732 !f> interface
733 !f>   subroutine double_hh_trafo_complex_AVX512_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
734 !f>                             bind(C, name="double_hh_trafo_complex_AVX512_2hv_double")
735 !f>     use, intrinsic :: iso_c_binding
736 !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
737 !f>     ! complex(kind=c_double_complex)     :: q(*)
738 !f>     type(c_ptr), value                   :: q
739 !f>     complex(kind=c_double_complex)     :: hh(pnb,2)
740 !f>   end subroutine
741 !f> end interface
742 !f>#endif
743 */
744 
745 /*
746 !f>#if defined(HAVE_AVX512)
747 !f> interface
748 !f>   subroutine double_hh_trafo_complex_AVX512_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
749 !f>                             bind(C, name="double_hh_trafo_complex_AVX512_2hv_single")
750 !f>     use, intrinsic :: iso_c_binding
751 !f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
752 !f>     ! complex(kind=c_float_complex)     :: q(*)
753 !f>     type(c_ptr), value                  :: q
754 !f>     complex(kind=c_float_complex)     :: hh(pnb,2)
755 !f>   end subroutine
756 !f> end interface
757 !f>#endif
758 */
759 
760 
CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)761 void CONCAT_7ARGS(PREFIX,_hh_trafo_complex_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int* pnb, int* pnq, int* pldq
762 #ifdef BLOCK1
763 		  )
764 #endif
765 #ifdef BLOCK2
766                   ,int* pldh)
767 #endif
768 {
769 
770      int i, worked_on;
771      int nb = *pnb;
772      int nq = *pldq;
773      int ldq = *pldq;
774 #ifdef BLOCK2
775      int ldh = *pldh;
776 
777      DATA_TYPE s = conj(hh[(ldh)+1])*1.0;
778 
779      for (i = 2; i < nb; i++)
780      {
781              s += hh[i-1] * conj(hh[(i+ldh)]);
782      }
783 #endif
784 
785      worked_on = 0;
786 
787 #ifdef BLOCK1
788 
789 #if VEC_SET == SSE_128
790 #ifdef DOUBLE_PRECISION_COMPLEX
791 #define ROW_LENGTH 6
792 #define STEP_SIZE 6
793 #define UPPER_BOUND 5
794 #endif
795 #ifdef SINGLE_PRECISION_COMPLEX
796 #define ROW_LENGTH 12
797 #define STEP_SIZE 12
798 #define UPPER_BOUND 10
799 #endif
800 #endif /* VEC_SET == SSE_128 */
801 
802 #if VEC_SET == AVX_256
803 #ifdef DOUBLE_PRECISION_COMPLEX
804 #define ROW_LENGTH 12
805 #define STEP_SIZE 12
806 #define UPPER_BOUND 10
807 #endif
808 #ifdef SINGLE_PRECISION_COMPLEX
809 #define ROW_LENGTH 24
810 #define STEP_SIZE 24
811 #define UPPER_BOUND 20
812 #endif
813 #endif /* VEC_SET == AVX_256 */
814 
815 #if VEC_SET == AVX_512
816 #ifdef DOUBLE_PRECISION_COMPLEX
817 #define ROW_LENGTH 24
818 #define STEP_SIZE 24
819 #define UPPER_BOUND 20
820 #endif
821 #ifdef SINGLE_PRECISION_COMPLEX
822 #define ROW_LENGTH 48
823 #define STEP_SIZE 48
824 #define UPPER_BOUND 40
825 #endif
826 #endif /* VEC_SET == AVX_512 */
827 
828 
829         for (i = 0; i < nq - UPPER_BOUND; i+= STEP_SIZE)
830         {
831 
832             CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
833 	    worked_on += ROW_LENGTH;
834         }
835 
836         if (nq == i) {
837           return;
838         }
839 
840 #if VEC_SET == SSE_128
841 #undef ROW_LENGTH
842 #ifdef DOUBLE_PRECISION_COMPLEX
843 #define ROW_LENGTH 5
844 #endif
845 #ifdef SINGLE_PRECISION_COMPLEX
846 #define ROW_LENGTH 10
847 #endif
848 #endif /* VEC_SET == SSE_128 */
849 
850 #if VEC_SET == AVX_256
851 #undef ROW_LENGTH
852 #ifdef DOUBLE_PRECISION_COMPLEX
853 #define ROW_LENGTH 10
854 #endif
855 #ifdef SINGLE_PRECISION_COMPLEX
856 #define ROW_LENGTH 20
857 #endif
858 #endif /* VEC_SET == AVX_256 */
859 
860 #if VEC_SET == AVX_512
861 #undef ROW_LENGTH
862 #ifdef DOUBLE_PRECISION_COMPLEX
863 #define ROW_LENGTH 20
864 #endif
865 #ifdef SINGLE_PRECISION_COMPLEX
866 #define ROW_LENGTH 40
867 #endif
868 #endif /* VEC_SET == AVX_512 */
869 
870         if (nq-i == ROW_LENGTH)
871         {
872             CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
873 	    worked_on += ROW_LENGTH;
874         }
875 
876 #if VEC_SET == SSE_128
877 #undef ROW_LENGTH
878 #ifdef DOUBLE_PRECISION_COMPLEX
879 #define ROW_LENGTH 4
880 #endif
881 #ifdef SINGLE_PRECISION_COMPLEX
882 #define ROW_LENGTH 8
883 #endif
884 #endif /* VEC_SET == SSE_128 */
885 
886 #if VEC_SET == AVX_256
887 #undef ROW_LENGTH
888 #ifdef DOUBLE_PRECISION_COMPLEX
889 #define ROW_LENGTH 8
890 #endif
891 #ifdef SINGLE_PRECISION_COMPLEX
892 #define ROW_LENGTH 16
893 #endif
894 #endif /* VEC_SET == AVX_256 */
895 
896 #if VEC_SET == AVX_512
897 #undef ROW_LENGTH
898 #ifdef DOUBLE_PRECISION_COMPLEX
899 #define ROW_LENGTH 16
900 #endif
901 #ifdef SINGLE_PRECISION_COMPLEX
902 #define ROW_LENGTH 32
903 #endif
904 #endif /* VEC_SET == AVX_512 */
905 
906         if (nq-i == ROW_LENGTH)
907         {
908             CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
909 	    worked_on += ROW_LENGTH;
910         }
911 
912 #if VEC_SET == SSE_128
913 #undef ROW_LENGTH
914 #ifdef DOUBLE_PRECISION_COMPLEX
915 #define ROW_LENGTH 3
916 #endif
917 #ifdef SINGLE_PRECISION_COMPLEX
918 #define ROW_LENGTH 6
919 #endif
920 #endif /* VEC_SET == SSE_128 */
921 
922 #if VEC_SET == AVX_256
923 #undef ROW_LENGTH
924 #ifdef DOUBLE_PRECISION_COMPLEX
925 #define ROW_LENGTH 6
926 #endif
927 #ifdef SINGLE_PRECISION_COMPLEX
928 #define ROW_LENGTH 12
929 #endif
930 #endif /* VEC_SET == AVX_256 */
931 
932 #if VEC_SET == AVX_512
933 #undef ROW_LENGTH
934 #ifdef DOUBLE_PRECISION_COMPLEX
935 #define ROW_LENGTH 12
936 #endif
937 #ifdef SINGLE_PRECISION_COMPLEX
938 #define ROW_LENGTH 24
939 #endif
940 #endif /* VEC_SET == AVX_512 */
941 
942         if (nq-i == ROW_LENGTH)
943         {
944             CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
945 	    worked_on += ROW_LENGTH;
946         }
947 
948 #if VEC_SET == SSE_128
949 #undef ROW_LENGTH
950 #ifdef DOUBLE_PRECISION_COMPLEX
951 #define ROW_LENGTH 2
952 #endif
953 #ifdef SINGLE_PRECISION_COMPLEX
954 #define ROW_LENGTH 4
955 #endif
956 #endif /* VEC_SET == SSE_128 */
957 
958 #if VEC_SET == AVX_256
959 #undef ROW_LENGTH
960 #ifdef DOUBLE_PRECISION_COMPLEX
961 #define ROW_LENGTH 4
962 #endif
963 #ifdef SINGLE_PRECISION_COMPLEX
964 #define ROW_LENGTH 8
965 #endif
966 #endif /* VEC_SET == AVX_256 */
967 
968 #if VEC_SET == AVX_512
969 #undef ROW_LENGTH
970 #ifdef DOUBLE_PRECISION_COMPLEX
971 #define ROW_LENGTH 8
972 #endif
973 #ifdef SINGLE_PRECISION_COMPLEX
974 #define ROW_LENGTH 16
975 #endif
976 #endif /* VEC_SET == AVX_512 */
977 
978         if (nq-i == ROW_LENGTH)
979         {
980             CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
981 	    worked_on += ROW_LENGTH;
982         }
983 
984 #if VEC_SET == SSE_128
985 #undef ROW_LENGTH
986 #ifdef DOUBLE_PRECISION_COMPLEX
987 #define ROW_LENGTH 1
988 #endif
989 #ifdef SINGLE_PRECISION_COMPLEX
990 #define ROW_LENGTH 2
991 #endif
992 #endif /* VEC_SET == SSE_128 */
993 
994 #if VEC_SET == AVX_256
995 #undef ROW_LENGTH
996 #ifdef DOUBLE_PRECISION_COMPLEX
997 #define ROW_LENGTH 2
998 #endif
999 #ifdef SINGLE_PRECISION_COMPLEX
1000 #define ROW_LENGTH 4
1001 #endif
1002 #endif /* VEC_SET == AVX_256 */
1003 
1004 #if VEC_SET == AVX_512
1005 #undef ROW_LENGTH
1006 #ifdef DOUBLE_PRECISION_COMPLEX
1007 #define ROW_LENGTH 4
1008 #endif
1009 #ifdef SINGLE_PRECISION_COMPLEX
1010 #define ROW_LENGTH 8
1011 #endif
1012 #endif /* VEC_SET == AVX_512 */
1013 
1014         if (nq-i == ROW_LENGTH)
1015         {
1016             CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq);
1017 	    worked_on += ROW_LENGTH;
1018         }
1019 
1020 #endif /* BLOCK1 */
1021 
1022 #ifdef BLOCK2
1023 
1024 #if VEC_SET == SSE_128
1025 #undef ROW_LENGTH
1026 #ifdef DOUBLE_PRECISION_COMPLEX
1027 #define ROW_LENGTH 4
1028 #define STEP_SIZE 4
1029 #define UPPER_BOUND 3
1030 #endif
1031 #ifdef SINGLE_PRECISION_COMPLEX
1032 #define ROW_LENGTH 8
1033 #define STEP_SIZE 8
1034 #define UPPER_BOUND 6
1035 #endif
1036 #endif /* VEC_SET == SSE_128 */
1037 
1038 #if VEC_SET == AVX_256
1039 #undef ROW_LENGTH
1040 #ifdef DOUBLE_PRECISION_COMPLEX
1041 #define ROW_LENGTH 8
1042 #define STEP_SIZE 8
1043 #define UPPER_BOUND 6
1044 #endif
1045 #ifdef SINGLE_PRECISION_COMPLEX
1046 #define ROW_LENGTH 16
1047 #define STEP_SIZE 16
1048 #define UPPER_BOUND 12
1049 #endif
1050 #endif /* VEC_SET == AVX_256 */
1051 
1052 #if VEC_SET == AVX_512
1053 #undef ROW_LENGTH
1054 #ifdef DOUBLE_PRECISION_COMPLEX
1055 #define ROW_LENGTH 16
1056 #define STEP_SIZE 16
1057 #define UPPER_BOUND 12
1058 #endif
1059 #ifdef SINGLE_PRECISION_COMPLEX
1060 #define ROW_LENGTH 32
1061 #define STEP_SIZE 32
1062 #define UPPER_BOUND 24
1063 #endif
1064 #endif /* VEC_SET == AVX_512 */
1065 
1066     for (i = 0; i < nq - UPPER_BOUND; i+=STEP_SIZE)
1067     {
1068          CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
1069 	 worked_on +=ROW_LENGTH;
1070     }
1071 
1072     if (nq == i)
1073     {
1074       return;
1075     }
1076 
1077 #if VEC_SET == SSE_128
1078 #undef ROW_LENGTH
1079 #ifdef DOUBLE_PRECISION_COMPLEX
1080 #define ROW_LENGTH 3
1081 #endif
1082 #ifdef SINGLE_PRECISION_COMPLEX
1083 #define ROW_LENGTH 6
1084 #endif
1085 #endif /* VEC_SET == SSE_128 */
1086 
1087 #if VEC_SET == AVX_256
1088 #undef ROW_LENGTH
1089 #ifdef DOUBLE_PRECISION_COMPLEX
1090 #define ROW_LENGTH 6
1091 #endif
1092 #ifdef SINGLE_PRECISION_COMPLEX
1093 #define ROW_LENGTH 12
1094 #endif
1095 #endif /* VEC_SET == AVX_256 */
1096 
1097 #if VEC_SET == AVX_512
1098 #undef ROW_LENGTH
1099 #ifdef DOUBLE_PRECISION_COMPLEX
1100 #define ROW_LENGTH 12
1101 #endif
1102 #ifdef SINGLE_PRECISION_COMPLEX
1103 #define ROW_LENGTH 24
1104 #endif
1105 #endif /* VEC_SET == AVX_512 */
1106 
1107     if (nq-i == ROW_LENGTH)
1108     {
1109         CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
1110         worked_on += ROW_LENGTH;
1111     }
1112 
1113 #if VEC_SET == SSE_128
1114 #undef ROW_LENGTH
1115 #ifdef DOUBLE_PRECISION_COMPLEX
1116 #define ROW_LENGTH 2
1117 #endif
1118 #ifdef SINGLE_PRECISION_COMPLEX
1119 #define ROW_LENGTH 4
1120 #endif
1121 #endif /* VEC_SET == SSE_128 */
1122 
1123 #if VEC_SET == AVX_256
1124 #undef ROW_LENGTH
1125 #ifdef DOUBLE_PRECISION_COMPLEX
1126 #define ROW_LENGTH 4
1127 #endif
1128 #ifdef SINGLE_PRECISION_COMPLEX
1129 #define ROW_LENGTH 8
1130 #endif
1131 #endif /* VEC_SET == AVX_256 */
1132 
1133 #if VEC_SET == AVX_512
1134 #undef ROW_LENGTH
1135 #ifdef DOUBLE_PRECISION_COMPLEX
1136 #define ROW_LENGTH 8
1137 #endif
1138 #ifdef SINGLE_PRECISION_COMPLEX
1139 #define ROW_LENGTH 16
1140 #endif
1141 #endif /* VEC_SET == AVX_512 */
1142 
1143     if (nq-i == ROW_LENGTH)
1144     {
1145         CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
1146         worked_on += ROW_LENGTH;
1147     }
1148 
1149 #if VEC_SET == SSE_128
1150 #undef ROW_LENGTH
1151 #ifdef DOUBLE_PRECISION_COMPLEX
1152 #define ROW_LENGTH 1
1153 #endif
1154 #ifdef SINGLE_PRECISION_COMPLEX
1155 #define ROW_LENGTH 2
1156 #endif
1157 #endif /* VEC_SET == SSE_128 */
1158 
1159 #if VEC_SET == AVX_256
1160 #undef ROW_LENGTH
1161 #ifdef DOUBLE_PRECISION_COMPLEX
1162 #define ROW_LENGTH 2
1163 #endif
1164 #ifdef SINGLE_PRECISION_COMPLEX
1165 #define ROW_LENGTH 4
1166 #endif
1167 #endif /* VEC_SET == AVX_256 */
1168 
1169 #if VEC_SET == AVX_512
1170 #undef ROW_LENGTH
1171 #ifdef DOUBLE_PRECISION_COMPLEX
1172 #define ROW_LENGTH 4
1173 #endif
1174 #ifdef SINGLE_PRECISION_COMPLEX
1175 #define ROW_LENGTH 8
1176 #endif
1177 #endif /* VEC_SET == AVX_512 */
1178 
1179     if (nq-i == ROW_LENGTH)
1180     {
1181         CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (&q[i], hh, nb, ldq, ldh, s);
1182         worked_on += ROW_LENGTH;
1183     }
1184 
1185 #endif /* BLOCK2 */
1186 
1187 #ifdef WITH_DEBUG
1188     if (worked_on != nq)
1189     {
1190       printf("Error in complex SIMD_SET BLOCK BLOCK kernel %d %d\n", worked_on, nq);
1191       abort();
1192     }
1193 #endif
1194 
1195 }
1196 
1197 #if VEC_SET == SSE_128
1198 #ifdef DOUBLE_PRECISION_COMPLEX
1199 #define ROW_LENGTH 6
1200 #endif
1201 #ifdef SINGLE_PRECISION_COMPLEX
1202 #define ROW_LENGTH 12
1203 #endif
1204 #endif /* VEC_SET == SSE_128 */
1205 
1206 #if VEC_SET == AVX_256
1207 #ifdef DOUBLE_PRECISION_COMPLEX
1208 #define ROW_LENGTH 12
1209 #endif
1210 #ifdef SINGLE_PRECISION_COMPLEX
1211 #define ROW_LENGTH 24
1212 #endif
1213 #endif /* VEC_SET == AVX_256 */
1214 
1215 #if VEC_SET == AVX_512
1216 #ifdef DOUBLE_PRECISION_COMPLEX
1217 #define ROW_LENGTH 24
1218 #endif
1219 #ifdef SINGLE_PRECISION_COMPLEX
1220 #define ROW_LENGTH 48
1221 #endif
1222 #endif /* VEC_SET == AVX_512 */
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)1223 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
1224 #ifdef BLOCK1
1225 		)
1226 #endif
1227 #ifdef BLOCK2
1228                 ,int ldh, DATA_TYPE s)
1229 #endif
1230 {
1231 
1232     DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
1233     DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
1234 #ifdef BLOCK2
1235     DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
1236 #endif
1237 
1238     __SIMD_DATATYPE x1, x2, x3, x4, x5, x6;
1239     __SIMD_DATATYPE q1, q2, q3, q4, q5, q6;
1240 #ifdef BLOCK2
1241     __SIMD_DATATYPE y1, y2, y3, y4, y5, y6;
1242     __SIMD_DATATYPE h2_real, h2_imag;
1243 #endif
1244     __SIMD_DATATYPE h1_real, h1_imag;
1245     __SIMD_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1246     int i=0;
1247 
1248 #if VEC_SET == SSE_128
1249 #ifdef DOUBLE_PRECISION_COMPLEX
1250     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
1251 #endif
1252 #ifdef SINGLE_PRECISION_COMPLEX
1253     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
1254 #endif
1255 #endif /* VEC_SET == SSE_128 */
1256 
1257 #if VEC_SET == AVX_256
1258 #ifdef DOUBLE_PRECISION_COMPLEX
1259     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
1260 #endif
1261 #ifdef SINGLE_PRECISION_COMPLEX
1262     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
1263 #endif
1264 #endif /* VEC_SET == AVX_256 */
1265 
1266 #if VEC_SET == AVX_512
1267 #ifdef DOUBLE_PRECISION_COMPLEX
1268         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
1269 #endif
1270 #ifdef SINGLE_PRECISION_COMPLEX
1271         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
1272 #endif
1273 #endif /* VEC_SET == AVX_512 */
1274 
1275 #ifdef BLOCK2
1276      x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
1277      x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
1278      x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
1279      x4 = _SIMD_LOAD(&q_dbl[(2*ldq)+3*offset]);
1280      x5 = _SIMD_LOAD(&q_dbl[(2*ldq)+4*offset]);
1281      x6 = _SIMD_LOAD(&q_dbl[(2*ldq)+5*offset]);
1282 
1283 #if VEC_SET == SSE_128
1284 #ifdef DOUBLE_PRECISION_COMPLEX
1285      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
1286      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
1287 #endif
1288 #ifdef SINGLE_PRECISION_COMPLEX
1289      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
1290      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
1291 #endif
1292 #endif /* VEC_SET == SSE_128 */
1293 
1294 #if VEC_SET == AVX_256
1295      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
1296      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
1297 #endif /* VEC_SET == AVX_256 */
1298 
1299 #if VEC_SET == AVX_512
1300      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
1301      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
1302 #endif /*  VEC_SET == AVX_512 */
1303 
1304 #ifndef __ELPA_USE_FMA__
1305      // conjugate
1306      h2_imag = _SIMD_XOR(h2_imag, sign);
1307 #endif
1308 
1309      y1 = _SIMD_LOAD(&q_dbl[0]);
1310      y2 = _SIMD_LOAD(&q_dbl[offset]);
1311      y3 = _SIMD_LOAD(&q_dbl[2*offset]);
1312      y4 = _SIMD_LOAD(&q_dbl[3*offset]);
1313      y5 = _SIMD_LOAD(&q_dbl[4*offset]);
1314      y6 = _SIMD_LOAD(&q_dbl[5*offset]);
1315 
1316      tmp1 = _SIMD_MUL(h2_imag, x1);
1317 #ifdef __ELPA_USE_FMA__
1318      y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1319 #else
1320      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1321 #endif
1322      tmp2 = _SIMD_MUL(h2_imag, x2);
1323 #ifdef __ELPA_USE_FMA__
1324      y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1325 #else
1326      y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1327 #endif
1328 
1329      tmp3 = _SIMD_MUL(h2_imag, x3);
1330 #ifdef __ELPA_USE_FMA__
1331      y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1332 #else
1333      y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1334 #endif
1335      tmp4 = _SIMD_MUL(h2_imag, x4);
1336 #ifdef __ELPA_USE_FMA__
1337      y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1338 #else
1339      y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1340 #endif
1341 
1342      tmp5 = _SIMD_MUL(h2_imag, x5);
1343 #ifdef __ELPA_USE_FMA__
1344      y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1345 #else
1346      y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1347 #endif
1348      tmp6 = _SIMD_MUL(h2_imag, x6);
1349 #ifdef __ELPA_USE_FMA__
1350      y6 = _SIMD_ADD(y6, _SIMD_FMSUBADD(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1351 #else
1352      y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1353 #endif
1354 
1355 #endif /* BLOCK2 */
1356 
1357 #ifdef BLOCK1
1358     x1 = _SIMD_LOAD(&q_dbl[0]);
1359     x2 = _SIMD_LOAD(&q_dbl[offset]);
1360     x3 = _SIMD_LOAD(&q_dbl[2*offset]);
1361     x4 = _SIMD_LOAD(&q_dbl[3*offset]);
1362     x5 = _SIMD_LOAD(&q_dbl[4*offset]);
1363     x6 = _SIMD_LOAD(&q_dbl[5*offset]);
1364 #endif
1365 
1366     for (i = BLOCK; i < nb; i++)
1367     {
1368 
1369 #if VEC_SET == SSE_128
1370 #ifdef DOUBLE_PRECISION_COMPLEX
1371         h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
1372         h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
1373 #endif
1374 #ifdef SINGLE_PRECISION_COMPLEX
1375         h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
1376         h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
1377 #endif
1378 #endif /* VEC_SET == SSE_128 */
1379 
1380 #if VEC_SET == AVX_256
1381        h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
1382        h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
1383 #endif /* VEC_SET == AVX_256 */
1384 
1385 #if VEC_SET == AVX_512
1386        h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
1387        h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
1388 #endif /* VEC_SET == AVX_512 */
1389 
1390 #ifndef __ELPA_USE_FMA__
1391         // conjugate
1392         h1_imag = _SIMD_XOR(h1_imag, sign);
1393 #endif
1394 
1395         q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
1396         q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
1397         q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
1398         q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
1399         q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
1400         q6 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
1401 
1402         tmp1 = _SIMD_MUL(h1_imag, q1);
1403 #ifdef __ELPA_USE_FMA__
1404         x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1405 #else
1406         x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1407 #endif
1408         tmp2 = _SIMD_MUL(h1_imag, q2);
1409 #ifdef __ELPA_USE_FMA__
1410         x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1411 #else
1412         x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1413 #endif
1414         tmp3 = _SIMD_MUL(h1_imag, q3);
1415 #ifdef __ELPA_USE_FMA__
1416         x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1417 #else
1418         x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1419 #endif
1420 
1421         tmp4 = _SIMD_MUL(h1_imag, q4);
1422 #ifdef __ELPA_USE_FMA__
1423         x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1424 #else
1425         x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1426 #endif
1427         tmp5 = _SIMD_MUL(h1_imag, q5);
1428 #ifdef __ELPA_USE_FMA__
1429         x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1430 #else
1431         x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1432 #endif
1433         tmp6 = _SIMD_MUL(h1_imag, q6);
1434 #ifdef __ELPA_USE_FMA__
1435         x6 = _SIMD_ADD(x6, _SIMD_FMSUBADD(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1436 #else
1437         x6 = _SIMD_ADD(x6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1438 #endif
1439 
1440 #ifdef BLOCK2
1441 
1442 #if VEC_SET == SSE_128
1443 #ifdef DOUBLE_PRECISION_COMPLEX
1444           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
1445           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
1446 #endif
1447 #ifdef SINGLE_PRECISION_COMPLEX
1448           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
1449           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
1450 #endif
1451 #endif /* VEC_SET == SSE_128 */
1452 
1453 #if VEC_SET == AVX_256
1454           h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
1455           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
1456 #endif /* VEC_SET == AVX_256 */
1457 
1458 #if VEC_SET == AVX_512
1459           h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
1460           h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
1461 #endif /* VEC_SET == AVX_512 */
1462 
1463 
1464 #ifndef __ELPA_USE_FMA__
1465           // conjugate
1466           h2_imag = _SIMD_XOR(h2_imag, sign);
1467 #endif
1468 
1469           tmp1 = _SIMD_MUL(h2_imag, q1);
1470 #ifdef __ELPA_USE_FMA__
1471           y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1472 #else
1473           y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1474 #endif
1475           tmp2 = _SIMD_MUL(h2_imag, q2);
1476 #ifdef __ELPA_USE_FMA__
1477           y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1478 #else
1479           y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1480 #endif
1481 
1482           tmp3 = _SIMD_MUL(h2_imag, q3);
1483 #ifdef __ELPA_USE_FMA__
1484           y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1485 #else
1486           y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1487 #endif
1488           tmp4 = _SIMD_MUL(h2_imag, q4);
1489 #ifdef __ELPA_USE_FMA__
1490           y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1491 #else
1492           y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1493 #endif
1494 
1495           tmp5 = _SIMD_MUL(h2_imag, q5);
1496 #ifdef __ELPA_USE_FMA__
1497           y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1498 #else
1499           y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1500 #endif
1501           tmp6 = _SIMD_MUL(h2_imag, q6);
1502 #ifdef __ELPA_USE_FMA__
1503           y6 = _SIMD_ADD(y6, _SIMD_FMSUBADD(h2_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1504 #else
1505           y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1506 #endif
1507 
1508 #endif /* BLOCK2 */
1509 
1510     }
1511 
1512 #ifdef BLOCK2
1513 
1514 #if VEC_SET == SSE_128
1515 #ifdef DOUBLE_PRECISION_COMPLEX
1516      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
1517      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
1518 #endif
1519 #ifdef SINGLE_PRECISION_COMPLEX
1520      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
1521      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
1522 #endif
1523 #endif /* VEC_SET == SSE_128 */
1524 
1525 #if VEC_SET == AVX_256
1526      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
1527      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
1528 #endif /* VEC_SET == AVX_256 */
1529 
1530 #if VEC_SET == AVX_512
1531      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
1532      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
1533 #endif /* VEC_SET == AVX_512 */
1534 
1535 #ifndef __ELPA_USE_FMA__
1536      // conjugate
1537      h1_imag = _SIMD_XOR(h1_imag, sign);
1538 #endif
1539 
1540      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
1541      q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
1542      q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
1543      q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
1544      q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
1545      q6 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+5*offset]);
1546 
1547      tmp1 = _SIMD_MUL(h1_imag, q1);
1548 #ifdef __ELPA_USE_FMA__
1549      x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1550 #else
1551      x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1552 #endif
1553      tmp2 = _SIMD_MUL(h1_imag, q2);
1554 #ifdef __ELPA_USE_FMA__
1555      x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1556 #else
1557      x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1558 #endif
1559 
1560      tmp3 = _SIMD_MUL(h1_imag, q3);
1561 #ifdef __ELPA_USE_FMA__
1562      x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1563 #else
1564      x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1565 #endif
1566      tmp4 = _SIMD_MUL(h1_imag, q4);
1567 #ifdef __ELPA_USE_FMA__
1568      x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1569 #else
1570      x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1571 #endif
1572 
1573      tmp5 = _SIMD_MUL(h1_imag, q5);
1574 #ifdef __ELPA_USE_FMA__
1575      x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1576 #else
1577      x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1578 #endif
1579      tmp6 = _SIMD_MUL(h1_imag, q6);
1580 #ifdef __ELPA_USE_FMA__
1581      x6 = _SIMD_ADD(x6, _SIMD_FMSUBADD(h1_real, q6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1582 #else
1583      x6 = _SIMD_ADD(x6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1584 #endif
1585 
1586 #endif /* BLOCK2 */
1587 
1588 #if VEC_SET == SSE_128
1589 #ifdef DOUBLE_PRECISION_COMPLEX
1590     h1_real = _mm_loaddup_pd(&hh_dbl[0]);
1591     h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
1592 #endif
1593 #ifdef SINGLE_PRECISION_COMPLEX
1594     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
1595     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
1596 #endif
1597 #endif /*  VEC_SET == SSE_128 */
1598 
1599 #if VEC_SET == AVX_256
1600     h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
1601     h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
1602 #endif /* VEC_SET == AVX_256 */
1603 
1604 #if VEC_SET == AVX_512
1605     h1_real = _SIMD_SET1(hh_dbl[0]);
1606     h1_imag = _SIMD_SET1(hh_dbl[1]);
1607 
1608 #ifdef HAVE_AVX512_XEON_PHI
1609 #ifdef DOUBLE_PRECISION_COMPLEX
1610         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
1611         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
1612 #endif
1613 #ifdef SINGLE_PRECISION_COMPLEX
1614         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
1615         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
1616 #endif
1617 #endif
1618 #ifdef HAVE_AVX512_XEON
1619 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
1620         h1_real = _SIMD_XOR(h1_real, sign);
1621         h1_imag = _SIMD_XOR(h1_imag, sign);
1622 #endif
1623 #endif
1624 
1625 #endif /* VEC_SET == AVX_512 */
1626 
1627 #if VEC_SET != AVX_512
1628     h1_real = _SIMD_XOR(h1_real, sign);
1629     h1_imag = _SIMD_XOR(h1_imag, sign);
1630 #endif /* VEC_SET != AVX_512 */
1631 
1632     tmp1 = _SIMD_MUL(h1_imag, x1);
1633 #ifdef __ELPA_USE_FMA__
1634     x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1635 #else
1636     x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1637 #endif
1638     tmp2 = _SIMD_MUL(h1_imag, x2);
1639 #ifdef __ELPA_USE_FMA__
1640     x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1641 #else
1642     x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1643 #endif
1644     tmp3 = _SIMD_MUL(h1_imag, x3);
1645 #ifdef __ELPA_USE_FMA__
1646     x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
1647 #else
1648     x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
1649 #endif
1650 
1651     tmp4 = _SIMD_MUL(h1_imag, x4);
1652 #ifdef __ELPA_USE_FMA__
1653     x4 = _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
1654 #else
1655     x4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
1656 #endif
1657     tmp5 = _SIMD_MUL(h1_imag, x5);
1658 #ifdef __ELPA_USE_FMA__
1659     x5 = _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
1660 #else
1661     x5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
1662 #endif
1663     tmp6 = _SIMD_MUL(h1_imag, x6);
1664 #ifdef __ELPA_USE_FMA__
1665     x6 = _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
1666 #else
1667     x6 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
1668 #endif
1669 
1670 #ifdef BLOCK2
1671 
1672 #if VEC_SET == SSE_128
1673 #ifdef DOUBLE_PRECISION_COMPLEX
1674      h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
1675      h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
1676 #endif
1677 #ifdef SINGLE_PRECISION_COMPLEX
1678      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
1679      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
1680 #endif
1681 
1682 #ifdef DOUBLE_PRECISION_COMPLEX
1683      h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
1684      h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
1685 #endif
1686 #ifdef SINGLE_PRECISION_COMPLEX
1687      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
1688      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
1689 #endif
1690 #endif /* VEC_SET == 128 */
1691 
1692 #if VEC_SET == AVX_256
1693      h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
1694      h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
1695      h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
1696      h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
1697 #endif /* VEC_SET == AVX_256 */
1698 
1699 #if VEC_SET == AVX_512
1700      h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
1701      h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
1702      h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
1703      h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
1704 
1705 #ifdef HAVE_AVX512_XEON_PHI
1706 
1707 #ifdef DOUBLE_PRECISION_COMPLEX
1708      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
1709      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
1710 #endif
1711 #ifdef SINGLE_PRECISION_COMPLEX
1712      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
1713      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
1714 #endif
1715 
1716 #ifdef DOUBLE_PRECISION_COMPLEX
1717      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
1718      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
1719 #endif
1720 #ifdef SINGLE_PRECISION_COMPLEX
1721      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
1722      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
1723 #endif
1724 #endif /* HAVE_AVX512_XEON_PHI */
1725 
1726 #ifdef HAVE_AVX512_XEON
1727 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
1728         h1_real = _SIMD_XOR(h1_real, sign);
1729         h1_imag = _SIMD_XOR(h1_imag, sign);
1730         h2_real = _SIMD_XOR(h2_real, sign);
1731         h2_imag = _SIMD_XOR(h2_imag, sign);
1732 #endif
1733 #endif
1734 #endif /* VEC_SET == AVX_512 */
1735 
1736 #if VEC_SET != AVX_512
1737      h1_real = _SIMD_XOR(h1_real, sign);
1738      h1_imag = _SIMD_XOR(h1_imag, sign);
1739      h2_real = _SIMD_XOR(h2_real, sign);
1740      h2_imag = _SIMD_XOR(h2_imag, sign);
1741 #endif /* VEC_SET != AVX_512 */
1742 
1743 #if VEC_SET == SSE_128
1744 #ifdef SINGLE_PRECISION_COMPLEX
1745      tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
1746 #else
1747      tmp2 = _SIMD_LOADU(s_dbl);
1748 #endif
1749 #endif /* VEC_SET == SSE_128 */
1750 
1751 #if VEC_SET == AVX_256
1752 #ifdef DOUBLE_PRECISION_COMPLEX
1753      tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
1754 #endif
1755 #ifdef SINGLE_PRECISION_COMPLEX
1756      tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
1757                              s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
1758 #endif
1759 #endif /* VEC_SET == AVX_256 */
1760 
1761 #if VEC_SET == AVX_512
1762 #ifdef DOUBLE_PRECISION_COMPLEX
1763      tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
1764                         s_dbl[1], s_dbl[0],
1765                         s_dbl[1], s_dbl[0],
1766                         s_dbl[1], s_dbl[0]);
1767 #endif
1768 #ifdef SINGLE_PRECISION_COMPLEX
1769      tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
1770 #endif
1771 #endif /* VEC_SET == AVX_512 */
1772 
1773      tmp1 = _SIMD_MUL(h2_imag, tmp2);
1774 #ifdef __ELPA_USE_FMA__
1775      tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1776 #else
1777      tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1778 #endif
1779 
1780 #if VEC_SET == AVX_512
1781      _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
1782 
1783      h2_real = _SIMD_SET1(s_dbl[0]);
1784      h2_imag = _SIMD_SET1(s_dbl[1]);
1785 #endif /* VEC_SET == AVX_512 */
1786 
1787 #if VEC_SET == SSE_128
1788 #ifdef DOUBLE_PRECISION_COMPLEX
1789      h2_real = _mm_movedup_pd(tmp2);
1790      h2_imag = _mm_set1_pd(tmp2[1]);
1791 #endif
1792 #ifdef SINGLE_PRECISION_COMPLEX
1793      h2_real = _mm_moveldup_ps(tmp2);
1794      h2_imag = _mm_movehdup_ps(tmp2);
1795 #endif
1796 #endif /* VEC_SET == SSE_128 */
1797 
1798 #if VEC_SET == AVX_256
1799      h2_real = _SIMD_SET1(tmp2[0]);
1800      h2_imag = _SIMD_SET1(tmp2[1]);
1801 #endif /* VEC_SET == AVX_256 */
1802 
1803      tmp1 = _SIMD_MUL(h1_imag, y1);
1804 #ifdef __ELPA_USE_FMA__
1805      y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1806 #else
1807      y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
1808 #endif
1809      tmp2 = _SIMD_MUL(h1_imag, y2);
1810 #ifdef __ELPA_USE_FMA__
1811      y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1812 #else
1813      y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
1814 #endif
1815 
1816      tmp3 = _SIMD_MUL(h1_imag, y3);
1817 #ifdef __ELPA_USE_FMA__
1818      y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
1819 #else
1820      y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
1821 #endif
1822      tmp4 = _SIMD_MUL(h1_imag, y4);
1823 #ifdef __ELPA_USE_FMA__
1824      y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
1825 #else
1826      y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
1827 #endif
1828 
1829      tmp5 = _SIMD_MUL(h1_imag, y5);
1830 #ifdef __ELPA_USE_FMA__
1831      y5 = _SIMD_FMADDSUB(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
1832 #else
1833      y5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
1834 #endif
1835      tmp6 = _SIMD_MUL(h1_imag, y6);
1836 #ifdef __ELPA_USE_FMA__
1837      y6 = _SIMD_FMADDSUB(h1_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
1838 #else
1839      y6 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE));
1840 #endif
1841 
1842      tmp1 = _SIMD_MUL(h2_imag, x1);
1843 #ifdef __ELPA_USE_FMA__
1844      y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1845 #else
1846      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1847 #endif
1848      tmp2 = _SIMD_MUL(h2_imag, x2);
1849 #ifdef __ELPA_USE_FMA__
1850      y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1851 #else
1852      y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1853 #endif
1854 
1855      tmp3 = _SIMD_MUL(h2_imag, x3);
1856 #ifdef __ELPA_USE_FMA__
1857      y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1858 #else
1859      y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1860 #endif
1861      tmp4 = _SIMD_MUL(h2_imag, x4);
1862 #ifdef __ELPA_USE_FMA__
1863      y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1864 #else
1865      y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1866 #endif
1867 
1868      tmp5 = _SIMD_MUL(h2_imag, x5);
1869 #ifdef __ELPA_USE_FMA__
1870      y5 = _SIMD_ADD(y5, _SIMD_FMADDSUB(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1871 #else
1872      y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1873 #endif
1874      tmp6 = _SIMD_MUL(h2_imag, x6);
1875 #ifdef __ELPA_USE_FMA__
1876      y6 = _SIMD_ADD(y6, _SIMD_FMADDSUB(h2_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1877 #else
1878      y6 = _SIMD_ADD(y6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1879 #endif
1880 
1881 #endif /* BLOCK2 */
1882 
1883     q1 = _SIMD_LOAD(&q_dbl[0]);
1884     q2 = _SIMD_LOAD(&q_dbl[offset]);
1885     q3 = _SIMD_LOAD(&q_dbl[2*offset]);
1886     q4 = _SIMD_LOAD(&q_dbl[3*offset]);
1887     q5 = _SIMD_LOAD(&q_dbl[4*offset]);
1888     q6 = _SIMD_LOAD(&q_dbl[5*offset]);
1889 
1890 #ifdef BLOCK1
1891     q1 = _SIMD_ADD(q1, x1);
1892     q2 = _SIMD_ADD(q2, x2);
1893     q3 = _SIMD_ADD(q3, x3);
1894     q4 = _SIMD_ADD(q4, x4);
1895     q5 = _SIMD_ADD(q5, x5);
1896     q6 = _SIMD_ADD(q6, x6);
1897 #endif
1898 
1899 
1900 #ifdef BLOCK2
1901     q1 = _SIMD_ADD(q1, y1);
1902     q2 = _SIMD_ADD(q2, y2);
1903     q3 = _SIMD_ADD(q3, y3);
1904     q4 = _SIMD_ADD(q4, y4);
1905     q5 = _SIMD_ADD(q5, y5);
1906     q6 = _SIMD_ADD(q6, y6);
1907 #endif
1908 
1909     _SIMD_STORE(&q_dbl[0], q1);
1910     _SIMD_STORE(&q_dbl[offset], q2);
1911     _SIMD_STORE(&q_dbl[2*offset], q3);
1912     _SIMD_STORE(&q_dbl[3*offset], q4);
1913     _SIMD_STORE(&q_dbl[4*offset], q5);
1914     _SIMD_STORE(&q_dbl[5*offset], q6);
1915 
1916 
1917 #ifdef BLOCK2
1918 
1919 #if VEC_SET == SSE_128
1920 #ifdef DOUBLE_PRECISION_COMPLEX
1921      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
1922      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
1923 #endif
1924 #ifdef SINGLE_PRECISION_COMPLEX
1925      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
1926      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
1927 #endif
1928 #endif /* VEC_SET == SSE_128 */
1929 
1930 #if VEC_SET == AVX_256
1931      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
1932      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
1933 #endif /* VEC_SET == AVX_256 */
1934 
1935 #if VEC_SET == AVX_512
1936      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
1937      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
1938 #endif /* VEC_SET == AVX_512 */
1939 
1940      q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
1941      q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
1942      q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
1943      q4 = _SIMD_LOAD(&q_dbl[(ldq*2)+3*offset]);
1944      q5 = _SIMD_LOAD(&q_dbl[(ldq*2)+4*offset]);
1945      q6 = _SIMD_LOAD(&q_dbl[(ldq*2)+5*offset]);
1946 
1947      q1 = _SIMD_ADD(q1, x1);
1948      q2 = _SIMD_ADD(q2, x2);
1949      q3 = _SIMD_ADD(q3, x3);
1950      q4 = _SIMD_ADD(q4, x4);
1951      q5 = _SIMD_ADD(q5, x5);
1952      q6 = _SIMD_ADD(q6, x6);
1953 
1954      tmp1 = _SIMD_MUL(h2_imag, y1);
1955 #ifdef __ELPA_USE_FMA__
1956      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1957 #else
1958      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
1959 #endif
1960      tmp2 = _SIMD_MUL(h2_imag, y2);
1961 #ifdef __ELPA_USE_FMA__
1962      q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1963 #else
1964      q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
1965 #endif
1966 
1967      tmp3 = _SIMD_MUL(h2_imag, y3);
1968 #ifdef __ELPA_USE_FMA__
1969      q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1970 #else
1971      q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
1972 #endif
1973      tmp4 = _SIMD_MUL(h2_imag, y4);
1974 #ifdef __ELPA_USE_FMA__
1975      q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1976 #else
1977      q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
1978 #endif
1979 
1980      tmp5 = _SIMD_MUL(h2_imag, y5);
1981 #ifdef __ELPA_USE_FMA__
1982      q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1983 #else
1984      q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
1985 #endif
1986      tmp6 = _SIMD_MUL(h2_imag, y6);
1987 #ifdef __ELPA_USE_FMA__
1988      q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1989 #else
1990      q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
1991 #endif
1992 
1993      _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
1994      _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
1995      _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
1996      _SIMD_STORE(&q_dbl[(ldq*2)+3*offset], q4);
1997      _SIMD_STORE(&q_dbl[(ldq*2)+4*offset], q5);
1998      _SIMD_STORE(&q_dbl[(ldq*2)+5*offset], q6);
1999 
2000 #endif /* BLOCK2 */
2001 
2002 
2003     for (i = BLOCK; i < nb; i++)
2004     {
2005 
2006 #if VEC_SET == SSE_128
2007 #ifdef DOUBLE_PRECISION_COMPLEX
2008         h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
2009         h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
2010 #endif
2011 #ifdef SINGLE_PRECISION_COMPLEX
2012         h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
2013         h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
2014 #endif
2015 #endif /* VEC_SET == SSE_128 */
2016 
2017 #if VEC_SET == AVX_256
2018 	h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
2019         h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
2020 #endif /* VEC_SET == AVX_256 */
2021 
2022 #if VEC_SET == AVX_512
2023         h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
2024         h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
2025 #endif /* VEC_SET == AVX_512 */
2026 
2027         q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
2028         q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
2029         q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
2030         q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
2031         q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
2032         q6 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
2033 
2034         tmp1 = _SIMD_MUL(h1_imag, x1);
2035 #ifdef __ELPA_USE_FMA__
2036         q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2037 #else
2038         q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2039 #endif
2040         tmp2 = _SIMD_MUL(h1_imag, x2);
2041 #ifdef __ELPA_USE_FMA__
2042         q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2043 #else
2044         q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2045 #endif
2046         tmp3 = _SIMD_MUL(h1_imag, x3);
2047 #ifdef __ELPA_USE_FMA__
2048         q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2049 #else
2050         q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2051 #endif
2052 
2053          tmp4 = _SIMD_MUL(h1_imag, x4);
2054 #ifdef __ELPA_USE_FMA__
2055          q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2056 #else
2057          q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2058 #endif
2059          tmp5 = _SIMD_MUL(h1_imag, x5);
2060 #ifdef __ELPA_USE_FMA__
2061          q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2062 #else
2063          q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2064 #endif
2065          tmp6 = _SIMD_MUL(h1_imag, x6);
2066 #ifdef __ELPA_USE_FMA__
2067          q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2068 #else
2069          q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2070 #endif
2071 
2072 #ifdef BLOCK2
2073 
2074 #if VEC_SET == SSE_128
2075 #ifdef DOUBLE_PRECISION_COMPLEX
2076           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
2077           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
2078 #endif
2079 #ifdef SINGLE_PRECISION_COMPLEX
2080           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
2081           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
2082 #endif
2083 #endif /* VEC_SET == SSE_128 */
2084 
2085 #if VEC_SET == AVX_256
2086 	  h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
2087           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
2088 #endif /* VEC_SET == AVX_256 */
2089 
2090 #if VEC_SET == AVX_512
2091 	  h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
2092           h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
2093 #endif /* VEC_SET == AVX_512 */
2094 
2095           tmp1 = _SIMD_MUL(h2_imag, y1);
2096 #ifdef __ELPA_USE_FMA__
2097           q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2098 #else
2099           q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2100 #endif
2101           tmp2 = _SIMD_MUL(h2_imag, y2);
2102 #ifdef __ELPA_USE_FMA__
2103           q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2104 #else
2105           q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2106 #endif
2107 
2108           tmp3 = _SIMD_MUL(h2_imag, y3);
2109 #ifdef __ELPA_USE_FMA__
2110           q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2111 #else
2112           q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2113 #endif
2114           tmp4 = _SIMD_MUL(h2_imag, y4);
2115 #ifdef __ELPA_USE_FMA__
2116           q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2117 #else
2118           q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2119 #endif
2120 
2121           tmp5 = _SIMD_MUL(h2_imag, y5);
2122 #ifdef __ELPA_USE_FMA__
2123           q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2124 #else
2125           q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2126 #endif
2127           tmp6 = _SIMD_MUL(h2_imag, y6);
2128 #ifdef __ELPA_USE_FMA__
2129           q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h2_real, y6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2130 #else
2131           q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2132 #endif
2133 
2134 #endif /* BLOCK2 */
2135 
2136 
2137          _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
2138          _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
2139          _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
2140          _SIMD_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
2141          _SIMD_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
2142          _SIMD_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
2143     }
2144 #ifdef BLOCK2
2145 
2146 #if VEC_SET == SSE_128
2147 #ifdef DOUBLE_PRECISION_COMPLEX
2148      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
2149      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
2150 #endif
2151 #ifdef SINGLE_PRECISION_COMPLEX
2152      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
2153      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
2154 #endif
2155 #endif /* VEC_SET == SSE_128 */
2156 
2157 #if VEC_SET == AVX_256
2158      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
2159      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
2160 #endif /* VEC_SET == AVX_256 */
2161 
2162 #if VEC_SET == AVX_512
2163      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
2164      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
2165 #endif /* VEC_SET == AVX_512 */
2166 
2167      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
2168      q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
2169      q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
2170      q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
2171      q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
2172      q6 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+5*offset]);
2173 
2174      tmp1 = _SIMD_MUL(h1_imag, x1);
2175 #ifdef __ELPA_USE_FMA__
2176      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2177 #else
2178      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2179 #endif
2180      tmp2 = _SIMD_MUL(h1_imag, x2);
2181 #ifdef __ELPA_USE_FMA__
2182      q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2183 #else
2184      q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2185 #endif
2186 
2187      tmp3 = _SIMD_MUL(h1_imag, x3);
2188 #ifdef __ELPA_USE_FMA__
2189      q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2190 #else
2191      q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2192 #endif
2193      tmp4 = _SIMD_MUL(h1_imag, x4);
2194 #ifdef __ELPA_USE_FMA__
2195      q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2196 #else
2197      q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2198 #endif
2199 
2200      tmp5 = _SIMD_MUL(h1_imag, x5);
2201 #ifdef __ELPA_USE_FMA__
2202      q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2203 #else
2204      q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2205 #endif
2206      tmp6 = _SIMD_MUL(h1_imag, x6);
2207 #ifdef __ELPA_USE_FMA__
2208      q6 = _SIMD_ADD(q6, _SIMD_FMADDSUB(h1_real, x6, _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2209 #else
2210      q6 = _SIMD_ADD(q6, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x6), _SIMD_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
2211 #endif
2212 
2213      _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
2214      _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
2215      _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
2216      _SIMD_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
2217      _SIMD_STORE(&q_dbl[(2*nb*ldq)+4*offset], q5);
2218      _SIMD_STORE(&q_dbl[(2*nb*ldq)+5*offset], q6);
2219 
2220 #endif /* BLOCK2 */
2221 
2222 }
2223 
2224 
2225 #if VEC_SET == SSE_128
2226 #ifdef DOUBLE_PRECISION_COMPLEX
2227 #define ROW_LENGTH 5
2228 #endif
2229 #ifdef SINGLE_PRECISION_COMPLEX
2230 #define ROW_LENGTH 10
2231 #endif
2232 #endif /* VEC_SET == SSE_128 */
2233 
2234 #if VEC_SET == AVX_256
2235 #ifdef DOUBLE_PRECISION_COMPLEX
2236 #define ROW_LENGTH 10
2237 #endif
2238 #ifdef SINGLE_PRECISION_COMPLEX
2239 #define ROW_LENGTH 20
2240 #endif
2241 #endif /* VEC_SET == AVX_256 */
2242 
2243 #if VEC_SET == AVX_512
2244 #ifdef DOUBLE_PRECISION_COMPLEX
2245 #define ROW_LENGTH 20
2246 #endif
2247 #ifdef SINGLE_PRECISION_COMPLEX
2248 #define ROW_LENGTH 40
2249 #endif
2250 #endif /* VEC_SET == AVX_512 */
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)2251 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
2252 #ifdef BLOCK1
2253 		)
2254 #endif
2255 #ifdef BLOCK2
2256                 ,int ldh, DATA_TYPE s)
2257 #endif
2258 {
2259 
2260     DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
2261     DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
2262 #ifdef BLOCK2
2263     DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
2264 #endif
2265 
2266     __SIMD_DATATYPE x1, x2, x3, x4, x5;
2267     __SIMD_DATATYPE q1, q2, q3, q4, q5;
2268 #ifdef BLOCK2
2269     __SIMD_DATATYPE y1, y2, y3, y4, y5;
2270     __SIMD_DATATYPE h2_real, h2_imag;
2271 #endif
2272     __SIMD_DATATYPE h1_real, h1_imag;
2273     __SIMD_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5;
2274     int i=0;
2275 
2276 #if VEC_SET == SSE_128
2277 #ifdef DOUBLE_PRECISION_COMPLEX
2278     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
2279 #endif
2280 #ifdef SINGLE_PRECISION_COMPLEX
2281     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
2282 #endif
2283 #endif /* VEC_SET == SSE_128 */
2284 
2285 #if VEC_SET == AVX_256
2286 #ifdef DOUBLE_PRECISION_COMPLEX
2287         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
2288 #endif
2289 #ifdef SINGLE_PRECISION_COMPLEX
2290         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
2291 #endif
2292 #endif /* VEC_SET == AVX_256 */
2293 
2294 #if VEC_SET == AVX_512
2295 #ifdef DOUBLE_PRECISION_COMPLEX
2296         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
2297 #endif
2298 #ifdef SINGLE_PRECISION_COMPLEX
2299         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
2300 #endif
2301 #endif /* VEC_SET == AVX_512 */
2302 
2303 #ifdef BLOCK2
2304      x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
2305      x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
2306      x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
2307      x4 = _SIMD_LOAD(&q_dbl[(2*ldq)+3*offset]);
2308      x5 = _SIMD_LOAD(&q_dbl[(2*ldq)+4*offset]);
2309 
2310 #if VEC_SET == SSE_128
2311 #ifdef DOUBLE_PRECISION_COMPLEX
2312      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
2313      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
2314 #endif
2315 #ifdef SINGLE_PRECISION_COMPLEX
2316      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
2317      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
2318 #endif
2319 #endif /* VEC_SET == SSE_128 */
2320 
2321 #if VEC_SET == AVX_256
2322      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
2323      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
2324 #endif /* VEC_SET == AVX_256 */
2325 
2326 #if VEC_SET == AVX_512
2327      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
2328      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
2329 #endif /*  VEC_SET == AVX_512 */
2330 
2331 #ifndef __ELPA_USE_FMA__
2332      // conjugate
2333      h2_imag = _SIMD_XOR(h2_imag, sign);
2334 #endif
2335 
2336      y1 = _SIMD_LOAD(&q_dbl[0]);
2337      y2 = _SIMD_LOAD(&q_dbl[offset]);
2338      y3 = _SIMD_LOAD(&q_dbl[2*offset]);
2339      y4 = _SIMD_LOAD(&q_dbl[3*offset]);
2340      y5 = _SIMD_LOAD(&q_dbl[4*offset]);
2341 
2342      tmp1 = _SIMD_MUL(h2_imag, x1);
2343 #ifdef __ELPA_USE_FMA__
2344      y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2345 #else
2346      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2347 #endif
2348      tmp2 = _SIMD_MUL(h2_imag, x2);
2349 #ifdef __ELPA_USE_FMA__
2350      y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2351 #else
2352      y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2353 #endif
2354 
2355      tmp3 = _SIMD_MUL(h2_imag, x3);
2356 #ifdef __ELPA_USE_FMA__
2357      y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2358 #else
2359      y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2360 #endif
2361      tmp4 = _SIMD_MUL(h2_imag, x4);
2362 #ifdef __ELPA_USE_FMA__
2363      y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2364 #else
2365      y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2366 #endif
2367 
2368      tmp5 = _SIMD_MUL(h2_imag, x5);
2369 #ifdef __ELPA_USE_FMA__
2370      y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2371 #else
2372      y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2373 #endif
2374 
2375 #endif /* BLOCK2 */
2376 
2377 #ifdef BLOCK1
2378     x1 = _SIMD_LOAD(&q_dbl[0]);
2379     x2 = _SIMD_LOAD(&q_dbl[offset]);
2380     x3 = _SIMD_LOAD(&q_dbl[2*offset]);
2381     x4 = _SIMD_LOAD(&q_dbl[3*offset]);
2382     x5 = _SIMD_LOAD(&q_dbl[4*offset]);
2383 #endif
2384 
2385     for (i = BLOCK; i < nb; i++)
2386     {
2387 
2388 #if VEC_SET == SSE_128
2389 #ifdef DOUBLE_PRECISION_COMPLEX
2390         h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
2391         h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
2392 #endif
2393 #ifdef SINGLE_PRECISION_COMPLEX
2394         h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
2395         h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
2396 #endif
2397 #endif /* VEC_SET == SSE_128 */
2398 
2399 #if VEC_SET == AVX_256
2400        h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
2401        h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
2402 #endif /* VEC_SET == AVX_256 */
2403 
2404 #if VEC_SET == AVX_512
2405        h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
2406        h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
2407 #endif /* VEC_SET == AVX_512 */
2408 
2409 #ifndef __ELPA_USE_FMA__
2410         // conjugate
2411         h1_imag = _SIMD_XOR(h1_imag, sign);
2412 #endif
2413 
2414         q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
2415         q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
2416         q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
2417         q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
2418         q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
2419 
2420         tmp1 = _SIMD_MUL(h1_imag, q1);
2421 
2422 #ifdef __ELPA_USE_FMA__
2423         x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2424 #else
2425         x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2426 #endif
2427         tmp2 = _SIMD_MUL(h1_imag, q2);
2428 #ifdef __ELPA_USE_FMA__
2429         x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2430 #else
2431         x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2432 #endif
2433         tmp3 = _SIMD_MUL(h1_imag, q3);
2434 #ifdef __ELPA_USE_FMA__
2435         x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2436 #else
2437         x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2438 #endif
2439 
2440         tmp4 = _SIMD_MUL(h1_imag, q4);
2441 #ifdef __ELPA_USE_FMA__
2442         x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2443 #else
2444         x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2445 #endif
2446         tmp5 = _SIMD_MUL(h1_imag, q5);
2447 #ifdef __ELPA_USE_FMA__
2448         x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2449 #else
2450         x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2451 #endif
2452 
2453 #ifdef BLOCK2
2454 
2455 #if VEC_SET == SSE_128
2456 #ifdef DOUBLE_PRECISION_COMPLEX
2457           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
2458           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
2459 #endif
2460 #ifdef SINGLE_PRECISION_COMPLEX
2461           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
2462           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
2463 #endif
2464 #endif /* VEC_SET == SSE_128 */
2465 
2466 #if VEC_SET == AVX_256
2467           h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
2468           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
2469 #endif /* VEC_SET == AVX_256 */
2470 
2471 #if VEC_SET == AVX_512
2472           h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
2473           h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
2474 #endif /* VEC_SET == AVX_512 */
2475 
2476 #ifndef __ELPA_USE_FMA__
2477           // conjugate
2478           h2_imag = _SIMD_XOR(h2_imag, sign);
2479 #endif
2480 
2481           tmp1 = _SIMD_MUL(h2_imag, q1);
2482 #ifdef __ELPA_USE_FMA__
2483           y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2484 #else
2485           y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2486 #endif
2487           tmp2 = _SIMD_MUL(h2_imag, q2);
2488 #ifdef __ELPA_USE_FMA__
2489           y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2490 #else
2491           y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2492 #endif
2493 
2494           tmp3 = _SIMD_MUL(h2_imag, q3);
2495 #ifdef __ELPA_USE_FMA__
2496           y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2497 #else
2498           y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2499 #endif
2500           tmp4 = _SIMD_MUL(h2_imag, q4);
2501 #ifdef __ELPA_USE_FMA__
2502           y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2503 #else
2504           y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2505 #endif
2506 
2507           tmp5 = _SIMD_MUL(h2_imag, q5);
2508 #ifdef __ELPA_USE_FMA__
2509           y5 = _SIMD_ADD(y5, _SIMD_FMSUBADD(h2_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2510 #else
2511           y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2512 #endif
2513 
2514 #endif /* BLOCK2 */
2515 
2516     }
2517 
2518 #ifdef BLOCK2
2519 
2520 #if VEC_SET == SSE_128
2521 #ifdef DOUBLE_PRECISION_COMPLEX
2522      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
2523      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
2524 #endif
2525 #ifdef SINGLE_PRECISION_COMPLEX
2526      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
2527      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
2528 #endif
2529 #endif /* VEC_SET == SSE_128 */
2530 
2531 #if VEC_SET == AVX_256
2532      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
2533      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
2534 #endif /* VEC_SET == AVX_256 */
2535 
2536 #if VEC_SET == AVX_512
2537      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
2538      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
2539 #endif /* VEC_SET == AVX_512 */
2540 
2541 #ifndef __ELPA_USE_FMA__
2542      // conjugate
2543      h1_imag = _SIMD_XOR(h1_imag, sign);
2544 #endif
2545 
2546      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
2547      q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
2548      q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
2549      q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
2550      q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
2551 
2552      tmp1 = _SIMD_MUL(h1_imag, q1);
2553 #ifdef __ELPA_USE_FMA__
2554      x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2555 #else
2556      x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2557 #endif
2558      tmp2 = _SIMD_MUL(h1_imag, q2);
2559 #ifdef __ELPA_USE_FMA__
2560      x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2561 #else
2562      x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2563 #endif
2564 
2565      tmp3 = _SIMD_MUL(h1_imag, q3);
2566 #ifdef __ELPA_USE_FMA__
2567      x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2568 #else
2569      x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2570 #endif
2571      tmp4 = _SIMD_MUL(h1_imag, q4);
2572 #ifdef __ELPA_USE_FMA__
2573      x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2574 #else
2575      x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2576 #endif
2577 
2578      tmp5 = _SIMD_MUL(h1_imag, q5);
2579 #ifdef __ELPA_USE_FMA__
2580      x5 = _SIMD_ADD(x5, _SIMD_FMSUBADD(h1_real, q5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2581 #else
2582      x5 = _SIMD_ADD(x5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2583 #endif
2584 
2585 #endif /* BLOCK2 */
2586 
2587 #if VEC_SET == SSE_128
2588 #ifdef DOUBLE_PRECISION_COMPLEX
2589     h1_real = _mm_loaddup_pd(&hh_dbl[0]);
2590     h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
2591 #endif
2592 #ifdef SINGLE_PRECISION_COMPLEX
2593     h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
2594     h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
2595 #endif
2596 #endif /*  VEC_SET == SSE_128 */
2597 
2598 #if VEC_SET == AVX_256
2599     h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
2600     h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
2601 #endif /* AVX_256 */
2602 
2603 #if VEC_SET == AVX_512
2604     h1_real = _SIMD_SET1(hh_dbl[0]);
2605     h1_imag = _SIMD_SET1(hh_dbl[1]);
2606 
2607 #ifdef HAVE_AVX512_XEON_PHI
2608 #ifdef DOUBLE_PRECISION_COMPLEX
2609         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
2610         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
2611 #endif
2612 #ifdef SINGLE_PRECISION_COMPLEX
2613         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
2614         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
2615 #endif
2616 #endif
2617 #ifdef HAVE_AVX512_XEON
2618 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
2619         h1_real = _SIMD_XOR(h1_real, sign);
2620         h1_imag = _SIMD_XOR(h1_imag, sign);
2621 #endif
2622 #endif
2623 
2624 #endif /* VEC_SET == AVX_512 */
2625 
2626 #if VEC_SET != AVX_512
2627     h1_real = _SIMD_XOR(h1_real, sign);
2628     h1_imag = _SIMD_XOR(h1_imag, sign);
2629 #endif /* VEC_SET != AVX_512 */
2630 
2631     tmp1 = _SIMD_MUL(h1_imag, x1);
2632 #ifdef __ELPA_USE_FMA__
2633     x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2634 #else
2635     x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2636 #endif
2637     tmp2 = _SIMD_MUL(h1_imag, x2);
2638 #ifdef __ELPA_USE_FMA__
2639     x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
2640 #else
2641     x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
2642 #endif
2643     tmp3 = _SIMD_MUL(h1_imag, x3);
2644 #ifdef __ELPA_USE_FMA__
2645     x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
2646 #else
2647     x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
2648 #endif
2649 
2650     tmp4 = _SIMD_MUL(h1_imag, x4);
2651 #ifdef __ELPA_USE_FMA__
2652     x4 = _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
2653 #else
2654     x4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
2655 #endif
2656     tmp5 = _SIMD_MUL(h1_imag, x5);
2657 #ifdef __ELPA_USE_FMA__
2658     x5 = _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
2659 #else
2660     x5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
2661 #endif
2662 
2663 #ifdef BLOCK2
2664 
2665 #if VEC_SET == SSE_128
2666 #ifdef DOUBLE_PRECISION_COMPLEX
2667      h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
2668      h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
2669 #endif
2670 #ifdef SINGLE_PRECISION_COMPLEX
2671      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
2672      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
2673 #endif
2674 
2675 #ifdef DOUBLE_PRECISION_COMPLEX
2676      h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
2677      h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
2678 #endif
2679 #ifdef SINGLE_PRECISION_COMPLEX
2680      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
2681      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
2682 #endif
2683 #endif /* VEC_SET == SSE_128 */
2684 
2685 #if VEC_SET == AVX_256
2686      h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
2687      h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
2688      h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
2689      h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
2690 #endif /* VEC_SET == AVX_256 */
2691 
2692 #if VEC_SET == AVX_512
2693      h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
2694      h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
2695      h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
2696      h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
2697 
2698 #ifdef HAVE_AVX512_XEON_PHI
2699 
2700 #ifdef DOUBLE_PRECISION_COMPLEX
2701      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
2702      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
2703 #endif
2704 #ifdef SINGLE_PRECISION_COMPLEX
2705      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
2706      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
2707 #endif
2708 
2709 #ifdef DOUBLE_PRECISION_COMPLEX
2710      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
2711      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
2712 #endif
2713 #ifdef SINGLE_PRECISION_COMPLEX
2714      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
2715      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
2716 #endif
2717 #endif /* HAVE_AVX512_XEON_PHI */
2718 
2719 #ifdef HAVE_AVX512_XEON
2720 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
2721         h1_real = _SIMD_XOR(h1_real, sign);
2722         h1_imag = _SIMD_XOR(h1_imag, sign);
2723         h2_real = _SIMD_XOR(h2_real, sign);
2724         h2_imag = _SIMD_XOR(h2_imag, sign);
2725 #endif
2726 #endif
2727 #endif /* VEC_SET == AVX_512 */
2728 
2729 #if VEC_SET != AVX_512
2730      h1_real = _SIMD_XOR(h1_real, sign);
2731      h1_imag = _SIMD_XOR(h1_imag, sign);
2732      h2_real = _SIMD_XOR(h2_real, sign);
2733      h2_imag = _SIMD_XOR(h2_imag, sign);
2734 #endif /* VEC_SET != AVX_512 */
2735 
2736 #if VEC_SET == SSE_128
2737 #ifdef SINGLE_PRECISION_COMPLEX
2738      tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
2739 #else
2740      tmp2 = _SIMD_LOADU(s_dbl);
2741 #endif
2742 #endif /* VEC_SET == SSE_128 */
2743 
2744 #if VEC_SET == AVX_256
2745 #ifdef DOUBLE_PRECISION_COMPLEX
2746      tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
2747 #endif
2748 #ifdef SINGLE_PRECISION_COMPLEX
2749      tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
2750                              s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
2751 #endif
2752 #endif /* VEC_SET == AVX_256 */
2753 
2754 #if VEC_SET == AVX_512
2755 #ifdef DOUBLE_PRECISION_COMPLEX
2756      tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
2757                         s_dbl[1], s_dbl[0],
2758                         s_dbl[1], s_dbl[0],
2759                         s_dbl[1], s_dbl[0]);
2760 #endif
2761 #ifdef SINGLE_PRECISION_COMPLEX
2762      tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
2763 #endif
2764 #endif /* VEC_SET == AVX_512 */
2765 
2766      tmp1 = _SIMD_MUL(h2_imag, tmp2);
2767 #ifdef __ELPA_USE_FMA__
2768      tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2769 #else
2770      tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2771 #endif
2772 
2773 #if VEC_SET == AVX_512
2774      _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
2775 
2776      h2_real = _SIMD_SET1(s_dbl[0]);
2777      h2_imag = _SIMD_SET1(s_dbl[1]);
2778 #endif /* VEC_SET == AVX_512 */
2779 
2780 #if VEC_SET == SSE_128
2781 #ifdef DOUBLE_PRECISION_COMPLEX
2782      h2_real = _mm_movedup_pd(tmp2);
2783      h2_imag = _mm_set1_pd(tmp2[1]);
2784 #endif
2785 #ifdef SINGLE_PRECISION_COMPLEX
2786      h2_real = _mm_moveldup_ps(tmp2);
2787      h2_imag = _mm_movehdup_ps(tmp2);
2788 #endif
2789 #endif /* VEC_SET == SSE_128 */
2790 
2791 #if VEC_SET == AVX_256
2792      h2_real = _SIMD_SET1(tmp2[0]);
2793      h2_imag = _SIMD_SET1(tmp2[1]);
2794 #endif /* VEC_SET == AVX_256 */
2795      tmp1 = _SIMD_MUL(h1_imag, y1);
2796 #ifdef __ELPA_USE_FMA__
2797      y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2798 #else
2799      y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
2800 #endif
2801      tmp2 = _SIMD_MUL(h1_imag, y2);
2802 #ifdef __ELPA_USE_FMA__
2803      y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
2804 #else
2805      y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
2806 #endif
2807 
2808      tmp3 = _SIMD_MUL(h1_imag, y3);
2809 #ifdef __ELPA_USE_FMA__
2810      y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
2811 #else
2812      y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
2813 #endif
2814      tmp4 = _SIMD_MUL(h1_imag, y4);
2815 #ifdef __ELPA_USE_FMA__
2816      y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
2817 #else
2818      y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
2819 #endif
2820 
2821      tmp5 = _SIMD_MUL(h1_imag, y5);
2822 #ifdef __ELPA_USE_FMA__
2823      y5 = _SIMD_FMADDSUB(h1_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
2824 #else
2825      y5 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE));
2826 #endif
2827 
2828      tmp1 = _SIMD_MUL(h2_imag, x1);
2829 #ifdef __ELPA_USE_FMA__
2830      y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2831 #else
2832      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2833 #endif
2834      tmp2 = _SIMD_MUL(h2_imag, x2);
2835 #ifdef __ELPA_USE_FMA__
2836      y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2837 #else
2838      y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2839 #endif
2840 
2841      tmp3 = _SIMD_MUL(h2_imag, x3);
2842 #ifdef __ELPA_USE_FMA__
2843      y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2844 #else
2845      y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2846 #endif
2847      tmp4 = _SIMD_MUL(h2_imag, x4);
2848 #ifdef __ELPA_USE_FMA__
2849      y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2850 #else
2851      y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2852 #endif
2853 
2854      tmp5 = _SIMD_MUL(h2_imag, x5);
2855 #ifdef __ELPA_USE_FMA__
2856      y5 = _SIMD_ADD(y5, _SIMD_FMADDSUB(h2_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2857 #else
2858      y5 = _SIMD_ADD(y5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2859 #endif
2860 
2861 #endif /* BLOCK2 */
2862 
2863     q1 = _SIMD_LOAD(&q_dbl[0]);
2864     q2 = _SIMD_LOAD(&q_dbl[offset]);
2865     q3 = _SIMD_LOAD(&q_dbl[2*offset]);
2866     q4 = _SIMD_LOAD(&q_dbl[3*offset]);
2867     q5 = _SIMD_LOAD(&q_dbl[4*offset]);
2868 
2869 #ifdef BLOCK1
2870     q1 = _SIMD_ADD(q1, x1);
2871     q2 = _SIMD_ADD(q2, x2);
2872     q3 = _SIMD_ADD(q3, x3);
2873     q4 = _SIMD_ADD(q4, x4);
2874     q5 = _SIMD_ADD(q5, x5);
2875 #endif
2876 
2877 
2878 #ifdef BLOCK2
2879     q1 = _SIMD_ADD(q1, y1);
2880     q2 = _SIMD_ADD(q2, y2);
2881     q3 = _SIMD_ADD(q3, y3);
2882     q4 = _SIMD_ADD(q4, y4);
2883     q5 = _SIMD_ADD(q5, y5);
2884 #endif
2885     _SIMD_STORE(&q_dbl[0], q1);
2886     _SIMD_STORE(&q_dbl[offset], q2);
2887     _SIMD_STORE(&q_dbl[2*offset], q3);
2888     _SIMD_STORE(&q_dbl[3*offset], q4);
2889     _SIMD_STORE(&q_dbl[4*offset], q5);
2890 
2891 
2892 #ifdef BLOCK2
2893 
2894 #if VEC_SET == SSE_128
2895 #ifdef DOUBLE_PRECISION_COMPLEX
2896      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
2897      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
2898 #endif
2899 #ifdef SINGLE_PRECISION_COMPLEX
2900      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
2901      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
2902 #endif
2903 #endif /* VEC_SET == SSE_128 */
2904 
2905 #if VEC_SET == AVX_256
2906      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
2907      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
2908 #endif /* VEC_SET == AVX_256 */
2909 
2910 #if VEC_SET == AVX_512
2911      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
2912      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
2913 #endif /* VEC_SET == AVX_512 */
2914 
2915      q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
2916      q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
2917      q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
2918      q4 = _SIMD_LOAD(&q_dbl[(ldq*2)+3*offset]);
2919      q5 = _SIMD_LOAD(&q_dbl[(ldq*2)+4*offset]);
2920 
2921      q1 = _SIMD_ADD(q1, x1);
2922      q2 = _SIMD_ADD(q2, x2);
2923      q3 = _SIMD_ADD(q3, x3);
2924      q4 = _SIMD_ADD(q4, x4);
2925      q5 = _SIMD_ADD(q5, x5);
2926 
2927      tmp1 = _SIMD_MUL(h2_imag, y1);
2928 
2929 #ifdef __ELPA_USE_FMA__
2930      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2931 #else
2932      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
2933 #endif
2934      tmp2 = _SIMD_MUL(h2_imag, y2);
2935 #ifdef __ELPA_USE_FMA__
2936      q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2937 #else
2938      q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
2939 #endif
2940 
2941      tmp3 = _SIMD_MUL(h2_imag, y3);
2942 #ifdef __ELPA_USE_FMA__
2943      q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2944 #else
2945      q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
2946 #endif
2947      tmp4 = _SIMD_MUL(h2_imag, y4);
2948 #ifdef __ELPA_USE_FMA__
2949      q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2950 #else
2951      q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
2952 #endif
2953 
2954      tmp5 = _SIMD_MUL(h2_imag, y5);
2955 #ifdef __ELPA_USE_FMA__
2956      q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2957 #else
2958      q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
2959 #endif
2960 
2961      _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
2962      _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
2963      _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
2964      _SIMD_STORE(&q_dbl[(ldq*2)+3*offset], q4);
2965      _SIMD_STORE(&q_dbl[(ldq*2)+4*offset], q5);
2966 
2967 #endif /* BLOCK2 */
2968 
2969 
2970     for (i = BLOCK; i < nb; i++)
2971     {
2972 
2973 #if VEC_SET == SSE_128
2974 #ifdef DOUBLE_PRECISION_COMPLEX
2975         h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
2976         h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
2977 #endif
2978 #ifdef SINGLE_PRECISION_COMPLEX
2979         h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
2980         h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
2981 #endif
2982 #endif /* VEC_SET == SSE_128 */
2983 
2984 #if VEC_SET == AVX_256
2985 	h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
2986         h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
2987 #endif /* VEC_SET == AVX_256 */
2988 
2989 #if VEC_SET == AVX_512
2990         h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
2991         h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
2992 #endif /* VEC_SET == AVX_512 */
2993 
2994         q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
2995         q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
2996         q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
2997         q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
2998         q5 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
2999 
3000 	tmp1 = _SIMD_MUL(h1_imag, x1);
3001 #ifdef __ELPA_USE_FMA__
3002         q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3003 #else
3004         q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3005 #endif
3006         tmp2 = _SIMD_MUL(h1_imag, x2);
3007 #ifdef __ELPA_USE_FMA__
3008         q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3009 #else
3010         q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3011 #endif
3012         tmp3 = _SIMD_MUL(h1_imag, x3);
3013 #ifdef __ELPA_USE_FMA__
3014         q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3015 #else
3016         q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3017 #endif
3018 
3019          tmp4 = _SIMD_MUL(h1_imag, x4);
3020 #ifdef __ELPA_USE_FMA__
3021          q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3022 #else
3023          q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3024 #endif
3025          tmp5 = _SIMD_MUL(h1_imag, x5);
3026 #ifdef __ELPA_USE_FMA__
3027          q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3028 #else
3029          q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3030 #endif
3031 
3032 #ifdef BLOCK2
3033 
3034 #if VEC_SET == SSE_128
3035 #ifdef DOUBLE_PRECISION_COMPLEX
3036           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
3037           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
3038 #endif
3039 #ifdef SINGLE_PRECISION_COMPLEX
3040           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
3041           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
3042 #endif
3043 #endif /* VEC_SET == SSE_128 */
3044 
3045 #if VEC_SET == AVX_256
3046 	  h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
3047           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
3048 #endif /* VEC_SET == AVX_256 */
3049 
3050 #if VEC_SET == AVX_512
3051 	  h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
3052           h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
3053 #endif /* VEC_SET == AVX_512 */
3054 
3055           tmp1 = _SIMD_MUL(h2_imag, y1);
3056 #ifdef __ELPA_USE_FMA__
3057           q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3058 #else
3059           q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3060 #endif
3061           tmp2 = _SIMD_MUL(h2_imag, y2);
3062 #ifdef __ELPA_USE_FMA__
3063           q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3064 #else
3065           q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3066 #endif
3067 
3068           tmp3 = _SIMD_MUL(h2_imag, y3);
3069 #ifdef __ELPA_USE_FMA__
3070           q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3071 #else
3072           q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3073 #endif
3074           tmp4 = _SIMD_MUL(h2_imag, y4);
3075 #ifdef __ELPA_USE_FMA__
3076           q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3077 #else
3078           q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3079 #endif
3080 
3081           tmp5 = _SIMD_MUL(h2_imag, y5);
3082 #ifdef __ELPA_USE_FMA__
3083           q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h2_real, y5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3084 #else
3085           q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3086 #endif
3087 
3088 #endif /* BLOCK2 */
3089 
3090          _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
3091          _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
3092          _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
3093          _SIMD_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
3094          _SIMD_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
3095     }
3096 #ifdef BLOCK2
3097 
3098 #if VEC_SET == SSE_128
3099 #ifdef DOUBLE_PRECISION_COMPLEX
3100      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
3101      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
3102 #endif
3103 #ifdef SINGLE_PRECISION_COMPLEX
3104      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
3105      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
3106 #endif
3107 #endif /* VEC_SET == SSE_128 */
3108 
3109 #if VEC_SET == AVX_256
3110      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
3111      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
3112 #endif /* VEC_SET == AVX_256 */
3113 
3114 #if VEC_SET == AVX_512
3115      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
3116      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
3117 #endif /* VEC_SET == AVX_512 */
3118 
3119      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
3120      q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
3121      q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
3122      q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
3123      q5 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+4*offset]);
3124 
3125      tmp1 = _SIMD_MUL(h1_imag, x1);
3126 #ifdef __ELPA_USE_FMA__
3127      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3128 #else
3129      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3130 #endif
3131      tmp2 = _SIMD_MUL(h1_imag, x2);
3132 #ifdef __ELPA_USE_FMA__
3133      q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3134 #else
3135      q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3136 #endif
3137 
3138      tmp3 = _SIMD_MUL(h1_imag, x3);
3139 #ifdef __ELPA_USE_FMA__
3140      q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3141 #else
3142      q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3143 #endif
3144      tmp4 = _SIMD_MUL(h1_imag, x4);
3145 #ifdef __ELPA_USE_FMA__
3146      q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3147 #else
3148      q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3149 #endif
3150 
3151      tmp5 = _SIMD_MUL(h1_imag, x5);
3152 #ifdef __ELPA_USE_FMA__
3153      q5 = _SIMD_ADD(q5, _SIMD_FMADDSUB(h1_real, x5, _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3154 #else
3155      q5 = _SIMD_ADD(q5, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x5), _SIMD_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
3156 #endif
3157 
3158      _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
3159      _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
3160      _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
3161      _SIMD_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
3162      _SIMD_STORE(&q_dbl[(2*nb*ldq)+4*offset], q5);
3163 
3164 #endif /* BLOCK2 */
3165 
3166 }
3167 
3168 #if VEC_SET == SSE_128
3169 #ifdef DOUBLE_PRECISION_COMPLEX
3170 #define ROW_LENGTH 4
3171 #endif
3172 #ifdef SINGLE_PRECISION_COMPLEX
3173 #define ROW_LENGTH 8
3174 #endif
3175 #endif /* VEC_SET == SSE_128 */
3176 
3177 #if VEC_SET == AVX_256
3178 #ifdef DOUBLE_PRECISION_COMPLEX
3179 #define ROW_LENGTH 8
3180 #endif
3181 #ifdef SINGLE_PRECISION_COMPLEX
3182 #define ROW_LENGTH 16
3183 #endif
3184 #endif /* VEC_SET == AVX_256 */
3185 
3186 #if VEC_SET == AVX_512
3187 #ifdef DOUBLE_PRECISION_COMPLEX
3188 #define ROW_LENGTH 16
3189 #endif
3190 #ifdef SINGLE_PRECISION_COMPLEX
3191 #define ROW_LENGTH 32
3192 #endif
3193 #endif /* VEC_SET == AVX_512 */
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)3194 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
3195 #ifdef BLOCK1
3196 		)
3197 #endif
3198 #ifdef BLOCK2
3199                 ,int ldh, DATA_TYPE s)
3200 #endif
3201 {
3202     DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
3203     DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
3204 #ifdef BLOCK2
3205     DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
3206 #endif
3207 
3208     __SIMD_DATATYPE x1, x2, x3, x4;
3209     __SIMD_DATATYPE q1, q2, q3, q4;
3210 #ifdef BLOCK2
3211     __SIMD_DATATYPE y1, y2, y3, y4;
3212     __SIMD_DATATYPE h2_real, h2_imag;
3213 #endif
3214     __SIMD_DATATYPE h1_real, h1_imag;
3215     __SIMD_DATATYPE tmp1, tmp2, tmp3, tmp4;
3216     int i=0;
3217 
3218 #if VEC_SET == SSE_128
3219 #ifdef DOUBLE_PRECISION_COMPLEX
3220     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
3221 #endif
3222 #ifdef SINGLE_PRECISION_COMPLEX
3223      __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
3224 #endif
3225 #endif /* VEC_SET == SSE_128 */
3226 
3227 #if VEC_SET == AVX_256
3228 #ifdef DOUBLE_PRECISION_COMPLEX
3229         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
3230 #endif
3231 #ifdef SINGLE_PRECISION_COMPLEX
3232         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
3233 #endif
3234 #endif /* VEC_SET == AVX_256 */
3235 
3236 #if VEC_SET == AVX_512
3237 #ifdef DOUBLE_PRECISION_COMPLEX
3238         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
3239 #endif
3240 #ifdef SINGLE_PRECISION_COMPLEX
3241         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
3242 #endif
3243 #endif /* VEC_SET == AVX_512 */
3244 
3245 #ifdef BLOCK2
3246      x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
3247      x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
3248      x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
3249      x4 = _SIMD_LOAD(&q_dbl[(2*ldq)+3*offset]);
3250 
3251 #if VEC_SET == SSE_128
3252 #ifdef DOUBLE_PRECISION_COMPLEX
3253      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
3254      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
3255 #endif
3256 #ifdef SINGLE_PRECISION_COMPLEX
3257      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
3258      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
3259 #endif
3260 #endif /* VEC_SET == SSE_128 */
3261 
3262 #if VEC_SET == AVX_256
3263      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
3264      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
3265 #endif /* VEC_SET == AVX_256 */
3266 
3267 #if VEC_SET == AVX_512
3268      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
3269      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
3270 #endif /*  VEC_SET == AVX_512 */
3271 
3272 #ifndef __ELPA_USE_FMA__
3273      // conjugate
3274      h2_imag = _SIMD_XOR(h2_imag, sign);
3275 #endif
3276 
3277      y1 = _SIMD_LOAD(&q_dbl[0]);
3278      y2 = _SIMD_LOAD(&q_dbl[offset]);
3279      y3 = _SIMD_LOAD(&q_dbl[2*offset]);
3280      y4 = _SIMD_LOAD(&q_dbl[3*offset]);
3281 
3282      tmp1 = _SIMD_MUL(h2_imag, x1);
3283 #ifdef __ELPA_USE_FMA__
3284      y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3285 #else
3286      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3287 #endif
3288 
3289      tmp2 = _SIMD_MUL(h2_imag, x2);
3290 #ifdef __ELPA_USE_FMA__
3291      y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3292 #else
3293      y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3294 #endif
3295 
3296      tmp3 = _SIMD_MUL(h2_imag, x3);
3297 #ifdef __ELPA_USE_FMA__
3298      y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3299 #else
3300      y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3301 #endif
3302 
3303      tmp4 = _SIMD_MUL(h2_imag, x4);
3304 #ifdef __ELPA_USE_FMA__
3305      y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3306 #else
3307      y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3308 #endif
3309 
3310 #endif /* BLOCK2 */
3311 
3312 #ifdef BLOCK1
3313      x1 = _SIMD_LOAD(&q_dbl[0]);
3314      x2 = _SIMD_LOAD(&q_dbl[offset]);
3315      x3 = _SIMD_LOAD(&q_dbl[2*offset]);
3316      x4 = _SIMD_LOAD(&q_dbl[3*offset]);
3317 #endif
3318 
3319      for (i = BLOCK; i < nb; i++)
3320      {
3321 #if VEC_SET == SSE_128
3322 #ifdef DOUBLE_PRECISION_COMPLEX
3323           h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
3324           h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
3325 #endif
3326 #ifdef SINGLE_PRECISION_COMPLEX
3327           h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
3328           h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
3329 #endif
3330 #endif /* VEC_SET == SSE_128 */
3331 
3332 #if VEC_SET == AVX_256
3333           h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
3334           h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
3335 #endif /* VEC_SET == AVX_256 */
3336 
3337 #if VEC_SET == AVX_512
3338           h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
3339           h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
3340 #endif /* VEC_SET == AVX_512 */
3341 
3342 #ifndef __ELPA_USE_FMA__
3343           // conjugate
3344           h1_imag = _SIMD_XOR(h1_imag, sign);
3345 #endif
3346 
3347           q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
3348           q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
3349           q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
3350           q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
3351 
3352           tmp1 = _SIMD_MUL(h1_imag, q1);
3353 
3354 #ifdef __ELPA_USE_FMA__
3355           x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3356 #else
3357           x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3358 #endif
3359 
3360           tmp2 = _SIMD_MUL(h1_imag, q2);
3361 #ifdef __ELPA_USE_FMA__
3362           x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3363 #else
3364           x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3365 #endif
3366 
3367           tmp3 = _SIMD_MUL(h1_imag, q3);
3368 #ifdef __ELPA_USE_FMA__
3369           x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3370 #else
3371           x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3372 #endif
3373           tmp4 = _SIMD_MUL(h1_imag, q4);
3374 #ifdef __ELPA_USE_FMA__
3375           x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3376 #else
3377           x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3378 #endif
3379 
3380 #ifdef BLOCK2
3381 
3382 #if VEC_SET == SSE_128
3383 #ifdef DOUBLE_PRECISION_COMPLEX
3384           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
3385           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
3386 #endif
3387 #ifdef SINGLE_PRECISION_COMPLEX
3388           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
3389           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
3390 #endif
3391 #endif /* VEC_SET == SSE_128 */
3392 
3393 #if VEC_SET == AVX_256
3394           h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
3395           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
3396 #endif /* VEC_SET == AVX_256 */
3397 
3398 #if VEC_SET == AVX_512
3399           h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
3400           h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
3401 #endif /* VEC_SET == AVX_512 */
3402 
3403 #ifndef __ELPA_USE_FMA__
3404           // conjugate
3405           h2_imag = _SIMD_XOR(h2_imag, sign);
3406 #endif
3407 
3408           tmp1 = _SIMD_MUL(h2_imag, q1);
3409 #ifdef __ELPA_USE_FMA__
3410           y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3411 #else
3412           y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3413 #endif
3414           tmp2 = _SIMD_MUL(h2_imag, q2);
3415 #ifdef __ELPA_USE_FMA__
3416           y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3417 #else
3418           y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3419 #endif
3420 
3421           tmp3 = _SIMD_MUL(h2_imag, q3);
3422 #ifdef __ELPA_USE_FMA__
3423           y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3424 #else
3425           y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3426 #endif
3427           tmp4 = _SIMD_MUL(h2_imag, q4);
3428 #ifdef __ELPA_USE_FMA__
3429           y4 = _SIMD_ADD(y4, _SIMD_FMSUBADD(h2_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3430 #else
3431           y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3432 #endif
3433 #endif /* BLOCK2 */
3434      }
3435 
3436 #ifdef BLOCK2
3437 
3438 #if VEC_SET == SSE_128
3439 #ifdef DOUBLE_PRECISION_COMPLEX
3440      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
3441      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
3442 #endif
3443 #ifdef SINGLE_PRECISION_COMPLEX
3444      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
3445      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
3446 #endif
3447 #endif /* VEC_SET == SSE_128 */
3448 
3449 #if VEC_SET == AVX_256
3450      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
3451      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
3452 #endif /* VEC_SET == AVX_256 */
3453 
3454 #if VEC_SET == AVX_512
3455      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
3456      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
3457 #endif /* VEC_SET == AVX_512 */
3458 
3459 #ifndef __ELPA_USE_FMA__
3460      // conjugate
3461      h1_imag = _SIMD_XOR(h1_imag, sign);
3462 #endif
3463 
3464      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
3465      q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
3466      q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
3467      q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
3468 
3469      tmp1 = _SIMD_MUL(h1_imag, q1);
3470 #ifdef __ELPA_USE_FMA__
3471      x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3472 #else
3473      x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3474 #endif
3475 
3476      tmp2 = _SIMD_MUL(h1_imag, q2);
3477 #ifdef __ELPA_USE_FMA__
3478      x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3479 #else
3480      x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3481 #endif
3482 
3483      tmp3 = _SIMD_MUL(h1_imag, q3);
3484 #ifdef __ELPA_USE_FMA__
3485      x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3486 #else
3487      x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3488 #endif
3489 
3490      tmp4 = _SIMD_MUL(h1_imag, q4);
3491 #ifdef __ELPA_USE_FMA__
3492      x4 = _SIMD_ADD(x4, _SIMD_FMSUBADD(h1_real, q4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3493 #else
3494      x4 = _SIMD_ADD(x4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3495 #endif
3496 
3497 #endif /* BLOCK2 */
3498 
3499 #if VEC_SET == SSE_128
3500 #ifdef DOUBLE_PRECISION_COMPLEX
3501      h1_real = _mm_loaddup_pd(&hh_dbl[0]);
3502      h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
3503 #endif
3504 #ifdef SINGLE_PRECISION_COMPLEX
3505      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
3506      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
3507 #endif
3508 #endif /* VEC_SET == SSE_128 */
3509 
3510 #if VEC_SET == AVX_256
3511     h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
3512     h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
3513 #endif /* AVX_256 */
3514 
3515 #if VEC_SET == AVX_512
3516     h1_real = _SIMD_SET1(hh_dbl[0]);
3517     h1_imag = _SIMD_SET1(hh_dbl[1]);
3518 
3519 #ifdef HAVE_AVX512_XEON_PHI
3520 #ifdef DOUBLE_PRECISION_COMPLEX
3521         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
3522         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
3523 #endif
3524 #ifdef SINGLE_PRECISION_COMPLEX
3525         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
3526         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
3527 #endif
3528 #endif
3529 #ifdef HAVE_AVX512_XEON
3530 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
3531         h1_real = _SIMD_XOR(h1_real, sign);
3532         h1_imag = _SIMD_XOR(h1_imag, sign);
3533 #endif
3534 #endif
3535 
3536 #endif /* VEC_SET == AVX_512 */
3537 
3538 #if VEC_SET != AVX_512
3539      h1_real = _SIMD_XOR(h1_real, sign);
3540      h1_imag = _SIMD_XOR(h1_imag, sign);
3541 #endif /* VEC_SET != AVX_512 */
3542 
3543      tmp1 = _SIMD_MUL(h1_imag, x1);
3544 #ifdef __ELPA_USE_FMA__
3545      x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3546 #else
3547      x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3548 #endif
3549 
3550      tmp2 = _SIMD_MUL(h1_imag, x2);
3551 #ifdef __ELPA_USE_FMA__
3552      x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
3553 #else
3554      x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
3555 #endif
3556 
3557      tmp3 = _SIMD_MUL(h1_imag, x3);
3558 #ifdef __ELPA_USE_FMA__
3559      x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
3560 #else
3561      x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
3562 #endif
3563 
3564      tmp4 = _SIMD_MUL(h1_imag, x4);
3565 #ifdef __ELPA_USE_FMA__
3566      x4 = _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
3567 #else
3568      x4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
3569 #endif
3570 
3571 #ifdef BLOCK2
3572 
3573 #if VEC_SET == SSE_128
3574 #ifdef DOUBLE_PRECISION_COMPLEX
3575      h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
3576      h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
3577 #endif
3578 #ifdef SINGLE_PRECISION_COMPLEX
3579      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
3580      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
3581 #endif
3582 
3583 #ifdef DOUBLE_PRECISION_COMPLEX
3584      h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
3585      h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
3586 #endif
3587 #ifdef SINGLE_PRECISION_COMPLEX
3588      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
3589      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
3590 #endif
3591 #endif /* VEC_SET == 128 */
3592 
3593 #if VEC_SET == AVX_256
3594      h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
3595      h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
3596      h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
3597      h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
3598 #endif /* VEC_SET == AVX_256 */
3599 
3600 #if VEC_SET == AVX_512
3601      h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
3602      h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
3603      h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
3604      h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
3605 
3606 #ifdef HAVE_AVX512_XEON_PHI
3607 
3608 #ifdef DOUBLE_PRECISION_COMPLEX
3609      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
3610      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
3611 #endif
3612 #ifdef SINGLE_PRECISION_COMPLEX
3613      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
3614      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
3615 #endif
3616 
3617 #ifdef DOUBLE_PRECISION_COMPLEX
3618      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
3619      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
3620 #endif
3621 #ifdef SINGLE_PRECISION_COMPLEX
3622      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
3623      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
3624 #endif
3625 #endif /* HAVE_AVX512_XEON_PHI */
3626 
3627 #ifdef HAVE_AVX512_XEON
3628 #if defined(DOUBLE_PRECISION_COMPLEX) || defined(SINGLE_PRECISION_COMPLEX)
3629         h1_real = _SIMD_XOR(h1_real, sign);
3630         h1_imag = _SIMD_XOR(h1_imag, sign);
3631         h2_real = _SIMD_XOR(h2_real, sign);
3632         h2_imag = _SIMD_XOR(h2_imag, sign);
3633 #endif
3634 #endif
3635 #endif /* VEC_SET == AVX_512 */
3636 
3637 #if VEC_SET != AVX_512
3638      h1_real = _SIMD_XOR(h1_real, sign);
3639      h1_imag = _SIMD_XOR(h1_imag, sign);
3640      h2_real = _SIMD_XOR(h2_real, sign);
3641      h2_imag = _SIMD_XOR(h2_imag, sign);
3642 #endif /* VEC_SET != AVX_512 */
3643 
3644 #if VEC_SET == SSE_128
3645 #ifdef SINGLE_PRECISION_COMPLEX
3646      tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
3647 #else
3648      tmp2 = _SIMD_LOADU(s_dbl);
3649 #endif
3650 #endif /* VEC_SET == SSE_128 */
3651 
3652 #if VEC_SET == AVX_256
3653 #ifdef DOUBLE_PRECISION_COMPLEX
3654      tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
3655 #endif
3656 #ifdef SINGLE_PRECISION_COMPLEX
3657      tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
3658                              s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
3659 #endif
3660 #endif /* VEC_SET == AVX_256 */
3661 
3662 #if VEC_SET == AVX_512
3663 #ifdef DOUBLE_PRECISION_COMPLEX
3664      tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
3665                         s_dbl[1], s_dbl[0],
3666                         s_dbl[1], s_dbl[0],
3667                         s_dbl[1], s_dbl[0]);
3668 #endif
3669 #ifdef SINGLE_PRECISION_COMPLEX
3670      tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
3671 #endif
3672 #endif /* VEC_SET == AVX_512 */
3673 
3674      tmp1 = _SIMD_MUL(h2_imag, tmp2);
3675 #ifdef __ELPA_USE_FMA__
3676      tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3677 #else
3678      tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3679 #endif
3680 
3681 #if VEC_SET == AVX_512
3682      _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
3683 
3684      h2_real = _SIMD_SET1(s_dbl[0]);
3685      h2_imag = _SIMD_SET1(s_dbl[1]);
3686 #endif /* VEC_SET == AVX_512 */
3687 
3688 #if VEC_SET == SSE_128
3689 #ifdef DOUBLE_PRECISION_COMPLEX
3690      h2_real = _mm_movedup_pd(tmp2);
3691      h2_imag = _mm_set1_pd(tmp2[1]);
3692 #endif
3693 #ifdef SINGLE_PRECISION_COMPLEX
3694      h2_real = _mm_moveldup_ps(tmp2);
3695      h2_imag = _mm_movehdup_ps(tmp2);
3696 #endif
3697 #endif /* VEC_SET == SSE_128 */
3698 
3699 #if VEC_SET == AVX_256
3700      h2_real = _SIMD_SET1(tmp2[0]);
3701      h2_imag = _SIMD_SET1(tmp2[1]);
3702 #endif /* VEC_SET == AVX_256 */
3703      tmp1 = _SIMD_MUL(h1_imag, y1);
3704 #ifdef __ELPA_USE_FMA__
3705      y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3706 #else
3707      y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
3708 #endif
3709 
3710      tmp2 = _SIMD_MUL(h1_imag, y2);
3711 #ifdef __ELPA_USE_FMA__
3712      y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
3713 #else
3714      y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
3715 #endif
3716 
3717      tmp3 = _SIMD_MUL(h1_imag, y3);
3718 #ifdef __ELPA_USE_FMA__
3719      y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
3720 #else
3721      y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
3722 #endif
3723 
3724      tmp4 = _SIMD_MUL(h1_imag, y4);
3725 #ifdef __ELPA_USE_FMA__
3726      y4 = _SIMD_FMADDSUB(h1_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
3727 #else
3728      y4 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE));
3729 #endif
3730 
3731      tmp1 = _SIMD_MUL(h2_imag, x1);
3732 #ifdef __ELPA_USE_FMA__
3733      y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3734 #else
3735      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3736 #endif
3737 
3738      tmp2 = _SIMD_MUL(h2_imag, x2);
3739 #ifdef __ELPA_USE_FMA__
3740      y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3741 #else
3742      y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3743 #endif
3744 
3745      tmp3 = _SIMD_MUL(h2_imag, x3);
3746 #ifdef __ELPA_USE_FMA__
3747      y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3748 #else
3749      y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3750 #endif
3751 
3752      tmp4 = _SIMD_MUL(h2_imag, x4);
3753 #ifdef __ELPA_USE_FMA__
3754      y4 = _SIMD_ADD(y4, _SIMD_FMADDSUB(h2_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3755 #else
3756      y4 = _SIMD_ADD(y4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3757 #endif
3758 
3759 #endif /* BLOCK2 */
3760 
3761      q1 = _SIMD_LOAD(&q_dbl[0]);
3762      q2 = _SIMD_LOAD(&q_dbl[offset]);
3763      q3 = _SIMD_LOAD(&q_dbl[2*offset]);
3764      q4 = _SIMD_LOAD(&q_dbl[3*offset]);
3765 
3766 #ifdef BLOCK1
3767      q1 = _SIMD_ADD(q1, x1);
3768      q2 = _SIMD_ADD(q2, x2);
3769      q3 = _SIMD_ADD(q3, x3);
3770      q4 = _SIMD_ADD(q4, x4);
3771 #endif
3772 
3773 #ifdef BLOCK2
3774      q1 = _SIMD_ADD(q1, y1);
3775      q2 = _SIMD_ADD(q2, y2);
3776      q3 = _SIMD_ADD(q3, y3);
3777      q4 = _SIMD_ADD(q4, y4);
3778 #endif
3779 
3780      _SIMD_STORE(&q_dbl[0], q1);
3781      _SIMD_STORE(&q_dbl[offset], q2);
3782      _SIMD_STORE(&q_dbl[2*offset], q3);
3783      _SIMD_STORE(&q_dbl[3*offset], q4);
3784 
3785 #ifdef BLOCK2
3786 
3787 #if VEC_SET == SSE_128
3788 #ifdef DOUBLE_PRECISION_COMPLEX
3789      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
3790      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
3791 #endif
3792 #ifdef SINGLE_PRECISION_COMPLEX
3793      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
3794      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
3795 #endif
3796 #endif /* VEC_SET == SSE_128 */
3797 
3798 #if VEC_SET == AVX_256
3799      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
3800      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
3801 #endif /* VEC_SET == AVX_256 */
3802 
3803 #if VEC_SET == AVX_512
3804      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
3805      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
3806 #endif /* VEC_SET == AVX_512 */
3807 
3808      q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
3809      q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
3810      q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
3811      q4 = _SIMD_LOAD(&q_dbl[(ldq*2)+3*offset]);
3812 
3813      q1 = _SIMD_ADD(q1, x1);
3814      q2 = _SIMD_ADD(q2, x2);
3815      q3 = _SIMD_ADD(q3, x3);
3816      q4 = _SIMD_ADD(q4, x4);
3817 
3818      tmp1 = _SIMD_MUL(h2_imag, y1);
3819 #ifdef __ELPA_USE_FMA__
3820      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3821 #else
3822      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3823 #endif
3824 
3825      tmp2 = _SIMD_MUL(h2_imag, y2);
3826 #ifdef __ELPA_USE_FMA__
3827      q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3828 #else
3829      q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3830 #endif
3831 
3832      tmp3 = _SIMD_MUL(h2_imag, y3);
3833 #ifdef __ELPA_USE_FMA__
3834      q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3835 #else
3836      q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3837 #endif
3838 
3839      tmp4 = _SIMD_MUL(h2_imag, y4);
3840 #ifdef __ELPA_USE_FMA__
3841      q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3842 #else
3843      q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3844 #endif
3845 
3846      _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
3847      _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
3848      _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
3849      _SIMD_STORE(&q_dbl[(ldq*2)+3*offset], q4);
3850 
3851 #endif /* BLOCK2 */
3852 
3853      for (i = BLOCK; i < nb; i++)
3854      {
3855 
3856 #if VEC_SET == SSE_128
3857 #ifdef DOUBLE_PRECISION_COMPLEX
3858           h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
3859           h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
3860 #endif
3861 #ifdef SINGLE_PRECISION_COMPLEX
3862           h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
3863           h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
3864 #endif
3865 #endif /* VEC_SET == SSE_128 */
3866 
3867 #if VEC_SET == AVX_256
3868 	  h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
3869           h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
3870 #endif /* VEC_SET == AVX_256 */
3871 
3872 #if VEC_SET == AVX_512
3873           h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
3874           h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
3875 #endif /* VEC_SET == AVX_512 */
3876 
3877           q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
3878           q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
3879           q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
3880           q4 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
3881 
3882           tmp1 = _SIMD_MUL(h1_imag, x1);
3883 
3884 #ifdef __ELPA_USE_FMA__
3885           q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3886 #else
3887           q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3888 #endif
3889           tmp2 = _SIMD_MUL(h1_imag, x2);
3890 #ifdef __ELPA_USE_FMA__
3891           q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3892 #else
3893           q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3894 #endif
3895 
3896           tmp3 = _SIMD_MUL(h1_imag, x3);
3897 #ifdef __ELPA_USE_FMA__
3898           q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3899 #else
3900           q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3901 #endif
3902           tmp4 = _SIMD_MUL(h1_imag, x4);
3903 #ifdef __ELPA_USE_FMA__
3904           q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3905 #else
3906           q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3907 #endif
3908 
3909 #ifdef BLOCK2
3910 
3911 #if VEC_SET == SSE_128
3912 #ifdef DOUBLE_PRECISION_COMPLEX
3913           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
3914           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
3915 #endif
3916 #ifdef SINGLE_PRECISION_COMPLEX
3917           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
3918           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
3919 #endif
3920 #endif /* VEC_SET == SSE_128 */
3921 
3922 #if VEC_SET == AVX_256
3923 	  h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
3924           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
3925 #endif /* VEC_SET == AVX_256 */
3926 
3927 #if VEC_SET == AVX_512
3928 	  h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
3929           h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
3930 #endif /* VEC_SET == AVX_512 */
3931 
3932           tmp1 = _SIMD_MUL(h2_imag, y1);
3933 #ifdef __ELPA_USE_FMA__
3934           q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3935 #else
3936           q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3937 #endif
3938           tmp2 = _SIMD_MUL(h2_imag, y2);
3939 #ifdef __ELPA_USE_FMA__
3940           q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3941 #else
3942           q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
3943 #endif
3944 
3945           tmp3 = _SIMD_MUL(h2_imag, y3);
3946 #ifdef __ELPA_USE_FMA__
3947           q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3948 #else
3949           q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
3950 #endif
3951           tmp4 = _SIMD_MUL(h2_imag, y4);
3952 #ifdef __ELPA_USE_FMA__
3953           q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h2_real, y4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3954 #else
3955           q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
3956 #endif
3957 
3958 #endif /* BLOCK2 */
3959 
3960           _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
3961           _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
3962           _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
3963           _SIMD_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
3964 
3965      }
3966 #ifdef BLOCK2
3967 
3968 #if VEC_SET == SSE_128
3969 #ifdef DOUBLE_PRECISION_COMPLEX
3970      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
3971      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
3972 #endif
3973 #ifdef SINGLE_PRECISION_COMPLEX
3974      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
3975      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
3976 #endif
3977 #endif /* VEC_SET == SSE_128 */
3978 
3979 #if VEC_SET == AVX_256
3980      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
3981      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
3982 #endif /* VEC_SET == AVX_256 */
3983 
3984 #if VEC_SET == AVX_512
3985      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
3986      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
3987 #endif /* VEC_SET == AVX_512 */
3988 
3989      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
3990      q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
3991      q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
3992      q4 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+3*offset]);
3993 
3994      tmp1 = _SIMD_MUL(h1_imag, x1);
3995 #ifdef __ELPA_USE_FMA__
3996      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3997 #else
3998      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
3999 #endif
4000      tmp2 = _SIMD_MUL(h1_imag, x2);
4001 #ifdef __ELPA_USE_FMA__
4002      q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4003 #else
4004      q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4005 #endif
4006 
4007      tmp3 = _SIMD_MUL(h1_imag, x3);
4008 #ifdef __ELPA_USE_FMA__
4009      q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4010 #else
4011      q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4012 #endif
4013      tmp4 = _SIMD_MUL(h1_imag, x4);
4014 #ifdef __ELPA_USE_FMA__
4015      q4 = _SIMD_ADD(q4, _SIMD_FMADDSUB(h1_real, x4, _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
4016 #else
4017      q4 = _SIMD_ADD(q4, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x4), _SIMD_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
4018 #endif
4019 
4020      _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
4021      _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
4022      _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
4023      _SIMD_STORE(&q_dbl[(2*nb*ldq)+3*offset], q4);
4024 
4025 #endif /* BLOCK2 */
4026 }
4027 
4028 
4029 #if VEC_SET == SSE_128
4030 #ifdef DOUBLE_PRECISION_COMPLEX
4031 #define ROW_LENGTH 3
4032 #endif
4033 #ifdef SINGLE_PRECISION_COMPLEX
4034 #define ROW_LENGTH 6
4035 #endif
4036 #endif /* VEC_SET == SSE_128 */
4037 
4038 #if VEC_SET == AVX_256
4039 #ifdef DOUBLE_PRECISION_COMPLEX
4040 #define ROW_LENGTH 6
4041 #endif
4042 #ifdef SINGLE_PRECISION_COMPLEX
4043 #define ROW_LENGTH 12
4044 #endif
4045 #endif /* VEC_SET == AVX_256 */
4046 
4047 #if VEC_SET == AVX_512
4048 #ifdef DOUBLE_PRECISION_COMPLEX
4049 #define ROW_LENGTH 12
4050 #endif
4051 #ifdef SINGLE_PRECISION_COMPLEX
4052 #define ROW_LENGTH 24
4053 #endif
4054 #endif /* VEC_SET == AVX_512 */
4055 
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)4056 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
4057 #ifdef BLOCK1
4058 		)
4059 #endif
4060 #ifdef BLOCK2
4061                 ,int ldh, DATA_TYPE s)
4062 #endif
4063 {
4064     DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
4065     DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
4066 #ifdef BLOCK2
4067     DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
4068 #endif
4069 
4070     __SIMD_DATATYPE x1, x2, x3;
4071     __SIMD_DATATYPE q1, q2, q3;
4072 #ifdef BLOCK2
4073     __SIMD_DATATYPE y1, y2, y3;
4074     __SIMD_DATATYPE h2_real, h2_imag;
4075 #endif
4076     __SIMD_DATATYPE h1_real, h1_imag;
4077     __SIMD_DATATYPE tmp1, tmp2, tmp3;
4078     int i=0;
4079 
4080 #if VEC_SET == SSE_128
4081 #ifdef DOUBLE_PRECISION_COMPLEX
4082     __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
4083 #endif
4084 #ifdef SINGLE_PRECISION_COMPLEX
4085      __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
4086 #endif
4087 #endif /* VEC_SET == SSE_128 */
4088 
4089 #if VEC_SET == AVX_256
4090 #ifdef DOUBLE_PRECISION_COMPLEX
4091         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
4092 #endif
4093 #ifdef SINGLE_PRECISION_COMPLEX
4094         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
4095 #endif
4096 #endif /* VEC_SET == AVX_256 */
4097 
4098 #if VEC_SET == AVX_512
4099 #ifdef DOUBLE_PRECISION_COMPLEX
4100         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
4101 #endif
4102 #ifdef SINGLE_PRECISION_COMPLEX
4103         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
4104 #endif
4105 #endif /* VEC_SET == AVX_512 */
4106 
4107 #ifdef BLOCK2
4108      x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
4109      x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
4110      x3 = _SIMD_LOAD(&q_dbl[(2*ldq)+2*offset]);
4111 
4112 #if VEC_SET == SSE_128
4113 #ifdef DOUBLE_PRECISION_COMPLEX
4114      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
4115      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
4116 #endif
4117 #ifdef SINGLE_PRECISION_COMPLEX
4118      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
4119      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
4120 #endif
4121 #endif /* VEC_SET == SSE_128 */
4122 
4123 #if VEC_SET == AVX_256
4124      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
4125      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
4126 #endif /* VEC_SET == AVX_256 */
4127 
4128 #if VEC_SET == AVX_512
4129      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
4130      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
4131 #endif /*  VEC_SET == AVX_512 */
4132 
4133 #ifndef __ELPA_USE_FMA__
4134      // conjugate
4135      h2_imag = _SIMD_XOR(h2_imag, sign);
4136 #endif
4137 
4138      y1 = _SIMD_LOAD(&q_dbl[0]);
4139      y2 = _SIMD_LOAD(&q_dbl[offset]);
4140      y3 = _SIMD_LOAD(&q_dbl[2*offset]);
4141 
4142      tmp1 = _SIMD_MUL(h2_imag, x1);
4143 #ifdef __ELPA_USE_FMA__
4144      y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4145 #else
4146      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4147 #endif
4148 
4149      tmp2 = _SIMD_MUL(h2_imag, x2);
4150 #ifdef __ELPA_USE_FMA__
4151      y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4152 #else
4153      y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4154 #endif
4155 
4156      tmp3 = _SIMD_MUL(h2_imag, x3);
4157 #ifdef __ELPA_USE_FMA__
4158      y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4159 #else
4160      y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4161 #endif
4162 
4163 #endif /* BLOCK2 */
4164 
4165 #ifdef BLOCK1
4166      x1 = _SIMD_LOAD(&q_dbl[0]);
4167      x2 = _SIMD_LOAD(&q_dbl[offset]);
4168      x3 = _SIMD_LOAD(&q_dbl[2*offset]);
4169 #endif
4170 
4171      for (i = BLOCK; i < nb; i++)
4172      {
4173 #if VEC_SET == SSE_128
4174 #ifdef DOUBLE_PRECISION_COMPLEX
4175           h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
4176           h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
4177 #endif
4178 #ifdef SINGLE_PRECISION_COMPLEX
4179           h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
4180           h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
4181 #endif
4182 #endif /* VEC_SET == SSE_128 */
4183 
4184 #if VEC_SET == AVX_256
4185           h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
4186           h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
4187 #endif /* VEC_SET == AVX_256 */
4188 
4189 #if VEC_SET == AVX_512
4190           h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
4191           h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
4192 #endif /* VEC_SET == AVX_512 */
4193 
4194 #ifndef __ELPA_USE_FMA__
4195           // conjugate
4196           h1_imag = _SIMD_XOR(h1_imag, sign);
4197 #endif
4198 
4199           q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
4200           q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
4201           q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
4202 
4203           tmp1 = _SIMD_MUL(h1_imag, q1);
4204 
4205 #ifdef __ELPA_USE_FMA__
4206           x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4207 #else
4208           x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4209 #endif
4210 
4211           tmp2 = _SIMD_MUL(h1_imag, q2);
4212 #ifdef __ELPA_USE_FMA__
4213           x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4214 #else
4215           x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4216 #endif
4217 
4218           tmp3 = _SIMD_MUL(h1_imag, q3);
4219 #ifdef __ELPA_USE_FMA__
4220           x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4221 #else
4222           x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4223 #endif
4224 
4225 #ifdef BLOCK2
4226 
4227 #if VEC_SET == SSE_128
4228 #ifdef DOUBLE_PRECISION_COMPLEX
4229           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
4230           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
4231 #endif
4232 #ifdef SINGLE_PRECISION_COMPLEX
4233           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
4234           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
4235 #endif
4236 #endif /* VEC_SET == SSE_128 */
4237 
4238 #if VEC_SET == AVX_256
4239           h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
4240           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
4241 #endif /* VEC_SET == AVX_256 */
4242 
4243 #if VEC_SET == AVX_512
4244           h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
4245           h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
4246 #endif /* VEC_SET == AVX_512 */
4247 
4248 #ifndef __ELPA_USE_FMA__
4249           // conjugate
4250           h2_imag = _SIMD_XOR(h2_imag, sign);
4251 #endif
4252 
4253           tmp1 = _SIMD_MUL(h2_imag, q1);
4254 #ifdef __ELPA_USE_FMA__
4255           y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4256 #else
4257           y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4258 #endif
4259           tmp2 = _SIMD_MUL(h2_imag, q2);
4260 #ifdef __ELPA_USE_FMA__
4261           y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4262 #else
4263           y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4264 #endif
4265 
4266           tmp3 = _SIMD_MUL(h2_imag, q3);
4267 #ifdef __ELPA_USE_FMA__
4268           y3 = _SIMD_ADD(y3, _SIMD_FMSUBADD(h2_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4269 #else
4270           y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4271 #endif
4272 #endif /* BLOCK2 */
4273      }
4274 
4275 #ifdef BLOCK2
4276 
4277 #if VEC_SET == SSE_128
4278 #ifdef DOUBLE_PRECISION_COMPLEX
4279      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
4280      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
4281 #endif
4282 #ifdef SINGLE_PRECISION_COMPLEX
4283      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
4284      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
4285 #endif
4286 #endif /* VEC_SET == SSE_128 */
4287 
4288 #if VEC_SET == AVX_256
4289      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
4290      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
4291 #endif /* VEC_SET == AVX_256 */
4292 
4293 #if VEC_SET == AVX_512
4294      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
4295      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
4296 #endif /* VEC_SET == AVX_512 */
4297 
4298 #ifndef __ELPA_USE_FMA__
4299      // conjugate
4300      h1_imag = _SIMD_XOR(h1_imag, sign);
4301 #endif
4302 
4303      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
4304      q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
4305      q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
4306 
4307      tmp1 = _SIMD_MUL(h1_imag, q1);
4308 #ifdef __ELPA_USE_FMA__
4309      x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4310 #else
4311      x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4312 #endif
4313 
4314      tmp2 = _SIMD_MUL(h1_imag, q2);
4315 #ifdef __ELPA_USE_FMA__
4316      x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4317 #else
4318      x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4319 #endif
4320 
4321      tmp3 = _SIMD_MUL(h1_imag, q3);
4322 #ifdef __ELPA_USE_FMA__
4323      x3 = _SIMD_ADD(x3, _SIMD_FMSUBADD(h1_real, q3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4324 #else
4325      x3 = _SIMD_ADD(x3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4326 #endif
4327 
4328 #endif /* BLOCK2 */
4329 
4330 #if VEC_SET == SSE_128
4331 #ifdef DOUBLE_PRECISION_COMPLEX
4332      h1_real = _mm_loaddup_pd(&hh_dbl[0]);
4333      h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
4334 #endif
4335 #ifdef SINGLE_PRECISION_COMPLEX
4336      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
4337      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
4338 #endif
4339 #endif /* VEC_SET == SSE_128 */
4340 
4341 #if VEC_SET == AVX_256
4342     h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
4343     h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
4344 #endif /* VEC_SET == AVX_256 */
4345 
4346 #if VEC_SET == AVX_512
4347     h1_real = _SIMD_SET1(hh_dbl[0]);
4348     h1_imag = _SIMD_SET1(hh_dbl[1]);
4349 
4350 #ifdef HAVE_AVX512_XEON_PHI
4351 #ifdef DOUBLE_PRECISION_COMPLEX
4352         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
4353         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
4354 #endif
4355 #ifdef SINGLE_PRECISION_COMPLEX
4356         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
4357         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
4358 #endif
4359 #endif /* HAVE_AVX512_XEON_PHI */
4360 
4361 #ifdef HAVE_AVX512_XEON
4362         h1_real = _SIMD_XOR(h1_real, sign);
4363         h1_imag = _SIMD_XOR(h1_imag, sign);
4364 #endif
4365 
4366 #endif /* VEC_SET == AVX_512 */
4367 
4368 #if VEC_SET != AVX_512
4369      h1_real = _SIMD_XOR(h1_real, sign);
4370      h1_imag = _SIMD_XOR(h1_imag, sign);
4371 #endif /* VEC_SET != AVX_512 */
4372 
4373      tmp1 = _SIMD_MUL(h1_imag, x1);
4374 #ifdef __ELPA_USE_FMA__
4375      x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4376 #else
4377      x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4378 #endif
4379 
4380      tmp2 = _SIMD_MUL(h1_imag, x2);
4381 #ifdef __ELPA_USE_FMA__
4382      x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
4383 #else
4384      x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
4385 #endif
4386 
4387      tmp3 = _SIMD_MUL(h1_imag, x3);
4388 #ifdef __ELPA_USE_FMA__
4389      x3 = _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
4390 #else
4391      x3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
4392 #endif
4393 
4394 #ifdef BLOCK2
4395 
4396 #if VEC_SET == SSE_128
4397 #ifdef DOUBLE_PRECISION_COMPLEX
4398      h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
4399      h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
4400 #endif
4401 #ifdef SINGLE_PRECISION_COMPLEX
4402      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
4403      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
4404 #endif
4405 
4406 #ifdef DOUBLE_PRECISION_COMPLEX
4407      h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
4408      h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
4409 #endif
4410 #ifdef SINGLE_PRECISION_COMPLEX
4411      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
4412      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
4413 #endif
4414 #endif /* VEC_SET == 128 */
4415 
4416 #if VEC_SET == AVX_256
4417      h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
4418      h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
4419      h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
4420      h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
4421 #endif /* VEC_SET == AVX_256 */
4422 
4423 #if VEC_SET == AVX_512
4424      h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
4425      h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
4426      h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
4427      h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
4428 
4429 #ifdef HAVE_AVX512_XEON_PHI
4430 
4431 #ifdef DOUBLE_PRECISION_COMPLEX
4432      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
4433      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
4434 #endif
4435 #ifdef SINGLE_PRECISION_COMPLEX
4436      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
4437      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
4438 #endif
4439 
4440 #ifdef DOUBLE_PRECISION_COMPLEX
4441      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
4442      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
4443 #endif
4444 #ifdef SINGLE_PRECISION_COMPLEX
4445      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
4446      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
4447 #endif
4448 #endif /* HAVE_AVX512_XEON_PHI */
4449 
4450 #ifdef HAVE_AVX512_XEON
4451         h1_real = _SIMD_XOR(h1_real, sign);
4452         h1_imag = _SIMD_XOR(h1_imag, sign);
4453         h2_real = _SIMD_XOR(h2_real, sign);
4454         h2_imag = _SIMD_XOR(h2_imag, sign);
4455 #endif
4456 #endif /* VEC_SET == AVX_512 */
4457 
4458 #if VEC_SET != AVX_512
4459      h1_real = _SIMD_XOR(h1_real, sign);
4460      h1_imag = _SIMD_XOR(h1_imag, sign);
4461      h2_real = _SIMD_XOR(h2_real, sign);
4462      h2_imag = _SIMD_XOR(h2_imag, sign);
4463 #endif /* VEC_SET != AVX_512 */
4464 
4465 #if VEC_SET == SSE_128
4466 #ifdef SINGLE_PRECISION_COMPLEX
4467      tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
4468 #else
4469      tmp2 = _SIMD_LOADU(s_dbl);
4470 #endif
4471 #endif /* VEC_SET == SSE_128 */
4472 
4473 #if VEC_SET == AVX_256
4474 #ifdef DOUBLE_PRECISION_COMPLEX
4475      tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
4476 #endif
4477 #ifdef SINGLE_PRECISION_COMPLEX
4478      tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
4479                              s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
4480 #endif
4481 #endif /* VEC_SET == AVX_256 */
4482 
4483 #if VEC_SET == AVX_512
4484 #ifdef DOUBLE_PRECISION_COMPLEX
4485      tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
4486                         s_dbl[1], s_dbl[0],
4487                         s_dbl[1], s_dbl[0],
4488                         s_dbl[1], s_dbl[0]);
4489 #endif
4490 #ifdef SINGLE_PRECISION_COMPLEX
4491      tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
4492 #endif
4493 #endif /* VEC_SET == AVX_512 */
4494 
4495 
4496      tmp1 = _SIMD_MUL(h2_imag, tmp2);
4497 #ifdef __ELPA_USE_FMA__
4498      tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4499 #else
4500      tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4501 #endif
4502 
4503 #if VEC_SET == AVX_512
4504      _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
4505 
4506      h2_real = _SIMD_SET1(s_dbl[0]);
4507      h2_imag = _SIMD_SET1(s_dbl[1]);
4508 #endif
4509 
4510 #if VEC_SET == SSE_128
4511 #ifdef DOUBLE_PRECISION_COMPLEX
4512      h2_real = _mm_movedup_pd(tmp2);
4513      h2_imag = _mm_set1_pd(tmp2[1]);
4514 #endif
4515 #ifdef SINGLE_PRECISION_COMPLEX
4516      h2_real = _mm_moveldup_ps(tmp2);
4517      h2_imag = _mm_movehdup_ps(tmp2);
4518 #endif
4519 #endif /* VEC_SET == SSE_128 */
4520 
4521 #if VEC_SET == AVX_256
4522      h2_real = _SIMD_SET1(tmp2[0]);
4523      h2_imag = _SIMD_SET1(tmp2[1]);
4524 #endif /* VEC_SET == AVX_256 */
4525 
4526      tmp1 = _SIMD_MUL(h1_imag, y1);
4527 #ifdef __ELPA_USE_FMA__
4528      y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4529 #else
4530      y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
4531 #endif
4532 
4533      tmp2 = _SIMD_MUL(h1_imag, y2);
4534 #ifdef __ELPA_USE_FMA__
4535      y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
4536 #else
4537      y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
4538 #endif
4539 
4540      tmp3 = _SIMD_MUL(h1_imag, y3);
4541 #ifdef __ELPA_USE_FMA__
4542      y3 = _SIMD_FMADDSUB(h1_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
4543 #else
4544      y3 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE));
4545 #endif
4546 
4547      tmp1 = _SIMD_MUL(h2_imag, x1);
4548 #ifdef __ELPA_USE_FMA__
4549      y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4550 #else
4551      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4552 #endif
4553 
4554      tmp2 = _SIMD_MUL(h2_imag, x2);
4555 #ifdef __ELPA_USE_FMA__
4556      y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4557 #else
4558      y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4559 #endif
4560 
4561      tmp3 = _SIMD_MUL(h2_imag, x3);
4562 #ifdef __ELPA_USE_FMA__
4563      y3 = _SIMD_ADD(y3, _SIMD_FMADDSUB(h2_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4564 #else
4565      y3 = _SIMD_ADD(y3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4566 #endif
4567 
4568 #endif /* BLOCK2 */
4569 
4570      q1 = _SIMD_LOAD(&q_dbl[0]);
4571      q2 = _SIMD_LOAD(&q_dbl[offset]);
4572      q3 = _SIMD_LOAD(&q_dbl[2*offset]);
4573 
4574 #ifdef BLOCK1
4575      q1 = _SIMD_ADD(q1, x1);
4576      q2 = _SIMD_ADD(q2, x2);
4577      q3 = _SIMD_ADD(q3, x3);
4578 #endif
4579 
4580 #ifdef BLOCK2
4581      q1 = _SIMD_ADD(q1, y1);
4582      q2 = _SIMD_ADD(q2, y2);
4583      q3 = _SIMD_ADD(q3, y3);
4584 #endif
4585 
4586      _SIMD_STORE(&q_dbl[0], q1);
4587      _SIMD_STORE(&q_dbl[offset], q2);
4588      _SIMD_STORE(&q_dbl[2*offset], q3);
4589 
4590 #ifdef BLOCK2
4591 
4592 #if VEC_SET == SSE_128
4593 #ifdef DOUBLE_PRECISION_COMPLEX
4594      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
4595      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
4596 #endif
4597 #ifdef SINGLE_PRECISION_COMPLEX
4598      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
4599      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
4600 #endif
4601 #endif /* VEC_SET == SSE_128 */
4602 
4603 #if VEC_SET == AVX_256
4604      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
4605      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
4606 #endif /* VEC_SET == AVX_256 */
4607 
4608 #if VEC_SET == AVX_512
4609      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
4610      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
4611 #endif /* VEC_SET == AVX_512 */
4612 
4613      q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
4614      q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
4615      q3 = _SIMD_LOAD(&q_dbl[(ldq*2)+2*offset]);
4616 
4617      q1 = _SIMD_ADD(q1, x1);
4618      q2 = _SIMD_ADD(q2, x2);
4619      q3 = _SIMD_ADD(q3, x3);
4620 
4621      tmp1 = _SIMD_MUL(h2_imag, y1);
4622 
4623 #ifdef __ELPA_USE_FMA__
4624      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4625 #else
4626      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4627 #endif
4628 
4629      tmp2 = _SIMD_MUL(h2_imag, y2);
4630 #ifdef __ELPA_USE_FMA__
4631      q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4632 #else
4633      q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4634 #endif
4635 
4636      tmp3 = _SIMD_MUL(h2_imag, y3);
4637 #ifdef __ELPA_USE_FMA__
4638      q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4639 #else
4640      q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4641 #endif
4642 
4643      _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
4644      _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
4645      _SIMD_STORE(&q_dbl[(ldq*2)+2*offset], q3);
4646 
4647 #endif /* BLOCK2 */
4648 
4649      for (i = BLOCK; i < nb; i++)
4650      {
4651 
4652 #if VEC_SET == SSE_128
4653 #ifdef DOUBLE_PRECISION_COMPLEX
4654           h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
4655           h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
4656 #endif
4657 #ifdef SINGLE_PRECISION_COMPLEX
4658           h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
4659           h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
4660 #endif
4661 #endif /* VEC_SET == SSE_128 */
4662 
4663 #if VEC_SET == AVX_256
4664 	h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
4665         h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
4666 #endif /* VEC_SET == AVX_256 */
4667 
4668 #if VEC_SET == AVX_512
4669         h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
4670         h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
4671 #endif /* VEC_SET == AVX_512 */
4672 
4673           q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
4674           q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
4675           q3 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
4676 
4677           tmp1 = _SIMD_MUL(h1_imag, x1);
4678 #ifdef __ELPA_USE_FMA__
4679           q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4680 #else
4681           q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4682 #endif
4683           tmp2 = _SIMD_MUL(h1_imag, x2);
4684 #ifdef __ELPA_USE_FMA__
4685           q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4686 #else
4687           q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4688 #endif
4689 
4690           tmp3 = _SIMD_MUL(h1_imag, x3);
4691 #ifdef __ELPA_USE_FMA__
4692           q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4693 #else
4694           q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4695 #endif
4696 
4697 #ifdef BLOCK2
4698 
4699 #if VEC_SET == SSE_128
4700 #ifdef DOUBLE_PRECISION_COMPLEX
4701           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
4702           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
4703 #endif
4704 #ifdef SINGLE_PRECISION_COMPLEX
4705           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
4706           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
4707 #endif
4708 #endif /* VEC_SET == SSE_128 */
4709 
4710 #if VEC_SET == AVX_256
4711 	  h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
4712           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
4713 #endif /* VEC_SET == AVX_256 */
4714 
4715 #if VEC_SET == AVX_512
4716         h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
4717         h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
4718 #endif /* VEC_SET == AVX_512 */
4719 
4720           tmp1 = _SIMD_MUL(h2_imag, y1);
4721 #ifdef __ELPA_USE_FMA__
4722           q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4723 #else
4724           q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4725 #endif
4726           tmp2 = _SIMD_MUL(h2_imag, y2);
4727 #ifdef __ELPA_USE_FMA__
4728           q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4729 #else
4730           q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4731 #endif
4732 
4733           tmp3 = _SIMD_MUL(h2_imag, y3);
4734 #ifdef __ELPA_USE_FMA__
4735           q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h2_real, y3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4736 #else
4737           q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4738 #endif
4739 
4740 #endif /* BLOCK2 */
4741 
4742           _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
4743           _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
4744           _SIMD_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
4745 
4746      }
4747 #ifdef BLOCK2
4748 
4749 #if VEC_SET == SSE_128
4750 #ifdef DOUBLE_PRECISION_COMPLEX
4751      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
4752      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
4753 #endif
4754 #ifdef SINGLE_PRECISION_COMPLEX
4755      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
4756      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
4757 #endif
4758 #endif /* VEC_SET == SSE_128 */
4759 
4760 #if VEC_SET == AVX_256
4761      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
4762      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
4763 #endif /* VEC_SET == AVX_256 */
4764 
4765 #if VEC_SET == AVX_512
4766      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
4767      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
4768 #endif /* VEC_SET == AVX_512 */
4769 
4770      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
4771      q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
4772      q3 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+2*offset]);
4773 
4774      tmp1 = _SIMD_MUL(h1_imag, x1);
4775 #ifdef __ELPA_USE_FMA__
4776      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4777 #else
4778      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4779 #endif
4780      tmp2 = _SIMD_MUL(h1_imag, x2);
4781 #ifdef __ELPA_USE_FMA__
4782      q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4783 #else
4784      q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4785 #endif
4786 
4787      tmp3 = _SIMD_MUL(h1_imag, x3);
4788 #ifdef __ELPA_USE_FMA__
4789      q3 = _SIMD_ADD(q3, _SIMD_FMADDSUB(h1_real, x3, _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4790 #else
4791      q3 = _SIMD_ADD(q3, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x3), _SIMD_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
4792 #endif
4793 
4794      _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
4795      _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
4796      _SIMD_STORE(&q_dbl[(2*nb*ldq)+2*offset], q3);
4797 
4798 #endif /* BLOCK2 */
4799 }
4800 
4801 
4802 #if VEC_SET == SSE_128
4803 #ifdef DOUBLE_PRECISION_COMPLEX
4804 #define ROW_LENGTH 2
4805 #endif
4806 #ifdef SINGLE_PRECISION_COMPLEX
4807 #define ROW_LENGTH 4
4808 #endif
4809 #endif /* VEC_SET == SSE_128 */
4810 
4811 #if VEC_SET == AVX_256
4812 #ifdef DOUBLE_PRECISION_COMPLEX
4813 #define ROW_LENGTH 4
4814 #endif
4815 #ifdef SINGLE_PRECISION_COMPLEX
4816 #define ROW_LENGTH 8
4817 #endif
4818 #endif /* VEC_SET == AVX_256 */
4819 
4820 #if VEC_SET == AVX_512
4821 #ifdef DOUBLE_PRECISION_COMPLEX
4822 #define ROW_LENGTH 8
4823 #endif
4824 #ifdef SINGLE_PRECISION_COMPLEX
4825 #define ROW_LENGTH 16
4826 #endif
4827 #endif /* VEC_SET == AVX_512 */
4828 
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)4829 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
4830 #ifdef BLOCK1
4831 		)
4832 #endif
4833 #ifdef BLOCK2
4834                 ,int ldh, DATA_TYPE s)
4835 #endif
4836 {
4837 
4838      DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
4839      DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
4840 #ifdef BLOCK2
4841      DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
4842 #endif
4843 
4844      __SIMD_DATATYPE x1, x2;
4845      __SIMD_DATATYPE q1, q2;
4846 #ifdef BLOCK2
4847      __SIMD_DATATYPE y1, y2;
4848      __SIMD_DATATYPE h2_real, h2_imag;
4849 #endif
4850      __SIMD_DATATYPE h1_real, h1_imag;
4851      __SIMD_DATATYPE tmp1, tmp2;
4852      int i=0;
4853 
4854 #if VEC_SET == SSE_128
4855 #ifdef DOUBLE_PRECISION_COMPLEX
4856      __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
4857 #endif
4858 #ifdef SINGLE_PRECISION_COMPLEX
4859      __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
4860 #endif
4861 #endif /* VEC_SET == SSE_128 */
4862 
4863 #if VEC_SET == AVX_256
4864 #ifdef DOUBLE_PRECISION_COMPLEX
4865         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
4866 #endif
4867 #ifdef SINGLE_PRECISION_COMPLEX
4868         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
4869 #endif
4870 #endif /* VEC_SET == AVX_256 */
4871 
4872 #if VEC_SET == AVX_512
4873 #ifdef DOUBLE_PRECISION_COMPLEX
4874         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
4875 #endif
4876 #ifdef SINGLE_PRECISION_COMPLEX
4877         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
4878 #endif
4879 #endif /* VEC_SET == AVX_512 */
4880 
4881 #ifdef BLOCK2
4882      x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
4883      x2 = _SIMD_LOAD(&q_dbl[(2*ldq)+offset]);
4884 
4885 #if VEC_SET == SSE_128
4886 #ifdef DOUBLE_PRECISION_COMPLEX
4887      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
4888      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
4889 #endif
4890 #ifdef SINGLE_PRECISION_COMPLEX
4891      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
4892      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
4893 #endif
4894 #endif /* VEC_SET == SSE_128 */
4895 
4896 #if VEC_SET == AVX_256
4897      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
4898      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
4899 #endif /* VEC_SET == AVX_256 */
4900 
4901 #if VEC_SET == AVX_512
4902      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
4903      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
4904 #endif
4905 
4906 #ifndef __ELPA_USE_FMA__
4907      // conjugate
4908      h2_imag = _SIMD_XOR(h2_imag, sign);
4909 #endif
4910 
4911      y1 = _SIMD_LOAD(&q_dbl[0]);
4912      y2 = _SIMD_LOAD(&q_dbl[offset]);
4913 
4914      tmp1 = _SIMD_MUL(h2_imag, x1);
4915 #ifdef __ELPA_USE_FMA__
4916      y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4917 #else
4918      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4919 #endif
4920      tmp2 = _SIMD_MUL(h2_imag, x2);
4921 #ifdef __ELPA_USE_FMA__
4922      y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4923 #else
4924      y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4925 #endif
4926 
4927 #endif /* BLOCK2 */
4928 
4929 #ifdef BLOCK1
4930      x1 = _SIMD_LOAD(&q_dbl[0]);
4931      x2 = _SIMD_LOAD(&q_dbl[offset]);
4932 #endif
4933 
4934      for (i = BLOCK; i < nb; i++)
4935      {
4936 #if VEC_SET == SSE_128
4937 #ifdef DOUBLE_PRECISION_COMPLEX
4938           h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
4939           h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
4940 #endif
4941 #ifdef SINGLE_PRECISION_COMPLEX
4942           h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
4943           h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
4944 #endif
4945 #endif /* VEC_SET == SSE_128 */
4946 
4947 #if VEC_SET == AVX_256
4948           h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
4949           h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
4950 #endif /* VEC_SET == AVX_256 */
4951 
4952 #if VEC_SET == AVX_512
4953           h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
4954           h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
4955 #endif /* VEC_SET == AVX_512 */
4956 
4957 #ifndef __ELPA_USE_FMA__
4958           // conjugate
4959           h1_imag = _SIMD_XOR(h1_imag, sign);
4960 #endif
4961 
4962           q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
4963           q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
4964           tmp1 = _SIMD_MUL(h1_imag, q1);
4965 
4966 #ifdef __ELPA_USE_FMA__
4967           x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4968 #else
4969           x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
4970 #endif
4971 
4972           tmp2 = _SIMD_MUL(h1_imag, q2);
4973 #ifdef __ELPA_USE_FMA__
4974           x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4975 #else
4976           x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
4977 #endif
4978 
4979 #ifdef BLOCK2
4980 
4981 #if VEC_SET == SSE_128
4982 #ifdef DOUBLE_PRECISION_COMPLEX
4983           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
4984           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
4985 #endif
4986 #ifdef SINGLE_PRECISION_COMPLEX
4987           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
4988           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
4989 #endif
4990 #endif /* VEC_SET == SSE_128 */
4991 
4992 #if VEC_SET == AVX_256
4993           h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
4994           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
4995 #endif /* VEC_SET == AVX_256 */
4996 
4997 #if VEC_SET == AVX_512
4998           h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
4999           h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
5000 #endif /* VEC_SET == AVX_512 */
5001 
5002 #ifndef __ELPA_USE_FMA__
5003           // conjugate
5004           h2_imag = _SIMD_XOR(h2_imag, sign);
5005 #endif
5006 
5007           tmp1 = _SIMD_MUL(h2_imag, q1);
5008 #ifdef __ELPA_USE_FMA__
5009           y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5010 #else
5011           y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5012 #endif
5013           tmp2 = _SIMD_MUL(h2_imag, q2);
5014 #ifdef __ELPA_USE_FMA__
5015           y2 = _SIMD_ADD(y2, _SIMD_FMSUBADD(h2_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5016 #else
5017           y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5018 #endif
5019 
5020 #endif /* BLOCK2 */
5021 
5022      }
5023 
5024 #ifdef BLOCK2
5025 
5026 #if VEC_SET == SSE_128
5027 #ifdef DOUBLE_PRECISION_COMPLEX
5028      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
5029      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
5030 #endif
5031 #ifdef SINGLE_PRECISION_COMPLEX
5032      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
5033      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
5034 #endif
5035 #endif /* VEC_SET == SSE_128 */
5036 
5037 #if VEC_SET == AVX_256
5038      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
5039      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
5040 #endif /* VEC_SET == AVX_256 */
5041 
5042 #if VEC_SET == AVX_512
5043      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
5044      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
5045 #endif /* VEC_SET == AVX_512 */
5046 
5047 #ifndef __ELPA_USE_FMA__
5048      // conjugate
5049      h1_imag = _SIMD_XOR(h1_imag, sign);
5050 #endif
5051 
5052      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
5053      q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
5054 
5055      tmp1 = _SIMD_MUL(h1_imag, q1);
5056 #ifdef __ELPA_USE_FMA__
5057      x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5058 #else
5059      x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5060 #endif
5061      tmp2 = _SIMD_MUL(h1_imag, q2);
5062 #ifdef __ELPA_USE_FMA__
5063      x2 = _SIMD_ADD(x2, _SIMD_FMSUBADD(h1_real, q2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5064 #else
5065      x2 = _SIMD_ADD(x2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5066 #endif
5067 
5068 #endif /* BLOCK2 */
5069 
5070 #if VEC_SET == SSE_128
5071 #ifdef DOUBLE_PRECISION_COMPLEX
5072      h1_real = _mm_loaddup_pd(&hh_dbl[0]);
5073      h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
5074 #endif
5075 #ifdef SINGLE_PRECISION_COMPLEX
5076      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
5077      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
5078 #endif
5079 #endif /* VEC_SET == SSE_128 */
5080 
5081 #if VEC_SET == AVX_256
5082     h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
5083     h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
5084 #endif /* VEC_SET == AVX_256 */
5085 
5086 #if VEC_SET == AVX_512
5087     h1_real = _SIMD_SET1(hh_dbl[0]);
5088     h1_imag = _SIMD_SET1(hh_dbl[1]);
5089 
5090 #ifdef HAVE_AVX512_XEON_PHI
5091 #ifdef DOUBLE_PRECISION_COMPLEX
5092         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
5093         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
5094 #endif
5095 #ifdef SINGLE_PRECISION_COMPLEX
5096         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
5097         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
5098 #endif
5099 #endif
5100 #ifdef HAVE_AVX512_XEON
5101         h1_real = _SIMD_XOR(h1_real, sign);
5102         h1_imag = _SIMD_XOR(h1_imag, sign);
5103 #endif
5104 
5105 #endif /* VEC_SET == AVX_512 */
5106 
5107 #if VEC_SET != AVX_512
5108      h1_real = _SIMD_XOR(h1_real, sign);
5109      h1_imag = _SIMD_XOR(h1_imag, sign);
5110 #endif /* VEC_SET != AVX_512 */
5111 
5112      tmp1 = _SIMD_MUL(h1_imag, x1);
5113 #ifdef __ELPA_USE_FMA__
5114      x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5115 #else
5116      x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5117 #endif
5118 
5119      tmp2 = _SIMD_MUL(h1_imag, x2);
5120 #ifdef __ELPA_USE_FMA__
5121      x2 = _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
5122 #else
5123      x2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
5124 #endif
5125 
5126 #ifdef BLOCK2
5127 
5128 #if VEC_SET == SSE_128
5129 #ifdef DOUBLE_PRECISION_COMPLEX
5130      h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
5131      h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
5132 #endif
5133 #ifdef SINGLE_PRECISION_COMPLEX
5134      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
5135      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
5136 #endif
5137 
5138 #ifdef DOUBLE_PRECISION_COMPLEX
5139      h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
5140      h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
5141 #endif
5142 #ifdef SINGLE_PRECISION_COMPLEX
5143      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
5144      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
5145 #endif
5146 #endif /* VEC_SET == 128 */
5147 
5148 #if VEC_SET == AVX_256
5149      h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
5150      h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
5151      h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
5152      h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
5153 #endif /* VEC_SET == AVX_256 */
5154 
5155 #if VEC_SET == AVX_512
5156      h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
5157      h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
5158      h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
5159      h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
5160 
5161 #ifdef HAVE_AVX512_XEON_PHI
5162 #ifdef DOUBLE_PRECISION_COMPLEX
5163      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
5164      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
5165 #endif
5166 #ifdef SINGLE_PRECISION_COMPLEX
5167      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
5168      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
5169 #endif
5170 
5171 #ifdef DOUBLE_PRECISION_COMPLEX
5172      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
5173      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
5174 #endif
5175 #ifdef SINGLE_PRECISION_COMPLEX
5176      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
5177      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
5178 #endif
5179 #endif
5180 #ifdef HAVE_AVX512_XEON
5181      h1_real = _SIMD_XOR(h1_real, sign);
5182      h1_imag = _SIMD_XOR(h1_imag, sign);
5183      h2_real = _SIMD_XOR(h2_real, sign);
5184      h2_imag = _SIMD_XOR(h2_imag, sign);
5185 #endif
5186 
5187 #endif /* VEC_SET == AVX_512 */
5188 
5189 #if VEC_SET != AVX_512
5190      h1_real = _SIMD_XOR(h1_real, sign);
5191      h1_imag = _SIMD_XOR(h1_imag, sign);
5192      h2_real = _SIMD_XOR(h2_real, sign);
5193      h2_imag = _SIMD_XOR(h2_imag, sign);
5194 #endif /* VEC_SET != AVX_512 */
5195 
5196 #if VEC_SET == SSE_128
5197 #ifdef SINGLE_PRECISION_COMPLEX
5198      tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
5199 #else
5200      tmp2 = _SIMD_LOADU(s_dbl);
5201 #endif
5202 #endif /* VEC_SET == SSE_128 */
5203 
5204 #if VEC_SET == AVX_256
5205 #ifdef DOUBLE_PRECISION_COMPLEX
5206      tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
5207 #endif
5208 #ifdef SINGLE_PRECISION_COMPLEX
5209      tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
5210                              s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
5211 #endif
5212 #endif /* VEC_SET == AVX_256 */
5213 
5214 #if VEC_SET == AVX_512
5215 #ifdef DOUBLE_PRECISION_COMPLEX
5216      tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
5217                           s_dbl[1], s_dbl[0],
5218                           s_dbl[1], s_dbl[0],
5219                           s_dbl[1], s_dbl[0]);
5220 #endif
5221 #ifdef SINGLE_PRECISION_COMPLEX
5222      tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
5223 #endif
5224 
5225 #endif /* VEC_SET == AVX_512 */
5226 
5227      tmp1 = _SIMD_MUL(h2_imag, tmp2);
5228 #ifdef __ELPA_USE_FMA__
5229      tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5230 #else
5231      tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5232 #endif
5233 
5234 #if VEC_SET == AVX_512
5235      _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
5236 
5237      h2_real = _SIMD_SET1(s_dbl[0]);
5238      h2_imag = _SIMD_SET1(s_dbl[1]);
5239 #endif
5240 
5241 #if VEC_SET == SSE_128
5242 #ifdef DOUBLE_PRECISION_COMPLEX
5243      h2_real = _mm_movedup_pd(tmp2);
5244      h2_imag = _mm_set1_pd(tmp2[1]);
5245 #endif
5246 #ifdef SINGLE_PRECISION_COMPLEX
5247      h2_real = _mm_moveldup_ps(tmp2);
5248      h2_imag = _mm_movehdup_ps(tmp2);
5249 #endif
5250 #endif /* VEC_SET == SSE_128 */
5251 
5252 #if VEC_SET == AVX_256
5253      h2_real = _SIMD_SET1(tmp2[0]);
5254      h2_imag = _SIMD_SET1(tmp2[1]);
5255 #endif /* VEC_SET == AVX_256 */
5256 
5257      tmp1 = _SIMD_MUL(h1_imag, y1);
5258 #ifdef __ELPA_USE_FMA__
5259      y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5260 #else
5261      y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5262 #endif
5263      tmp2 = _SIMD_MUL(h1_imag, y2);
5264 #ifdef __ELPA_USE_FMA__
5265      y2 = _SIMD_FMADDSUB(h1_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
5266 #else
5267      y2 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE));
5268 #endif
5269 
5270      tmp1 = _SIMD_MUL(h2_imag, x1);
5271 #ifdef __ELPA_USE_FMA__
5272      y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5273 #else
5274      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5275 #endif
5276      tmp2 = _SIMD_MUL(h2_imag, x2);
5277 #ifdef __ELPA_USE_FMA__
5278      y2 = _SIMD_ADD(y2, _SIMD_FMADDSUB(h2_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5279 #else
5280      y2 = _SIMD_ADD(y2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5281 #endif
5282 
5283 #endif /* BLOCK2 */
5284 
5285      q1 = _SIMD_LOAD(&q_dbl[0]);
5286      q2 = _SIMD_LOAD(&q_dbl[offset]);
5287 
5288 #ifdef BLOCK1
5289      q1 = _SIMD_ADD(q1, x1);
5290      q2 = _SIMD_ADD(q2, x2);
5291 #endif
5292 
5293 #ifdef BLOCK2
5294      q1 = _SIMD_ADD(q1, y1);
5295      q2 = _SIMD_ADD(q2, y2);
5296 #endif
5297      _SIMD_STORE(&q_dbl[0], q1);
5298      _SIMD_STORE(&q_dbl[offset], q2);
5299 
5300 #ifdef BLOCK2
5301 
5302 #if VEC_SET == SSE_128
5303 #ifdef DOUBLE_PRECISION_COMPLEX
5304      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
5305      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
5306 #endif
5307 #ifdef SINGLE_PRECISION_COMPLEX
5308      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
5309      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
5310 #endif
5311 #endif /* VEC_SET == SSE_128 */
5312 
5313 #if VEC_SET == AVX_256
5314      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
5315      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
5316 #endif /* VEC_SET == AVX_256 */
5317 
5318 #if VEC_SET == AVX_512
5319      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
5320      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
5321 #endif /* VEC_SET == AVX_512 */
5322 
5323      q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
5324      q2 = _SIMD_LOAD(&q_dbl[(ldq*2)+offset]);
5325 
5326      q1 = _SIMD_ADD(q1, x1);
5327      q2 = _SIMD_ADD(q2, x2);
5328 
5329      tmp1 = _SIMD_MUL(h2_imag, y1);
5330 
5331 #ifdef __ELPA_USE_FMA__
5332      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5333 #else
5334      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5335 #endif
5336      tmp2 = _SIMD_MUL(h2_imag, y2);
5337 #ifdef __ELPA_USE_FMA__
5338      q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5339 #else
5340      q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5341 #endif
5342 
5343      _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
5344      _SIMD_STORE(&q_dbl[(ldq*2)+offset], q2);
5345 
5346 #endif /* BLOCK2 */
5347 
5348      for (i = BLOCK; i < nb; i++)
5349      {
5350 #if VEC_SET == SSE_128
5351 #ifdef DOUBLE_PRECISION_COMPLEX
5352           h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
5353           h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
5354 #endif
5355 #ifdef SINGLE_PRECISION_COMPLEX
5356           h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
5357           h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
5358 #endif
5359 #endif /* VEC_SET == SSE_128 */
5360 
5361 #if VEC_SET == AVX_256
5362 	  h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
5363           h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
5364 #endif /* VEC_SET == AVX_256 */
5365 
5366 #if VEC_SET == AVX_512
5367           h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
5368           h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
5369 #endif /* VEC_SET == AVX_512 */
5370 
5371           q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
5372           q2 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+offset]);
5373           tmp1 = _SIMD_MUL(h1_imag, x1);
5374 
5375 #ifdef __ELPA_USE_FMA__
5376           q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5377 #else
5378           q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5379 #endif
5380 
5381           tmp2 = _SIMD_MUL(h1_imag, x2);
5382 #ifdef __ELPA_USE_FMA__
5383           q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5384 #else
5385           q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5386 #endif
5387 
5388 
5389 #ifdef BLOCK2
5390 
5391 #if VEC_SET == SSE_128
5392 #ifdef DOUBLE_PRECISION_COMPLEX
5393           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
5394           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
5395 #endif
5396 #ifdef SINGLE_PRECISION_COMPLEX
5397           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
5398           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
5399 #endif
5400 #endif /* VEC_SET == SSE_128 */
5401 
5402 #if VEC_SET == AVX_256
5403 	  h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
5404           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
5405 #endif /* VEC_SET == AVX_256 */
5406 
5407 #if VEC_SET == AVX_512
5408          h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
5409          h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
5410 #endif /* VEC_SET == AVX_512 */
5411 
5412           tmp1 = _SIMD_MUL(h2_imag, y1);
5413 #ifdef __ELPA_USE_FMA__
5414           q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5415 #else
5416           q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5417 #endif
5418           tmp2 = _SIMD_MUL(h2_imag, y2);
5419 #ifdef __ELPA_USE_FMA__
5420           q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h2_real, y2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5421 #else
5422           q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5423 #endif
5424 
5425 #endif /* BLOCK2 */
5426 
5427           _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
5428           _SIMD_STORE(&q_dbl[(2*i*ldq)+offset], q2);
5429     }
5430 #ifdef BLOCK2
5431 
5432 #if VEC_SET == SSE_128
5433 #ifdef DOUBLE_PRECISION_COMPLEX
5434      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
5435      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
5436 #endif
5437 #ifdef SINGLE_PRECISION_COMPLEX
5438      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
5439      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
5440 #endif
5441 #endif /* VEC_SET == SSE_128 */
5442 
5443 #if VEC_SET == AVX_256
5444      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
5445      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
5446 #endif /* VEC_SET == AVX_256 */
5447 
5448 #if VEC_SET == AVX_512
5449      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
5450      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
5451 #endif /* VEC_SET == AVX_512 */
5452 
5453      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
5454      q2 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+offset]);
5455 
5456      tmp1 = _SIMD_MUL(h1_imag, x1);
5457 #ifdef __ELPA_USE_FMA__
5458      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5459 #else
5460      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5461 #endif
5462      tmp2 = _SIMD_MUL(h1_imag, x2);
5463 #ifdef __ELPA_USE_FMA__
5464      q2 = _SIMD_ADD(q2, _SIMD_FMADDSUB(h1_real, x2, _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5465 #else
5466      q2 = _SIMD_ADD(q2, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x2), _SIMD_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
5467 #endif
5468 
5469      _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
5470      _SIMD_STORE(&q_dbl[(2*nb*ldq)+offset], q2);
5471 
5472 #endif /* BLOCK2 */
5473 
5474 }
5475 
5476 #if VEC_SET == SSE_128
5477 #ifdef DOUBLE_PRECISION_COMPLEX
5478 #define ROW_LENGTH 1
5479 #endif
5480 #ifdef SINGLE_PRECISION_COMPLEX
5481 #define ROW_LENGTH 2
5482 #endif
5483 #endif /* VEC_SET == SSE_128 */
5484 
5485 #if VEC_SET == AVX_256
5486 #ifdef DOUBLE_PRECISION_COMPLEX
5487 #define ROW_LENGTH 2
5488 #endif
5489 #ifdef SINGLE_PRECISION_COMPLEX
5490 #define ROW_LENGTH 4
5491 #endif
5492 #endif /* VEC_SET == AVX_256 */
5493 
5494 #if VEC_SET == AVX_512
5495 #ifdef DOUBLE_PRECISION_COMPLEX
5496 #define ROW_LENGTH 4
5497 #endif
5498 #ifdef SINGLE_PRECISION_COMPLEX
5499 #define ROW_LENGTH 8
5500 #endif
5501 #endif /* VEC_SET == AVX_512 */
5502 
5503 
CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH)5504 static __forceinline void CONCAT_8ARGS(hh_trafo_complex_kernel_,ROW_LENGTH,_,SIMD_SET,_,BLOCK,hv_,WORD_LENGTH) (DATA_TYPE_PTR q, DATA_TYPE_PTR hh, int nb, int ldq
5505 #ifdef BLOCK1
5506 		)
5507 #endif
5508 #ifdef BLOCK2
5509                 ,int ldh, DATA_TYPE s)
5510 #endif
5511 {
5512 
5513      DATA_TYPE_REAL_PTR q_dbl = (DATA_TYPE_REAL_PTR)q;
5514      DATA_TYPE_REAL_PTR hh_dbl = (DATA_TYPE_REAL_PTR)hh;
5515 #ifdef BLOCK2
5516      DATA_TYPE_REAL_PTR s_dbl = (DATA_TYPE_REAL_PTR)(&s);
5517 #endif
5518 
5519      __SIMD_DATATYPE x1;
5520      __SIMD_DATATYPE q1;
5521 #ifdef BLOCK2
5522      __SIMD_DATATYPE y1;
5523      __SIMD_DATATYPE h2_real, h2_imag;
5524 #endif
5525      __SIMD_DATATYPE h1_real, h1_imag;
5526      __SIMD_DATATYPE tmp1, tmp2;
5527      int i=0;
5528 
5529 #if VEC_SET == SSE_128
5530 #ifdef DOUBLE_PRECISION_COMPLEX
5531      __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
5532 #endif
5533 #ifdef SINGLE_PRECISION_COMPLEX
5534      __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
5535 #endif
5536 #endif /* VEC_SET == SSE_128 */
5537 
5538 #if VEC_SET == AVX_256
5539 #ifdef DOUBLE_PRECISION_COMPLEX
5540         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
5541 #endif
5542 #ifdef SINGLE_PRECISION_COMPLEX
5543         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
5544 #endif
5545 #endif /* VEC_SET == AVX_256 */
5546 
5547 #if VEC_SET == AVX_512
5548 #ifdef DOUBLE_PRECISION_COMPLEX
5549         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi64(0x8000000000000000);
5550 #endif
5551 #ifdef SINGLE_PRECISION_COMPLEX
5552         __SIMD_DATATYPE sign = (__SIMD_DATATYPE)_mm512_set1_epi32(0x80000000);
5553 #endif
5554 #endif /* VEC_SET == AVX_512 */
5555 
5556 #ifdef BLOCK2
5557      x1 = _SIMD_LOAD(&q_dbl[(2*ldq)+0]);
5558 
5559 #if VEC_SET == SSE_128
5560 #ifdef DOUBLE_PRECISION_COMPLEX
5561      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
5562      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
5563 #endif
5564 #ifdef SINGLE_PRECISION_COMPLEX
5565      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
5566      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
5567 #endif
5568 #endif /* VEC_SET == SSE_128 */
5569 
5570 #if VEC_SET == AVX_256
5571      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
5572      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
5573 #endif /* VEC_SET == AVX_256 */
5574 
5575 #if VEC_SET == AVX_512
5576      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
5577      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
5578 #endif
5579 
5580 #ifndef __ELPA_USE_FMA__
5581      // conjugate
5582      h2_imag = _SIMD_XOR(h2_imag, sign);
5583 #endif
5584 
5585      y1 = _SIMD_LOAD(&q_dbl[0]);
5586 
5587      tmp1 = _SIMD_MUL(h2_imag, x1);
5588 #ifdef __ELPA_USE_FMA__
5589      y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5590 #else
5591      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5592 #endif
5593 
5594 #endif /* BLOCK2 */
5595 
5596 #ifdef BLOCK1
5597      x1 = _SIMD_LOAD(&q_dbl[0]);
5598 #endif
5599 
5600      for (i = BLOCK; i < nb; i++)
5601      {
5602 #if VEC_SET == SSE_128
5603 #ifdef DOUBLE_PRECISION_COMPLEX
5604           h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
5605           h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
5606 #endif
5607 #ifdef SINGLE_PRECISION_COMPLEX
5608           h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
5609           h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
5610 #endif
5611 #endif /* VEC_SET == SSE_128 */
5612 
5613 #if VEC_SET == AVX_256
5614           h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
5615           h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
5616 #endif /* VEC_SET == AVX_256 */
5617 
5618 #if VEC_SET == AVX_512
5619          h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
5620          h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
5621 #endif /* VEC_SET == AVX_512 */
5622 
5623 #ifndef __ELPA_USE_FMA__
5624           // conjugate
5625           h1_imag = _SIMD_XOR(h1_imag, sign);
5626 #endif
5627 
5628           q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
5629 
5630           tmp1 = _SIMD_MUL(h1_imag, q1);
5631 #ifdef __ELPA_USE_FMA__
5632           x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5633 #else
5634           x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5635 #endif
5636 
5637 #ifdef BLOCK2
5638 
5639 #if VEC_SET == SSE_128
5640 #ifdef DOUBLE_PRECISION_COMPLEX
5641           h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
5642           h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
5643 #endif
5644 #ifdef SINGLE_PRECISION_COMPLEX
5645           h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
5646           h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
5647 #endif
5648 #endif /* VEC_SET == SSE_128 */
5649 
5650 #if VEC_SET == AVX_256
5651           h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
5652           h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
5653 #endif /* VEC_SET == AVX_256 */
5654 
5655 #if VEC_SET == AVX_512
5656           h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
5657           h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
5658 #endif /* AVX_512 */
5659 
5660 #ifndef __ELPA_USE_FMA__
5661           // conjugate
5662           h2_imag = _SIMD_XOR(h2_imag, sign);
5663 #endif
5664 
5665           tmp1 = _SIMD_MUL(h2_imag, q1);
5666 #ifdef __ELPA_USE_FMA__
5667           y1 = _SIMD_ADD(y1, _SIMD_FMSUBADD(h2_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5668 #else
5669           y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5670 #endif
5671 
5672 #endif /* BLOCK2 */
5673 
5674      }
5675 
5676 #ifdef BLOCK2
5677 
5678 #if VEC_SET == SSE_128
5679 #ifdef DOUBLE_PRECISION_COMPLEX
5680      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
5681      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
5682 #endif
5683 #ifdef SINGLE_PRECISION_COMPLEX
5684      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
5685      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
5686 #endif
5687 #endif /* VEC_SET == SSE_128 */
5688 
5689 #if VEC_SET == AVX_256
5690      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
5691      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
5692 #endif /* VEC_SET == AVX_256 */
5693 
5694 #if VEC_SET == AVX_512
5695      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
5696      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
5697 #endif /* VEC_SET == AVX_512 */
5698 
5699 #ifndef __ELPA_USE_FMA__
5700      // conjugate
5701      h1_imag = _SIMD_XOR(h1_imag, sign);
5702 #endif
5703 
5704      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
5705 
5706      tmp1 = _SIMD_MUL(h1_imag, q1);
5707 #ifdef __ELPA_USE_FMA__
5708      x1 = _SIMD_ADD(x1, _SIMD_FMSUBADD(h1_real, q1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5709 #else
5710      x1 = _SIMD_ADD(x1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, q1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5711 #endif
5712 
5713 #endif /* BLOCK2 */
5714 
5715 #if VEC_SET == SSE_128
5716 #ifdef DOUBLE_PRECISION_COMPLEX
5717      h1_real = _mm_loaddup_pd(&hh_dbl[0]);
5718      h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
5719 #endif
5720 #ifdef SINGLE_PRECISION_COMPLEX
5721      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
5722      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
5723 #endif
5724 #endif /* VEC_SET == SSE_128 */
5725 
5726 #if VEC_SET == AVX_256
5727     h1_real = _SIMD_BROADCAST(&hh_dbl[0]);
5728     h1_imag = _SIMD_BROADCAST(&hh_dbl[1]);
5729 #endif /*  VEC_SET == AVX_256 */
5730 
5731 #if VEC_SET == AVX_512
5732     h1_real = _SIMD_SET1(hh_dbl[0]);
5733     h1_imag = _SIMD_SET1(hh_dbl[1]);
5734 
5735 #ifdef HAVE_AVX512_XEON_PHI
5736 #ifdef DOUBLE_PRECISION_COMPLEX
5737         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
5738         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
5739 #endif
5740 #ifdef SINGLE_PRECISION_COMPLEX
5741         h1_real = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_real, (__m512i) sign);
5742         h1_imag = (__SIMD_DATATYPE) _SIMD_XOR_EPI((__m512i) h1_imag, (__m512i) sign);
5743 #endif
5744 #endif
5745 #ifdef HAVE_AVX512_XEON
5746         h1_real = _SIMD_XOR(h1_real, sign);
5747         h1_imag = _SIMD_XOR(h1_imag, sign);
5748 #endif
5749 
5750 #endif /* VEC_SET == AVX_512 */
5751 
5752 #if VEC_SET != AVX_512
5753      h1_real = _SIMD_XOR(h1_real, sign);
5754      h1_imag = _SIMD_XOR(h1_imag, sign);
5755 #endif /* VEC_SET != AVX_512 */
5756 
5757      tmp1 = _SIMD_MUL(h1_imag, x1);
5758 #ifdef __ELPA_USE_FMA__
5759      x1 = _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5760 #else
5761      x1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5762 #endif
5763 
5764 #ifdef BLOCK2
5765 
5766 #if VEC_SET == SSE_128
5767 #ifdef DOUBLE_PRECISION_COMPLEX
5768      h1_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
5769      h1_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
5770 #endif
5771 #ifdef SINGLE_PRECISION_COMPLEX
5772      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
5773      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
5774 #endif
5775 
5776 #ifdef DOUBLE_PRECISION_COMPLEX
5777      h2_real = _mm_loaddup_pd(&hh_dbl[ldh*2]);
5778      h2_imag = _mm_loaddup_pd(&hh_dbl[(ldh*2)+1]);
5779 #endif
5780 #ifdef SINGLE_PRECISION_COMPLEX
5781      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
5782      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
5783 #endif
5784 #endif /* VEC_SET == 128 */
5785 
5786 #if VEC_SET == AVX_256
5787      h1_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
5788      h1_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
5789      h2_real = _SIMD_BROADCAST(&hh_dbl[ldh*2]);
5790      h2_imag = _SIMD_BROADCAST(&hh_dbl[(ldh*2)+1]);
5791 #endif /* VEC_SET == AVX_256 */
5792 
5793 #if VEC_SET == AVX_512
5794      h1_real = _SIMD_SET1(hh_dbl[ldh*2]);
5795      h1_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
5796      h2_real = _SIMD_SET1(hh_dbl[ldh*2]);
5797      h2_imag = _SIMD_SET1(hh_dbl[(ldh*2)+1]);
5798 
5799 #ifdef HAVE_AVX512_XEON_PHI
5800 #ifdef DOUBLE_PRECISION_COMPLEX
5801      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_real, (__m512i) sign);
5802      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h1_imag, (__m512i) sign);
5803 #endif
5804 #ifdef SINGLE_PRECISION_COMPLEX
5805      h1_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_real, (__m512i) sign);
5806      h1_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h1_imag, (__m512i) sign);
5807 #endif
5808 
5809 #ifdef DOUBLE_PRECISION_COMPLEX
5810      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
5811      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi64((__m512i) h2_imag, (__m512i) sign);
5812 #endif
5813 #ifdef SINGLE_PRECISION_COMPLEX
5814      h2_real = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_real, (__m512i) sign);
5815      h2_imag = (__SIMD_DATATYPE) _mm512_xor_epi32((__m512i) h2_imag, (__m512i) sign);
5816 #endif
5817 #endif
5818 #ifdef HAVE_AVX512_XEON
5819      h1_real = _SIMD_XOR(h1_real, sign);
5820      h1_imag = _SIMD_XOR(h1_imag, sign);
5821      h2_real = _SIMD_XOR(h2_real, sign);
5822      h2_imag = _SIMD_XOR(h2_imag, sign);
5823 #endif
5824 
5825 #endif /* VEC_SET == AVX_512 */
5826 
5827 #if VEC_SET != AVX_512
5828      h1_real = _SIMD_XOR(h1_real, sign);
5829      h1_imag = _SIMD_XOR(h1_imag, sign);
5830      h2_real = _SIMD_XOR(h2_real, sign);
5831      h2_imag = _SIMD_XOR(h2_imag, sign);
5832 #endif
5833 
5834 #if VEC_SET == SSE_128
5835 #ifdef SINGLE_PRECISION_COMPLEX
5836      tmp2 = _mm_castpd_ps(_mm_load_pd1((double *) s_dbl));
5837 #else
5838      tmp2 = _SIMD_LOADU(s_dbl);
5839 #endif
5840 #endif /* VEC_SET == SSE_128 */
5841 
5842 #if VEC_SET == AVX_256
5843 #ifdef DOUBLE_PRECISION_COMPLEX
5844      tmp2 = _mm256_set_pd(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
5845 #endif
5846 #ifdef SINGLE_PRECISION_COMPLEX
5847      tmp2 = _mm256_set_ps(s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0],
5848                           s_dbl[1], s_dbl[0], s_dbl[1], s_dbl[0]);
5849 #endif
5850 #endif /* VEC_SET == AVX_256 */
5851 
5852 #if VEC_SET == AVX_512
5853 #ifdef DOUBLE_PRECISION_COMPLEX
5854      tmp2 = _SIMD_SET(s_dbl[1], s_dbl[0],
5855                           s_dbl[1], s_dbl[0],
5856                           s_dbl[1], s_dbl[0],
5857                           s_dbl[1], s_dbl[0]);
5858 #endif
5859 #ifdef SINGLE_PRECISION_COMPLEX
5860      tmp2 = (__SIMD_DATATYPE) _mm512_set1_pd(*(double*)(&s_dbl[0]));
5861 #endif
5862 
5863 #endif /* VEC_SET == AVX_512 */
5864 
5865      tmp1 = _SIMD_MUL(h2_imag, tmp2);
5866 #ifdef __ELPA_USE_FMA__
5867      tmp2 = _SIMD_FMADDSUB(h2_real, tmp2, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5868 #else
5869      tmp2 = _SIMD_ADDSUB( _SIMD_MUL(h2_real, tmp2), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5870 #endif
5871 
5872 #if VEC_SET == AVX_512
5873      _SIMD_MASK_STOREU(s_dbl, 0x01 + 0x02, tmp2);
5874 
5875      h2_real = _SIMD_SET1(s_dbl[0]);
5876      h2_imag = _SIMD_SET1(s_dbl[1]);
5877 #endif
5878 
5879 #if VEC_SET == SSE_128
5880 #ifdef DOUBLE_PRECISION_COMPLEX
5881      h2_real = _mm_movedup_pd(tmp2);
5882      h2_imag = _mm_set1_pd(tmp2[1]);
5883 #endif
5884 #ifdef SINGLE_PRECISION_COMPLEX
5885      h2_real = _mm_moveldup_ps(tmp2);
5886      h2_imag = _mm_movehdup_ps(tmp2);
5887 #endif
5888 #endif /*  VEC_SET == SSE_128 */
5889 
5890 #if VEC_SET == AVX_256
5891      h2_real = _SIMD_SET1(tmp2[0]);
5892      h2_imag = _SIMD_SET1(tmp2[1]);
5893 #endif /* VEC_SET == AVX_256 */
5894 
5895      tmp1 = _SIMD_MUL(h1_imag, y1);
5896 #ifdef __ELPA_USE_FMA__
5897      y1 = _SIMD_FMADDSUB(h1_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5898 #else
5899      y1 = _SIMD_ADDSUB( _SIMD_MUL(h1_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE));
5900 #endif
5901 
5902      tmp1 = _SIMD_MUL(h2_imag, x1);
5903 #ifdef __ELPA_USE_FMA__
5904      y1 = _SIMD_ADD(y1, _SIMD_FMADDSUB(h2_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5905 #else
5906      y1 = _SIMD_ADD(y1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5907 #endif
5908 
5909 #endif /* BLOCK2 */
5910 
5911      q1 = _SIMD_LOAD(&q_dbl[0]);
5912 
5913 #ifdef BLOCK1
5914      q1 = _SIMD_ADD(q1, x1);
5915 #endif
5916 
5917 #ifdef BLOCK2
5918      q1 = _SIMD_ADD(q1, y1);
5919 #endif
5920      _SIMD_STORE(&q_dbl[0], q1);
5921 
5922 #ifdef BLOCK2
5923 
5924 #if VEC_SET == SSE_128
5925 #ifdef DOUBLE_PRECISION_COMPLEX
5926      h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+1)*2]);
5927      h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+1)*2)+1]);
5928 #endif
5929 #ifdef SINGLE_PRECISION_COMPLEX
5930      h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
5931      h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
5932 #endif
5933 #endif /* VEC_SET == SSE_128 */
5934 
5935 #if VEC_SET == AVX_256
5936      h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+1)*2]);
5937      h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+1)*2)+1]);
5938 #endif /* VEC_SET == AVX_256 */
5939 
5940 #if VEC_SET == AVX_512
5941      h2_real = _SIMD_SET1(hh_dbl[(ldh+1)*2]);
5942      h2_imag = _SIMD_SET1(hh_dbl[((ldh+1)*2)+1]);
5943 #endif /* VEC_SET == AVX_512 */
5944 
5945      q1 = _SIMD_LOAD(&q_dbl[(ldq*2)+0]);
5946 
5947      q1 = _SIMD_ADD(q1, x1);
5948 
5949      tmp1 = _SIMD_MUL(h2_imag, y1);
5950 #ifdef __ELPA_USE_FMA__
5951      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5952 #else
5953      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5954 #endif
5955 
5956      _SIMD_STORE(&q_dbl[(ldq*2)+0], q1);
5957 
5958 #endif /* BLOCK2 */
5959 
5960      for (i = BLOCK; i < nb; i++)
5961      {
5962 #if VEC_SET == SSE_128
5963 #ifdef DOUBLE_PRECISION_COMPLEX
5964         h1_real = _mm_loaddup_pd(&hh_dbl[(i-BLOCK+1)*2]);
5965         h1_imag = _mm_loaddup_pd(&hh_dbl[((i-BLOCK+1)*2)+1]);
5966 #endif
5967 #ifdef SINGLE_PRECISION_COMPLEX
5968         h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-BLOCK+1)*2]) )));
5969         h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-BLOCK+1)*2)+1]) )));
5970 #endif
5971 #endif /* VEC_SET == SSE_128 */
5972 
5973 #if VEC_SET == AVX_256
5974 	h1_real = _SIMD_BROADCAST(&hh_dbl[(i-BLOCK+1)*2]);
5975         h1_imag = _SIMD_BROADCAST(&hh_dbl[((i-BLOCK+1)*2)+1]);
5976 #endif /* VEC_SET == AVX_256 */
5977 
5978 #if VEC_SET == AVX_512
5979         h1_real = _SIMD_SET1(hh_dbl[(i-BLOCK+1)*2]);
5980         h1_imag = _SIMD_SET1(hh_dbl[((i-BLOCK+1)*2)+1]);
5981 #endif /* VEC_SET == AVX_512 */
5982 
5983         q1 = _SIMD_LOAD(&q_dbl[(2*i*ldq)+0]);
5984 
5985         tmp1 = _SIMD_MUL(h1_imag, x1);
5986 #ifdef __ELPA_USE_FMA__
5987         q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5988 #else
5989         q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
5990 #endif
5991 
5992 #ifdef BLOCK2
5993 
5994 #if VEC_SET == SSE_128
5995 #ifdef DOUBLE_PRECISION_COMPLEX
5996         h2_real = _mm_loaddup_pd(&hh_dbl[(ldh+i)*2]);
5997         h2_imag = _mm_loaddup_pd(&hh_dbl[((ldh+i)*2)+1]);
5998 #endif
5999 #ifdef SINGLE_PRECISION_COMPLEX
6000         h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
6001         h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
6002 #endif
6003 #endif /* VEC_SET == SSE_128 */
6004 
6005 #if VEC_SET == AVX_256
6006 	h2_real = _SIMD_BROADCAST(&hh_dbl[(ldh+i)*2]);
6007         h2_imag = _SIMD_BROADCAST(&hh_dbl[((ldh+i)*2)+1]);
6008 #endif /* VEC_SET == AVX_256 */
6009 
6010 #if VEC_SET == AVX_512
6011         h2_real = _SIMD_SET1(hh_dbl[(ldh+i)*2]);
6012         h2_imag = _SIMD_SET1(hh_dbl[((ldh+i)*2)+1]);
6013 #endif /* VEC_SET == AVX_512 */
6014 
6015         tmp1 = _SIMD_MUL(h2_imag, y1);
6016 #ifdef __ELPA_USE_FMA__
6017         q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h2_real, y1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
6018 #else
6019         q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h2_real, y1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
6020 #endif
6021 #endif /* BLOCK2 */
6022 
6023         _SIMD_STORE(&q_dbl[(2*i*ldq)+0], q1);
6024     }
6025 #ifdef BLOCK2
6026 
6027 #if VEC_SET == SSE_128
6028 #ifdef DOUBLE_PRECISION_COMPLEX
6029      h1_real = _mm_loaddup_pd(&hh_dbl[(nb-1)*2]);
6030      h1_imag = _mm_loaddup_pd(&hh_dbl[((nb-1)*2)+1]);
6031 #endif
6032 #ifdef SINGLE_PRECISION_COMPLEX
6033      h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
6034      h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
6035 #endif
6036 #endif /* VEC_SET == SSE_128 */
6037 
6038 #if VEC_SET == AVX_256
6039      h1_real = _SIMD_BROADCAST(&hh_dbl[(nb-1)*2]);
6040      h1_imag = _SIMD_BROADCAST(&hh_dbl[((nb-1)*2)+1]);
6041 #endif /* VEC_SET == AVX_256 */
6042 
6043 #if VEC_SET == AVX_512
6044      h1_real = _SIMD_SET1(hh_dbl[(nb-1)*2]);
6045      h1_imag = _SIMD_SET1(hh_dbl[((nb-1)*2)+1]);
6046 #endif /* VEC_SET == AVX_512 */
6047 
6048      q1 = _SIMD_LOAD(&q_dbl[(2*nb*ldq)+0]);
6049 
6050      tmp1 = _SIMD_MUL(h1_imag, x1);
6051 #ifdef __ELPA_USE_FMA__
6052      q1 = _SIMD_ADD(q1, _SIMD_FMADDSUB(h1_real, x1, _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
6053 #else
6054      q1 = _SIMD_ADD(q1, _SIMD_ADDSUB( _SIMD_MUL(h1_real, x1), _SIMD_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
6055 #endif
6056 
6057      _SIMD_STORE(&q_dbl[(2*nb*ldq)+0], q1);
6058 
6059 #endif /* BLOCK2 */
6060 
6061 }
6062