1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
FLA_Apply_G_rf_opt_var9(FLA_Obj G,FLA_Obj A)13 FLA_Error FLA_Apply_G_rf_opt_var9( FLA_Obj G, FLA_Obj A )
14 /*
15 Apply k sets of Givens rotations to a matrix A from the right,
16 where each set takes the form:
17
18 A := A ( G(n-1,k) ... G(1,k) G(0,k) )'
19 = A G(0,k)' G(1,k)' ... G(n-1,k)'
20
21 where Gik is the ith Givens rotation formed from the kth set,
22 stored in the (i,k) entries of of G:
23
24 Gik = / gamma_ik -sigma_ik \
25 \ sigma_ik gamma_ik /
26
27 This variant iterates in pipelined, overlapping fashion and
28 applies rotations to two columns at a time.
29
30 -FGVZ
31 */
32 {
33 FLA_Datatype datatype;
34 int k_G, m_A, n_A;
35 int rs_G, cs_G;
36 int rs_A, cs_A;
37
38 datatype = FLA_Obj_datatype( A );
39
40 k_G = FLA_Obj_width( G );
41 m_A = FLA_Obj_length( A );
42 n_A = FLA_Obj_width( A );
43
44 rs_G = FLA_Obj_row_stride( G );
45 cs_G = FLA_Obj_col_stride( G );
46
47 rs_A = FLA_Obj_row_stride( A );
48 cs_A = FLA_Obj_col_stride( A );
49
50 switch ( datatype )
51 {
52 case FLA_FLOAT:
53 {
54 scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G );
55 float* buff_A = ( float* ) FLA_FLOAT_PTR( A );
56
57 FLA_Apply_G_rf_ops_var9( k_G,
58 m_A,
59 n_A,
60 buff_G, rs_G, cs_G,
61 buff_A, rs_A, cs_A );
62
63 break;
64 }
65
66 case FLA_DOUBLE:
67 {
68 dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G );
69 double* buff_A = ( double* ) FLA_DOUBLE_PTR( A );
70
71 FLA_Apply_G_rf_opd_var9( k_G,
72 m_A,
73 n_A,
74 buff_G, rs_G, cs_G,
75 buff_A, rs_A, cs_A );
76
77 break;
78 }
79
80 case FLA_COMPLEX:
81 {
82 scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G );
83 scomplex* buff_A = ( scomplex* ) FLA_COMPLEX_PTR( A );
84
85 FLA_Apply_G_rf_opc_var9( k_G,
86 m_A,
87 n_A,
88 buff_G, rs_G, cs_G,
89 buff_A, rs_A, cs_A );
90
91 break;
92 }
93
94 case FLA_DOUBLE_COMPLEX:
95 {
96 dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G );
97 dcomplex* buff_A = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A );
98
99 FLA_Apply_G_rf_opz_var9( k_G,
100 m_A,
101 n_A,
102 buff_G, rs_G, cs_G,
103 buff_A, rs_A, cs_A );
104
105 break;
106 }
107 }
108
109 return FLA_SUCCESS;
110 }
111
112
FLA_Apply_G_rf_ops_var9(int k_G,int m_A,int n_A,scomplex * buff_G,int rs_G,int cs_G,float * buff_A,int rs_A,int cs_A)113 FLA_Error FLA_Apply_G_rf_ops_var9( int k_G,
114 int m_A,
115 int n_A,
116 scomplex* buff_G, int rs_G, int cs_G,
117 float* buff_A, int rs_A, int cs_A )
118 {
119 float one = bl1_s1();
120 float zero = bl1_s0();
121 float gamma12;
122 float sigma12;
123 float gamma23;
124 float sigma23;
125 float* a1;
126 float* a2;
127 float* a3;
128 scomplex* g12;
129 scomplex* g23;
130 int i, j, g, k;
131 int nG, nG_app;
132 int n_iter;
133 int n_left;
134 int k_minus_1;
135 int n_fuse;
136 int is_ident12, is_ident23;
137
138 k_minus_1 = k_G - 1;
139 nG = n_A - 1;
140 n_fuse = 2;
141
142 // Use the simple variant for nG < (k - 1) or k == 1.
143 if ( nG < 2*k_minus_1 || k_G == 1 )
144 {
145 FLA_Apply_G_rf_ops_var1( k_G,
146 m_A,
147 n_A,
148 buff_G, rs_G, cs_G,
149 buff_A, rs_A, cs_A );
150 return FLA_SUCCESS;
151 }
152
153
154 // Start-up phase.
155
156 for ( j = -1; j < k_minus_1; j += n_fuse )
157 {
158 nG_app = j + 1;
159 n_iter = nG_app;
160 n_left = 1;
161
162 for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
163 {
164 g12 = buff_G + (g )*rs_G + (k )*cs_G;
165 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
166 a1 = buff_A + (g )*cs_A;
167 a2 = buff_A + (g + 1)*cs_A;
168 a3 = buff_A + (g + 2)*cs_A;
169
170 gamma12 = g12->real;
171 sigma12 = g12->imag;
172 gamma23 = g23->real;
173 sigma23 = g23->imag;
174
175 is_ident12 = ( gamma12 == one && sigma12 == zero );
176 is_ident23 = ( gamma23 == one && sigma23 == zero );
177
178 if ( !is_ident12 && is_ident23 )
179 {
180 // Apply only to columns 1 and 2.
181
182 MAC_Apply_G_mx2_ops( m_A,
183 &gamma12,
184 &sigma12,
185 a1, rs_A,
186 a2, rs_A );
187 }
188 else if ( is_ident12 && !is_ident23 )
189 {
190 // Apply only to columns 2 and 3.
191
192 MAC_Apply_G_mx2_ops( m_A,
193 &gamma23,
194 &sigma23,
195 a2, rs_A,
196 a3, rs_A );
197 }
198 else if ( !is_ident12 && !is_ident23 )
199 {
200 // Apply to all three columns.
201
202 MAC_Apply_G_mx3_ops( m_A,
203 &gamma12,
204 &sigma12,
205 &gamma23,
206 &sigma23,
207 a1, rs_A,
208 a2, rs_A,
209 a3, rs_A );
210 }
211 }
212
213 if ( n_left == 1 )
214 {
215 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
216 a2 = buff_A + (g + 1)*cs_A;
217 a3 = buff_A + (g + 2)*cs_A;
218
219 gamma23 = g23->real;
220 sigma23 = g23->imag;
221
222 is_ident23 = ( gamma23 == one && sigma23 == zero );
223
224 if ( !is_ident23 )
225 MAC_Apply_G_mx2_ops( m_A,
226 &gamma23,
227 &sigma23,
228 a2, rs_A,
229 a3, rs_A );
230 }
231 }
232
233 // Pipeline stage
234
235 for ( ; j < nG - 1; j += n_fuse )
236 {
237 nG_app = k_G;
238 n_iter = nG_app;
239 n_left = 0;
240
241 for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
242 {
243 g12 = buff_G + (g )*rs_G + (k )*cs_G;
244 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
245 a1 = buff_A + (g )*cs_A;
246 a2 = buff_A + (g + 1)*cs_A;
247 a3 = buff_A + (g + 2)*cs_A;
248
249 gamma12 = g12->real;
250 sigma12 = g12->imag;
251 gamma23 = g23->real;
252 sigma23 = g23->imag;
253
254 is_ident12 = ( gamma12 == one && sigma12 == zero );
255 is_ident23 = ( gamma23 == one && sigma23 == zero );
256
257 if ( !is_ident12 && is_ident23 )
258 {
259 // Apply only to columns 1 and 2.
260
261 MAC_Apply_G_mx2_ops( m_A,
262 &gamma12,
263 &sigma12,
264 a1, rs_A,
265 a2, rs_A );
266 }
267 else if ( is_ident12 && !is_ident23 )
268 {
269 // Apply only to columns 2 and 3.
270
271 MAC_Apply_G_mx2_ops( m_A,
272 &gamma23,
273 &sigma23,
274 a2, rs_A,
275 a3, rs_A );
276 }
277 else if ( !is_ident12 && !is_ident23 )
278 {
279 // Apply to all three columns.
280
281 MAC_Apply_G_mx3_ops( m_A,
282 &gamma12,
283 &sigma12,
284 &gamma23,
285 &sigma23,
286 a1, rs_A,
287 a2, rs_A,
288 a3, rs_A );
289 }
290 }
291 }
292
293 // Shutdown stage
294
295 for ( j = nG % n_fuse; j < k_G; j += n_fuse )
296 {
297 g = nG - 1;
298 k = j;
299
300 n_left = 1;
301 if ( n_left == 1 )
302 {
303 g12 = buff_G + (g )*rs_G + (k )*cs_G;
304 a1 = buff_A + (g )*cs_A;
305 a2 = buff_A + (g + 1)*cs_A;
306
307 gamma12 = g12->real;
308 sigma12 = g12->imag;
309
310 is_ident12 = ( gamma12 == one && sigma12 == zero );
311
312 if ( !is_ident12 )
313 MAC_Apply_G_mx2_ops( m_A,
314 &gamma12,
315 &sigma12,
316 a1, rs_A,
317 a2, rs_A );
318 ++k;
319 --g;
320 }
321
322 nG_app = k_minus_1 - j;
323 n_iter = nG_app;
324
325 for ( i = 0; i < n_iter; ++i, ++k, --g )
326 {
327 g12 = buff_G + (g )*rs_G + (k )*cs_G;
328 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
329 a1 = buff_A + (g )*cs_A;
330 a2 = buff_A + (g + 1)*cs_A;
331 a3 = buff_A + (g + 2)*cs_A;
332
333 gamma12 = g12->real;
334 sigma12 = g12->imag;
335 gamma23 = g23->real;
336 sigma23 = g23->imag;
337
338 is_ident12 = ( gamma12 == one && sigma12 == zero );
339 is_ident23 = ( gamma23 == one && sigma23 == zero );
340
341 if ( !is_ident12 && is_ident23 )
342 {
343 // Apply only to columns 1 and 2.
344
345 MAC_Apply_G_mx2_ops( m_A,
346 &gamma12,
347 &sigma12,
348 a1, rs_A,
349 a2, rs_A );
350 }
351 else if ( is_ident12 && !is_ident23 )
352 {
353 // Apply only to columns 2 and 3.
354
355 MAC_Apply_G_mx2_ops( m_A,
356 &gamma23,
357 &sigma23,
358 a2, rs_A,
359 a3, rs_A );
360 }
361 else if ( !is_ident12 && !is_ident23 )
362 {
363 // Apply to all three columns.
364
365 MAC_Apply_G_mx3_ops( m_A,
366 &gamma12,
367 &sigma12,
368 &gamma23,
369 &sigma23,
370 a1, rs_A,
371 a2, rs_A,
372 a3, rs_A );
373 }
374 }
375 }
376
377 return FLA_SUCCESS;
378 }
379
FLA_Apply_G_rf_opd_var9(int k_G,int m_A,int n_A,dcomplex * buff_G,int rs_G,int cs_G,double * buff_A,int rs_A,int cs_A)380 FLA_Error FLA_Apply_G_rf_opd_var9( int k_G,
381 int m_A,
382 int n_A,
383 dcomplex* buff_G, int rs_G, int cs_G,
384 double* buff_A, int rs_A, int cs_A )
385 {
386 double one = bl1_d1();
387 double zero = bl1_d0();
388 double gamma12;
389 double sigma12;
390 double gamma23;
391 double sigma23;
392 double* a1;
393 double* a2;
394 double* a3;
395 dcomplex* g12;
396 dcomplex* g23;
397 int i, j, g, k;
398 int nG, nG_app;
399 int n_iter;
400 int n_left;
401 int k_minus_1;
402 int n_fuse;
403 int is_ident12, is_ident23;
404
405 k_minus_1 = k_G - 1;
406 nG = n_A - 1;
407 n_fuse = 2;
408
409 // Use the simple variant for nG < (k - 1) or k == 1.
410 if ( nG < 2*k_minus_1 || k_G == 1 )
411 {
412 FLA_Apply_G_rf_opd_var1( k_G,
413 m_A,
414 n_A,
415 buff_G, rs_G, cs_G,
416 buff_A, rs_A, cs_A );
417 return FLA_SUCCESS;
418 }
419
420
421 // Start-up phase.
422
423 for ( j = -1; j < k_minus_1; j += n_fuse )
424 {
425 nG_app = j + 1;
426 n_iter = nG_app;
427 n_left = 1;
428
429 for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
430 {
431 g12 = buff_G + (g )*rs_G + (k )*cs_G;
432 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
433 a1 = buff_A + (g )*cs_A;
434 a2 = buff_A + (g + 1)*cs_A;
435 a3 = buff_A + (g + 2)*cs_A;
436
437 gamma12 = g12->real;
438 sigma12 = g12->imag;
439 gamma23 = g23->real;
440 sigma23 = g23->imag;
441
442 is_ident12 = ( gamma12 == one && sigma12 == zero );
443 is_ident23 = ( gamma23 == one && sigma23 == zero );
444
445 if ( !is_ident12 && is_ident23 )
446 {
447 // Apply only to columns 1 and 2.
448
449 MAC_Apply_G_mx2_opd( m_A,
450 &gamma12,
451 &sigma12,
452 a1, rs_A,
453 a2, rs_A );
454 }
455 else if ( is_ident12 && !is_ident23 )
456 {
457 // Apply only to columns 2 and 3.
458
459 MAC_Apply_G_mx2_opd( m_A,
460 &gamma23,
461 &sigma23,
462 a2, rs_A,
463 a3, rs_A );
464 }
465 else if ( !is_ident12 && !is_ident23 )
466 {
467 // Apply to all three columns.
468
469 MAC_Apply_G_mx3_opd( m_A,
470 &gamma12,
471 &sigma12,
472 &gamma23,
473 &sigma23,
474 a1, rs_A,
475 a2, rs_A,
476 a3, rs_A );
477 }
478 }
479
480 if ( n_left == 1 )
481 {
482 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
483 a2 = buff_A + (g + 1)*cs_A;
484 a3 = buff_A + (g + 2)*cs_A;
485
486 gamma23 = g23->real;
487 sigma23 = g23->imag;
488
489 is_ident23 = ( gamma23 == one && sigma23 == zero );
490
491 if ( !is_ident23 )
492 MAC_Apply_G_mx2_opd( m_A,
493 &gamma23,
494 &sigma23,
495 a2, rs_A,
496 a3, rs_A );
497 }
498 }
499
500 // Pipeline stage
501
502 for ( ; j < nG - 1; j += n_fuse )
503 {
504 nG_app = k_G;
505 n_iter = nG_app;
506 n_left = 0;
507
508 for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
509 {
510 g12 = buff_G + (g )*rs_G + (k )*cs_G;
511 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
512 a1 = buff_A + (g )*cs_A;
513 a2 = buff_A + (g + 1)*cs_A;
514 a3 = buff_A + (g + 2)*cs_A;
515
516 gamma12 = g12->real;
517 sigma12 = g12->imag;
518 gamma23 = g23->real;
519 sigma23 = g23->imag;
520
521 is_ident12 = ( gamma12 == one && sigma12 == zero );
522 is_ident23 = ( gamma23 == one && sigma23 == zero );
523
524 if ( !is_ident12 && is_ident23 )
525 {
526 // Apply only to columns 1 and 2.
527
528 MAC_Apply_G_mx2_opd( m_A,
529 &gamma12,
530 &sigma12,
531 a1, rs_A,
532 a2, rs_A );
533 }
534 else if ( is_ident12 && !is_ident23 )
535 {
536 // Apply only to columns 2 and 3.
537
538 MAC_Apply_G_mx2_opd( m_A,
539 &gamma23,
540 &sigma23,
541 a2, rs_A,
542 a3, rs_A );
543 }
544 else if ( !is_ident12 && !is_ident23 )
545 {
546 // Apply to all three columns.
547
548 MAC_Apply_G_mx3_opd( m_A,
549 &gamma12,
550 &sigma12,
551 &gamma23,
552 &sigma23,
553 a1, rs_A,
554 a2, rs_A,
555 a3, rs_A );
556 }
557 }
558 }
559
560 // Shutdown stage
561
562 for ( j = nG % n_fuse; j < k_G; j += n_fuse )
563 {
564 g = nG - 1;
565 k = j;
566
567 n_left = 1;
568 if ( n_left == 1 )
569 {
570 g12 = buff_G + (g )*rs_G + (k )*cs_G;
571 a1 = buff_A + (g )*cs_A;
572 a2 = buff_A + (g + 1)*cs_A;
573
574 gamma12 = g12->real;
575 sigma12 = g12->imag;
576
577 is_ident12 = ( gamma12 == one && sigma12 == zero );
578
579 if ( !is_ident12 )
580 MAC_Apply_G_mx2_opd( m_A,
581 &gamma12,
582 &sigma12,
583 a1, rs_A,
584 a2, rs_A );
585 ++k;
586 --g;
587 }
588
589 nG_app = k_minus_1 - j;
590 n_iter = nG_app;
591
592 for ( i = 0; i < n_iter; ++i, ++k, --g )
593 {
594 g12 = buff_G + (g )*rs_G + (k )*cs_G;
595 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
596 a1 = buff_A + (g )*cs_A;
597 a2 = buff_A + (g + 1)*cs_A;
598 a3 = buff_A + (g + 2)*cs_A;
599
600 gamma12 = g12->real;
601 sigma12 = g12->imag;
602 gamma23 = g23->real;
603 sigma23 = g23->imag;
604
605 is_ident12 = ( gamma12 == one && sigma12 == zero );
606 is_ident23 = ( gamma23 == one && sigma23 == zero );
607
608 if ( !is_ident12 && is_ident23 )
609 {
610 // Apply only to columns 1 and 2.
611
612 MAC_Apply_G_mx2_opd( m_A,
613 &gamma12,
614 &sigma12,
615 a1, rs_A,
616 a2, rs_A );
617 }
618 else if ( is_ident12 && !is_ident23 )
619 {
620 // Apply only to columns 2 and 3.
621
622 MAC_Apply_G_mx2_opd( m_A,
623 &gamma23,
624 &sigma23,
625 a2, rs_A,
626 a3, rs_A );
627 }
628 else if ( !is_ident12 && !is_ident23 )
629 {
630 // Apply to all three columns.
631
632 MAC_Apply_G_mx3_opd( m_A,
633 &gamma12,
634 &sigma12,
635 &gamma23,
636 &sigma23,
637 a1, rs_A,
638 a2, rs_A,
639 a3, rs_A );
640 }
641 }
642 }
643
644 return FLA_SUCCESS;
645 }
646
FLA_Apply_G_rf_opc_var9(int k_G,int m_A,int n_A,scomplex * buff_G,int rs_G,int cs_G,scomplex * buff_A,int rs_A,int cs_A)647 FLA_Error FLA_Apply_G_rf_opc_var9( int k_G,
648 int m_A,
649 int n_A,
650 scomplex* buff_G, int rs_G, int cs_G,
651 scomplex* buff_A, int rs_A, int cs_A )
652 {
653 float one = bl1_s1();
654 float zero = bl1_s0();
655 float gamma12;
656 float sigma12;
657 float gamma23;
658 float sigma23;
659 scomplex* a1;
660 scomplex* a2;
661 scomplex* a3;
662 scomplex* g12;
663 scomplex* g23;
664 int i, j, g, k;
665 int nG, nG_app;
666 int n_iter;
667 int n_left;
668 int k_minus_1;
669 int n_fuse;
670 int is_ident12, is_ident23;
671
672 k_minus_1 = k_G - 1;
673 nG = n_A - 1;
674 n_fuse = 2;
675
676 // Use the simple variant for nG < (k - 1) or k == 1.
677 if ( nG < 2*k_minus_1 || k_G == 1 )
678 {
679 FLA_Apply_G_rf_opc_var1( k_G,
680 m_A,
681 n_A,
682 buff_G, rs_G, cs_G,
683 buff_A, rs_A, cs_A );
684 return FLA_SUCCESS;
685 }
686
687
688 // Start-up phase.
689
690 for ( j = -1; j < k_minus_1; j += n_fuse )
691 {
692 nG_app = j + 1;
693 n_iter = nG_app;
694 n_left = 1;
695
696 for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
697 {
698 g12 = buff_G + (g )*rs_G + (k )*cs_G;
699 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
700 a1 = buff_A + (g )*cs_A;
701 a2 = buff_A + (g + 1)*cs_A;
702 a3 = buff_A + (g + 2)*cs_A;
703
704 gamma12 = g12->real;
705 sigma12 = g12->imag;
706 gamma23 = g23->real;
707 sigma23 = g23->imag;
708
709 is_ident12 = ( gamma12 == one && sigma12 == zero );
710 is_ident23 = ( gamma23 == one && sigma23 == zero );
711
712 if ( !is_ident12 && is_ident23 )
713 {
714 // Apply only to columns 1 and 2.
715
716 MAC_Apply_G_mx2_opc( m_A,
717 &gamma12,
718 &sigma12,
719 a1, rs_A,
720 a2, rs_A );
721 }
722 else if ( is_ident12 && !is_ident23 )
723 {
724 // Apply only to columns 2 and 3.
725
726 MAC_Apply_G_mx2_opc( m_A,
727 &gamma23,
728 &sigma23,
729 a2, rs_A,
730 a3, rs_A );
731 }
732 else if ( !is_ident12 && !is_ident23 )
733 {
734 // Apply to all three columns.
735
736 MAC_Apply_G_mx3_opc( m_A,
737 &gamma12,
738 &sigma12,
739 &gamma23,
740 &sigma23,
741 a1, rs_A,
742 a2, rs_A,
743 a3, rs_A );
744 }
745 }
746
747 if ( n_left == 1 )
748 {
749 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
750 a2 = buff_A + (g + 1)*cs_A;
751 a3 = buff_A + (g + 2)*cs_A;
752
753 gamma23 = g23->real;
754 sigma23 = g23->imag;
755
756 is_ident23 = ( gamma23 == one && sigma23 == zero );
757
758 if ( !is_ident23 )
759 MAC_Apply_G_mx2_opc( m_A,
760 &gamma23,
761 &sigma23,
762 a2, rs_A,
763 a3, rs_A );
764 }
765 }
766
767 // Pipeline stage
768
769 for ( ; j < nG - 1; j += n_fuse )
770 {
771 nG_app = k_G;
772 n_iter = nG_app;
773 n_left = 0;
774
775 for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
776 {
777 g12 = buff_G + (g )*rs_G + (k )*cs_G;
778 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
779 a1 = buff_A + (g )*cs_A;
780 a2 = buff_A + (g + 1)*cs_A;
781 a3 = buff_A + (g + 2)*cs_A;
782
783 gamma12 = g12->real;
784 sigma12 = g12->imag;
785 gamma23 = g23->real;
786 sigma23 = g23->imag;
787
788 is_ident12 = ( gamma12 == one && sigma12 == zero );
789 is_ident23 = ( gamma23 == one && sigma23 == zero );
790
791 if ( !is_ident12 && is_ident23 )
792 {
793 // Apply only to columns 1 and 2.
794
795 MAC_Apply_G_mx2_opc( m_A,
796 &gamma12,
797 &sigma12,
798 a1, rs_A,
799 a2, rs_A );
800 }
801 else if ( is_ident12 && !is_ident23 )
802 {
803 // Apply only to columns 2 and 3.
804
805 MAC_Apply_G_mx2_opc( m_A,
806 &gamma23,
807 &sigma23,
808 a2, rs_A,
809 a3, rs_A );
810 }
811 else if ( !is_ident12 && !is_ident23 )
812 {
813 // Apply to all three columns.
814
815 MAC_Apply_G_mx3_opc( m_A,
816 &gamma12,
817 &sigma12,
818 &gamma23,
819 &sigma23,
820 a1, rs_A,
821 a2, rs_A,
822 a3, rs_A );
823 }
824 }
825 }
826
827 // Shutdown stage
828
829 for ( j = nG % n_fuse; j < k_G; j += n_fuse )
830 {
831 g = nG - 1;
832 k = j;
833
834 n_left = 1;
835 if ( n_left == 1 )
836 {
837 g12 = buff_G + (g )*rs_G + (k )*cs_G;
838 a1 = buff_A + (g )*cs_A;
839 a2 = buff_A + (g + 1)*cs_A;
840
841 gamma12 = g12->real;
842 sigma12 = g12->imag;
843
844 is_ident12 = ( gamma12 == one && sigma12 == zero );
845
846 if ( !is_ident12 )
847 MAC_Apply_G_mx2_opc( m_A,
848 &gamma12,
849 &sigma12,
850 a1, rs_A,
851 a2, rs_A );
852 ++k;
853 --g;
854 }
855
856 nG_app = k_minus_1 - j;
857 n_iter = nG_app;
858
859 for ( i = 0; i < n_iter; ++i, ++k, --g )
860 {
861 g12 = buff_G + (g )*rs_G + (k )*cs_G;
862 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
863 a1 = buff_A + (g )*cs_A;
864 a2 = buff_A + (g + 1)*cs_A;
865 a3 = buff_A + (g + 2)*cs_A;
866
867 gamma12 = g12->real;
868 sigma12 = g12->imag;
869 gamma23 = g23->real;
870 sigma23 = g23->imag;
871
872 is_ident12 = ( gamma12 == one && sigma12 == zero );
873 is_ident23 = ( gamma23 == one && sigma23 == zero );
874
875 if ( !is_ident12 && is_ident23 )
876 {
877 // Apply only to columns 1 and 2.
878
879 MAC_Apply_G_mx2_opc( m_A,
880 &gamma12,
881 &sigma12,
882 a1, rs_A,
883 a2, rs_A );
884 }
885 else if ( is_ident12 && !is_ident23 )
886 {
887 // Apply only to columns 2 and 3.
888
889 MAC_Apply_G_mx2_opc( m_A,
890 &gamma23,
891 &sigma23,
892 a2, rs_A,
893 a3, rs_A );
894 }
895 else if ( !is_ident12 && !is_ident23 )
896 {
897 // Apply to all three columns.
898
899 MAC_Apply_G_mx3_opc( m_A,
900 &gamma12,
901 &sigma12,
902 &gamma23,
903 &sigma23,
904 a1, rs_A,
905 a2, rs_A,
906 a3, rs_A );
907 }
908 }
909 }
910
911 return FLA_SUCCESS;
912 }
913
FLA_Apply_G_rf_opz_var9(int k_G,int m_A,int n_A,dcomplex * buff_G,int rs_G,int cs_G,dcomplex * buff_A,int rs_A,int cs_A)914 FLA_Error FLA_Apply_G_rf_opz_var9( int k_G,
915 int m_A,
916 int n_A,
917 dcomplex* buff_G, int rs_G, int cs_G,
918 dcomplex* buff_A, int rs_A, int cs_A )
919 {
920 double one = bl1_d1();
921 double zero = bl1_d0();
922 double gamma12;
923 double sigma12;
924 double gamma23;
925 double sigma23;
926 dcomplex* a1;
927 dcomplex* a2;
928 dcomplex* a3;
929 dcomplex* g12;
930 dcomplex* g23;
931 int i, j, g, k;
932 int nG, nG_app;
933 int n_iter;
934 int n_left;
935 int k_minus_1;
936 int n_fuse;
937 int is_ident12, is_ident23;
938
939 k_minus_1 = k_G - 1;
940 nG = n_A - 1;
941 n_fuse = 2;
942
943 // Use the simple variant for nG < (k - 1) or k == 1.
944 if ( nG < 2*k_minus_1 || k_G == 1 )
945 {
946 FLA_Apply_G_rf_opz_var1( k_G,
947 m_A,
948 n_A,
949 buff_G, rs_G, cs_G,
950 buff_A, rs_A, cs_A );
951 return FLA_SUCCESS;
952 }
953
954
955 // Start-up phase.
956
957 for ( j = -1; j < k_minus_1; j += n_fuse )
958 {
959 nG_app = j + 1;
960 n_iter = nG_app;
961 n_left = 1;
962
963 for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
964 {
965 g12 = buff_G + (g )*rs_G + (k )*cs_G;
966 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
967 a1 = buff_A + (g )*cs_A;
968 a2 = buff_A + (g + 1)*cs_A;
969 a3 = buff_A + (g + 2)*cs_A;
970
971 gamma12 = g12->real;
972 sigma12 = g12->imag;
973 gamma23 = g23->real;
974 sigma23 = g23->imag;
975
976 is_ident12 = ( gamma12 == one && sigma12 == zero );
977 is_ident23 = ( gamma23 == one && sigma23 == zero );
978
979 if ( !is_ident12 && is_ident23 )
980 {
981 // Apply only to columns 1 and 2.
982
983 MAC_Apply_G_mx2_opz( m_A,
984 &gamma12,
985 &sigma12,
986 a1, rs_A,
987 a2, rs_A );
988 }
989 else if ( is_ident12 && !is_ident23 )
990 {
991 // Apply only to columns 2 and 3.
992
993 MAC_Apply_G_mx2_opz( m_A,
994 &gamma23,
995 &sigma23,
996 a2, rs_A,
997 a3, rs_A );
998 }
999 else if ( !is_ident12 && !is_ident23 )
1000 {
1001 // Apply to all three columns.
1002
1003 MAC_Apply_G_mx3_opz( m_A,
1004 &gamma12,
1005 &sigma12,
1006 &gamma23,
1007 &sigma23,
1008 a1, rs_A,
1009 a2, rs_A,
1010 a3, rs_A );
1011 }
1012 }
1013
1014 if ( n_left == 1 )
1015 {
1016 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
1017 a2 = buff_A + (g + 1)*cs_A;
1018 a3 = buff_A + (g + 2)*cs_A;
1019
1020 gamma23 = g23->real;
1021 sigma23 = g23->imag;
1022
1023 is_ident23 = ( gamma23 == one && sigma23 == zero );
1024
1025 if ( !is_ident23 )
1026 MAC_Apply_G_mx2_opz( m_A,
1027 &gamma23,
1028 &sigma23,
1029 a2, rs_A,
1030 a3, rs_A );
1031 }
1032 }
1033
1034 // Pipeline stage
1035
1036 for ( ; j < nG - 1; j += n_fuse )
1037 {
1038 nG_app = k_G;
1039 n_iter = nG_app;
1040 n_left = 0;
1041
1042 for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
1043 {
1044 g12 = buff_G + (g )*rs_G + (k )*cs_G;
1045 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
1046 a1 = buff_A + (g )*cs_A;
1047 a2 = buff_A + (g + 1)*cs_A;
1048 a3 = buff_A + (g + 2)*cs_A;
1049
1050 gamma12 = g12->real;
1051 sigma12 = g12->imag;
1052 gamma23 = g23->real;
1053 sigma23 = g23->imag;
1054
1055 is_ident12 = ( gamma12 == one && sigma12 == zero );
1056 is_ident23 = ( gamma23 == one && sigma23 == zero );
1057
1058 if ( !is_ident12 && is_ident23 )
1059 {
1060 // Apply only to columns 1 and 2.
1061
1062 MAC_Apply_G_mx2_opz( m_A,
1063 &gamma12,
1064 &sigma12,
1065 a1, rs_A,
1066 a2, rs_A );
1067 }
1068 else if ( is_ident12 && !is_ident23 )
1069 {
1070 // Apply only to columns 2 and 3.
1071
1072 MAC_Apply_G_mx2_opz( m_A,
1073 &gamma23,
1074 &sigma23,
1075 a2, rs_A,
1076 a3, rs_A );
1077 }
1078 else if ( !is_ident12 && !is_ident23 )
1079 {
1080 // Apply to all three columns.
1081
1082 MAC_Apply_G_mx3_opz( m_A,
1083 &gamma12,
1084 &sigma12,
1085 &gamma23,
1086 &sigma23,
1087 a1, rs_A,
1088 a2, rs_A,
1089 a3, rs_A );
1090 }
1091 }
1092 }
1093
1094 // Shutdown stage
1095
1096 for ( j = nG % n_fuse; j < k_G; j += n_fuse )
1097 {
1098 g = nG - 1;
1099 k = j;
1100
1101 n_left = 1;
1102 if ( n_left == 1 )
1103 {
1104 g12 = buff_G + (g )*rs_G + (k )*cs_G;
1105 a1 = buff_A + (g )*cs_A;
1106 a2 = buff_A + (g + 1)*cs_A;
1107
1108 gamma12 = g12->real;
1109 sigma12 = g12->imag;
1110
1111 is_ident12 = ( gamma12 == one && sigma12 == zero );
1112
1113 if ( !is_ident12 )
1114 MAC_Apply_G_mx2_opz( m_A,
1115 &gamma12,
1116 &sigma12,
1117 a1, rs_A,
1118 a2, rs_A );
1119 ++k;
1120 --g;
1121 }
1122
1123 nG_app = k_minus_1 - j;
1124 n_iter = nG_app;
1125
1126 for ( i = 0; i < n_iter; ++i, ++k, --g )
1127 {
1128 g12 = buff_G + (g )*rs_G + (k )*cs_G;
1129 g23 = buff_G + (g + 1)*rs_G + (k )*cs_G;
1130 a1 = buff_A + (g )*cs_A;
1131 a2 = buff_A + (g + 1)*cs_A;
1132 a3 = buff_A + (g + 2)*cs_A;
1133
1134 gamma12 = g12->real;
1135 sigma12 = g12->imag;
1136 gamma23 = g23->real;
1137 sigma23 = g23->imag;
1138
1139 is_ident12 = ( gamma12 == one && sigma12 == zero );
1140 is_ident23 = ( gamma23 == one && sigma23 == zero );
1141
1142 if ( !is_ident12 && is_ident23 )
1143 {
1144 // Apply only to columns 1 and 2.
1145
1146 MAC_Apply_G_mx2_opz( m_A,
1147 &gamma12,
1148 &sigma12,
1149 a1, rs_A,
1150 a2, rs_A );
1151 }
1152 else if ( is_ident12 && !is_ident23 )
1153 {
1154 // Apply only to columns 2 and 3.
1155
1156 MAC_Apply_G_mx2_opz( m_A,
1157 &gamma23,
1158 &sigma23,
1159 a2, rs_A,
1160 a3, rs_A );
1161 }
1162 else if ( !is_ident12 && !is_ident23 )
1163 {
1164 // Apply to all three columns.
1165
1166 MAC_Apply_G_mx3_opz( m_A,
1167 &gamma12,
1168 &sigma12,
1169 &gamma23,
1170 &sigma23,
1171 a1, rs_A,
1172 a2, rs_A,
1173 a3, rs_A );
1174 }
1175 }
1176 }
1177
1178 return FLA_SUCCESS;
1179 }
1180
1181