1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "FLAME.h"
12 
FLA_Apply_G_rf_opt_var9(FLA_Obj G,FLA_Obj A)13 FLA_Error FLA_Apply_G_rf_opt_var9( FLA_Obj G, FLA_Obj A )
14 /*
15   Apply k sets of Givens rotations to a matrix A from the right,
16   where each set takes the form:
17 
18     A := A ( G(n-1,k) ... G(1,k) G(0,k) )'
19        = A G(0,k)' G(1,k)' ... G(n-1,k)'
20 
21   where Gik is the ith Givens rotation formed from the kth set,
22   stored in the (i,k) entries of of G:
23 
24     Gik  =  / gamma_ik  -sigma_ik \
25             \ sigma_ik   gamma_ik /
26 
27   This variant iterates in pipelined, overlapping fashion and
28   applies rotations to two columns at a time.
29 
30   -FGVZ
31 */
32 {
33 	FLA_Datatype datatype;
34 	int          k_G, m_A, n_A;
35 	int          rs_G, cs_G;
36 	int          rs_A, cs_A;
37 
38 	datatype = FLA_Obj_datatype( A );
39 
40 	k_G      = FLA_Obj_width( G );
41 	m_A      = FLA_Obj_length( A );
42 	n_A      = FLA_Obj_width( A );
43 
44 	rs_G     = FLA_Obj_row_stride( G );
45 	cs_G     = FLA_Obj_col_stride( G );
46 
47 	rs_A     = FLA_Obj_row_stride( A );
48 	cs_A     = FLA_Obj_col_stride( A );
49 
50 	switch ( datatype )
51 	{
52 		case FLA_FLOAT:
53 		{
54 			scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G );
55 			float*    buff_A = ( float*    ) FLA_FLOAT_PTR( A );
56 
57 			FLA_Apply_G_rf_ops_var9( k_G,
58 			                         m_A,
59 			                         n_A,
60 			                         buff_G, rs_G, cs_G,
61 			                         buff_A, rs_A, cs_A );
62 
63 			break;
64 		}
65 
66 		case FLA_DOUBLE:
67 		{
68 			dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G );
69 			double*   buff_A = ( double*   ) FLA_DOUBLE_PTR( A );
70 
71 			FLA_Apply_G_rf_opd_var9( k_G,
72 			                         m_A,
73 			                         n_A,
74 			                         buff_G, rs_G, cs_G,
75 			                         buff_A, rs_A, cs_A );
76 
77 			break;
78 		}
79 
80 		case FLA_COMPLEX:
81 		{
82 			scomplex* buff_G = ( scomplex* ) FLA_COMPLEX_PTR( G );
83 			scomplex* buff_A = ( scomplex* ) FLA_COMPLEX_PTR( A );
84 
85 			FLA_Apply_G_rf_opc_var9( k_G,
86 			                         m_A,
87 			                         n_A,
88 			                         buff_G, rs_G, cs_G,
89 			                         buff_A, rs_A, cs_A );
90 
91 			break;
92 		}
93 
94 		case FLA_DOUBLE_COMPLEX:
95 		{
96 			dcomplex* buff_G = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( G );
97 			dcomplex* buff_A = ( dcomplex* ) FLA_DOUBLE_COMPLEX_PTR( A );
98 
99 			FLA_Apply_G_rf_opz_var9( k_G,
100 			                         m_A,
101 			                         n_A,
102 			                         buff_G, rs_G, cs_G,
103 			                         buff_A, rs_A, cs_A );
104 
105 			break;
106 		}
107 	}
108 
109 	return FLA_SUCCESS;
110 }
111 
112 
FLA_Apply_G_rf_ops_var9(int k_G,int m_A,int n_A,scomplex * buff_G,int rs_G,int cs_G,float * buff_A,int rs_A,int cs_A)113 FLA_Error FLA_Apply_G_rf_ops_var9( int       k_G,
114                                    int       m_A,
115                                    int       n_A,
116                                    scomplex* buff_G, int rs_G, int cs_G,
117                                    float*    buff_A, int rs_A, int cs_A )
118 {
119 	float     one  = bl1_s1();
120 	float     zero = bl1_s0();
121 	float     gamma12;
122 	float     sigma12;
123 	float     gamma23;
124 	float     sigma23;
125 	float*    a1;
126 	float*    a2;
127 	float*    a3;
128 	scomplex* g12;
129 	scomplex* g23;
130 	int       i, j, g, k;
131 	int       nG, nG_app;
132 	int       n_iter;
133 	int       n_left;
134 	int       k_minus_1;
135 	int       n_fuse;
136 	int       is_ident12, is_ident23;
137 
138 	k_minus_1 = k_G - 1;
139 	nG        = n_A - 1;
140 	n_fuse    = 2;
141 
142 	// Use the simple variant for nG < (k - 1) or k == 1.
143 	if ( nG < 2*k_minus_1 || k_G == 1 )
144 	{
145 		FLA_Apply_G_rf_ops_var1( k_G,
146 		                         m_A,
147 		                         n_A,
148 		                         buff_G, rs_G, cs_G,
149 		                         buff_A, rs_A, cs_A );
150 		return FLA_SUCCESS;
151 	}
152 
153 
154 	// Start-up phase.
155 
156 	for ( j = -1; j < k_minus_1; j += n_fuse )
157 	{
158 		nG_app = j + 1;
159 		n_iter = nG_app;
160 		n_left = 1;
161 
162 		for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
163 		{
164 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
165 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
166 			a1    = buff_A + (g    )*cs_A;
167 			a2    = buff_A + (g + 1)*cs_A;
168 			a3    = buff_A + (g + 2)*cs_A;
169 
170 			gamma12 = g12->real;
171 			sigma12 = g12->imag;
172 			gamma23 = g23->real;
173 			sigma23 = g23->imag;
174 
175 			is_ident12 = ( gamma12 == one && sigma12 == zero );
176 			is_ident23 = ( gamma23 == one && sigma23 == zero );
177 
178 			if      ( !is_ident12 && is_ident23 )
179 			{
180 				// Apply only to columns 1 and 2.
181 
182 				MAC_Apply_G_mx2_ops( m_A,
183 				                     &gamma12,
184 				                     &sigma12,
185 				                     a1, rs_A,
186 				                     a2, rs_A );
187 			}
188 			else if ( is_ident12 && !is_ident23 )
189 			{
190 				// Apply only to columns 2 and 3.
191 
192 				MAC_Apply_G_mx2_ops( m_A,
193 				                     &gamma23,
194 				                     &sigma23,
195 				                     a2, rs_A,
196 				                     a3, rs_A );
197 			}
198 			else if ( !is_ident12 && !is_ident23 )
199 			{
200 				// Apply to all three columns.
201 
202 				MAC_Apply_G_mx3_ops( m_A,
203 				                     &gamma12,
204 				                     &sigma12,
205 				                     &gamma23,
206 				                     &sigma23,
207 				                     a1, rs_A,
208 				                     a2, rs_A,
209 				                     a3, rs_A );
210 			}
211 		}
212 
213 		if ( n_left == 1 )
214 		{
215 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
216 			a2    = buff_A + (g + 1)*cs_A;
217 			a3    = buff_A + (g + 2)*cs_A;
218 
219 			gamma23 = g23->real;
220 			sigma23 = g23->imag;
221 
222 			is_ident23 = ( gamma23 == one && sigma23 == zero );
223 
224 			if ( !is_ident23 )
225 				MAC_Apply_G_mx2_ops( m_A,
226 				                     &gamma23,
227 				                     &sigma23,
228 				                     a2, rs_A,
229 				                     a3, rs_A );
230 		}
231 	}
232 
233 	// Pipeline stage
234 
235 	for ( ; j < nG - 1; j += n_fuse )
236 	{
237 		nG_app = k_G;
238 		n_iter = nG_app;
239 		n_left = 0;
240 
241 		for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
242 		{
243 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
244 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
245 			a1    = buff_A + (g    )*cs_A;
246 			a2    = buff_A + (g + 1)*cs_A;
247 			a3    = buff_A + (g + 2)*cs_A;
248 
249 			gamma12 = g12->real;
250 			sigma12 = g12->imag;
251 			gamma23 = g23->real;
252 			sigma23 = g23->imag;
253 
254 			is_ident12 = ( gamma12 == one && sigma12 == zero );
255 			is_ident23 = ( gamma23 == one && sigma23 == zero );
256 
257 			if      ( !is_ident12 && is_ident23 )
258 			{
259 				// Apply only to columns 1 and 2.
260 
261 				MAC_Apply_G_mx2_ops( m_A,
262 				                     &gamma12,
263 				                     &sigma12,
264 				                     a1, rs_A,
265 				                     a2, rs_A );
266 			}
267 			else if ( is_ident12 && !is_ident23 )
268 			{
269 				// Apply only to columns 2 and 3.
270 
271 				MAC_Apply_G_mx2_ops( m_A,
272 				                     &gamma23,
273 				                     &sigma23,
274 				                     a2, rs_A,
275 				                     a3, rs_A );
276 			}
277 			else if ( !is_ident12 && !is_ident23 )
278 			{
279 				// Apply to all three columns.
280 
281 				MAC_Apply_G_mx3_ops( m_A,
282 				                     &gamma12,
283 				                     &sigma12,
284 				                     &gamma23,
285 				                     &sigma23,
286 				                     a1, rs_A,
287 				                     a2, rs_A,
288 				                     a3, rs_A );
289 			}
290 		}
291 	}
292 
293 	// Shutdown stage
294 
295 	for ( j = nG % n_fuse; j < k_G; j += n_fuse )
296 	{
297 		g = nG - 1;
298 		k = j;
299 
300 		n_left = 1;
301 		if ( n_left == 1 )
302 		{
303 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
304 			a1    = buff_A + (g    )*cs_A;
305 			a2    = buff_A + (g + 1)*cs_A;
306 
307 			gamma12 = g12->real;
308 			sigma12 = g12->imag;
309 
310 			is_ident12 = ( gamma12 == one && sigma12 == zero );
311 
312 			if ( !is_ident12 )
313 				MAC_Apply_G_mx2_ops( m_A,
314 				                     &gamma12,
315 				                     &sigma12,
316 				                     a1, rs_A,
317 				                     a2, rs_A );
318 			++k;
319 			--g;
320 		}
321 
322 		nG_app = k_minus_1 - j;
323 		n_iter = nG_app;
324 
325 		for ( i = 0; i < n_iter; ++i, ++k, --g )
326 		{
327 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
328 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
329 			a1    = buff_A + (g    )*cs_A;
330 			a2    = buff_A + (g + 1)*cs_A;
331 			a3    = buff_A + (g + 2)*cs_A;
332 
333 			gamma12 = g12->real;
334 			sigma12 = g12->imag;
335 			gamma23 = g23->real;
336 			sigma23 = g23->imag;
337 
338 			is_ident12 = ( gamma12 == one && sigma12 == zero );
339 			is_ident23 = ( gamma23 == one && sigma23 == zero );
340 
341 			if      ( !is_ident12 && is_ident23 )
342 			{
343 				// Apply only to columns 1 and 2.
344 
345 				MAC_Apply_G_mx2_ops( m_A,
346 				                     &gamma12,
347 				                     &sigma12,
348 				                     a1, rs_A,
349 				                     a2, rs_A );
350 			}
351 			else if ( is_ident12 && !is_ident23 )
352 			{
353 				// Apply only to columns 2 and 3.
354 
355 				MAC_Apply_G_mx2_ops( m_A,
356 				                     &gamma23,
357 				                     &sigma23,
358 				                     a2, rs_A,
359 				                     a3, rs_A );
360 			}
361 			else if ( !is_ident12 && !is_ident23 )
362 			{
363 				// Apply to all three columns.
364 
365 				MAC_Apply_G_mx3_ops( m_A,
366 				                     &gamma12,
367 				                     &sigma12,
368 				                     &gamma23,
369 				                     &sigma23,
370 				                     a1, rs_A,
371 				                     a2, rs_A,
372 				                     a3, rs_A );
373 			}
374 		}
375 	}
376 
377 	return FLA_SUCCESS;
378 }
379 
FLA_Apply_G_rf_opd_var9(int k_G,int m_A,int n_A,dcomplex * buff_G,int rs_G,int cs_G,double * buff_A,int rs_A,int cs_A)380 FLA_Error FLA_Apply_G_rf_opd_var9( int       k_G,
381                                    int       m_A,
382                                    int       n_A,
383                                    dcomplex* buff_G, int rs_G, int cs_G,
384                                    double*   buff_A, int rs_A, int cs_A )
385 {
386 	double    one  = bl1_d1();
387 	double    zero = bl1_d0();
388 	double    gamma12;
389 	double    sigma12;
390 	double    gamma23;
391 	double    sigma23;
392 	double*   a1;
393 	double*   a2;
394 	double*   a3;
395 	dcomplex* g12;
396 	dcomplex* g23;
397 	int       i, j, g, k;
398 	int       nG, nG_app;
399 	int       n_iter;
400 	int       n_left;
401 	int       k_minus_1;
402 	int       n_fuse;
403 	int       is_ident12, is_ident23;
404 
405 	k_minus_1 = k_G - 1;
406 	nG        = n_A - 1;
407 	n_fuse    = 2;
408 
409 	// Use the simple variant for nG < (k - 1) or k == 1.
410 	if ( nG < 2*k_minus_1 || k_G == 1 )
411 	{
412 		FLA_Apply_G_rf_opd_var1( k_G,
413 		                         m_A,
414 		                         n_A,
415 		                         buff_G, rs_G, cs_G,
416 		                         buff_A, rs_A, cs_A );
417 		return FLA_SUCCESS;
418 	}
419 
420 
421 	// Start-up phase.
422 
423 	for ( j = -1; j < k_minus_1; j += n_fuse )
424 	{
425 		nG_app = j + 1;
426 		n_iter = nG_app;
427 		n_left = 1;
428 
429 		for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
430 		{
431 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
432 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
433 			a1    = buff_A + (g    )*cs_A;
434 			a2    = buff_A + (g + 1)*cs_A;
435 			a3    = buff_A + (g + 2)*cs_A;
436 
437 			gamma12 = g12->real;
438 			sigma12 = g12->imag;
439 			gamma23 = g23->real;
440 			sigma23 = g23->imag;
441 
442 			is_ident12 = ( gamma12 == one && sigma12 == zero );
443 			is_ident23 = ( gamma23 == one && sigma23 == zero );
444 
445 			if      ( !is_ident12 && is_ident23 )
446 			{
447 				// Apply only to columns 1 and 2.
448 
449 				MAC_Apply_G_mx2_opd( m_A,
450 				                     &gamma12,
451 				                     &sigma12,
452 				                     a1, rs_A,
453 				                     a2, rs_A );
454 			}
455 			else if ( is_ident12 && !is_ident23 )
456 			{
457 				// Apply only to columns 2 and 3.
458 
459 				MAC_Apply_G_mx2_opd( m_A,
460 				                     &gamma23,
461 				                     &sigma23,
462 				                     a2, rs_A,
463 				                     a3, rs_A );
464 			}
465 			else if ( !is_ident12 && !is_ident23 )
466 			{
467 				// Apply to all three columns.
468 
469 				MAC_Apply_G_mx3_opd( m_A,
470 				                     &gamma12,
471 				                     &sigma12,
472 				                     &gamma23,
473 				                     &sigma23,
474 				                     a1, rs_A,
475 				                     a2, rs_A,
476 				                     a3, rs_A );
477 			}
478 		}
479 
480 		if ( n_left == 1 )
481 		{
482 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
483 			a2    = buff_A + (g + 1)*cs_A;
484 			a3    = buff_A + (g + 2)*cs_A;
485 
486 			gamma23 = g23->real;
487 			sigma23 = g23->imag;
488 
489 			is_ident23 = ( gamma23 == one && sigma23 == zero );
490 
491 			if ( !is_ident23 )
492 				MAC_Apply_G_mx2_opd( m_A,
493 				                     &gamma23,
494 				                     &sigma23,
495 				                     a2, rs_A,
496 				                     a3, rs_A );
497 		}
498 	}
499 
500 	// Pipeline stage
501 
502 	for ( ; j < nG - 1; j += n_fuse )
503 	{
504 		nG_app = k_G;
505 		n_iter = nG_app;
506 		n_left = 0;
507 
508 		for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
509 		{
510 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
511 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
512 			a1    = buff_A + (g    )*cs_A;
513 			a2    = buff_A + (g + 1)*cs_A;
514 			a3    = buff_A + (g + 2)*cs_A;
515 
516 			gamma12 = g12->real;
517 			sigma12 = g12->imag;
518 			gamma23 = g23->real;
519 			sigma23 = g23->imag;
520 
521 			is_ident12 = ( gamma12 == one && sigma12 == zero );
522 			is_ident23 = ( gamma23 == one && sigma23 == zero );
523 
524 			if      ( !is_ident12 && is_ident23 )
525 			{
526 				// Apply only to columns 1 and 2.
527 
528 				MAC_Apply_G_mx2_opd( m_A,
529 				                     &gamma12,
530 				                     &sigma12,
531 				                     a1, rs_A,
532 				                     a2, rs_A );
533 			}
534 			else if ( is_ident12 && !is_ident23 )
535 			{
536 				// Apply only to columns 2 and 3.
537 
538 				MAC_Apply_G_mx2_opd( m_A,
539 				                     &gamma23,
540 				                     &sigma23,
541 				                     a2, rs_A,
542 				                     a3, rs_A );
543 			}
544 			else if ( !is_ident12 && !is_ident23 )
545 			{
546 				// Apply to all three columns.
547 
548 				MAC_Apply_G_mx3_opd( m_A,
549 				                     &gamma12,
550 				                     &sigma12,
551 				                     &gamma23,
552 				                     &sigma23,
553 				                     a1, rs_A,
554 				                     a2, rs_A,
555 				                     a3, rs_A );
556 			}
557 		}
558 	}
559 
560 	// Shutdown stage
561 
562 	for ( j = nG % n_fuse; j < k_G; j += n_fuse )
563 	{
564 		g = nG - 1;
565 		k = j;
566 
567 		n_left = 1;
568 		if ( n_left == 1 )
569 		{
570 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
571 			a1    = buff_A + (g    )*cs_A;
572 			a2    = buff_A + (g + 1)*cs_A;
573 
574 			gamma12 = g12->real;
575 			sigma12 = g12->imag;
576 
577 			is_ident12 = ( gamma12 == one && sigma12 == zero );
578 
579 			if ( !is_ident12 )
580 				MAC_Apply_G_mx2_opd( m_A,
581 				                     &gamma12,
582 				                     &sigma12,
583 				                     a1, rs_A,
584 				                     a2, rs_A );
585 			++k;
586 			--g;
587 		}
588 
589 		nG_app = k_minus_1 - j;
590 		n_iter = nG_app;
591 
592 		for ( i = 0; i < n_iter; ++i, ++k, --g )
593 		{
594 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
595 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
596 			a1    = buff_A + (g    )*cs_A;
597 			a2    = buff_A + (g + 1)*cs_A;
598 			a3    = buff_A + (g + 2)*cs_A;
599 
600 			gamma12 = g12->real;
601 			sigma12 = g12->imag;
602 			gamma23 = g23->real;
603 			sigma23 = g23->imag;
604 
605 			is_ident12 = ( gamma12 == one && sigma12 == zero );
606 			is_ident23 = ( gamma23 == one && sigma23 == zero );
607 
608 			if      ( !is_ident12 && is_ident23 )
609 			{
610 				// Apply only to columns 1 and 2.
611 
612 				MAC_Apply_G_mx2_opd( m_A,
613 				                     &gamma12,
614 				                     &sigma12,
615 				                     a1, rs_A,
616 				                     a2, rs_A );
617 			}
618 			else if ( is_ident12 && !is_ident23 )
619 			{
620 				// Apply only to columns 2 and 3.
621 
622 				MAC_Apply_G_mx2_opd( m_A,
623 				                     &gamma23,
624 				                     &sigma23,
625 				                     a2, rs_A,
626 				                     a3, rs_A );
627 			}
628 			else if ( !is_ident12 && !is_ident23 )
629 			{
630 				// Apply to all three columns.
631 
632 				MAC_Apply_G_mx3_opd( m_A,
633 				                     &gamma12,
634 				                     &sigma12,
635 				                     &gamma23,
636 				                     &sigma23,
637 				                     a1, rs_A,
638 				                     a2, rs_A,
639 				                     a3, rs_A );
640 			}
641 		}
642 	}
643 
644 	return FLA_SUCCESS;
645 }
646 
FLA_Apply_G_rf_opc_var9(int k_G,int m_A,int n_A,scomplex * buff_G,int rs_G,int cs_G,scomplex * buff_A,int rs_A,int cs_A)647 FLA_Error FLA_Apply_G_rf_opc_var9( int       k_G,
648                                    int       m_A,
649                                    int       n_A,
650                                    scomplex* buff_G, int rs_G, int cs_G,
651                                    scomplex* buff_A, int rs_A, int cs_A )
652 {
653 	float     one  = bl1_s1();
654 	float     zero = bl1_s0();
655 	float     gamma12;
656 	float     sigma12;
657 	float     gamma23;
658 	float     sigma23;
659 	scomplex* a1;
660 	scomplex* a2;
661 	scomplex* a3;
662 	scomplex* g12;
663 	scomplex* g23;
664 	int       i, j, g, k;
665 	int       nG, nG_app;
666 	int       n_iter;
667 	int       n_left;
668 	int       k_minus_1;
669 	int       n_fuse;
670 	int       is_ident12, is_ident23;
671 
672 	k_minus_1 = k_G - 1;
673 	nG        = n_A - 1;
674 	n_fuse    = 2;
675 
676 	// Use the simple variant for nG < (k - 1) or k == 1.
677 	if ( nG < 2*k_minus_1 || k_G == 1 )
678 	{
679 		FLA_Apply_G_rf_opc_var1( k_G,
680 		                         m_A,
681 		                         n_A,
682 		                         buff_G, rs_G, cs_G,
683 		                         buff_A, rs_A, cs_A );
684 		return FLA_SUCCESS;
685 	}
686 
687 
688 	// Start-up phase.
689 
690 	for ( j = -1; j < k_minus_1; j += n_fuse )
691 	{
692 		nG_app = j + 1;
693 		n_iter = nG_app;
694 		n_left = 1;
695 
696 		for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
697 		{
698 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
699 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
700 			a1    = buff_A + (g    )*cs_A;
701 			a2    = buff_A + (g + 1)*cs_A;
702 			a3    = buff_A + (g + 2)*cs_A;
703 
704 			gamma12 = g12->real;
705 			sigma12 = g12->imag;
706 			gamma23 = g23->real;
707 			sigma23 = g23->imag;
708 
709 			is_ident12 = ( gamma12 == one && sigma12 == zero );
710 			is_ident23 = ( gamma23 == one && sigma23 == zero );
711 
712 			if      ( !is_ident12 && is_ident23 )
713 			{
714 				// Apply only to columns 1 and 2.
715 
716 				MAC_Apply_G_mx2_opc( m_A,
717 				                     &gamma12,
718 				                     &sigma12,
719 				                     a1, rs_A,
720 				                     a2, rs_A );
721 			}
722 			else if ( is_ident12 && !is_ident23 )
723 			{
724 				// Apply only to columns 2 and 3.
725 
726 				MAC_Apply_G_mx2_opc( m_A,
727 				                     &gamma23,
728 				                     &sigma23,
729 				                     a2, rs_A,
730 				                     a3, rs_A );
731 			}
732 			else if ( !is_ident12 && !is_ident23 )
733 			{
734 				// Apply to all three columns.
735 
736 				MAC_Apply_G_mx3_opc( m_A,
737 				                     &gamma12,
738 				                     &sigma12,
739 				                     &gamma23,
740 				                     &sigma23,
741 				                     a1, rs_A,
742 				                     a2, rs_A,
743 				                     a3, rs_A );
744 			}
745 		}
746 
747 		if ( n_left == 1 )
748 		{
749 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
750 			a2    = buff_A + (g + 1)*cs_A;
751 			a3    = buff_A + (g + 2)*cs_A;
752 
753 			gamma23 = g23->real;
754 			sigma23 = g23->imag;
755 
756 			is_ident23 = ( gamma23 == one && sigma23 == zero );
757 
758 			if ( !is_ident23 )
759 				MAC_Apply_G_mx2_opc( m_A,
760 				                     &gamma23,
761 				                     &sigma23,
762 				                     a2, rs_A,
763 				                     a3, rs_A );
764 		}
765 	}
766 
767 	// Pipeline stage
768 
769 	for ( ; j < nG - 1; j += n_fuse )
770 	{
771 		nG_app = k_G;
772 		n_iter = nG_app;
773 		n_left = 0;
774 
775 		for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
776 		{
777 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
778 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
779 			a1    = buff_A + (g    )*cs_A;
780 			a2    = buff_A + (g + 1)*cs_A;
781 			a3    = buff_A + (g + 2)*cs_A;
782 
783 			gamma12 = g12->real;
784 			sigma12 = g12->imag;
785 			gamma23 = g23->real;
786 			sigma23 = g23->imag;
787 
788 			is_ident12 = ( gamma12 == one && sigma12 == zero );
789 			is_ident23 = ( gamma23 == one && sigma23 == zero );
790 
791 			if      ( !is_ident12 && is_ident23 )
792 			{
793 				// Apply only to columns 1 and 2.
794 
795 				MAC_Apply_G_mx2_opc( m_A,
796 				                     &gamma12,
797 				                     &sigma12,
798 				                     a1, rs_A,
799 				                     a2, rs_A );
800 			}
801 			else if ( is_ident12 && !is_ident23 )
802 			{
803 				// Apply only to columns 2 and 3.
804 
805 				MAC_Apply_G_mx2_opc( m_A,
806 				                     &gamma23,
807 				                     &sigma23,
808 				                     a2, rs_A,
809 				                     a3, rs_A );
810 			}
811 			else if ( !is_ident12 && !is_ident23 )
812 			{
813 				// Apply to all three columns.
814 
815 				MAC_Apply_G_mx3_opc( m_A,
816 				                     &gamma12,
817 				                     &sigma12,
818 				                     &gamma23,
819 				                     &sigma23,
820 				                     a1, rs_A,
821 				                     a2, rs_A,
822 				                     a3, rs_A );
823 			}
824 		}
825 	}
826 
827 	// Shutdown stage
828 
829 	for ( j = nG % n_fuse; j < k_G; j += n_fuse )
830 	{
831 		g = nG - 1;
832 		k = j;
833 
834 		n_left = 1;
835 		if ( n_left == 1 )
836 		{
837 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
838 			a1    = buff_A + (g    )*cs_A;
839 			a2    = buff_A + (g + 1)*cs_A;
840 
841 			gamma12 = g12->real;
842 			sigma12 = g12->imag;
843 
844 			is_ident12 = ( gamma12 == one && sigma12 == zero );
845 
846 			if ( !is_ident12 )
847 				MAC_Apply_G_mx2_opc( m_A,
848 				                     &gamma12,
849 				                     &sigma12,
850 				                     a1, rs_A,
851 				                     a2, rs_A );
852 			++k;
853 			--g;
854 		}
855 
856 		nG_app = k_minus_1 - j;
857 		n_iter = nG_app;
858 
859 		for ( i = 0; i < n_iter; ++i, ++k, --g )
860 		{
861 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
862 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
863 			a1    = buff_A + (g    )*cs_A;
864 			a2    = buff_A + (g + 1)*cs_A;
865 			a3    = buff_A + (g + 2)*cs_A;
866 
867 			gamma12 = g12->real;
868 			sigma12 = g12->imag;
869 			gamma23 = g23->real;
870 			sigma23 = g23->imag;
871 
872 			is_ident12 = ( gamma12 == one && sigma12 == zero );
873 			is_ident23 = ( gamma23 == one && sigma23 == zero );
874 
875 			if      ( !is_ident12 && is_ident23 )
876 			{
877 				// Apply only to columns 1 and 2.
878 
879 				MAC_Apply_G_mx2_opc( m_A,
880 				                     &gamma12,
881 				                     &sigma12,
882 				                     a1, rs_A,
883 				                     a2, rs_A );
884 			}
885 			else if ( is_ident12 && !is_ident23 )
886 			{
887 				// Apply only to columns 2 and 3.
888 
889 				MAC_Apply_G_mx2_opc( m_A,
890 				                     &gamma23,
891 				                     &sigma23,
892 				                     a2, rs_A,
893 				                     a3, rs_A );
894 			}
895 			else if ( !is_ident12 && !is_ident23 )
896 			{
897 				// Apply to all three columns.
898 
899 				MAC_Apply_G_mx3_opc( m_A,
900 				                     &gamma12,
901 				                     &sigma12,
902 				                     &gamma23,
903 				                     &sigma23,
904 				                     a1, rs_A,
905 				                     a2, rs_A,
906 				                     a3, rs_A );
907 			}
908 		}
909 	}
910 
911 	return FLA_SUCCESS;
912 }
913 
FLA_Apply_G_rf_opz_var9(int k_G,int m_A,int n_A,dcomplex * buff_G,int rs_G,int cs_G,dcomplex * buff_A,int rs_A,int cs_A)914 FLA_Error FLA_Apply_G_rf_opz_var9( int       k_G,
915                                    int       m_A,
916                                    int       n_A,
917                                    dcomplex* buff_G, int rs_G, int cs_G,
918                                    dcomplex* buff_A, int rs_A, int cs_A )
919 {
920 	double    one  = bl1_d1();
921 	double    zero = bl1_d0();
922 	double    gamma12;
923 	double    sigma12;
924 	double    gamma23;
925 	double    sigma23;
926 	dcomplex* a1;
927 	dcomplex* a2;
928 	dcomplex* a3;
929 	dcomplex* g12;
930 	dcomplex* g23;
931 	int       i, j, g, k;
932 	int       nG, nG_app;
933 	int       n_iter;
934 	int       n_left;
935 	int       k_minus_1;
936 	int       n_fuse;
937 	int       is_ident12, is_ident23;
938 
939 	k_minus_1 = k_G - 1;
940 	nG        = n_A - 1;
941 	n_fuse    = 2;
942 
943 	// Use the simple variant for nG < (k - 1) or k == 1.
944 	if ( nG < 2*k_minus_1 || k_G == 1 )
945 	{
946 		FLA_Apply_G_rf_opz_var1( k_G,
947 		                         m_A,
948 		                         n_A,
949 		                         buff_G, rs_G, cs_G,
950 		                         buff_A, rs_A, cs_A );
951 		return FLA_SUCCESS;
952 	}
953 
954 
955 	// Start-up phase.
956 
957 	for ( j = -1; j < k_minus_1; j += n_fuse )
958 	{
959 		nG_app = j + 1;
960 		n_iter = nG_app;
961 		n_left = 1;
962 
963 		for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
964 		{
965 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
966 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
967 			a1    = buff_A + (g    )*cs_A;
968 			a2    = buff_A + (g + 1)*cs_A;
969 			a3    = buff_A + (g + 2)*cs_A;
970 
971 			gamma12 = g12->real;
972 			sigma12 = g12->imag;
973 			gamma23 = g23->real;
974 			sigma23 = g23->imag;
975 
976 			is_ident12 = ( gamma12 == one && sigma12 == zero );
977 			is_ident23 = ( gamma23 == one && sigma23 == zero );
978 
979 			if      ( !is_ident12 && is_ident23 )
980 			{
981 				// Apply only to columns 1 and 2.
982 
983 				MAC_Apply_G_mx2_opz( m_A,
984 				                     &gamma12,
985 				                     &sigma12,
986 				                     a1, rs_A,
987 				                     a2, rs_A );
988 			}
989 			else if ( is_ident12 && !is_ident23 )
990 			{
991 				// Apply only to columns 2 and 3.
992 
993 				MAC_Apply_G_mx2_opz( m_A,
994 				                     &gamma23,
995 				                     &sigma23,
996 				                     a2, rs_A,
997 				                     a3, rs_A );
998 			}
999 			else if ( !is_ident12 && !is_ident23 )
1000 			{
1001 				// Apply to all three columns.
1002 
1003 				MAC_Apply_G_mx3_opz( m_A,
1004 				                     &gamma12,
1005 				                     &sigma12,
1006 				                     &gamma23,
1007 				                     &sigma23,
1008 				                     a1, rs_A,
1009 				                     a2, rs_A,
1010 				                     a3, rs_A );
1011 			}
1012 		}
1013 
1014 		if ( n_left == 1 )
1015 		{
1016 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
1017 			a2    = buff_A + (g + 1)*cs_A;
1018 			a3    = buff_A + (g + 2)*cs_A;
1019 
1020 			gamma23 = g23->real;
1021 			sigma23 = g23->imag;
1022 
1023 			is_ident23 = ( gamma23 == one && sigma23 == zero );
1024 
1025 			if ( !is_ident23 )
1026 				MAC_Apply_G_mx2_opz( m_A,
1027 				                     &gamma23,
1028 				                     &sigma23,
1029 				                     a2, rs_A,
1030 				                     a3, rs_A );
1031 		}
1032 	}
1033 
1034 	// Pipeline stage
1035 
1036 	for ( ; j < nG - 1; j += n_fuse )
1037 	{
1038 		nG_app = k_G;
1039 		n_iter = nG_app;
1040 		n_left = 0;
1041 
1042 		for ( i = 0, k = 0, g = j; i < n_iter; ++i, ++k, --g )
1043 		{
1044 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
1045 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
1046 			a1    = buff_A + (g    )*cs_A;
1047 			a2    = buff_A + (g + 1)*cs_A;
1048 			a3    = buff_A + (g + 2)*cs_A;
1049 
1050 			gamma12 = g12->real;
1051 			sigma12 = g12->imag;
1052 			gamma23 = g23->real;
1053 			sigma23 = g23->imag;
1054 
1055 			is_ident12 = ( gamma12 == one && sigma12 == zero );
1056 			is_ident23 = ( gamma23 == one && sigma23 == zero );
1057 
1058 			if      ( !is_ident12 && is_ident23 )
1059 			{
1060 				// Apply only to columns 1 and 2.
1061 
1062 				MAC_Apply_G_mx2_opz( m_A,
1063 				                     &gamma12,
1064 				                     &sigma12,
1065 				                     a1, rs_A,
1066 				                     a2, rs_A );
1067 			}
1068 			else if ( is_ident12 && !is_ident23 )
1069 			{
1070 				// Apply only to columns 2 and 3.
1071 
1072 				MAC_Apply_G_mx2_opz( m_A,
1073 				                     &gamma23,
1074 				                     &sigma23,
1075 				                     a2, rs_A,
1076 				                     a3, rs_A );
1077 			}
1078 			else if ( !is_ident12 && !is_ident23 )
1079 			{
1080 				// Apply to all three columns.
1081 
1082 				MAC_Apply_G_mx3_opz( m_A,
1083 				                     &gamma12,
1084 				                     &sigma12,
1085 				                     &gamma23,
1086 				                     &sigma23,
1087 				                     a1, rs_A,
1088 				                     a2, rs_A,
1089 				                     a3, rs_A );
1090 			}
1091 		}
1092 	}
1093 
1094 	// Shutdown stage
1095 
1096 	for ( j = nG % n_fuse; j < k_G; j += n_fuse )
1097 	{
1098 		g = nG - 1;
1099 		k = j;
1100 
1101 		n_left = 1;
1102 		if ( n_left == 1 )
1103 		{
1104 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
1105 			a1    = buff_A + (g    )*cs_A;
1106 			a2    = buff_A + (g + 1)*cs_A;
1107 
1108 			gamma12 = g12->real;
1109 			sigma12 = g12->imag;
1110 
1111 			is_ident12 = ( gamma12 == one && sigma12 == zero );
1112 
1113 			if ( !is_ident12 )
1114 				MAC_Apply_G_mx2_opz( m_A,
1115 				                     &gamma12,
1116 				                     &sigma12,
1117 				                     a1, rs_A,
1118 				                     a2, rs_A );
1119 			++k;
1120 			--g;
1121 		}
1122 
1123 		nG_app = k_minus_1 - j;
1124 		n_iter = nG_app;
1125 
1126 		for ( i = 0; i < n_iter; ++i, ++k, --g )
1127 		{
1128 			g12   = buff_G + (g    )*rs_G + (k    )*cs_G;
1129 			g23   = buff_G + (g + 1)*rs_G + (k    )*cs_G;
1130 			a1    = buff_A + (g    )*cs_A;
1131 			a2    = buff_A + (g + 1)*cs_A;
1132 			a3    = buff_A + (g + 2)*cs_A;
1133 
1134 			gamma12 = g12->real;
1135 			sigma12 = g12->imag;
1136 			gamma23 = g23->real;
1137 			sigma23 = g23->imag;
1138 
1139 			is_ident12 = ( gamma12 == one && sigma12 == zero );
1140 			is_ident23 = ( gamma23 == one && sigma23 == zero );
1141 
1142 			if      ( !is_ident12 && is_ident23 )
1143 			{
1144 				// Apply only to columns 1 and 2.
1145 
1146 				MAC_Apply_G_mx2_opz( m_A,
1147 				                     &gamma12,
1148 				                     &sigma12,
1149 				                     a1, rs_A,
1150 				                     a2, rs_A );
1151 			}
1152 			else if ( is_ident12 && !is_ident23 )
1153 			{
1154 				// Apply only to columns 2 and 3.
1155 
1156 				MAC_Apply_G_mx2_opz( m_A,
1157 				                     &gamma23,
1158 				                     &sigma23,
1159 				                     a2, rs_A,
1160 				                     a3, rs_A );
1161 			}
1162 			else if ( !is_ident12 && !is_ident23 )
1163 			{
1164 				// Apply to all three columns.
1165 
1166 				MAC_Apply_G_mx3_opz( m_A,
1167 				                     &gamma12,
1168 				                     &sigma12,
1169 				                     &gamma23,
1170 				                     &sigma23,
1171 				                     a1, rs_A,
1172 				                     a2, rs_A,
1173 				                     a3, rs_A );
1174 			}
1175 		}
1176 	}
1177 
1178 	return FLA_SUCCESS;
1179 }
1180 
1181