1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "FLAME.h"
12 
13 #ifdef FLA_ENABLE_NON_CRITICAL_CODE
14 
FLA_Sylv_hn_blk_var1(FLA_Obj isgn,FLA_Obj A,FLA_Obj B,FLA_Obj C,FLA_Obj scale,fla_sylv_t * cntl)15 FLA_Error FLA_Sylv_hn_blk_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl )
16 {
17   FLA_Obj ATL,   ATR,      A00, A01, A02,
18           ABL,   ABR,      A10, A11, A12,
19                            A20, A21, A22;
20 
21   FLA_Obj BTL,   BTR,      B00, B01, B02,
22           BBL,   BBR,      B10, B11, B12,
23                            B20, B21, B22;
24 
25   FLA_Obj CTL,   CTR,      C00, C01, C02,
26           CBL,   CBR,      C10, C11, C12,
27                            C20, C21, C22;
28 
29   dim_t b;
30 
31   FLA_Part_2x2( A,    &ATL, &ATR,
32                       &ABL, &ABR,     0, 0, FLA_TL );
33 
34   FLA_Part_2x2( B,    &BTL, &BTR,
35                       &BBL, &BBR,     0, 0, FLA_TL );
36 
37   FLA_Part_2x2( C,    &CTL, &CTR,
38                       &CBL, &CBR,     0, 0, FLA_TL );
39 
40   while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){
41 
42     b = FLA_Determine_blocksize( CBR, FLA_BR, FLA_Cntl_blocksize( cntl ) );
43 
44     FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
45                         /* ************* */   /* ******************** */
46                                                 &A10, /**/ &A11, &A12,
47                            ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
48                            b, b, FLA_BR );
49 
50     FLA_Repart_2x2_to_3x3( BTL, /**/ BTR,       &B00, /**/ &B01, &B02,
51                         /* ************* */   /* ******************** */
52                                                 &B10, /**/ &B11, &B12,
53                            BBL, /**/ BBR,       &B20, /**/ &B21, &B22,
54                            b, b, FLA_BR );
55 
56     FLA_Repart_2x2_to_3x3( CTL, /**/ CTR,       &C00, /**/ &C01, &C02,
57                         /* ************* */   /* ******************** */
58                                                 &C10, /**/ &C11, &C12,
59                            CBL, /**/ CBR,       &C20, /**/ &C21, &C22,
60                            b, b, FLA_BR );
61 
62     // Loop Invariant:
63     // CTL = sylv( ATL', BTL, CTL )
64     // CTR = CTR
65     // CBL = CBL
66     // CBR = CBR
67 
68     /*------------------------------------------------------------*/
69 
70     // C10 = sylv( A11', B00, C10 - A01' * C00 );
71     FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE,
72                        FLA_MINUS_ONE, A01, C00, FLA_ONE, C10,
73                        FLA_Cntl_sub_gemm1( cntl ) );
74 
75     FLA_Sylv_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE,
76                        isgn, A11, B00, C10, scale,
77                        FLA_Cntl_sub_sylv1( cntl ) );
78 
79     // C01 = sylv( A00', B11, C01 -/+ C00 * B01 );
80     FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
81                        FLA_NEGATE( isgn ), C00, B01, FLA_ONE, C01,
82                        FLA_Cntl_sub_gemm2( cntl ) );
83 
84     FLA_Sylv_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE,
85                        isgn, A00, B11, C01, scale,
86                        FLA_Cntl_sub_sylv2( cntl ) );
87 
88     // C11 = sylv( A11', B11, C11 - A01' * C01 -/+ C10 * B01 );
89     FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
90                        FLA_NEGATE( isgn ), C10, B01, FLA_ONE, C11,
91                        FLA_Cntl_sub_gemm3( cntl ) );
92 
93     FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE,
94                        FLA_MINUS_ONE, A01, C01, FLA_ONE, C11,
95                        FLA_Cntl_sub_gemm4( cntl ) );
96 
97     FLA_Sylv_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE,
98                        isgn, A11, B11, C11, scale,
99                        FLA_Cntl_sub_sylv3( cntl ) );
100 
101     /*------------------------------------------------------------*/
102 
103     FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
104                                                      A10, A11, /**/ A12,
105                             /* ************** */  /* ****************** */
106                               &ABL, /**/ &ABR,       A20, A21, /**/ A22,
107                               FLA_TL );
108 
109     FLA_Cont_with_3x3_to_2x2( &BTL, /**/ &BTR,       B00, B01, /**/ B02,
110                                                      B10, B11, /**/ B12,
111                             /* ************** */  /* ****************** */
112                               &BBL, /**/ &BBR,       B20, B21, /**/ B22,
113                               FLA_TL );
114 
115     FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR,       C00, C01, /**/ C02,
116                                                      C10, C11, /**/ C12,
117                             /* ************** */  /* ****************** */
118                               &CBL, /**/ &CBR,       C20, C21, /**/ C22,
119                               FLA_TL );
120 
121   }
122 
123   return FLA_SUCCESS;
124 }
125 
126 #endif
127