1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
13 #define FLA_ALG_REFERENCE 0
14 #define FLA_ALG_BLOCKED 1
15 #define FLA_ALG_UNBLOCKED 2
16 #define FLA_ALG_UNB_OPT 3
17
18
19 FLA_Error REF_Trinv_lu( FLA_Obj A );
20 void time_Trinv_lu(
21 int variant, int type, int nrepeats, int m, int nb_alg,
22 FLA_Obj A, FLA_Obj x, FLA_Obj b, FLA_Obj norm,
23 double *dtime, double *diff, double *gflops );
24
25
time_Trinv_lu(int variant,int type,int nrepeats,int m,int nb_alg,FLA_Obj A,FLA_Obj x,FLA_Obj b,FLA_Obj norm,double * dtime,double * diff,double * gflops)26 void time_Trinv_lu(
27 int variant, int type, int nrepeats, int m, int nb_alg,
28 FLA_Obj A, FLA_Obj x, FLA_Obj b, FLA_Obj norm,
29 double *dtime, double *diff, double *gflops )
30 {
31 int
32 irep;
33
34 double
35 dtime_old = 1.0e9;
36
37 FLA_Obj
38 A_save, b_save;
39
40 fla_blocksize_t*
41 bp;
42 fla_trinv_t*
43 cntl_trinv_var;
44 fla_trinv_t*
45 cntl_trinv_unb;
46 fla_gemm_t*
47 cntl_gemm_blas;
48 fla_trmm_t*
49 cntl_trmm_blas;
50 fla_trsm_t*
51 cntl_trsm_blas;
52
53
54 bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
55 cntl_trinv_unb = FLA_Cntl_trinv_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT3, NULL, NULL, NULL, NULL, NULL, NULL );
56 cntl_trmm_blas = FLA_Cntl_trmm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
57 cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
58 cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL );
59 cntl_trinv_var = FLA_Cntl_trinv_obj_create( FLA_FLAT, variant, bp, cntl_trinv_unb, cntl_trmm_blas, cntl_trsm_blas, cntl_trsm_blas, cntl_gemm_blas );
60
61 FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
62 FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save );
63
64 FLA_Copy_external( A, A_save );
65 FLA_Copy_external( b, b_save );
66
67 for ( irep = 0 ; irep < nrepeats; irep++ )
68 {
69 FLA_Copy_external( A_save, A );
70
71 *dtime = FLA_Clock();
72
73 switch( variant ){
74
75 // Time reference
76 case 0:
77 REF_Trinv_lu( A );
78 break;
79
80 // Time variant 1
81 case 1:{
82 switch( type ){
83 case FLA_ALG_UNBLOCKED:
84 FLA_Trinv_lu_unb_var1( A );
85 break;
86 case FLA_ALG_UNB_OPT:
87 FLA_Trinv_lu_opt_var1( A );
88 break;
89 case FLA_ALG_BLOCKED:
90 FLA_Trinv_lu_blk_var1( A, cntl_trinv_var );
91 break;
92 default:
93 printf("trouble\n");
94 }
95
96 break;
97 }
98
99 // Time variant 2
100 case 2:{
101 switch( type ){
102 case FLA_ALG_UNBLOCKED:
103 FLA_Trinv_lu_unb_var2( A );
104 break;
105 case FLA_ALG_UNB_OPT:
106 FLA_Trinv_lu_opt_var2( A );
107 break;
108 case FLA_ALG_BLOCKED:
109 FLA_Trinv_lu_blk_var2( A, cntl_trinv_var );
110 break;
111 default:
112 printf("trouble\n");
113 }
114
115 break;
116 }
117
118 // Time variant 3
119 case 3:{
120 switch( type ){
121 case FLA_ALG_UNBLOCKED:
122 FLA_Trinv_lu_unb_var3( A );
123 break;
124 case FLA_ALG_UNB_OPT:
125 FLA_Trinv_lu_opt_var3( A );
126 break;
127 case FLA_ALG_BLOCKED:
128 FLA_Trinv_lu_blk_var3( A, cntl_trinv_var );
129 break;
130 default:
131 printf("trouble\n");
132 }
133
134 break;
135 }
136
137 // Time variant 4
138 case 4:{
139 switch( type ){
140 case FLA_ALG_UNBLOCKED:
141 FLA_Trinv_lu_unb_var4( A );
142 break;
143 case FLA_ALG_UNB_OPT:
144 FLA_Trinv_lu_opt_var4( A );
145 break;
146 case FLA_ALG_BLOCKED:
147 FLA_Trinv_lu_blk_var4( A, cntl_trinv_var );
148 break;
149 default:
150 printf("trouble\n");
151 }
152
153 break;
154 }
155
156 }
157
158 *dtime = FLA_Clock() - *dtime;
159 dtime_old = min( *dtime, dtime_old );
160 }
161
162 FLA_Cntl_obj_free( cntl_trinv_var );
163 FLA_Cntl_obj_free( cntl_trinv_unb );
164 FLA_Cntl_obj_free( cntl_gemm_blas );
165 FLA_Cntl_obj_free( cntl_trmm_blas );
166 FLA_Cntl_obj_free( cntl_trsm_blas );
167 FLA_Blocksize_free( bp );
168
169 {
170 FLA_Copy_external( b, x );
171 FLA_Trmv_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE,
172 FLA_UNIT_DIAG, A, x );
173
174 FLA_Trmv_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE,
175 FLA_UNIT_DIAG, A_save, x );
176
177 FLA_Axpy_external( FLA_MINUS_ONE, x, b );
178
179 FLA_Nrm2_external( b, norm );
180 FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm,
181 1, 1, diff, 1, 1 );
182 }
183
184 *gflops = 1.0 / 3.0 *
185 FLA_Obj_length( A ) *
186 FLA_Obj_length( A ) *
187 FLA_Obj_length( A ) /
188 dtime_old / 1e9;
189
190 if ( FLA_Obj_is_complex( A ) )
191 *gflops *= 4.0;
192
193 *dtime = dtime_old;
194
195 FLA_Copy_external( A_save, A );
196 FLA_Copy_external( b_save, b );
197
198 FLA_Obj_free( &A_save );
199 FLA_Obj_free( &b_save );
200 }
201
202