1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
13 #define FLA_ALG_REFERENCE 0
14 #define FLA_ALG_UNBLOCKED 1
15 #define FLA_ALG_UNB_OPT 2
16 #define FLA_ALG_BLOCKED 3
17
18 FLA_Error REF_Hevd_lv( FLA_Obj A, FLA_Obj l,
19 double* dtime_tred, double* dtime_tevd, double* dtime_appq );
20 FLA_Error REF_Hevdd_lv( FLA_Obj A, FLA_Obj l,
21 double* dtime_tred, double* dtime_tevd, double* dtime_appq );
22 FLA_Error REF_Hevdr_lv( FLA_Obj A, FLA_Obj l, FLA_Obj Z,
23 double* dtime_tred, double* dtime_tevd, double* dtime_appq );
24 FLA_Error REF_Hevd_lv_components( FLA_Obj A, FLA_Obj l,
25 double* dtime_tred, double* dtime_tevd, double* dtime_appq );
26 FLA_Error REF_Hevdd_lv_components( FLA_Obj A, FLA_Obj l,
27 double* dtime_tred, double* dtime_tevd, double* dtime_appq );
28 FLA_Error REF_Hevdr_lv_components( FLA_Obj A, FLA_Obj l, FLA_Obj Z,
29 double* dtime_tred, double* dtime_tevd, double* dtime_appq );
30 FLA_Error FLA_Hevd_lv_var1_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg,
31 double* dtime_tred, double* dtime_tevd, double* dtime_appq );
32 FLA_Error FLA_Hevd_lv_var2_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg,
33 double* dtime_tred, double* dtime_tevd, double* dtime_appq );
34 FLA_Error FLA_Hevd_lv_var3_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg,
35 double* dtime_tred, double* dtime_tevd, double* dtime_appq );
36 FLA_Error FLA_Hevd_lv_var4_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg,
37 double* dtime_tred, double* dtime_tevd, double* dtime_appq );
38
39 void time_Hevd_lv_components(
40 int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg,
41 FLA_Obj A, FLA_Obj l,
42 double* dtime, double* diff1, double* diff2, double* gflops,
43 double* dtime_tred, double* gflops_tred,
44 double* dtime_tevd, double* gflops_tevd,
45 double* dtime_appq, double* gflops_appq, int* k_perf );
46
47
time_Hevd_lv_components(int variant,int type,int n_repeats,int m,int n_iter_max,int k_accum,int b_alg,FLA_Obj A,FLA_Obj l,double * dtime,double * diff1,double * diff2,double * gflops,double * dtime_tred,double * gflops_tred,double * dtime_tevd,double * gflops_tevd,double * dtime_appq,double * gflops_appq,int * k_perf)48 void time_Hevd_lv_components(
49 int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg,
50 FLA_Obj A, FLA_Obj l,
51 double* dtime, double* diff1, double* diff2, double* gflops,
52 double* dtime_tred, double* gflops_tred,
53 double* dtime_tevd, double* gflops_tevd,
54 double* dtime_appq, double* gflops_appq, int* k_perf )
55 {
56 int i;
57 double k;
58 double dtime_save = 1.0e9;
59 double dtime_tred_save = 1.0e9;
60 double dtime_tevd_save = 1.0e9;
61 double dtime_appq_save = 1.0e9;
62 double flops_tred;
63 double flops_tevd;
64 double flops_appq;
65 double mult_tred;
66 double mult_tevd;
67 double mult_appq;
68
69 FLA_Obj A_save, Z;
70
71 if (
72 ( variant == -3 ) ||
73 ( variant == -4 ) ||
74 ( variant == -5 ) ||
75 //( variant == 0 ) ||
76 //( variant == -1 ) ||
77 //( variant == -2 ) ||
78 //( variant == 1 ) ||
79 //( variant == 2 ) ||
80 //( variant == 3 ) ||
81 //( variant == 4 ) ||
82 FALSE
83 )
84 {
85 *gflops = 0.0;
86 *dtime = 0.0;
87 *diff1 = 0.0;
88 *diff2 = 0.0;
89 *dtime_tred = 0.0;
90 *dtime_tevd = 0.0;
91 *dtime_appq = 0.0;
92 *gflops_tred = 0.0;
93 *gflops_tevd = 0.0;
94 *gflops_appq = 0.0;
95 *k_perf = 0;
96 return;
97 }
98
99 FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
100 FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z );
101
102 FLA_Copy_external( A, A_save );
103
104 for ( i = 0 ; i < n_repeats; i++ ){
105
106 FLA_Copy_external( A_save, A );
107
108 *dtime = FLA_Clock();
109
110 switch( variant ){
111
112 case -3:
113 {
114 *k_perf = 0;
115 REF_Hevd_lv( A, l,
116 dtime_tred, dtime_tevd, dtime_appq );
117 break;
118 }
119
120 case -4:
121 {
122 *k_perf = 0;
123 REF_Hevdd_lv( A, l,
124 dtime_tred, dtime_tevd, dtime_appq );
125 break;
126 }
127
128 case -5:
129 {
130 *k_perf = 0;
131 REF_Hevdr_lv( A, l, Z,
132 dtime_tred, dtime_tevd, dtime_appq );
133 break;
134 }
135
136 case 0:
137 {
138 *k_perf = 0;
139 REF_Hevd_lv_components( A, l,
140 dtime_tred, dtime_tevd, dtime_appq );
141 break;
142 }
143
144 case -1:
145 {
146 *k_perf = 0;
147 REF_Hevdd_lv_components( A, l,
148 dtime_tred, dtime_tevd, dtime_appq );
149 break;
150 }
151
152 case -2:
153 {
154 *k_perf = 0;
155 REF_Hevdr_lv_components( A, l, Z,
156 dtime_tred, dtime_tevd, dtime_appq );
157 break;
158 }
159
160 // Time variant 1
161 case 1:
162 {
163 *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg,
164 dtime_tred, dtime_tevd, dtime_appq );
165 break;
166 }
167
168 // Time variant 2
169 case 2:
170 {
171 *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg,
172 dtime_tred, dtime_tevd, dtime_appq );
173 break;
174 }
175
176 }
177
178 *dtime = FLA_Clock() - *dtime;
179 if ( *dtime < dtime_save )
180 {
181 dtime_save = *dtime;
182 dtime_tred_save = *dtime_tred;
183 dtime_tevd_save = *dtime_tevd;
184 dtime_appq_save = *dtime_appq;
185 }
186 }
187
188 *dtime = dtime_save;
189 *dtime_tred = dtime_tred_save;
190 *dtime_tevd = dtime_tevd_save;
191 *dtime_appq = dtime_appq_save;
192
193 //if ( variant == -3 || variant == 0 )
194 //printf( "\ndtime is %9.3e\n", *dtime );
195
196 {
197 FLA_Obj V, A_rev_evd, norm, eye;
198
199 if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A );
200
201 FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V );
202 FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd );
203 FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye );
204 FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );
205
206 FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A );
207
208 FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
209 FLA_ONE, A, V, FLA_ZERO, A_rev_evd );
210 FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );
211
212 //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" );
213
214 FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd );
215 FLA_Norm_frob( A_rev_evd, norm );
216 FLA_Obj_extract_real_scalar( norm, diff1 );
217
218 FLA_Set_to_identity( eye );
219 FLA_Copy( V, A_rev_evd );
220 FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
221 FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye );
222 FLA_Norm_frob( eye, norm );
223 FLA_Obj_extract_real_scalar( norm, diff2 );
224
225 FLA_Obj_free( &V );
226 FLA_Obj_free( &A_rev_evd );
227 FLA_Obj_free( &eye );
228 FLA_Obj_free( &norm );
229 }
230
231 k = 2.00;
232
233 flops_tred = ( ( 4.0 / 3.0 ) * m * m * m );
234 flops_tevd = ( 4.5 * k * m * m +
235 3.0 * k * m * m * m );
236
237 if ( variant == -1 || variant == -2 ||
238 variant == -4 || variant == -5 )
239 flops_appq = ( 2.0 * m * m * m );
240 else
241 flops_appq = ( 4.0 / 3.0 * m * m * m );
242
243 /*
244 if ( FLA_Obj_is_complex( A ) )
245 {
246 *gflops = ( 4.0 * flops_tred +
247 2.0 * flops_tevd +
248 4.0 * flops_appq ) / *dtime / 1e9;
249
250 *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9;
251 *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9;
252 *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9;
253 }
254 else
255 {
256 *gflops = ( 1.0 * flops_tred +
257 1.0 * flops_tevd +
258 1.0 * flops_appq ) / *dtime / 1e9;
259
260 *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9;
261 *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9;
262 *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9;
263 }
264 */
265
266 if ( FLA_Obj_is_complex( A ) )
267 {
268 mult_tred = 4.0;
269 mult_tevd = 2.0;
270 mult_appq = 4.0;
271 }
272 else
273 {
274 mult_tred = 1.0;
275 mult_tevd = 1.0;
276 mult_appq = 1.0;
277 }
278
279 *gflops = ( mult_tred * flops_tred +
280 mult_tevd * flops_tevd +
281 mult_appq * flops_appq ) / *dtime / 1e9;
282
283 *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9;
284 *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9;
285 *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9;
286
287 FLA_Copy_external( A_save, A );
288
289 FLA_Obj_free( &A_save );
290 FLA_Obj_free( &Z );
291 }
292
293