1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "FLAME.h"
12 
13 #define FLA_ALG_REFERENCE     0
14 #define FLA_ALG_UNBLOCKED     1
15 #define FLA_ALG_UNB_OPT       2
16 #define FLA_ALG_BLOCKED       3
17 
18 FLA_Error REF_Hevd_lv( FLA_Obj A, FLA_Obj l,
19                        double* dtime_tred, double* dtime_tevd, double* dtime_appq );
20 FLA_Error REF_Hevdd_lv( FLA_Obj A, FLA_Obj l,
21                         double* dtime_tred, double* dtime_tevd, double* dtime_appq );
22 FLA_Error REF_Hevdr_lv( FLA_Obj A, FLA_Obj l, FLA_Obj Z,
23                         double* dtime_tred, double* dtime_tevd, double* dtime_appq );
24 FLA_Error REF_Hevd_lv_components( FLA_Obj A, FLA_Obj l,
25                                   double* dtime_tred, double* dtime_tevd, double* dtime_appq );
26 FLA_Error REF_Hevdd_lv_components( FLA_Obj A, FLA_Obj l,
27                                    double* dtime_tred, double* dtime_tevd, double* dtime_appq );
28 FLA_Error REF_Hevdr_lv_components( FLA_Obj A, FLA_Obj l, FLA_Obj Z,
29                                    double* dtime_tred, double* dtime_tevd, double* dtime_appq );
30 FLA_Error FLA_Hevd_lv_var1_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg,
31                                        double* dtime_tred, double* dtime_tevd, double* dtime_appq );
32 FLA_Error FLA_Hevd_lv_var2_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg,
33                                        double* dtime_tred, double* dtime_tevd, double* dtime_appq );
34 FLA_Error FLA_Hevd_lv_var3_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg,
35                                        double* dtime_tred, double* dtime_tevd, double* dtime_appq );
36 FLA_Error FLA_Hevd_lv_var4_components( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg,
37                                        double* dtime_tred, double* dtime_tevd, double* dtime_appq );
38 
39 void time_Hevd_lv_components(
40                int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg,
41                FLA_Obj A, FLA_Obj l,
42                double* dtime, double* diff1, double* diff2, double* gflops,
43                double* dtime_tred, double* gflops_tred,
44                double* dtime_tevd, double* gflops_tevd,
45                double* dtime_appq, double* gflops_appq, int* k_perf );
46 
47 
time_Hevd_lv_components(int variant,int type,int n_repeats,int m,int n_iter_max,int k_accum,int b_alg,FLA_Obj A,FLA_Obj l,double * dtime,double * diff1,double * diff2,double * gflops,double * dtime_tred,double * gflops_tred,double * dtime_tevd,double * gflops_tevd,double * dtime_appq,double * gflops_appq,int * k_perf)48 void time_Hevd_lv_components(
49                int variant, int type, int n_repeats, int m, int n_iter_max, int k_accum, int b_alg,
50                FLA_Obj A, FLA_Obj l,
51                double* dtime, double* diff1, double* diff2, double* gflops,
52                double* dtime_tred, double* gflops_tred,
53                double* dtime_tevd, double* gflops_tevd,
54                double* dtime_appq, double* gflops_appq, int* k_perf )
55 {
56   int     i;
57   double  k;
58   double  dtime_save      = 1.0e9;
59   double  dtime_tred_save = 1.0e9;
60   double  dtime_tevd_save = 1.0e9;
61   double  dtime_appq_save = 1.0e9;
62   double  flops_tred;
63   double  flops_tevd;
64   double  flops_appq;
65   double  mult_tred;
66   double  mult_tevd;
67   double  mult_appq;
68 
69   FLA_Obj A_save, Z;
70 
71   if (
72        ( variant == -3 ) ||
73        ( variant == -4 ) ||
74        ( variant == -5 ) ||
75        //( variant == 0 ) ||
76        //( variant == -1 ) ||
77        //( variant == -2 ) ||
78        //( variant == 1 ) ||
79        //( variant == 2 ) ||
80        //( variant == 3 ) ||
81        //( variant == 4 ) ||
82        FALSE
83      )
84   {
85     *gflops      = 0.0;
86     *dtime       = 0.0;
87     *diff1       = 0.0;
88     *diff2       = 0.0;
89     *dtime_tred  = 0.0;
90     *dtime_tevd  = 0.0;
91     *dtime_appq  = 0.0;
92     *gflops_tred = 0.0;
93     *gflops_tevd = 0.0;
94     *gflops_appq = 0.0;
95     *k_perf      = 0;
96     return;
97   }
98 
99   FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
100   FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z );
101 
102   FLA_Copy_external( A, A_save );
103 
104   for ( i = 0 ; i < n_repeats; i++ ){
105 
106     FLA_Copy_external( A_save, A );
107 
108     *dtime = FLA_Clock();
109 
110     switch( variant ){
111 
112     case -3:
113     {
114       *k_perf = 0;
115       REF_Hevd_lv( A, l,
116                    dtime_tred, dtime_tevd, dtime_appq );
117       break;
118     }
119 
120     case -4:
121     {
122       *k_perf = 0;
123       REF_Hevdd_lv( A, l,
124                     dtime_tred, dtime_tevd, dtime_appq );
125       break;
126     }
127 
128     case -5:
129     {
130       *k_perf = 0;
131       REF_Hevdr_lv( A, l, Z,
132                     dtime_tred, dtime_tevd, dtime_appq );
133       break;
134     }
135 
136     case 0:
137     {
138       *k_perf = 0;
139       REF_Hevd_lv_components( A, l,
140                               dtime_tred, dtime_tevd, dtime_appq );
141       break;
142     }
143 
144     case -1:
145     {
146       *k_perf = 0;
147       REF_Hevdd_lv_components( A, l,
148                                dtime_tred, dtime_tevd, dtime_appq );
149       break;
150     }
151 
152     case -2:
153     {
154       *k_perf = 0;
155       REF_Hevdr_lv_components( A, l, Z,
156                                dtime_tred, dtime_tevd, dtime_appq );
157       break;
158     }
159 
160     // Time variant 1
161     case 1:
162     {
163       *k_perf = FLA_Hevd_lv_var1_components( n_iter_max, A, l, k_accum, b_alg,
164                                              dtime_tred, dtime_tevd, dtime_appq );
165       break;
166     }
167 
168     // Time variant 2
169     case 2:
170     {
171       *k_perf = FLA_Hevd_lv_var2_components( n_iter_max, A, l, k_accum, b_alg,
172                                              dtime_tred, dtime_tevd, dtime_appq );
173       break;
174     }
175 
176     }
177 
178     *dtime = FLA_Clock() - *dtime;
179     if ( *dtime < dtime_save )
180     {
181       dtime_save      = *dtime;
182       dtime_tred_save = *dtime_tred;
183       dtime_tevd_save = *dtime_tevd;
184       dtime_appq_save = *dtime_appq;
185     }
186   }
187 
188   *dtime      = dtime_save;
189   *dtime_tred = dtime_tred_save;
190   *dtime_tevd = dtime_tevd_save;
191   *dtime_appq = dtime_appq_save;
192 
193 //if ( variant == -3 || variant == 0 )
194 //printf( "\ndtime is %9.3e\n", *dtime );
195 
196   {
197     FLA_Obj V, A_rev_evd, norm, eye;
198 
199     if ( variant == -2 || variant == -5 ) FLA_Copy( Z, A );
200 
201     FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &V );
202     FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_rev_evd );
203     FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &eye );
204     FLA_Obj_create( FLA_Obj_datatype_proj_to_real( A ), 1, 1, 0, 0, &norm );
205 
206     FLA_Apply_diag_matrix( FLA_RIGHT, FLA_NO_CONJUGATE, l, A );
207 
208     FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
209               FLA_ONE, A, V, FLA_ZERO, A_rev_evd );
210     FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_NONUNIT_DIAG, A_rev_evd );
211 
212 //FLA_Obj_show( "A_rev_evd", A_rev_evd, "%9.2e + %9.2e ", "" );
213 
214     FLA_Axpy( FLA_MINUS_ONE, A_save, A_rev_evd );
215     FLA_Norm_frob( A_rev_evd, norm );
216     FLA_Obj_extract_real_scalar( norm, diff1 );
217 
218     FLA_Set_to_identity( eye );
219 	FLA_Copy( V, A_rev_evd );
220     FLA_Gemm( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
221               FLA_ONE, V, A_rev_evd, FLA_MINUS_ONE, eye );
222     FLA_Norm_frob( eye, norm );
223     FLA_Obj_extract_real_scalar( norm, diff2 );
224 
225     FLA_Obj_free( &V );
226     FLA_Obj_free( &A_rev_evd );
227     FLA_Obj_free( &eye );
228     FLA_Obj_free( &norm );
229   }
230 
231   k = 2.00;
232 
233   flops_tred = ( ( 4.0 / 3.0 )   * m * m * m );
234   flops_tevd = (   4.5           * k * m * m     +
235                    3.0           * k * m * m * m );
236 
237   if ( variant == -1 || variant == -2 ||
238        variant == -4 || variant == -5 )
239     flops_appq = ( 2.0           * m * m * m );
240   else
241     flops_appq = ( 4.0 / 3.0     * m * m * m );
242 
243 /*
244   if ( FLA_Obj_is_complex( A ) )
245   {
246     *gflops      = ( 4.0 * flops_tred +
247                      2.0 * flops_tevd +
248                      4.0 * flops_appq ) / *dtime      / 1e9;
249 
250     *gflops_tred = ( 4.0 * flops_tred ) / *dtime_tred / 1e9;
251     *gflops_tevd = ( 2.0 * flops_tevd ) / *dtime_tevd / 1e9;
252     *gflops_appq = ( 4.0 * flops_appq ) / *dtime_appq / 1e9;
253   }
254   else
255   {
256     *gflops      = ( 1.0 * flops_tred +
257                      1.0 * flops_tevd +
258                      1.0 * flops_appq ) / *dtime      / 1e9;
259 
260     *gflops_tred = ( 1.0 * flops_tred ) / *dtime_tred / 1e9;
261     *gflops_tevd = ( 1.0 * flops_tevd ) / *dtime_tevd / 1e9;
262     *gflops_appq = ( 1.0 * flops_appq ) / *dtime_appq / 1e9;
263   }
264 */
265 
266   if ( FLA_Obj_is_complex( A ) )
267   {
268     mult_tred = 4.0;
269     mult_tevd = 2.0;
270     mult_appq = 4.0;
271   }
272   else
273   {
274     mult_tred = 1.0;
275     mult_tevd = 1.0;
276     mult_appq = 1.0;
277   }
278 
279   *gflops = ( mult_tred * flops_tred +
280               mult_tevd * flops_tevd +
281               mult_appq * flops_appq ) / *dtime / 1e9;
282 
283   *gflops_tred = ( mult_tred * flops_tred ) / *dtime_tred / 1e9;
284   *gflops_tevd = ( mult_tevd * flops_tevd ) / *dtime_tevd / 1e9;
285   *gflops_appq = ( mult_appq * flops_appq ) / *dtime_appq / 1e9;
286 
287   FLA_Copy_external( A_save, A );
288 
289   FLA_Obj_free( &A_save );
290   FLA_Obj_free( &Z );
291 }
292 
293