1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
13 #define FLA_ALG_REFERENCE 0
14 #define FLA_ALG_BLOCKED 1
15 #define FLA_ALG_UNBLOCKED 2
16 #define FLA_ALG_UNB_OPT 3
17
18
19 FLA_Error REF_Trinv_uu( FLA_Obj A );
20 void time_Trinv_uu(
21 int variant, int type, int nrepeats, int m, int nb_alg,
22 FLA_Obj A, FLA_Obj b, FLA_Obj b_orig, FLA_Obj norm,
23 double *dtime, double *diff, double *gflops );
24
25
time_Trinv_uu(int variant,int type,int nrepeats,int m,int nb_alg,FLA_Obj A,FLA_Obj b,FLA_Obj b_orig,FLA_Obj norm,double * dtime,double * diff,double * gflops)26 void time_Trinv_uu(
27 int variant, int type, int nrepeats, int m, int nb_alg,
28 FLA_Obj A, FLA_Obj b, FLA_Obj b_orig, FLA_Obj norm,
29 double *dtime, double *diff, double *gflops )
30 {
31 int
32 irep;
33
34 double
35 dtime_old = 1.0e9;
36
37 FLA_Obj
38 A_save, b_save, b_orig_save;
39
40 fla_blocksize_t*
41 bp;
42 fla_trinv_t*
43 cntl_trinv_var;
44 fla_trinv_t*
45 cntl_trinv_unb;
46 fla_gemm_t*
47 cntl_gemm_blas;
48 fla_trmm_t*
49 cntl_trmm_blas;
50 fla_trsm_t*
51 cntl_trsm_blas;
52
53
54 bp = FLA_Blocksize_create( nb_alg, nb_alg, nb_alg, nb_alg );
55 cntl_trinv_unb = FLA_Cntl_trinv_obj_create( FLA_FLAT, FLA_UNB_OPT_VARIANT3, NULL, NULL, NULL, NULL, NULL, NULL );
56 cntl_trmm_blas = FLA_Cntl_trmm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
57 cntl_trsm_blas = FLA_Cntl_trsm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL, NULL );
58 cntl_gemm_blas = FLA_Cntl_gemm_obj_create( FLA_FLAT, FLA_SUBPROBLEM, NULL, NULL );
59 cntl_trinv_var = FLA_Cntl_trinv_obj_create( FLA_FLAT, variant, bp, cntl_trinv_unb, cntl_trmm_blas, cntl_trsm_blas, cntl_trsm_blas, cntl_gemm_blas );
60
61 FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_save );
62 FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b, &b_save );
63 FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, b_orig, &b_orig_save );
64
65 FLA_Copy_external( A, A_save );
66 FLA_Copy_external( b, b_save );
67 FLA_Copy_external( b_orig, b_orig_save );
68
69
70 for ( irep = 0 ; irep < nrepeats; irep++ )
71 {
72 FLA_Copy_external( A_save, A );
73
74 *dtime = FLA_Clock();
75
76 switch( variant ){
77
78 // Time reference
79 case 0:
80 REF_Trinv_uu( A );
81 break;
82
83 // Time variant 1
84 case 1:{
85 switch( type ){
86 case FLA_ALG_UNBLOCKED:
87 FLA_Trinv_uu_unb_var1( A );
88 break;
89 case FLA_ALG_UNB_OPT:
90 FLA_Trinv_uu_opt_var1( A );
91 break;
92 case FLA_ALG_BLOCKED:
93 FLA_Trinv_uu_blk_var1( A, cntl_trinv_var );
94 break;
95 default:
96 printf("trouble\n");
97 }
98
99 break;
100 }
101
102 // Time variant 2
103 case 2:{
104 switch( type ){
105 case FLA_ALG_UNBLOCKED:
106 FLA_Trinv_uu_unb_var2( A );
107 break;
108 case FLA_ALG_UNB_OPT:
109 FLA_Trinv_uu_opt_var2( A );
110 break;
111 case FLA_ALG_BLOCKED:
112 FLA_Trinv_uu_blk_var2( A, cntl_trinv_var );
113 break;
114 default:
115 printf("trouble\n");
116 }
117
118 break;
119 }
120
121 // Time variant 3
122 case 3:{
123 switch( type ){
124 case FLA_ALG_UNBLOCKED:
125 FLA_Trinv_uu_unb_var3( A );
126 break;
127 case FLA_ALG_UNB_OPT:
128 FLA_Trinv_uu_opt_var3( A );
129 break;
130 case FLA_ALG_BLOCKED:
131 FLA_Trinv_uu_blk_var3( A, cntl_trinv_var );
132 break;
133 default:
134 printf("trouble\n");
135 }
136
137 break;
138 }
139
140 // Time variant 4
141 case 4:{
142 switch( type ){
143 case FLA_ALG_UNBLOCKED:
144 FLA_Trinv_uu_unb_var4( A );
145 break;
146 case FLA_ALG_UNB_OPT:
147 FLA_Trinv_uu_opt_var4( A );
148 break;
149 case FLA_ALG_BLOCKED:
150 FLA_Trinv_uu_blk_var4( A, cntl_trinv_var );
151 break;
152 default:
153 printf("trouble\n");
154 }
155
156 break;
157 }
158
159 }
160
161 *dtime = FLA_Clock() - *dtime;
162 dtime_old = min( *dtime, dtime_old );
163 }
164
165 FLA_Cntl_obj_free( cntl_trinv_var );
166 FLA_Cntl_obj_free( cntl_trinv_unb );
167 FLA_Cntl_obj_free( cntl_gemm_blas );
168 FLA_Cntl_obj_free( cntl_trmm_blas );
169 FLA_Cntl_obj_free( cntl_trsm_blas );
170 FLA_Blocksize_free( bp );
171
172 {
173 FLA_Trmv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
174 FLA_UNIT_DIAG, A, b );
175
176 FLA_Trmv_external( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
177 FLA_UNIT_DIAG, A_save, b );
178
179 FLA_Axpy_external( FLA_MINUS_ONE, b_orig, b );
180
181 FLA_Nrm2_external( b, norm );
182 FLA_Copy_object_to_buffer( FLA_NO_TRANSPOSE, 0, 0, norm,
183 1, 1, diff, 1, 1 );
184 }
185
186 *gflops = 1.0 / 3.0 *
187 FLA_Obj_length( A ) *
188 FLA_Obj_length( A ) *
189 FLA_Obj_length( A ) /
190 dtime_old / 1e9;
191
192 if ( FLA_Obj_is_complex( A ) )
193 *gflops *= 4.0;
194
195 *dtime = dtime_old;
196
197 FLA_Copy_external( A_save, A );
198 FLA_Copy_external( b_save, b );
199 FLA_Copy_external( b_orig_save, b_orig );
200
201 FLA_Obj_free( &A_save );
202 FLA_Obj_free( &b_save );
203 FLA_Obj_free( &b_orig_save );
204 }
205
206