1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "FLAME.h"
12 
FLA_Tevd_n_opt_var1(dim_t n_iter_max,FLA_Obj d,FLA_Obj e,FLA_Obj G,FLA_Obj U)13 FLA_Error FLA_Tevd_n_opt_var1( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj U )
14 {
15 	FLA_Error    r_val = FLA_SUCCESS;
16 	FLA_Datatype datatype;
17 	int          m_A, m_U, n_G;
18 	int          inc_d;
19 	int          inc_e;
20 	int          rs_G, cs_G;
21 
22 	datatype = FLA_Obj_datatype( U );
23 
24 	m_A       = FLA_Obj_vector_dim( d );
25 	m_U       = FLA_Obj_vector_dim( d );
26 	n_G       = FLA_Obj_width( G );
27 
28 	inc_d     = FLA_Obj_vector_inc( d );
29 	inc_e     = FLA_Obj_vector_inc( e );
30 
31 	rs_G      = FLA_Obj_row_stride( G );
32 	cs_G      = FLA_Obj_col_stride( G );
33 
34 /*
35 FLA_Obj de, deL, deR, deLT, deLB;
36 FLA_Obj_create( FLA_DOUBLE, m_A, 2, 0, 0, &de );
37 FLA_Part_1x2( de,  &deL, &deR,   1, FLA_LEFT );
38 FLA_Part_2x1( deL, &deLT,
39                    &deLB,   1, FLA_BOTTOM );
40 FLA_Copy( d, deR );
41 FLA_Copy( e, deLT );
42 FLA_Set( FLA_ZERO, deLB );
43 //FLA_Obj_show( "de", de, "%21.17e", "" );
44 */
45 
46 	switch ( datatype )
47 	{
48 		case FLA_FLOAT:
49 		{
50 			float*    buff_d = FLA_FLOAT_PTR( d );
51 			float*    buff_e = FLA_FLOAT_PTR( e );
52 			scomplex* buff_G = FLA_COMPLEX_PTR( G );
53 
54 			r_val = FLA_Tevd_n_ops_var1( m_A,
55 			                             m_U,
56 			                             n_G,
57 			                             n_iter_max,
58 			                             buff_d, inc_d,
59 			                             buff_e, inc_e,
60 			                             buff_G, rs_G, cs_G );
61 
62 			break;
63 		}
64 
65 		case FLA_DOUBLE:
66 		{
67 			double*   buff_d = FLA_DOUBLE_PTR( d );
68 			double*   buff_e = FLA_DOUBLE_PTR( e );
69 			dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G );
70 
71 			r_val = FLA_Tevd_n_opd_var1( m_A,
72 			                             m_U,
73 			                             n_G,
74 			                             n_iter_max,
75 			                             buff_d, inc_d,
76 			                             buff_e, inc_e,
77 			                             buff_G, rs_G, cs_G );
78 
79 			break;
80 		}
81 
82 		case FLA_COMPLEX:
83 		{
84 			float*    buff_d = FLA_FLOAT_PTR( d );
85 			float*    buff_e = FLA_FLOAT_PTR( e );
86 			scomplex* buff_G = FLA_COMPLEX_PTR( G );
87 
88 			r_val = FLA_Tevd_n_opc_var1( m_A,
89 			                             m_U,
90 			                             n_G,
91 			                             n_iter_max,
92 			                             buff_d, inc_d,
93 			                             buff_e, inc_e,
94 			                             buff_G, rs_G, cs_G );
95 
96 			break;
97 		}
98 
99 		case FLA_DOUBLE_COMPLEX:
100 		{
101 			double*   buff_d = FLA_DOUBLE_PTR( d );
102 			double*   buff_e = FLA_DOUBLE_PTR( e );
103 			dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G );
104 
105 			r_val = FLA_Tevd_n_opz_var1( m_A,
106 			                             m_U,
107 			                             n_G,
108 			                             n_iter_max,
109 			                             buff_d, inc_d,
110 			                             buff_e, inc_e,
111 			                             buff_G, rs_G, cs_G );
112 
113 			break;
114 		}
115 	}
116 /*
117 FLA_Copy( d, deR );
118 FLA_Copy( e, deLT );
119 FLA_Set( FLA_ZERO, deLB );
120 FLA_Sort( FLA_FORWARD, deR );
121 FLA_Obj_show( "de after", de, "%21.17e", "" );
122 double eps   = FLA_Mach_params_opd( FLA_MACH_EPS );
123 printf( "epsilon = %21.17e\n", eps );
124 FLA_Obj_free( &de );
125 */
126 	return r_val;
127 }
128 
129 
130 
FLA_Tevd_n_ops_var1(int m_A,int m_U,int n_G,int n_iter_max,float * buff_d,int inc_d,float * buff_e,int inc_e,scomplex * buff_G,int rs_G,int cs_G)131 FLA_Error FLA_Tevd_n_ops_var1( int       m_A,
132                                int       m_U,
133                                int       n_G,
134                                int       n_iter_max,
135                                float*    buff_d, int inc_d,
136                                float*    buff_e, int inc_e,
137                                scomplex* buff_G, int rs_G, int cs_G )
138 {
139 	return FLA_SUCCESS;
140 }
141 
142 
143 
FLA_Tevd_n_opd_var1(int m_A,int m_U,int n_G,int n_iter_max,double * buff_d,int inc_d,double * buff_e,int inc_e,dcomplex * buff_G,int rs_G,int cs_G)144 FLA_Error FLA_Tevd_n_opd_var1( int       m_A,
145                                int       m_U,
146                                int       n_G,
147                                int       n_iter_max,
148                                double*   buff_d, int inc_d,
149                                double*   buff_e, int inc_e,
150                                dcomplex* buff_G, int rs_G, int cs_G )
151 {
152 	return FLA_SUCCESS;
153 }
154 
FLA_Tevd_n_opc_var1(int m_A,int m_U,int n_G,int n_iter_max,float * buff_d,int inc_d,float * buff_e,int inc_e,scomplex * buff_G,int rs_G,int cs_G)155 FLA_Error FLA_Tevd_n_opc_var1( int       m_A,
156                                int       m_U,
157                                int       n_G,
158                                int       n_iter_max,
159                                float*    buff_d, int inc_d,
160                                float*    buff_e, int inc_e,
161                                scomplex* buff_G, int rs_G, int cs_G )
162 {
163 	return FLA_SUCCESS;
164 }
165 
166 //#define PRINTF
167 
FLA_Tevd_n_opz_var1(int m_A,int m_U,int n_G,int n_iter_max,double * buff_d,int inc_d,double * buff_e,int inc_e,dcomplex * buff_G,int rs_G,int cs_G)168 FLA_Error FLA_Tevd_n_opz_var1( int       m_A,
169                                int       m_U,
170                                int       n_G,
171                                int       n_iter_max,
172                                double*   buff_d, int inc_d,
173                                double*   buff_e, int inc_e,
174                                dcomplex* buff_G, int rs_G, int cs_G )
175 {
176 	dcomplex  one  = bl1_z1();
177 	double    rone = bl1_d1();
178 
179 	double    eps;
180 	double    eps2;
181 	double    safmin;
182 	double    ssfmin;
183 	double    safmax;
184 	double    ssfmax;
185 
186 	dcomplex* G;
187 	double*   d1;
188 	double*   e1;
189 	int       r_val;
190 	int       done;
191 	int       m_G_sweep_max;
192 	int       ij_begin;
193 	int       ijTL, ijBR;
194 	int       m_A11;
195 	int       n_iter_perf;
196 	int       total_deflations;
197 	int       n_deflations;
198 	int       n_iter_prev;
199 	int       n_iter_perf_sweep_max;
200 
201 	// Initialize some numerical constants.
202 	eps    = FLA_Mach_params_opd( FLA_MACH_EPS );
203 	eps2   = FLA_Mach_params_opd( FLA_MACH_EPS2 );
204 	safmin = FLA_Mach_params_opd( FLA_MACH_SFMIN );
205 	safmax = rone / safmin;
206 	ssfmax = sqrt( safmax ) / 3.0;
207 	ssfmin = sqrt( safmin ) / eps2;
208 
209 	// Initialize our completion flag.
210 	done = FALSE;
211 
212 	// Initialize a counter that holds the maximum number of rows of G
213 	// that we would need to initialize for the next sweep.
214 	m_G_sweep_max = m_A - 1;
215 
216 	// Initialize a counter for the total number of iterations performed.
217 	n_iter_prev = 0;
218 
219 	// Iterate until the matrix has completely deflated.
220 	for ( total_deflations = 0; done != TRUE; )
221 	{
222 
223 		// Initialize G to contain only identity rotations.
224 		bl1_zsetm( m_G_sweep_max,
225 		           n_G,
226 		           &one,
227 		           buff_G, rs_G, cs_G );
228 
229 		// Keep track of the maximum number of iterations performed in the
230 		// current sweep. This is used when applying the sweep's Givens
231 		// rotations.
232 		n_iter_perf_sweep_max = 0;
233 
234 		// Perform a sweep: Move through the matrix and perform a tridiagonal
235 		// EVD on each non-zero submatrix that is encountered. During the
236 		// first time through, ijTL will be 0 and ijBR will be m_A - 1.
237 		for ( ij_begin = 0; ij_begin < m_A; )
238 		{
239 
240 #ifdef PRINTF
241 if ( ij_begin == 0 )
242 printf( "FLA_Tevd_n_opz_var1: beginning new sweep (ij_begin = %d)\n", ij_begin );
243 #endif
244 
245 			// Search for the first submatrix along the diagonal that is
246 			// bounded by zeroes (or endpoints of the matrix). If no
247 			// submatrix is found (ie: if the entire subdiagonal is zero
248 			// then FLA_FAILURE is returned. This function also inspects
249 			// subdiagonal elements for proximity to zero. If a given
250 			// element is close enough to zero, then it is deemed
251 			// converged and manually set to zero.
252 			r_val = FLA_Tevd_find_submatrix_opd( m_A,
253 			                                     ij_begin,
254 			                                     buff_d, inc_d,
255 			                                     buff_e, inc_e,
256 			                                     &ijTL,
257 			                                     &ijBR );
258 
259 			// Verify that a submatrix was found. If one was not found,
260 			// then we are done with the current sweep. Furthermore, if
261 			// a submatrix was not found AND we began our search at the
262 			// beginning of the matrix (ie: ij_begin == 0), then the
263 			// matrix has completely deflated and so we are done with
264 			// Francis step iteration.
265 			if ( r_val == FLA_FAILURE )
266 			{
267 				if ( ij_begin == 0 )
268 				{
269 #ifdef PRINTF
270 printf( "FLA_Tevd_n_opz_var1: subdiagonal is completely zero.\n" );
271 printf( "FLA_Tevd_n_opz_var1: Francis iteration is done!\n" );
272 #endif
273 					done = TRUE;
274 				}
275 
276 				// Break out of the current sweep so we can apply the last
277 				// remaining Givens rotations.
278 				break;
279 			}
280 
281 			// If we got this far, then:
282 			//   (a) ijTL refers to the index of the first non-zero
283 			//       subdiagonal along the diagonal, and
284 			//   (b) ijBR refers to either:
285 			//       - the first zero element that occurs after ijTL, or
286 			//       - the the last diagonal element.
287 			// Note that ijTL and ijBR also correspond to the first and
288 			// last diagonal elements of the submatrix of interest. Thus,
289 			// we may compute the dimension of this submatrix as:
290 			m_A11 = ijBR - ijTL + 1;
291 
292 #ifdef PRINTF
293 printf( "FLA_Tevd_n_opz_var1: ij_begin = %d\n", ij_begin );
294 printf( "FLA_Tevd_n_opz_var1: ijTL     = %d\n", ijTL );
295 printf( "FLA_Tevd_n_opz_var1: ijBR     = %d\n", ijBR );
296 printf( "FLA_Tevd_n_opz_var1: m_A11    = %d\n", m_A11 );
297 #endif
298 
299 			// Adjust ij_begin, which gets us ready for the next submatrix
300 			// search in the current sweep.
301 			ij_begin = ijBR + 1;
302 
303 			// Index to the submatrices upon which we will operate.
304 			d1 = buff_d + ijTL * inc_d;
305 			e1 = buff_e + ijTL * inc_e;
306 			G  = buff_G + ijTL * rs_G;
307 
308 			// Compute the 1-norm (which equals the infinity norm since the
309 			// matrix is tridiagonal and symmetric) and perform appropriate
310 			// scaling if necessary.
311 /*
312 			FLA_Norm1_tridiag( m_A11,
313 			                   d1, inc_d,
314 			                   e1, inc_e,
315 			                   &norm );
316 */
317 
318 			// Search for a batch of eigenvalues, recursing on deflated
319 			// subproblems whenever a split occurs. Iteration continues
320 			// as long as:
321 			//   (a) there is still matrix left to operate on, and
322 			//   (b) the number of iterations performed in this batch is
323 			//       less than n_G.
324 			// If/when either of the two above conditions fails to hold,
325 			// the function returns.
326 			n_deflations = FLA_Tevd_iteracc_n_opd_var1( m_A11,
327 			                                            n_G,
328 			                                            ijTL,
329 			                                            d1, inc_d,
330 			                                            e1, inc_e,
331 			                                            &n_iter_perf );
332 
333 			// Record the number of deflations that were observed.
334 			total_deflations += n_deflations;
335 
336 			// Update the maximum number of iterations performed in the
337 			// current sweep.
338 			n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf );
339 
340 #ifdef PRINTF
341 printf( "FLA_Tevd_n_opz_var1: deflations observed       = %d\n", n_deflations );
342 printf( "FLA_Tevd_n_opz_var1: total deflations observed = %d\n", total_deflations );
343 printf( "FLA_Tevd_n_opz_var1: num iterations performed  = %d\n", n_iter_perf );
344 #endif
345 
346 			// Store the most recent value of ijBR in m_G_sweep_max.
347 			// When the sweep is done, this value will contain the minimum
348 			// number of rows of G we can apply and safely include all
349 			// non-identity rotations that were computed during the
350 			// eigenvalue searches.
351 			m_G_sweep_max = ijBR;
352 
353 			// Make sure we haven't exceeded our maximum iteration count.
354 			if ( n_iter_prev >= m_A * n_iter_max )
355 			{
356 #ifdef PRINTF
357 printf( "FLA_Tevd_n_opz_var1: reached maximum total number of iterations: %d\n", n_iter_prev );
358 #endif
359 				FLA_Abort();
360 				//return FLA_FAILURE;
361 			}
362 		}
363 
364 		// The sweep is complete.
365 
366 		// Increment the total number of iterations previously performed.
367 		n_iter_prev += n_iter_perf_sweep_max;
368 
369 #ifdef PRINTF
370 printf( "FLA_Tevd_n_opz_var1: total number of iterations performed: %d\n", n_iter_prev );
371 #endif
372 	}
373 
374 	//return FLA_SUCCESS;
375 	return n_iter_prev;
376 }
377 
378