1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
13 // Note that this operation is designed for tall rectangular matrix A.
14 // If m_A < n_A then, U and V should be swapped before entering this function.
FLA_Bsvd_v_opt_var1(dim_t n_iter_max,FLA_Obj d,FLA_Obj e,FLA_Obj G,FLA_Obj H,FLA_Obj U,FLA_Obj V,dim_t b_alg)15 FLA_Error FLA_Bsvd_v_opt_var1( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj H, FLA_Obj U, FLA_Obj V, dim_t b_alg )
16 {
17 FLA_Error r_val = FLA_SUCCESS;
18 FLA_Datatype datatype;
19 int m_U, m_V, n_GH;
20 int inc_d;
21 int inc_e;
22 int rs_G, cs_G;
23 int rs_H, cs_H;
24 int rs_U, cs_U;
25 int rs_V, cs_V;
26
27 datatype = FLA_Obj_datatype( U );
28
29 m_U = FLA_Obj_length( U );
30 m_V = FLA_Obj_length( V );
31 n_GH = FLA_Obj_width( G );
32
33 inc_d = FLA_Obj_vector_inc( d );
34 inc_e = FLA_Obj_vector_inc( e );
35
36 rs_G = FLA_Obj_row_stride( G );
37 cs_G = FLA_Obj_col_stride( G );
38
39 rs_H = FLA_Obj_row_stride( H );
40 cs_H = FLA_Obj_col_stride( H );
41
42 rs_U = FLA_Obj_row_stride( U );
43 cs_U = FLA_Obj_col_stride( U );
44
45 rs_V = FLA_Obj_row_stride( V );
46 cs_V = FLA_Obj_col_stride( V );
47
48
49 switch ( datatype )
50 {
51 case FLA_FLOAT:
52 {
53 float* buff_d = FLA_FLOAT_PTR( d );
54 float* buff_e = FLA_FLOAT_PTR( e );
55 scomplex* buff_G = FLA_COMPLEX_PTR( G );
56 scomplex* buff_H = FLA_COMPLEX_PTR( H );
57 float* buff_U = FLA_FLOAT_PTR( U );
58 float* buff_V = FLA_FLOAT_PTR( V );
59
60 r_val = FLA_Bsvd_v_ops_var1( min( m_U, m_V ),
61 m_U,
62 m_V,
63 n_GH,
64 n_iter_max,
65 buff_d, inc_d,
66 buff_e, inc_e,
67 buff_G, rs_G, cs_G,
68 buff_H, rs_H, cs_H,
69 buff_U, rs_U, cs_U,
70 buff_V, rs_V, cs_V,
71 b_alg );
72
73 break;
74 }
75
76 case FLA_DOUBLE:
77 {
78 double* buff_d = FLA_DOUBLE_PTR( d );
79 double* buff_e = FLA_DOUBLE_PTR( e );
80 dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G );
81 dcomplex* buff_H = FLA_DOUBLE_COMPLEX_PTR( H );
82 double* buff_U = FLA_DOUBLE_PTR( U );
83 double* buff_V = FLA_DOUBLE_PTR( V );
84
85 r_val = FLA_Bsvd_v_opd_var1( min( m_U, m_V ),
86 m_U,
87 m_V,
88 n_GH,
89 n_iter_max,
90 buff_d, inc_d,
91 buff_e, inc_e,
92 buff_G, rs_G, cs_G,
93 buff_H, rs_H, cs_H,
94 buff_U, rs_U, cs_U,
95 buff_V, rs_V, cs_V,
96 b_alg );
97
98 break;
99 }
100
101 case FLA_COMPLEX:
102 {
103 float* buff_d = FLA_FLOAT_PTR( d );
104 float* buff_e = FLA_FLOAT_PTR( e );
105 scomplex* buff_G = FLA_COMPLEX_PTR( G );
106 scomplex* buff_H = FLA_COMPLEX_PTR( H );
107 scomplex* buff_U = FLA_COMPLEX_PTR( U );
108 scomplex* buff_V = FLA_COMPLEX_PTR( V );
109
110 r_val = FLA_Bsvd_v_opc_var1( min( m_U, m_V ),
111 m_U,
112 m_V,
113 n_GH,
114 n_iter_max,
115 buff_d, inc_d,
116 buff_e, inc_e,
117 buff_G, rs_G, cs_G,
118 buff_H, rs_H, cs_H,
119 buff_U, rs_U, cs_U,
120 buff_V, rs_V, cs_V,
121 b_alg );
122
123 break;
124 }
125
126 case FLA_DOUBLE_COMPLEX:
127 {
128 double* buff_d = FLA_DOUBLE_PTR( d );
129 double* buff_e = FLA_DOUBLE_PTR( e );
130 dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G );
131 dcomplex* buff_H = FLA_DOUBLE_COMPLEX_PTR( H );
132 dcomplex* buff_U = FLA_DOUBLE_COMPLEX_PTR( U );
133 dcomplex* buff_V = FLA_DOUBLE_COMPLEX_PTR( V );
134
135 r_val = FLA_Bsvd_v_opz_var1( min( m_U, m_V ),
136 m_U,
137 m_V,
138 n_GH,
139 n_iter_max,
140 buff_d, inc_d,
141 buff_e, inc_e,
142 buff_G, rs_G, cs_G,
143 buff_H, rs_H, cs_H,
144 buff_U, rs_U, cs_U,
145 buff_V, rs_V, cs_V,
146 b_alg );
147
148 break;
149 }
150 }
151
152 return r_val;
153 }
154
155
156
FLA_Bsvd_v_ops_var1(int min_m_n,int m_U,int m_V,int n_GH,int n_iter_max,float * buff_d,int inc_d,float * buff_e,int inc_e,scomplex * buff_G,int rs_G,int cs_G,scomplex * buff_H,int rs_H,int cs_H,float * buff_U,int rs_U,int cs_U,float * buff_V,int rs_V,int cs_V,int b_alg)157 FLA_Error FLA_Bsvd_v_ops_var1( int min_m_n,
158 int m_U,
159 int m_V,
160 int n_GH,
161 int n_iter_max,
162 float* buff_d, int inc_d,
163 float* buff_e, int inc_e,
164 scomplex* buff_G, int rs_G, int cs_G,
165 scomplex* buff_H, int rs_H, int cs_H,
166 float* buff_U, int rs_U, int cs_U,
167 float* buff_V, int rs_V, int cs_V,
168 int b_alg )
169 {
170 scomplex one = bl1_c1();
171 float rzero = bl1_s0();
172
173 int maxitr = 6;
174
175 float eps;
176 float tolmul;
177 float tol;
178 float thresh;
179
180 scomplex* G;
181 scomplex* H;
182 float* d1;
183 float* e1;
184 int r_val;
185 int done;
186 int m_GH_sweep_max;
187 int ij_begin;
188 int ijTL, ijBR;
189 int m_A11;
190 int n_iter_perf;
191 int n_UV_apply;
192 int total_deflations;
193 int n_deflations;
194 int n_iter_prev;
195 int n_iter_perf_sweep_max;
196
197 // Compute some convergence constants.
198 eps = FLA_Mach_params_ops( FLA_MACH_EPS );
199 tolmul = max( 10.0F, min( 100.0F, powf( eps, -0.125F ) ) );
200 FLA_Bsvd_compute_tol_thresh_ops( min_m_n,
201 tolmul,
202 maxitr,
203 buff_d, inc_d,
204 buff_e, inc_e,
205 &tol,
206 &thresh );
207
208 // Initialize our completion flag.
209 done = FALSE;
210
211 // Initialize a counter that holds the maximum number of rows of G
212 // and H that we would need to initialize for the next sweep.
213 m_GH_sweep_max = min_m_n - 1;
214
215 // Initialize a counter for the total number of iterations performed.
216 n_iter_prev = 0;
217
218 // Iterate until the matrix has completely deflated.
219 for ( total_deflations = 0; done != TRUE; )
220 {
221
222 // Initialize G and H to contain only identity rotations.
223 bl1_csetm( m_GH_sweep_max,
224 n_GH,
225 &one,
226 buff_G, rs_G, cs_G );
227 bl1_csetm( m_GH_sweep_max,
228 n_GH,
229 &one,
230 buff_H, rs_H, cs_H );
231
232 // Keep track of the maximum number of iterations performed in the
233 // current sweep. This is used when applying the sweep's Givens
234 // rotations.
235 n_iter_perf_sweep_max = 0;
236
237 // Perform a sweep: Move through the matrix and perform a bidiagonal
238 // SVD on each non-zero submatrix that is encountered. During the
239 // first time through, ijTL will be 0 and ijBR will be min_m_n - 1.
240 for ( ij_begin = 0; ij_begin < min_m_n; )
241 {
242
243 // Search for the first submatrix along the diagonal that is
244 // bounded by zeroes (or endpoints of the matrix). If no
245 // submatrix is found (ie: if the entire superdiagonal is zero
246 // then FLA_FAILURE is returned. This function also inspects
247 // superdiagonal elements for proximity to zero. If a given
248 // element is close enough to zero, then it is deemed
249 // converged and manually set to zero.
250 r_val = FLA_Bsvd_find_submatrix_ops( min_m_n,
251 ij_begin,
252 buff_d, inc_d,
253 buff_e, inc_e,
254 &ijTL,
255 &ijBR );
256
257 // Verify that a submatrix was found. If one was not found,
258 // then we are done with the current sweep. Furthermore, if
259 // a submatrix was not found AND we began our search at the
260 // beginning of the matrix (ie: ij_begin == 0), then the
261 // matrix has completely deflated and so we are done with
262 // Francis step iteration.
263 if ( r_val == FLA_FAILURE )
264 {
265 if ( ij_begin == 0 )
266 {
267 done = TRUE;
268 }
269
270 // Break out of the current sweep so we can apply the last
271 // remaining Givens rotations.
272 break;
273 }
274
275 // If we got this far, then:
276 // (a) ijTL refers to the index of the first non-zero
277 // superdiagonal along the diagonal, and
278 // (b) ijBR refers to either:
279 // - the first zero element that occurs after ijTL, or
280 // - the the last diagonal element.
281 // Note that ijTL and ijBR also correspond to the first and
282 // last diagonal elements of the submatrix of interest. Thus,
283 // we may compute the dimension of this submatrix as:
284 m_A11 = ijBR - ijTL + 1;
285
286 // Adjust ij_begin, which gets us ready for the next submatrix
287 // search in the current sweep.
288 ij_begin = ijBR + 1;
289
290 // Index to the submatrices upon which we will operate.
291 d1 = buff_d + ijTL * inc_d;
292 e1 = buff_e + ijTL * inc_e;
293 G = buff_G + ijTL * rs_G;
294 H = buff_H + ijTL * rs_H;
295
296 // Search for a batch of singular values, recursing on deflated
297 // subproblems whenever a split occurs. Iteration continues as
298 // long as
299 // (a) there is still matrix left to operate on, and
300 // (b) the number of iterations performed in this batch is
301 // less than n_G.
302 // If/when either of the two above conditions fails to hold,
303 // the function returns.
304 n_deflations = FLA_Bsvd_iteracc_v_ops_var1( m_A11,
305 n_GH,
306 ijTL,
307 tol,
308 thresh,
309 d1, inc_d,
310 e1, inc_e,
311 G, rs_G, cs_G,
312 H, rs_H, cs_H,
313 &n_iter_perf );
314
315 // Record the number of deflations that were observed.
316 total_deflations += n_deflations;
317
318 // Update the maximum number of iterations performed in the
319 // current sweep.
320 n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf );
321
322 // Store the most recent value of ijBR in m_G_sweep_max.
323 // When the sweep is done, this value will contain the minimum
324 // number of rows of G and H we can apply and safely include all
325 // non-identity rotations that were computed during the
326 // singular value searches.
327 m_GH_sweep_max = ijBR;
328
329 // Make sure we haven't exceeded our maximum iteration count.
330 if ( n_iter_prev >= n_iter_max * min_m_n )
331 {
332 FLA_Abort();
333 //return FLA_FAILURE;
334 }
335 }
336
337 // The sweep is complete. Now we must apply the Givens rotations
338 // that were accumulated during the sweep.
339
340 // Recall that the number of columns of U and V to which we apply
341 // rotations is one more than the number of rotations.
342 n_UV_apply = m_GH_sweep_max + 1;
343
344 // Apply the Givens rotations. Note that we only apply k sets of
345 // rotations, where k = n_iter_perf_sweep_max. Also note that we only
346 // apply to n_UV_apply columns of U and V since this is the most we
347 // need to touch given the most recent value stored to m_GH_sweep_max.
348 //FLA_Apply_G_rf_bls_var5( n_iter_perf_sweep_max,
349 FLA_Apply_G_rf_bls_var3( n_iter_perf_sweep_max,
350 //FLA_Apply_G_rf_bld_var9( n_iter_perf_sweep_max,
351 //FLA_Apply_G_rf_bld_var6( n_iter_perf_sweep_max,
352 m_U,
353 n_UV_apply,
354 buff_G, rs_G, cs_G,
355 buff_U, rs_U, cs_U,
356 b_alg );
357 //FLA_Apply_G_rf_blc_var5( n_iter_perf_sweep_max,
358 FLA_Apply_G_rf_bls_var3( n_iter_perf_sweep_max,
359 //FLA_Apply_G_rf_bld_var9( n_iter_perf_sweep_max,
360 //FLA_Apply_G_rf_bld_var6( n_iter_perf_sweep_max,
361 m_V,
362 n_UV_apply,
363 buff_H, rs_H, cs_H,
364 buff_V, rs_V, cs_V,
365 b_alg );
366
367 // Increment the total number of iterations previously performed.
368 n_iter_prev += n_iter_perf_sweep_max;
369
370 }
371
372 // Make all the singular values positive.
373 {
374 int i;
375 float minus_one = bl1_sm1();
376
377 for ( i = 0; i < min_m_n; ++i )
378 {
379 if ( buff_d[ (i )*inc_d ] < rzero )
380 {
381 buff_d[ (i )*inc_d ] = -buff_d[ (i )*inc_d ];
382
383 // Scale the right singular vectors.
384 bl1_sscalv( BLIS1_NO_CONJUGATE,
385 m_V,
386 &minus_one,
387 buff_V + (i )*cs_V, rs_V );
388 }
389 }
390 }
391
392 return n_iter_prev;
393 }
394
395
396
FLA_Bsvd_v_opd_var1(int min_m_n,int m_U,int m_V,int n_GH,int n_iter_max,double * buff_d,int inc_d,double * buff_e,int inc_e,dcomplex * buff_G,int rs_G,int cs_G,dcomplex * buff_H,int rs_H,int cs_H,double * buff_U,int rs_U,int cs_U,double * buff_V,int rs_V,int cs_V,int b_alg)397 FLA_Error FLA_Bsvd_v_opd_var1( int min_m_n,
398 int m_U,
399 int m_V,
400 int n_GH,
401 int n_iter_max,
402 double* buff_d, int inc_d,
403 double* buff_e, int inc_e,
404 dcomplex* buff_G, int rs_G, int cs_G,
405 dcomplex* buff_H, int rs_H, int cs_H,
406 double* buff_U, int rs_U, int cs_U,
407 double* buff_V, int rs_V, int cs_V,
408 int b_alg )
409 {
410 dcomplex one = bl1_z1();
411 double rzero = bl1_d0();
412
413 int maxitr = 6;
414
415 double eps;
416 double tolmul;
417 double tol;
418 double thresh;
419
420 dcomplex* G;
421 dcomplex* H;
422 double* d1;
423 double* e1;
424 int r_val;
425 int done;
426 int m_GH_sweep_max;
427 int ij_begin;
428 int ijTL, ijBR;
429 int m_A11;
430 int n_iter_perf;
431 int n_UV_apply;
432 int total_deflations;
433 int n_deflations;
434 int n_iter_prev;
435 int n_iter_perf_sweep_max;
436
437 // Compute some convergence constants.
438 eps = FLA_Mach_params_opd( FLA_MACH_EPS );
439 tolmul = max( 10.0, min( 100.0, pow( eps, -0.125 ) ) );
440 FLA_Bsvd_compute_tol_thresh_opd( min_m_n,
441 tolmul,
442 maxitr,
443 buff_d, inc_d,
444 buff_e, inc_e,
445 &tol,
446 &thresh );
447 #ifdef PRINTF
448 printf( "FLA_Bsvd_v_opd_var1: tolmul = %12.6e\n", tolmul );
449 printf( "FLA_Bsvd_v_opd_var1: tol = %12.6e\n", tol );
450 printf( "FLA_Bsvd_v_opd_var1: thresh = %12.6e\n", thresh );
451 #endif
452
453 // Initialize our completion flag.
454 done = FALSE;
455
456 // Initialize a counter that holds the maximum number of rows of G
457 // and H that we would need to initialize for the next sweep.
458 m_GH_sweep_max = min_m_n - 1;
459
460 // Initialize a counter for the total number of iterations performed.
461 n_iter_prev = 0;
462
463 // Iterate until the matrix has completely deflated.
464 for ( total_deflations = 0; done != TRUE; )
465 {
466
467 // Initialize G and H to contain only identity rotations.
468 bl1_zsetm( m_GH_sweep_max,
469 n_GH,
470 &one,
471 buff_G, rs_G, cs_G );
472 bl1_zsetm( m_GH_sweep_max,
473 n_GH,
474 &one,
475 buff_H, rs_H, cs_H );
476
477 // Keep track of the maximum number of iterations performed in the
478 // current sweep. This is used when applying the sweep's Givens
479 // rotations.
480 n_iter_perf_sweep_max = 0;
481
482 // Perform a sweep: Move through the matrix and perform a bidiagonal
483 // SVD on each non-zero submatrix that is encountered. During the
484 // first time through, ijTL will be 0 and ijBR will be min_m_n - 1.
485 for ( ij_begin = 0; ij_begin < min_m_n; )
486 {
487
488 #ifdef PRINTF
489 if ( ij_begin == 0 )
490 printf( "FLA_Bsvd_v_opd_var1: beginning new sweep (ij_begin = %d)\n", ij_begin );
491 #endif
492
493 // Search for the first submatrix along the diagonal that is
494 // bounded by zeroes (or endpoints of the matrix). If no
495 // submatrix is found (ie: if the entire superdiagonal is zero
496 // then FLA_FAILURE is returned. This function also inspects
497 // superdiagonal elements for proximity to zero. If a given
498 // element is close enough to zero, then it is deemed
499 // converged and manually set to zero.
500 r_val = FLA_Bsvd_find_submatrix_opd( min_m_n,
501 ij_begin,
502 buff_d, inc_d,
503 buff_e, inc_e,
504 &ijTL,
505 &ijBR );
506
507 // Verify that a submatrix was found. If one was not found,
508 // then we are done with the current sweep. Furthermore, if
509 // a submatrix was not found AND we began our search at the
510 // beginning of the matrix (ie: ij_begin == 0), then the
511 // matrix has completely deflated and so we are done with
512 // Francis step iteration.
513 if ( r_val == FLA_FAILURE )
514 {
515 if ( ij_begin == 0 )
516 {
517 #ifdef PRINTF
518 printf( "FLA_Bsvd_v_opd_var1: superdiagonal is completely zero.\n" );
519 printf( "FLA_Bsvd_v_opd_var1: Francis iteration is done!\n" );
520 #endif
521 done = TRUE;
522 }
523
524 // Break out of the current sweep so we can apply the last
525 // remaining Givens rotations.
526 break;
527 }
528
529 // If we got this far, then:
530 // (a) ijTL refers to the index of the first non-zero
531 // superdiagonal along the diagonal, and
532 // (b) ijBR refers to either:
533 // - the first zero element that occurs after ijTL, or
534 // - the the last diagonal element.
535 // Note that ijTL and ijBR also correspond to the first and
536 // last diagonal elements of the submatrix of interest. Thus,
537 // we may compute the dimension of this submatrix as:
538 m_A11 = ijBR - ijTL + 1;
539
540 #ifdef PRINTF
541 printf( "FLA_Bsvd_v_opd_var1: ij_begin = %d\n", ij_begin );
542 printf( "FLA_Bsvd_v_opd_var1: ijTL = %d\n", ijTL );
543 printf( "FLA_Bsvd_v_opd_var1: ijBR = %d\n", ijBR );
544 printf( "FLA_Bsvd_v_opd_var1: m_A11 = %d\n", m_A11 );
545 #endif
546
547 // Adjust ij_begin, which gets us ready for the next submatrix
548 // search in the current sweep.
549 ij_begin = ijBR + 1;
550
551 // Index to the submatrices upon which we will operate.
552 d1 = buff_d + ijTL * inc_d;
553 e1 = buff_e + ijTL * inc_e;
554 G = buff_G + ijTL * rs_G;
555 H = buff_H + ijTL * rs_H;
556
557 // Search for a batch of singular values, recursing on deflated
558 // subproblems whenever a split occurs. Iteration continues as
559 // long as
560 // (a) there is still matrix left to operate on, and
561 // (b) the number of iterations performed in this batch is
562 // less than n_G.
563 // If/when either of the two above conditions fails to hold,
564 // the function returns.
565 n_deflations = FLA_Bsvd_iteracc_v_opd_var1( m_A11,
566 n_GH,
567 ijTL,
568 tol,
569 thresh,
570 d1, inc_d,
571 e1, inc_e,
572 G, rs_G, cs_G,
573 H, rs_H, cs_H,
574 &n_iter_perf );
575
576 // Record the number of deflations that were observed.
577 total_deflations += n_deflations;
578
579 // Update the maximum number of iterations performed in the
580 // current sweep.
581 n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf );
582
583 #ifdef PRINTF
584 printf( "FLA_Bsvd_v_opd_var1: deflations observed = %d\n", n_deflations );
585 printf( "FLA_Bsvd_v_opd_var1: total deflations observed = %d\n", total_deflations );
586 printf( "FLA_Bsvd_v_opd_var1: num iterations performed = %d\n", n_iter_perf );
587 #endif
588
589 // Store the most recent value of ijBR in m_G_sweep_max.
590 // When the sweep is done, this value will contain the minimum
591 // number of rows of G and H we can apply and safely include all
592 // non-identity rotations that were computed during the
593 // singular value searches.
594 m_GH_sweep_max = ijBR;
595
596 // Make sure we haven't exceeded our maximum iteration count.
597 if ( n_iter_prev >= n_iter_max * min_m_n )
598 {
599 #ifdef PRINTF
600 printf( "FLA_Bsvd_v_opd_var1: reached maximum total number of iterations: %d\n", n_iter_prev );
601 #endif
602 FLA_Abort();
603 //return FLA_FAILURE;
604 }
605 }
606
607 // The sweep is complete. Now we must apply the Givens rotations
608 // that were accumulated during the sweep.
609
610 // Recall that the number of columns of U and V to which we apply
611 // rotations is one more than the number of rotations.
612 n_UV_apply = m_GH_sweep_max + 1;
613
614 #ifdef PRINTF
615 printf( "FLA_Bsvd_v_opd_var1: applying %d sets of Givens rotations\n", n_iter_perf_sweep_max );
616 printf( "FLA_Bsvd_v_opd_var1: m_U = %d\n", m_U );
617 printf( "FLA_Bsvd_v_opd_var1: m_V = %d\n", m_V );
618 printf( "FLA_Bsvd_v_opd_var1: napp= %d\n", n_UV_apply );
619 #endif
620 // Apply the Givens rotations. Note that we only apply k sets of
621 // rotations, where k = n_iter_perf_sweep_max. Also note that we only
622 // apply to n_UV_apply columns of U and V since this is the most we
623 // need to touch given the most recent value stored to m_GH_sweep_max.
624 //FLA_Apply_G_rf_bld_var5( n_iter_perf_sweep_max,
625 FLA_Apply_G_rf_bld_var3( n_iter_perf_sweep_max,
626 //FLA_Apply_G_rf_bld_var9( n_iter_perf_sweep_max,
627 //FLA_Apply_G_rf_bld_var6( n_iter_perf_sweep_max,
628 m_U,
629 n_UV_apply,
630 buff_G, rs_G, cs_G,
631 buff_U, rs_U, cs_U,
632 b_alg );
633 //FLA_Apply_G_rf_bld_var5( n_iter_perf_sweep_max,
634 FLA_Apply_G_rf_bld_var3( n_iter_perf_sweep_max,
635 //FLA_Apply_G_rf_bld_var9( n_iter_perf_sweep_max,
636 //FLA_Apply_G_rf_bld_var6( n_iter_perf_sweep_max,
637 m_V,
638 n_UV_apply,
639 buff_H, rs_H, cs_H,
640 buff_V, rs_V, cs_V,
641 b_alg );
642
643 // Increment the total number of iterations previously performed.
644 n_iter_prev += n_iter_perf_sweep_max;
645
646 #ifdef PRINTF
647 printf( "FLA_Bsvd_v_opd_var1: total number of iterations performed: %d\n", n_iter_prev );
648 #endif
649 }
650
651 // Make all the singular values positive.
652 {
653 int i;
654 double minus_one = bl1_dm1();
655
656 for ( i = 0; i < min_m_n; ++i )
657 {
658 if ( buff_d[ (i )*inc_d ] < rzero )
659 {
660 buff_d[ (i )*inc_d ] = -buff_d[ (i )*inc_d ];
661
662 // Scale the right singular vectors.
663 bl1_dscalv( BLIS1_NO_CONJUGATE,
664 m_V,
665 &minus_one,
666 buff_V + (i )*cs_V, rs_V );
667 }
668 }
669 }
670
671 return n_iter_prev;
672 }
673
FLA_Bsvd_v_opc_var1(int min_m_n,int m_U,int m_V,int n_GH,int n_iter_max,float * buff_d,int inc_d,float * buff_e,int inc_e,scomplex * buff_G,int rs_G,int cs_G,scomplex * buff_H,int rs_H,int cs_H,scomplex * buff_U,int rs_U,int cs_U,scomplex * buff_V,int rs_V,int cs_V,int b_alg)674 FLA_Error FLA_Bsvd_v_opc_var1( int min_m_n,
675 int m_U,
676 int m_V,
677 int n_GH,
678 int n_iter_max,
679 float* buff_d, int inc_d,
680 float* buff_e, int inc_e,
681 scomplex* buff_G, int rs_G, int cs_G,
682 scomplex* buff_H, int rs_H, int cs_H,
683 scomplex* buff_U, int rs_U, int cs_U,
684 scomplex* buff_V, int rs_V, int cs_V,
685 int b_alg )
686 {
687 scomplex one = bl1_c1();
688 float rzero = bl1_s0();
689
690 int maxitr = 6;
691
692 float eps;
693 float tolmul;
694 float tol;
695 float thresh;
696
697 scomplex* G;
698 scomplex* H;
699 float* d1;
700 float* e1;
701 int r_val;
702 int done;
703 int m_GH_sweep_max;
704 int ij_begin;
705 int ijTL, ijBR;
706 int m_A11;
707 int n_iter_perf;
708 int n_UV_apply;
709 int total_deflations;
710 int n_deflations;
711 int n_iter_prev;
712 int n_iter_perf_sweep_max;
713
714 // Compute some convergence constants.
715 eps = FLA_Mach_params_ops( FLA_MACH_EPS );
716 tolmul = max( 10.0F, min( 100.0F, powf( eps, -0.125F ) ) );
717 FLA_Bsvd_compute_tol_thresh_ops( min_m_n,
718 tolmul,
719 maxitr,
720 buff_d, inc_d,
721 buff_e, inc_e,
722 &tol,
723 &thresh );
724
725 // Initialize our completion flag.
726 done = FALSE;
727
728 // Initialize a counter that holds the maximum number of rows of G
729 // and H that we would need to initialize for the next sweep.
730 m_GH_sweep_max = min_m_n - 1;
731
732 // Initialize a counter for the total number of iterations performed.
733 n_iter_prev = 0;
734
735 // Iterate until the matrix has completely deflated.
736 for ( total_deflations = 0; done != TRUE; )
737 {
738
739 // Initialize G and H to contain only identity rotations.
740 bl1_csetm( m_GH_sweep_max,
741 n_GH,
742 &one,
743 buff_G, rs_G, cs_G );
744 bl1_csetm( m_GH_sweep_max,
745 n_GH,
746 &one,
747 buff_H, rs_H, cs_H );
748
749 // Keep track of the maximum number of iterations performed in the
750 // current sweep. This is used when applying the sweep's Givens
751 // rotations.
752 n_iter_perf_sweep_max = 0;
753
754 // Perform a sweep: Move through the matrix and perform a bidiagonal
755 // SVD on each non-zero submatrix that is encountered. During the
756 // first time through, ijTL will be 0 and ijBR will be min_m_n - 1.
757 for ( ij_begin = 0; ij_begin < min_m_n; )
758 {
759 // Search for the first submatrix along the diagonal that is
760 // bounded by zeroes (or endpoints of the matrix). If no
761 // submatrix is found (ie: if the entire superdiagonal is zero
762 // then FLA_FAILURE is returned. This function also inspects
763 // superdiagonal elements for proximity to zero. If a given
764 // element is close enough to zero, then it is deemed
765 // converged and manually set to zero.
766 r_val = FLA_Bsvd_find_submatrix_ops( min_m_n,
767 ij_begin,
768 buff_d, inc_d,
769 buff_e, inc_e,
770 &ijTL,
771 &ijBR );
772
773 // Verify that a submatrix was found. If one was not found,
774 // then we are done with the current sweep. Furthermore, if
775 // a submatrix was not found AND we began our search at the
776 // beginning of the matrix (ie: ij_begin == 0), then the
777 // matrix has completely deflated and so we are done with
778 // Francis step iteration.
779 if ( r_val == FLA_FAILURE )
780 {
781 if ( ij_begin == 0 )
782 {
783 done = TRUE;
784 }
785
786 // Break out of the current sweep so we can apply the last
787 // remaining Givens rotations.
788 break;
789 }
790
791 // If we got this far, then:
792 // (a) ijTL refers to the index of the first non-zero
793 // superdiagonal along the diagonal, and
794 // (b) ijBR refers to either:
795 // - the first zero element that occurs after ijTL, or
796 // - the the last diagonal element.
797 // Note that ijTL and ijBR also correspond to the first and
798 // last diagonal elements of the submatrix of interest. Thus,
799 // we may compute the dimension of this submatrix as:
800 m_A11 = ijBR - ijTL + 1;
801
802 // Adjust ij_begin, which gets us ready for the next submatrix
803 // search in the current sweep.
804 ij_begin = ijBR + 1;
805
806 // Index to the submatrices upon which we will operate.
807 d1 = buff_d + ijTL * inc_d;
808 e1 = buff_e + ijTL * inc_e;
809 G = buff_G + ijTL * rs_G;
810 H = buff_H + ijTL * rs_H;
811
812 // Search for a batch of singular values, recursing on deflated
813 // subproblems whenever a split occurs. Iteration continues as
814 // long as
815 // (a) there is still matrix left to operate on, and
816 // (b) the number of iterations performed in this batch is
817 // less than n_G.
818 // If/when either of the two above conditions fails to hold,
819 // the function returns.
820 n_deflations = FLA_Bsvd_iteracc_v_ops_var1( m_A11,
821 n_GH,
822 ijTL,
823 tol,
824 thresh,
825 d1, inc_d,
826 e1, inc_e,
827 G, rs_G, cs_G,
828 H, rs_H, cs_H,
829 &n_iter_perf );
830
831 // Record the number of deflations that were observed.
832 total_deflations += n_deflations;
833
834 // Update the maximum number of iterations performed in the
835 // current sweep.
836 n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf );
837
838 // Store the most recent value of ijBR in m_G_sweep_max.
839 // When the sweep is done, this value will contain the minimum
840 // number of rows of G and H we can apply and safely include all
841 // non-identity rotations that were computed during the
842 // singular value searches.
843 m_GH_sweep_max = ijBR;
844
845 // Make sure we haven't exceeded our maximum iteration count.
846 if ( n_iter_prev >= n_iter_max * min_m_n )
847 {
848 FLA_Abort();
849 //return FLA_FAILURE;
850 }
851 }
852
853 // The sweep is complete. Now we must apply the Givens rotations
854 // that were accumulated during the sweep.
855
856 // Recall that the number of columns of U and V to which we apply
857 // rotations is one more than the number of rotations.
858 n_UV_apply = m_GH_sweep_max + 1;
859
860 // Apply the Givens rotations. Note that we only apply k sets of
861 // rotations, where k = n_iter_perf_sweep_max. Also note that we only
862 // apply to n_UV_apply columns of U and V since this is the most we
863 // need to touch given the most recent value stored to m_GH_sweep_max.
864 //FLA_Apply_G_rf_blc_var5( n_iter_perf_sweep_max,
865 FLA_Apply_G_rf_blc_var3( n_iter_perf_sweep_max,
866 //FLA_Apply_G_rf_blz_var9( n_iter_perf_sweep_max,
867 //FLA_Apply_G_rf_blz_var6( n_iter_perf_sweep_max,
868 m_U,
869 n_UV_apply,
870 buff_G, rs_G, cs_G,
871 buff_U, rs_U, cs_U,
872 b_alg );
873 //FLA_Apply_G_rf_blc_var5( n_iter_perf_sweep_max,
874 FLA_Apply_G_rf_blc_var3( n_iter_perf_sweep_max,
875 //FLA_Apply_G_rf_blz_var9( n_iter_perf_sweep_max,
876 //FLA_Apply_G_rf_blz_var6( n_iter_perf_sweep_max,
877 m_V,
878 n_UV_apply,
879 buff_H, rs_H, cs_H,
880 buff_V, rs_V, cs_V,
881 b_alg );
882
883 // Increment the total number of iterations previously performed.
884 n_iter_prev += n_iter_perf_sweep_max;
885 }
886
887 // Make all the singular values positive.
888 {
889 int i;
890 float minus_one = bl1_sm1();
891
892 for ( i = 0; i < min_m_n; ++i )
893 {
894 if ( buff_d[ (i )*inc_d ] < rzero )
895 {
896 buff_d[ (i )*inc_d ] = -buff_d[ (i )*inc_d ];
897
898 // Scale the right singular vectors.
899 bl1_csscalv( BLIS1_NO_CONJUGATE,
900 m_V,
901 &minus_one,
902 buff_V + (i )*cs_V, rs_V );
903 }
904 }
905 }
906
907 return FLA_SUCCESS;
908 }
909
910 //#define PRINTF
911
FLA_Bsvd_v_opz_var1(int min_m_n,int m_U,int m_V,int n_GH,int n_iter_max,double * buff_d,int inc_d,double * buff_e,int inc_e,dcomplex * buff_G,int rs_G,int cs_G,dcomplex * buff_H,int rs_H,int cs_H,dcomplex * buff_U,int rs_U,int cs_U,dcomplex * buff_V,int rs_V,int cs_V,int b_alg)912 FLA_Error FLA_Bsvd_v_opz_var1( int min_m_n,
913 int m_U,
914 int m_V,
915 int n_GH,
916 int n_iter_max,
917 double* buff_d, int inc_d,
918 double* buff_e, int inc_e,
919 dcomplex* buff_G, int rs_G, int cs_G,
920 dcomplex* buff_H, int rs_H, int cs_H,
921 dcomplex* buff_U, int rs_U, int cs_U,
922 dcomplex* buff_V, int rs_V, int cs_V,
923 int b_alg )
924 {
925 dcomplex one = bl1_z1();
926 double rzero = bl1_d0();
927
928 int maxitr = 6;
929
930 double eps;
931 double tolmul;
932 double tol;
933 double thresh;
934
935 dcomplex* G;
936 dcomplex* H;
937 double* d1;
938 double* e1;
939 int r_val;
940 int done;
941 int m_GH_sweep_max;
942 int ij_begin;
943 int ijTL, ijBR;
944 int m_A11;
945 int n_iter_perf;
946 int n_UV_apply;
947 int total_deflations;
948 int n_deflations;
949 int n_iter_prev;
950 int n_iter_perf_sweep_max;
951
952 // Compute some convergence constants.
953 eps = FLA_Mach_params_opd( FLA_MACH_EPS );
954 tolmul = max( 10.0, min( 100.0, pow( eps, -0.125 ) ) );
955 FLA_Bsvd_compute_tol_thresh_opd( min_m_n,
956 tolmul,
957 maxitr,
958 buff_d, inc_d,
959 buff_e, inc_e,
960 &tol,
961 &thresh );
962 #ifdef PRINTF
963 printf( "FLA_Bsvd_v_opz_var1: tolmul = %12.6e\n", tolmul );
964 printf( "FLA_Bsvd_v_opz_var1: tol = %12.6e\n", tol );
965 printf( "FLA_Bsvd_v_opz_var1: thresh = %12.6e\n", thresh );
966 #endif
967
968 // Initialize our completion flag.
969 done = FALSE;
970
971 // Initialize a counter that holds the maximum number of rows of G
972 // and H that we would need to initialize for the next sweep.
973 m_GH_sweep_max = min_m_n - 1;
974
975 // Initialize a counter for the total number of iterations performed.
976 n_iter_prev = 0;
977
978 // Iterate until the matrix has completely deflated.
979 for ( total_deflations = 0; done != TRUE; )
980 {
981
982 // Initialize G and H to contain only identity rotations.
983 bl1_zsetm( m_GH_sweep_max,
984 n_GH,
985 &one,
986 buff_G, rs_G, cs_G );
987 bl1_zsetm( m_GH_sweep_max,
988 n_GH,
989 &one,
990 buff_H, rs_H, cs_H );
991
992 // Keep track of the maximum number of iterations performed in the
993 // current sweep. This is used when applying the sweep's Givens
994 // rotations.
995 n_iter_perf_sweep_max = 0;
996
997 // Perform a sweep: Move through the matrix and perform a bidiagonal
998 // SVD on each non-zero submatrix that is encountered. During the
999 // first time through, ijTL will be 0 and ijBR will be min_m_n - 1.
1000 for ( ij_begin = 0; ij_begin < min_m_n; )
1001 {
1002
1003 #ifdef PRINTF
1004 if ( ij_begin == 0 )
1005 printf( "FLA_Bsvd_v_opz_var1: beginning new sweep (ij_begin = %d)\n", ij_begin );
1006 #endif
1007
1008 // Search for the first submatrix along the diagonal that is
1009 // bounded by zeroes (or endpoints of the matrix). If no
1010 // submatrix is found (ie: if the entire superdiagonal is zero
1011 // then FLA_FAILURE is returned. This function also inspects
1012 // superdiagonal elements for proximity to zero. If a given
1013 // element is close enough to zero, then it is deemed
1014 // converged and manually set to zero.
1015 r_val = FLA_Bsvd_find_submatrix_opd( min_m_n,
1016 ij_begin,
1017 buff_d, inc_d,
1018 buff_e, inc_e,
1019 &ijTL,
1020 &ijBR );
1021
1022 // Verify that a submatrix was found. If one was not found,
1023 // then we are done with the current sweep. Furthermore, if
1024 // a submatrix was not found AND we began our search at the
1025 // beginning of the matrix (ie: ij_begin == 0), then the
1026 // matrix has completely deflated and so we are done with
1027 // Francis step iteration.
1028 if ( r_val == FLA_FAILURE )
1029 {
1030 if ( ij_begin == 0 )
1031 {
1032 #ifdef PRINTF
1033 printf( "FLA_Bsvd_v_opz_var1: superdiagonal is completely zero.\n" );
1034 printf( "FLA_Bsvd_v_opz_var1: Francis iteration is done!\n" );
1035 #endif
1036 done = TRUE;
1037 }
1038
1039 // Break out of the current sweep so we can apply the last
1040 // remaining Givens rotations.
1041 break;
1042 }
1043
1044 // If we got this far, then:
1045 // (a) ijTL refers to the index of the first non-zero
1046 // superdiagonal along the diagonal, and
1047 // (b) ijBR refers to either:
1048 // - the first zero element that occurs after ijTL, or
1049 // - the the last diagonal element.
1050 // Note that ijTL and ijBR also correspond to the first and
1051 // last diagonal elements of the submatrix of interest. Thus,
1052 // we may compute the dimension of this submatrix as:
1053 m_A11 = ijBR - ijTL + 1;
1054
1055 #ifdef PRINTF
1056 printf( "FLA_Bsvd_v_opz_var1: ij_begin = %d\n", ij_begin );
1057 printf( "FLA_Bsvd_v_opz_var1: ijTL = %d\n", ijTL );
1058 printf( "FLA_Bsvd_v_opz_var1: ijBR = %d\n", ijBR );
1059 printf( "FLA_Bsvd_v_opz_var1: m_A11 = %d\n", m_A11 );
1060 #endif
1061
1062 // Adjust ij_begin, which gets us ready for the next submatrix
1063 // search in the current sweep.
1064 ij_begin = ijBR + 1;
1065
1066 // Index to the submatrices upon which we will operate.
1067 d1 = buff_d + ijTL * inc_d;
1068 e1 = buff_e + ijTL * inc_e;
1069 G = buff_G + ijTL * rs_G;
1070 H = buff_H + ijTL * rs_H;
1071
1072 // Search for a batch of singular values, recursing on deflated
1073 // subproblems whenever a split occurs. Iteration continues as
1074 // long as
1075 // (a) there is still matrix left to operate on, and
1076 // (b) the number of iterations performed in this batch is
1077 // less than n_G.
1078 // If/when either of the two above conditions fails to hold,
1079 // the function returns.
1080 n_deflations = FLA_Bsvd_iteracc_v_opd_var1( m_A11,
1081 n_GH,
1082 ijTL,
1083 tol,
1084 thresh,
1085 d1, inc_d,
1086 e1, inc_e,
1087 G, rs_G, cs_G,
1088 H, rs_H, cs_H,
1089 &n_iter_perf );
1090
1091 // Record the number of deflations that were observed.
1092 total_deflations += n_deflations;
1093
1094 // Update the maximum number of iterations performed in the
1095 // current sweep.
1096 n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf );
1097
1098 #ifdef PRINTF
1099 printf( "FLA_Bsvd_v_opz_var1: deflations observed = %d\n", n_deflations );
1100 printf( "FLA_Bsvd_v_opz_var1: total deflations observed = %d\n", total_deflations );
1101 printf( "FLA_Bsvd_v_opz_var1: num iterations performed = %d\n", n_iter_perf );
1102 #endif
1103
1104 // Store the most recent value of ijBR in m_G_sweep_max.
1105 // When the sweep is done, this value will contain the minimum
1106 // number of rows of G and H we can apply and safely include all
1107 // non-identity rotations that were computed during the
1108 // singular value searches.
1109 m_GH_sweep_max = ijBR;
1110
1111 // Make sure we haven't exceeded our maximum iteration count.
1112 if ( n_iter_prev >= n_iter_max * min_m_n )
1113 {
1114 #ifdef PRINTF
1115 printf( "FLA_Bsvd_v_opz_var1: reached maximum total number of iterations: %d\n", n_iter_prev );
1116 #endif
1117 FLA_Abort();
1118 //return FLA_FAILURE;
1119 }
1120 }
1121
1122 // The sweep is complete. Now we must apply the Givens rotations
1123 // that were accumulated during the sweep.
1124
1125 // Recall that the number of columns of U and V to which we apply
1126 // rotations is one more than the number of rotations.
1127 n_UV_apply = m_GH_sweep_max + 1;
1128
1129 #ifdef PRINTF
1130 printf( "FLA_Bsvd_v_opz_var1: applying %d sets of Givens rotations\n", n_iter_perf_sweep_max );
1131 printf( "FLA_Bsvd_v_opz_var1: m_U = %d\n", m_U );
1132 printf( "FLA_Bsvd_v_opz_var1: m_V = %d\n", m_V );
1133 printf( "FLA_Bsvd_v_opz_var1: napp= %d\n", n_UV_apply );
1134 #endif
1135 // Apply the Givens rotations. Note that we only apply k sets of
1136 // rotations, where k = n_iter_perf_sweep_max. Also note that we only
1137 // apply to n_UV_apply columns of U and V since this is the most we
1138 // need to touch given the most recent value stored to m_GH_sweep_max.
1139 //FLA_Apply_G_rf_blz_var5( n_iter_perf_sweep_max,
1140 FLA_Apply_G_rf_blz_var3( n_iter_perf_sweep_max,
1141 //FLA_Apply_G_rf_blz_var9( n_iter_perf_sweep_max,
1142 //FLA_Apply_G_rf_blz_var6( n_iter_perf_sweep_max,
1143 m_U,
1144 n_UV_apply,
1145 buff_G, rs_G, cs_G,
1146 buff_U, rs_U, cs_U,
1147 b_alg );
1148 //FLA_Apply_G_rf_blz_var5( n_iter_perf_sweep_max,
1149 FLA_Apply_G_rf_blz_var3( n_iter_perf_sweep_max,
1150 //FLA_Apply_G_rf_blz_var9( n_iter_perf_sweep_max,
1151 //FLA_Apply_G_rf_blz_var6( n_iter_perf_sweep_max,
1152 m_V,
1153 n_UV_apply,
1154 buff_H, rs_H, cs_H,
1155 buff_V, rs_V, cs_V,
1156 b_alg );
1157
1158 // Increment the total number of iterations previously performed.
1159 n_iter_prev += n_iter_perf_sweep_max;
1160
1161 #ifdef PRINTF
1162 printf( "FLA_Bsvd_v_opz_var1: total number of iterations performed: %d\n", n_iter_prev );
1163 #endif
1164 }
1165
1166 // Make all the singular values positive.
1167 {
1168 int i;
1169 double minus_one = bl1_dm1();
1170
1171 for ( i = 0; i < min_m_n; ++i )
1172 {
1173 if ( buff_d[ (i )*inc_d ] < rzero )
1174 {
1175 buff_d[ (i )*inc_d ] = -buff_d[ (i )*inc_d ];
1176
1177 // Scale the right singular vectors.
1178 bl1_zdscalv( BLIS1_NO_CONJUGATE,
1179 m_V,
1180 &minus_one,
1181 buff_V + (i )*cs_V, rs_V );
1182 }
1183 }
1184 }
1185
1186 return n_iter_prev;
1187 }
1188
1189