1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "FLAME.h"
12 
13 // Note that this operation is designed for tall rectangular matrix A.
14 // If m_A < n_A then, U and V should be swapped before entering this function.
FLA_Bsvd_v_opt_var1(dim_t n_iter_max,FLA_Obj d,FLA_Obj e,FLA_Obj G,FLA_Obj H,FLA_Obj U,FLA_Obj V,dim_t b_alg)15 FLA_Error FLA_Bsvd_v_opt_var1( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj H, FLA_Obj U, FLA_Obj V, dim_t b_alg )
16 {
17     FLA_Error    r_val = FLA_SUCCESS;
18     FLA_Datatype datatype;
19     int          m_U, m_V, n_GH;
20     int          inc_d;
21     int          inc_e;
22     int          rs_G, cs_G;
23     int          rs_H, cs_H;
24     int          rs_U, cs_U;
25     int          rs_V, cs_V;
26 
27     datatype = FLA_Obj_datatype( U );
28 
29     m_U        = FLA_Obj_length( U );
30     m_V        = FLA_Obj_length( V );
31     n_GH       = FLA_Obj_width( G );
32 
33     inc_d      = FLA_Obj_vector_inc( d );
34     inc_e      = FLA_Obj_vector_inc( e );
35 
36     rs_G       = FLA_Obj_row_stride( G );
37     cs_G       = FLA_Obj_col_stride( G );
38 
39     rs_H       = FLA_Obj_row_stride( H );
40     cs_H       = FLA_Obj_col_stride( H );
41 
42     rs_U       = FLA_Obj_row_stride( U );
43     cs_U       = FLA_Obj_col_stride( U );
44 
45     rs_V       = FLA_Obj_row_stride( V );
46     cs_V       = FLA_Obj_col_stride( V );
47 
48 
49     switch ( datatype )
50     {
51     case FLA_FLOAT:
52     {
53         float*    buff_d = FLA_FLOAT_PTR( d );
54         float*    buff_e = FLA_FLOAT_PTR( e );
55         scomplex* buff_G = FLA_COMPLEX_PTR( G );
56         scomplex* buff_H = FLA_COMPLEX_PTR( H );
57         float*    buff_U = FLA_FLOAT_PTR( U );
58         float*    buff_V = FLA_FLOAT_PTR( V );
59 
60         r_val = FLA_Bsvd_v_ops_var1( min( m_U, m_V ),
61                                      m_U,
62                                      m_V,
63                                      n_GH,
64                                      n_iter_max,
65                                      buff_d, inc_d,
66                                      buff_e, inc_e,
67                                      buff_G, rs_G, cs_G,
68                                      buff_H, rs_H, cs_H,
69                                      buff_U, rs_U, cs_U,
70                                      buff_V, rs_V, cs_V,
71                                      b_alg );
72 
73         break;
74     }
75 
76     case FLA_DOUBLE:
77     {
78         double*   buff_d = FLA_DOUBLE_PTR( d );
79         double*   buff_e = FLA_DOUBLE_PTR( e );
80         dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G );
81         dcomplex* buff_H = FLA_DOUBLE_COMPLEX_PTR( H );
82         double*   buff_U = FLA_DOUBLE_PTR( U );
83         double*   buff_V = FLA_DOUBLE_PTR( V );
84 
85         r_val = FLA_Bsvd_v_opd_var1( min( m_U, m_V ),
86                                      m_U,
87                                      m_V,
88                                      n_GH,
89                                      n_iter_max,
90                                      buff_d, inc_d,
91                                      buff_e, inc_e,
92                                      buff_G, rs_G, cs_G,
93                                      buff_H, rs_H, cs_H,
94                                      buff_U, rs_U, cs_U,
95                                      buff_V, rs_V, cs_V,
96                                      b_alg );
97 
98         break;
99     }
100 
101     case FLA_COMPLEX:
102     {
103         float*    buff_d = FLA_FLOAT_PTR( d );
104         float*    buff_e = FLA_FLOAT_PTR( e );
105         scomplex* buff_G = FLA_COMPLEX_PTR( G );
106         scomplex* buff_H = FLA_COMPLEX_PTR( H );
107         scomplex* buff_U = FLA_COMPLEX_PTR( U );
108         scomplex* buff_V = FLA_COMPLEX_PTR( V );
109 
110         r_val = FLA_Bsvd_v_opc_var1( min( m_U, m_V ),
111                                      m_U,
112                                      m_V,
113                                      n_GH,
114                                      n_iter_max,
115                                      buff_d, inc_d,
116                                      buff_e, inc_e,
117                                      buff_G, rs_G, cs_G,
118                                      buff_H, rs_H, cs_H,
119                                      buff_U, rs_U, cs_U,
120                                      buff_V, rs_V, cs_V,
121                                      b_alg );
122 
123         break;
124     }
125 
126     case FLA_DOUBLE_COMPLEX:
127     {
128         double*   buff_d = FLA_DOUBLE_PTR( d );
129         double*   buff_e = FLA_DOUBLE_PTR( e );
130         dcomplex* buff_G = FLA_DOUBLE_COMPLEX_PTR( G );
131         dcomplex* buff_H = FLA_DOUBLE_COMPLEX_PTR( H );
132         dcomplex* buff_U = FLA_DOUBLE_COMPLEX_PTR( U );
133         dcomplex* buff_V = FLA_DOUBLE_COMPLEX_PTR( V );
134 
135         r_val = FLA_Bsvd_v_opz_var1( min( m_U, m_V ),
136                                      m_U,
137                                      m_V,
138                                      n_GH,
139                                      n_iter_max,
140                                      buff_d, inc_d,
141                                      buff_e, inc_e,
142                                      buff_G, rs_G, cs_G,
143                                      buff_H, rs_H, cs_H,
144                                      buff_U, rs_U, cs_U,
145                                      buff_V, rs_V, cs_V,
146                                      b_alg );
147 
148         break;
149     }
150     }
151 
152     return r_val;
153 }
154 
155 
156 
FLA_Bsvd_v_ops_var1(int min_m_n,int m_U,int m_V,int n_GH,int n_iter_max,float * buff_d,int inc_d,float * buff_e,int inc_e,scomplex * buff_G,int rs_G,int cs_G,scomplex * buff_H,int rs_H,int cs_H,float * buff_U,int rs_U,int cs_U,float * buff_V,int rs_V,int cs_V,int b_alg)157 FLA_Error FLA_Bsvd_v_ops_var1( int       min_m_n,
158                                int       m_U,
159                                int       m_V,
160                                int       n_GH,
161                                int       n_iter_max,
162                                float*    buff_d, int inc_d,
163                                float*    buff_e, int inc_e,
164                                scomplex* buff_G, int rs_G, int cs_G,
165                                scomplex* buff_H, int rs_H, int cs_H,
166                                float*    buff_U, int rs_U, int cs_U,
167                                float*    buff_V, int rs_V, int cs_V,
168                                int       b_alg )
169 {
170     scomplex  one        = bl1_c1();
171     float     rzero      = bl1_s0();
172 
173     int       maxitr     = 6;
174 
175     float     eps;
176     float     tolmul;
177     float     tol;
178     float     thresh;
179 
180     scomplex* G;
181     scomplex* H;
182     float*    d1;
183     float*    e1;
184     int       r_val;
185     int       done;
186     int       m_GH_sweep_max;
187     int       ij_begin;
188     int       ijTL, ijBR;
189     int       m_A11;
190     int       n_iter_perf;
191     int       n_UV_apply;
192     int       total_deflations;
193     int       n_deflations;
194     int       n_iter_prev;
195     int       n_iter_perf_sweep_max;
196 
197     // Compute some convergence constants.
198     eps    = FLA_Mach_params_ops( FLA_MACH_EPS );
199     tolmul = max( 10.0F, min( 100.0F, powf( eps, -0.125F ) ) );
200     FLA_Bsvd_compute_tol_thresh_ops( min_m_n,
201                                      tolmul,
202                                      maxitr,
203                                      buff_d, inc_d,
204                                      buff_e, inc_e,
205                                      &tol,
206                                      &thresh );
207 
208     // Initialize our completion flag.
209     done = FALSE;
210 
211     // Initialize a counter that holds the maximum number of rows of G
212     // and H that we would need to initialize for the next sweep.
213     m_GH_sweep_max = min_m_n - 1;
214 
215     // Initialize a counter for the total number of iterations performed.
216     n_iter_prev = 0;
217 
218     // Iterate until the matrix has completely deflated.
219     for ( total_deflations = 0; done != TRUE; )
220     {
221 
222         // Initialize G and H to contain only identity rotations.
223         bl1_csetm( m_GH_sweep_max,
224                    n_GH,
225                    &one,
226                    buff_G, rs_G, cs_G );
227         bl1_csetm( m_GH_sweep_max,
228                    n_GH,
229                    &one,
230                    buff_H, rs_H, cs_H );
231 
232         // Keep track of the maximum number of iterations performed in the
233         // current sweep. This is used when applying the sweep's Givens
234         // rotations.
235         n_iter_perf_sweep_max = 0;
236 
237         // Perform a sweep: Move through the matrix and perform a bidiagonal
238         // SVD on each non-zero submatrix that is encountered. During the
239         // first time through, ijTL will be 0 and ijBR will be min_m_n - 1.
240         for ( ij_begin = 0; ij_begin < min_m_n;  )
241         {
242 
243             // Search for the first submatrix along the diagonal that is
244             // bounded by zeroes (or endpoints of the matrix). If no
245             // submatrix is found (ie: if the entire superdiagonal is zero
246             // then FLA_FAILURE is returned. This function also inspects
247             // superdiagonal elements for proximity to zero. If a given
248             // element is close enough to zero, then it is deemed
249             // converged and manually set to zero.
250             r_val = FLA_Bsvd_find_submatrix_ops( min_m_n,
251                                                  ij_begin,
252                                                  buff_d, inc_d,
253                                                  buff_e, inc_e,
254                                                  &ijTL,
255                                                  &ijBR );
256 
257             // Verify that a submatrix was found. If one was not found,
258             // then we are done with the current sweep. Furthermore, if
259             // a submatrix was not found AND we began our search at the
260             // beginning of the matrix (ie: ij_begin == 0), then the
261             // matrix has completely deflated and so we are done with
262             // Francis step iteration.
263             if ( r_val == FLA_FAILURE )
264             {
265                 if ( ij_begin == 0 )
266                 {
267                     done = TRUE;
268                 }
269 
270                 // Break out of the current sweep so we can apply the last
271                 // remaining Givens rotations.
272                 break;
273             }
274 
275             // If we got this far, then:
276             //   (a) ijTL refers to the index of the first non-zero
277             //       superdiagonal along the diagonal, and
278             //   (b) ijBR refers to either:
279             //       - the first zero element that occurs after ijTL, or
280             //       - the the last diagonal element.
281             // Note that ijTL and ijBR also correspond to the first and
282             // last diagonal elements of the submatrix of interest. Thus,
283             // we may compute the dimension of this submatrix as:
284             m_A11 = ijBR - ijTL + 1;
285 
286             // Adjust ij_begin, which gets us ready for the next submatrix
287             // search in the current sweep.
288             ij_begin = ijBR + 1;
289 
290             // Index to the submatrices upon which we will operate.
291             d1 = buff_d + ijTL * inc_d;
292             e1 = buff_e + ijTL * inc_e;
293             G  = buff_G + ijTL * rs_G;
294             H  = buff_H + ijTL * rs_H;
295 
296             // Search for a batch of singular values, recursing on deflated
297             // subproblems whenever a split occurs. Iteration continues as
298             // long as
299             //   (a) there is still matrix left to operate on, and
300             //   (b) the number of iterations performed in this batch is
301             //       less than n_G.
302             // If/when either of the two above conditions fails to hold,
303             // the function returns.
304             n_deflations = FLA_Bsvd_iteracc_v_ops_var1( m_A11,
305                                                         n_GH,
306                                                         ijTL,
307                                                         tol,
308                                                         thresh,
309                                                         d1, inc_d,
310                                                         e1, inc_e,
311                                                         G,  rs_G, cs_G,
312                                                         H,  rs_H, cs_H,
313                                                         &n_iter_perf );
314 
315             // Record the number of deflations that were observed.
316             total_deflations += n_deflations;
317 
318             // Update the maximum number of iterations performed in the
319             // current sweep.
320             n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf );
321 
322             // Store the most recent value of ijBR in m_G_sweep_max.
323             // When the sweep is done, this value will contain the minimum
324             // number of rows of G and H we can apply and safely include all
325             // non-identity rotations that were computed during the
326             // singular value searches.
327             m_GH_sweep_max = ijBR;
328 
329             // Make sure we haven't exceeded our maximum iteration count.
330             if ( n_iter_prev >= n_iter_max * min_m_n )
331             {
332                 FLA_Abort();
333                 //return FLA_FAILURE;
334             }
335         }
336 
337         // The sweep is complete. Now we must apply the Givens rotations
338         // that were accumulated during the sweep.
339 
340         // Recall that the number of columns of U and V to which we apply
341         // rotations is one more than the number of rotations.
342         n_UV_apply = m_GH_sweep_max + 1;
343 
344         // Apply the Givens rotations. Note that we only apply k sets of
345         // rotations, where k = n_iter_perf_sweep_max. Also note that we only
346         // apply to n_UV_apply columns of U and V since this is the most we
347         // need to touch given the most recent value stored to m_GH_sweep_max.
348         //FLA_Apply_G_rf_bls_var5( n_iter_perf_sweep_max,
349         FLA_Apply_G_rf_bls_var3( n_iter_perf_sweep_max,
350                                  //FLA_Apply_G_rf_bld_var9( n_iter_perf_sweep_max,
351                                  //FLA_Apply_G_rf_bld_var6( n_iter_perf_sweep_max,
352                                  m_U,
353                                  n_UV_apply,
354                                  buff_G, rs_G, cs_G,
355                                  buff_U, rs_U, cs_U,
356                                  b_alg );
357         //FLA_Apply_G_rf_blc_var5( n_iter_perf_sweep_max,
358         FLA_Apply_G_rf_bls_var3( n_iter_perf_sweep_max,
359                                  //FLA_Apply_G_rf_bld_var9( n_iter_perf_sweep_max,
360                                  //FLA_Apply_G_rf_bld_var6( n_iter_perf_sweep_max,
361                                  m_V,
362                                  n_UV_apply,
363                                  buff_H, rs_H, cs_H,
364                                  buff_V, rs_V, cs_V,
365                                  b_alg );
366 
367         // Increment the total number of iterations previously performed.
368         n_iter_prev += n_iter_perf_sweep_max;
369 
370     }
371 
372     // Make all the singular values positive.
373     {
374         int    i;
375         float  minus_one = bl1_sm1();
376 
377         for ( i = 0; i < min_m_n; ++i )
378         {
379             if ( buff_d[ (i  )*inc_d ] < rzero )
380             {
381                 buff_d[ (i  )*inc_d ] = -buff_d[ (i  )*inc_d ];
382 
383                 // Scale the right singular vectors.
384                 bl1_sscalv( BLIS1_NO_CONJUGATE,
385                             m_V,
386                             &minus_one,
387                             buff_V + (i  )*cs_V, rs_V );
388             }
389         }
390     }
391 
392     return n_iter_prev;
393 }
394 
395 
396 
FLA_Bsvd_v_opd_var1(int min_m_n,int m_U,int m_V,int n_GH,int n_iter_max,double * buff_d,int inc_d,double * buff_e,int inc_e,dcomplex * buff_G,int rs_G,int cs_G,dcomplex * buff_H,int rs_H,int cs_H,double * buff_U,int rs_U,int cs_U,double * buff_V,int rs_V,int cs_V,int b_alg)397 FLA_Error FLA_Bsvd_v_opd_var1( int       min_m_n,
398                                int       m_U,
399                                int       m_V,
400                                int       n_GH,
401                                int       n_iter_max,
402                                double*   buff_d, int inc_d,
403                                double*   buff_e, int inc_e,
404                                dcomplex* buff_G, int rs_G, int cs_G,
405                                dcomplex* buff_H, int rs_H, int cs_H,
406                                double*   buff_U, int rs_U, int cs_U,
407                                double*   buff_V, int rs_V, int cs_V,
408                                int       b_alg )
409 {
410     dcomplex  one        = bl1_z1();
411     double    rzero      = bl1_d0();
412 
413     int       maxitr     = 6;
414 
415     double    eps;
416     double    tolmul;
417     double    tol;
418     double    thresh;
419 
420     dcomplex* G;
421     dcomplex* H;
422     double*   d1;
423     double*   e1;
424     int       r_val;
425     int       done;
426     int       m_GH_sweep_max;
427     int       ij_begin;
428     int       ijTL, ijBR;
429     int       m_A11;
430     int       n_iter_perf;
431     int       n_UV_apply;
432     int       total_deflations;
433     int       n_deflations;
434     int       n_iter_prev;
435     int       n_iter_perf_sweep_max;
436 
437     // Compute some convergence constants.
438     eps    = FLA_Mach_params_opd( FLA_MACH_EPS );
439     tolmul = max( 10.0, min( 100.0, pow( eps, -0.125 ) ) );
440     FLA_Bsvd_compute_tol_thresh_opd( min_m_n,
441                                      tolmul,
442                                      maxitr,
443                                      buff_d, inc_d,
444                                      buff_e, inc_e,
445                                      &tol,
446                                      &thresh );
447 #ifdef PRINTF
448     printf( "FLA_Bsvd_v_opd_var1: tolmul = %12.6e\n", tolmul );
449     printf( "FLA_Bsvd_v_opd_var1: tol    = %12.6e\n", tol );
450     printf( "FLA_Bsvd_v_opd_var1: thresh = %12.6e\n", thresh );
451 #endif
452 
453     // Initialize our completion flag.
454     done = FALSE;
455 
456     // Initialize a counter that holds the maximum number of rows of G
457     // and H that we would need to initialize for the next sweep.
458     m_GH_sweep_max = min_m_n - 1;
459 
460     // Initialize a counter for the total number of iterations performed.
461     n_iter_prev = 0;
462 
463     // Iterate until the matrix has completely deflated.
464     for ( total_deflations = 0; done != TRUE; )
465     {
466 
467         // Initialize G and H to contain only identity rotations.
468         bl1_zsetm( m_GH_sweep_max,
469                    n_GH,
470                    &one,
471                    buff_G, rs_G, cs_G );
472         bl1_zsetm( m_GH_sweep_max,
473                    n_GH,
474                    &one,
475                    buff_H, rs_H, cs_H );
476 
477         // Keep track of the maximum number of iterations performed in the
478         // current sweep. This is used when applying the sweep's Givens
479         // rotations.
480         n_iter_perf_sweep_max = 0;
481 
482         // Perform a sweep: Move through the matrix and perform a bidiagonal
483         // SVD on each non-zero submatrix that is encountered. During the
484         // first time through, ijTL will be 0 and ijBR will be min_m_n - 1.
485         for ( ij_begin = 0; ij_begin < min_m_n;  )
486         {
487 
488 #ifdef PRINTF
489             if ( ij_begin == 0 )
490                 printf( "FLA_Bsvd_v_opd_var1: beginning new sweep (ij_begin = %d)\n", ij_begin );
491 #endif
492 
493             // Search for the first submatrix along the diagonal that is
494             // bounded by zeroes (or endpoints of the matrix). If no
495             // submatrix is found (ie: if the entire superdiagonal is zero
496             // then FLA_FAILURE is returned. This function also inspects
497             // superdiagonal elements for proximity to zero. If a given
498             // element is close enough to zero, then it is deemed
499             // converged and manually set to zero.
500             r_val = FLA_Bsvd_find_submatrix_opd( min_m_n,
501                                                  ij_begin,
502                                                  buff_d, inc_d,
503                                                  buff_e, inc_e,
504                                                  &ijTL,
505                                                  &ijBR );
506 
507             // Verify that a submatrix was found. If one was not found,
508             // then we are done with the current sweep. Furthermore, if
509             // a submatrix was not found AND we began our search at the
510             // beginning of the matrix (ie: ij_begin == 0), then the
511             // matrix has completely deflated and so we are done with
512             // Francis step iteration.
513             if ( r_val == FLA_FAILURE )
514             {
515                 if ( ij_begin == 0 )
516                 {
517 #ifdef PRINTF
518                     printf( "FLA_Bsvd_v_opd_var1: superdiagonal is completely zero.\n" );
519                     printf( "FLA_Bsvd_v_opd_var1: Francis iteration is done!\n" );
520 #endif
521                     done = TRUE;
522                 }
523 
524                 // Break out of the current sweep so we can apply the last
525                 // remaining Givens rotations.
526                 break;
527             }
528 
529             // If we got this far, then:
530             //   (a) ijTL refers to the index of the first non-zero
531             //       superdiagonal along the diagonal, and
532             //   (b) ijBR refers to either:
533             //       - the first zero element that occurs after ijTL, or
534             //       - the the last diagonal element.
535             // Note that ijTL and ijBR also correspond to the first and
536             // last diagonal elements of the submatrix of interest. Thus,
537             // we may compute the dimension of this submatrix as:
538             m_A11 = ijBR - ijTL + 1;
539 
540 #ifdef PRINTF
541             printf( "FLA_Bsvd_v_opd_var1: ij_begin = %d\n", ij_begin );
542             printf( "FLA_Bsvd_v_opd_var1: ijTL     = %d\n", ijTL );
543             printf( "FLA_Bsvd_v_opd_var1: ijBR     = %d\n", ijBR );
544             printf( "FLA_Bsvd_v_opd_var1: m_A11    = %d\n", m_A11 );
545 #endif
546 
547             // Adjust ij_begin, which gets us ready for the next submatrix
548             // search in the current sweep.
549             ij_begin = ijBR + 1;
550 
551             // Index to the submatrices upon which we will operate.
552             d1 = buff_d + ijTL * inc_d;
553             e1 = buff_e + ijTL * inc_e;
554             G  = buff_G + ijTL * rs_G;
555             H  = buff_H + ijTL * rs_H;
556 
557             // Search for a batch of singular values, recursing on deflated
558             // subproblems whenever a split occurs. Iteration continues as
559             // long as
560             //   (a) there is still matrix left to operate on, and
561             //   (b) the number of iterations performed in this batch is
562             //       less than n_G.
563             // If/when either of the two above conditions fails to hold,
564             // the function returns.
565             n_deflations = FLA_Bsvd_iteracc_v_opd_var1( m_A11,
566                            n_GH,
567                            ijTL,
568                            tol,
569                            thresh,
570                            d1, inc_d,
571                            e1, inc_e,
572                            G,  rs_G, cs_G,
573                            H,  rs_H, cs_H,
574                            &n_iter_perf );
575 
576             // Record the number of deflations that were observed.
577             total_deflations += n_deflations;
578 
579             // Update the maximum number of iterations performed in the
580             // current sweep.
581             n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf );
582 
583 #ifdef PRINTF
584             printf( "FLA_Bsvd_v_opd_var1: deflations observed       = %d\n", n_deflations );
585             printf( "FLA_Bsvd_v_opd_var1: total deflations observed = %d\n", total_deflations );
586             printf( "FLA_Bsvd_v_opd_var1: num iterations performed  = %d\n", n_iter_perf );
587 #endif
588 
589             // Store the most recent value of ijBR in m_G_sweep_max.
590             // When the sweep is done, this value will contain the minimum
591             // number of rows of G and H we can apply and safely include all
592             // non-identity rotations that were computed during the
593             // singular value searches.
594             m_GH_sweep_max = ijBR;
595 
596             // Make sure we haven't exceeded our maximum iteration count.
597             if ( n_iter_prev >= n_iter_max * min_m_n )
598             {
599 #ifdef PRINTF
600                 printf( "FLA_Bsvd_v_opd_var1: reached maximum total number of iterations: %d\n", n_iter_prev );
601 #endif
602                 FLA_Abort();
603                 //return FLA_FAILURE;
604             }
605         }
606 
607         // The sweep is complete. Now we must apply the Givens rotations
608         // that were accumulated during the sweep.
609 
610         // Recall that the number of columns of U and V to which we apply
611         // rotations is one more than the number of rotations.
612         n_UV_apply = m_GH_sweep_max + 1;
613 
614 #ifdef PRINTF
615         printf( "FLA_Bsvd_v_opd_var1: applying %d sets of Givens rotations\n", n_iter_perf_sweep_max );
616         printf( "FLA_Bsvd_v_opd_var1: m_U = %d\n", m_U );
617         printf( "FLA_Bsvd_v_opd_var1: m_V = %d\n", m_V );
618         printf( "FLA_Bsvd_v_opd_var1: napp= %d\n", n_UV_apply );
619 #endif
620         // Apply the Givens rotations. Note that we only apply k sets of
621         // rotations, where k = n_iter_perf_sweep_max. Also note that we only
622         // apply to n_UV_apply columns of U and V since this is the most we
623         // need to touch given the most recent value stored to m_GH_sweep_max.
624         //FLA_Apply_G_rf_bld_var5( n_iter_perf_sweep_max,
625         FLA_Apply_G_rf_bld_var3( n_iter_perf_sweep_max,
626                                  //FLA_Apply_G_rf_bld_var9( n_iter_perf_sweep_max,
627                                  //FLA_Apply_G_rf_bld_var6( n_iter_perf_sweep_max,
628                                  m_U,
629                                  n_UV_apply,
630                                  buff_G, rs_G, cs_G,
631                                  buff_U, rs_U, cs_U,
632                                  b_alg );
633         //FLA_Apply_G_rf_bld_var5( n_iter_perf_sweep_max,
634         FLA_Apply_G_rf_bld_var3( n_iter_perf_sweep_max,
635                                  //FLA_Apply_G_rf_bld_var9( n_iter_perf_sweep_max,
636                                  //FLA_Apply_G_rf_bld_var6( n_iter_perf_sweep_max,
637                                  m_V,
638                                  n_UV_apply,
639                                  buff_H, rs_H, cs_H,
640                                  buff_V, rs_V, cs_V,
641                                  b_alg );
642 
643         // Increment the total number of iterations previously performed.
644         n_iter_prev += n_iter_perf_sweep_max;
645 
646 #ifdef PRINTF
647         printf( "FLA_Bsvd_v_opd_var1: total number of iterations performed: %d\n", n_iter_prev );
648 #endif
649     }
650 
651     // Make all the singular values positive.
652     {
653         int    i;
654         double minus_one = bl1_dm1();
655 
656         for ( i = 0; i < min_m_n; ++i )
657         {
658             if ( buff_d[ (i  )*inc_d ] < rzero )
659             {
660                 buff_d[ (i  )*inc_d ] = -buff_d[ (i  )*inc_d ];
661 
662                 // Scale the right singular vectors.
663                 bl1_dscalv( BLIS1_NO_CONJUGATE,
664                             m_V,
665                             &minus_one,
666                             buff_V + (i  )*cs_V, rs_V );
667             }
668         }
669     }
670 
671     return n_iter_prev;
672 }
673 
FLA_Bsvd_v_opc_var1(int min_m_n,int m_U,int m_V,int n_GH,int n_iter_max,float * buff_d,int inc_d,float * buff_e,int inc_e,scomplex * buff_G,int rs_G,int cs_G,scomplex * buff_H,int rs_H,int cs_H,scomplex * buff_U,int rs_U,int cs_U,scomplex * buff_V,int rs_V,int cs_V,int b_alg)674 FLA_Error FLA_Bsvd_v_opc_var1( int       min_m_n,
675                                int       m_U,
676                                int       m_V,
677                                int       n_GH,
678                                int       n_iter_max,
679                                float*    buff_d, int inc_d,
680                                float*    buff_e, int inc_e,
681                                scomplex* buff_G, int rs_G, int cs_G,
682                                scomplex* buff_H, int rs_H, int cs_H,
683                                scomplex* buff_U, int rs_U, int cs_U,
684                                scomplex* buff_V, int rs_V, int cs_V,
685                                int       b_alg )
686 {
687     scomplex  one        = bl1_c1();
688     float     rzero      = bl1_s0();
689 
690     int       maxitr     = 6;
691 
692     float     eps;
693     float     tolmul;
694     float     tol;
695     float     thresh;
696 
697     scomplex* G;
698     scomplex* H;
699     float*    d1;
700     float*    e1;
701     int       r_val;
702     int       done;
703     int       m_GH_sweep_max;
704     int       ij_begin;
705     int       ijTL, ijBR;
706     int       m_A11;
707     int       n_iter_perf;
708     int       n_UV_apply;
709     int       total_deflations;
710     int       n_deflations;
711     int       n_iter_prev;
712     int       n_iter_perf_sweep_max;
713 
714     // Compute some convergence constants.
715     eps    = FLA_Mach_params_ops( FLA_MACH_EPS );
716     tolmul = max( 10.0F, min( 100.0F, powf( eps, -0.125F ) ) );
717     FLA_Bsvd_compute_tol_thresh_ops( min_m_n,
718                                      tolmul,
719                                      maxitr,
720                                      buff_d, inc_d,
721                                      buff_e, inc_e,
722                                      &tol,
723                                      &thresh );
724 
725     // Initialize our completion flag.
726     done = FALSE;
727 
728     // Initialize a counter that holds the maximum number of rows of G
729     // and H that we would need to initialize for the next sweep.
730     m_GH_sweep_max = min_m_n - 1;
731 
732     // Initialize a counter for the total number of iterations performed.
733     n_iter_prev = 0;
734 
735     // Iterate until the matrix has completely deflated.
736     for ( total_deflations = 0; done != TRUE; )
737     {
738 
739         // Initialize G and H to contain only identity rotations.
740         bl1_csetm( m_GH_sweep_max,
741                    n_GH,
742                    &one,
743                    buff_G, rs_G, cs_G );
744         bl1_csetm( m_GH_sweep_max,
745                    n_GH,
746                    &one,
747                    buff_H, rs_H, cs_H );
748 
749         // Keep track of the maximum number of iterations performed in the
750         // current sweep. This is used when applying the sweep's Givens
751         // rotations.
752         n_iter_perf_sweep_max = 0;
753 
754         // Perform a sweep: Move through the matrix and perform a bidiagonal
755         // SVD on each non-zero submatrix that is encountered. During the
756         // first time through, ijTL will be 0 and ijBR will be min_m_n - 1.
757         for ( ij_begin = 0; ij_begin < min_m_n;  )
758         {
759             // Search for the first submatrix along the diagonal that is
760             // bounded by zeroes (or endpoints of the matrix). If no
761             // submatrix is found (ie: if the entire superdiagonal is zero
762             // then FLA_FAILURE is returned. This function also inspects
763             // superdiagonal elements for proximity to zero. If a given
764             // element is close enough to zero, then it is deemed
765             // converged and manually set to zero.
766             r_val = FLA_Bsvd_find_submatrix_ops( min_m_n,
767                                                  ij_begin,
768                                                  buff_d, inc_d,
769                                                  buff_e, inc_e,
770                                                  &ijTL,
771                                                  &ijBR );
772 
773             // Verify that a submatrix was found. If one was not found,
774             // then we are done with the current sweep. Furthermore, if
775             // a submatrix was not found AND we began our search at the
776             // beginning of the matrix (ie: ij_begin == 0), then the
777             // matrix has completely deflated and so we are done with
778             // Francis step iteration.
779             if ( r_val == FLA_FAILURE )
780             {
781                 if ( ij_begin == 0 )
782                 {
783                     done = TRUE;
784                 }
785 
786                 // Break out of the current sweep so we can apply the last
787                 // remaining Givens rotations.
788                 break;
789             }
790 
791             // If we got this far, then:
792             //   (a) ijTL refers to the index of the first non-zero
793             //       superdiagonal along the diagonal, and
794             //   (b) ijBR refers to either:
795             //       - the first zero element that occurs after ijTL, or
796             //       - the the last diagonal element.
797             // Note that ijTL and ijBR also correspond to the first and
798             // last diagonal elements of the submatrix of interest. Thus,
799             // we may compute the dimension of this submatrix as:
800             m_A11 = ijBR - ijTL + 1;
801 
802             // Adjust ij_begin, which gets us ready for the next submatrix
803             // search in the current sweep.
804             ij_begin = ijBR + 1;
805 
806             // Index to the submatrices upon which we will operate.
807             d1 = buff_d + ijTL * inc_d;
808             e1 = buff_e + ijTL * inc_e;
809             G  = buff_G + ijTL * rs_G;
810             H  = buff_H + ijTL * rs_H;
811 
812             // Search for a batch of singular values, recursing on deflated
813             // subproblems whenever a split occurs. Iteration continues as
814             // long as
815             //   (a) there is still matrix left to operate on, and
816             //   (b) the number of iterations performed in this batch is
817             //       less than n_G.
818             // If/when either of the two above conditions fails to hold,
819             // the function returns.
820             n_deflations = FLA_Bsvd_iteracc_v_ops_var1( m_A11,
821                                                         n_GH,
822                                                         ijTL,
823                                                         tol,
824                                                         thresh,
825                                                         d1, inc_d,
826                                                         e1, inc_e,
827                                                         G,  rs_G, cs_G,
828                                                         H,  rs_H, cs_H,
829                                                         &n_iter_perf );
830 
831             // Record the number of deflations that were observed.
832             total_deflations += n_deflations;
833 
834             // Update the maximum number of iterations performed in the
835             // current sweep.
836             n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf );
837 
838             // Store the most recent value of ijBR in m_G_sweep_max.
839             // When the sweep is done, this value will contain the minimum
840             // number of rows of G and H we can apply and safely include all
841             // non-identity rotations that were computed during the
842             // singular value searches.
843             m_GH_sweep_max = ijBR;
844 
845             // Make sure we haven't exceeded our maximum iteration count.
846             if ( n_iter_prev >= n_iter_max * min_m_n )
847             {
848                 FLA_Abort();
849                 //return FLA_FAILURE;
850             }
851         }
852 
853         // The sweep is complete. Now we must apply the Givens rotations
854         // that were accumulated during the sweep.
855 
856         // Recall that the number of columns of U and V to which we apply
857         // rotations is one more than the number of rotations.
858         n_UV_apply = m_GH_sweep_max + 1;
859 
860         // Apply the Givens rotations. Note that we only apply k sets of
861         // rotations, where k = n_iter_perf_sweep_max. Also note that we only
862         // apply to n_UV_apply columns of U and V since this is the most we
863         // need to touch given the most recent value stored to m_GH_sweep_max.
864         //FLA_Apply_G_rf_blc_var5( n_iter_perf_sweep_max,
865         FLA_Apply_G_rf_blc_var3( n_iter_perf_sweep_max,
866                                  //FLA_Apply_G_rf_blz_var9( n_iter_perf_sweep_max,
867                                  //FLA_Apply_G_rf_blz_var6( n_iter_perf_sweep_max,
868                                  m_U,
869                                  n_UV_apply,
870                                  buff_G, rs_G, cs_G,
871                                  buff_U, rs_U, cs_U,
872                                  b_alg );
873         //FLA_Apply_G_rf_blc_var5( n_iter_perf_sweep_max,
874         FLA_Apply_G_rf_blc_var3( n_iter_perf_sweep_max,
875                                  //FLA_Apply_G_rf_blz_var9( n_iter_perf_sweep_max,
876                                  //FLA_Apply_G_rf_blz_var6( n_iter_perf_sweep_max,
877                                  m_V,
878                                  n_UV_apply,
879                                  buff_H, rs_H, cs_H,
880                                  buff_V, rs_V, cs_V,
881                                  b_alg );
882 
883         // Increment the total number of iterations previously performed.
884         n_iter_prev += n_iter_perf_sweep_max;
885     }
886 
887     // Make all the singular values positive.
888     {
889         int    i;
890         float  minus_one = bl1_sm1();
891 
892         for ( i = 0; i < min_m_n; ++i )
893         {
894             if ( buff_d[ (i  )*inc_d ] < rzero )
895             {
896                 buff_d[ (i  )*inc_d ] = -buff_d[ (i  )*inc_d ];
897 
898                 // Scale the right singular vectors.
899                 bl1_csscalv( BLIS1_NO_CONJUGATE,
900                              m_V,
901                              &minus_one,
902                              buff_V + (i  )*cs_V, rs_V );
903             }
904         }
905     }
906 
907     return FLA_SUCCESS;
908 }
909 
910 //#define PRINTF
911 
FLA_Bsvd_v_opz_var1(int min_m_n,int m_U,int m_V,int n_GH,int n_iter_max,double * buff_d,int inc_d,double * buff_e,int inc_e,dcomplex * buff_G,int rs_G,int cs_G,dcomplex * buff_H,int rs_H,int cs_H,dcomplex * buff_U,int rs_U,int cs_U,dcomplex * buff_V,int rs_V,int cs_V,int b_alg)912 FLA_Error FLA_Bsvd_v_opz_var1( int       min_m_n,
913                                int       m_U,
914                                int       m_V,
915                                int       n_GH,
916                                int       n_iter_max,
917                                double*   buff_d, int inc_d,
918                                double*   buff_e, int inc_e,
919                                dcomplex* buff_G, int rs_G, int cs_G,
920                                dcomplex* buff_H, int rs_H, int cs_H,
921                                dcomplex* buff_U, int rs_U, int cs_U,
922                                dcomplex* buff_V, int rs_V, int cs_V,
923                                int       b_alg )
924 {
925     dcomplex  one        = bl1_z1();
926     double    rzero      = bl1_d0();
927 
928     int       maxitr     = 6;
929 
930     double    eps;
931     double    tolmul;
932     double    tol;
933     double    thresh;
934 
935     dcomplex* G;
936     dcomplex* H;
937     double*   d1;
938     double*   e1;
939     int       r_val;
940     int       done;
941     int       m_GH_sweep_max;
942     int       ij_begin;
943     int       ijTL, ijBR;
944     int       m_A11;
945     int       n_iter_perf;
946     int       n_UV_apply;
947     int       total_deflations;
948     int       n_deflations;
949     int       n_iter_prev;
950     int       n_iter_perf_sweep_max;
951 
952     // Compute some convergence constants.
953     eps    = FLA_Mach_params_opd( FLA_MACH_EPS );
954     tolmul = max( 10.0, min( 100.0, pow( eps, -0.125 ) ) );
955     FLA_Bsvd_compute_tol_thresh_opd( min_m_n,
956                                      tolmul,
957                                      maxitr,
958                                      buff_d, inc_d,
959                                      buff_e, inc_e,
960                                      &tol,
961                                      &thresh );
962 #ifdef PRINTF
963     printf( "FLA_Bsvd_v_opz_var1: tolmul = %12.6e\n", tolmul );
964     printf( "FLA_Bsvd_v_opz_var1: tol    = %12.6e\n", tol );
965     printf( "FLA_Bsvd_v_opz_var1: thresh = %12.6e\n", thresh );
966 #endif
967 
968     // Initialize our completion flag.
969     done = FALSE;
970 
971     // Initialize a counter that holds the maximum number of rows of G
972     // and H that we would need to initialize for the next sweep.
973     m_GH_sweep_max = min_m_n - 1;
974 
975     // Initialize a counter for the total number of iterations performed.
976     n_iter_prev = 0;
977 
978     // Iterate until the matrix has completely deflated.
979     for ( total_deflations = 0; done != TRUE; )
980     {
981 
982         // Initialize G and H to contain only identity rotations.
983         bl1_zsetm( m_GH_sweep_max,
984                    n_GH,
985                    &one,
986                    buff_G, rs_G, cs_G );
987         bl1_zsetm( m_GH_sweep_max,
988                    n_GH,
989                    &one,
990                    buff_H, rs_H, cs_H );
991 
992         // Keep track of the maximum number of iterations performed in the
993         // current sweep. This is used when applying the sweep's Givens
994         // rotations.
995         n_iter_perf_sweep_max = 0;
996 
997         // Perform a sweep: Move through the matrix and perform a bidiagonal
998         // SVD on each non-zero submatrix that is encountered. During the
999         // first time through, ijTL will be 0 and ijBR will be min_m_n - 1.
1000         for ( ij_begin = 0; ij_begin < min_m_n;  )
1001         {
1002 
1003 #ifdef PRINTF
1004             if ( ij_begin == 0 )
1005                 printf( "FLA_Bsvd_v_opz_var1: beginning new sweep (ij_begin = %d)\n", ij_begin );
1006 #endif
1007 
1008             // Search for the first submatrix along the diagonal that is
1009             // bounded by zeroes (or endpoints of the matrix). If no
1010             // submatrix is found (ie: if the entire superdiagonal is zero
1011             // then FLA_FAILURE is returned. This function also inspects
1012             // superdiagonal elements for proximity to zero. If a given
1013             // element is close enough to zero, then it is deemed
1014             // converged and manually set to zero.
1015             r_val = FLA_Bsvd_find_submatrix_opd( min_m_n,
1016                                                  ij_begin,
1017                                                  buff_d, inc_d,
1018                                                  buff_e, inc_e,
1019                                                  &ijTL,
1020                                                  &ijBR );
1021 
1022             // Verify that a submatrix was found. If one was not found,
1023             // then we are done with the current sweep. Furthermore, if
1024             // a submatrix was not found AND we began our search at the
1025             // beginning of the matrix (ie: ij_begin == 0), then the
1026             // matrix has completely deflated and so we are done with
1027             // Francis step iteration.
1028             if ( r_val == FLA_FAILURE )
1029             {
1030                 if ( ij_begin == 0 )
1031                 {
1032 #ifdef PRINTF
1033                     printf( "FLA_Bsvd_v_opz_var1: superdiagonal is completely zero.\n" );
1034                     printf( "FLA_Bsvd_v_opz_var1: Francis iteration is done!\n" );
1035 #endif
1036                     done = TRUE;
1037                 }
1038 
1039                 // Break out of the current sweep so we can apply the last
1040                 // remaining Givens rotations.
1041                 break;
1042             }
1043 
1044             // If we got this far, then:
1045             //   (a) ijTL refers to the index of the first non-zero
1046             //       superdiagonal along the diagonal, and
1047             //   (b) ijBR refers to either:
1048             //       - the first zero element that occurs after ijTL, or
1049             //       - the the last diagonal element.
1050             // Note that ijTL and ijBR also correspond to the first and
1051             // last diagonal elements of the submatrix of interest. Thus,
1052             // we may compute the dimension of this submatrix as:
1053             m_A11 = ijBR - ijTL + 1;
1054 
1055 #ifdef PRINTF
1056             printf( "FLA_Bsvd_v_opz_var1: ij_begin = %d\n", ij_begin );
1057             printf( "FLA_Bsvd_v_opz_var1: ijTL     = %d\n", ijTL );
1058             printf( "FLA_Bsvd_v_opz_var1: ijBR     = %d\n", ijBR );
1059             printf( "FLA_Bsvd_v_opz_var1: m_A11    = %d\n", m_A11 );
1060 #endif
1061 
1062             // Adjust ij_begin, which gets us ready for the next submatrix
1063             // search in the current sweep.
1064             ij_begin = ijBR + 1;
1065 
1066             // Index to the submatrices upon which we will operate.
1067             d1 = buff_d + ijTL * inc_d;
1068             e1 = buff_e + ijTL * inc_e;
1069             G  = buff_G + ijTL * rs_G;
1070             H  = buff_H + ijTL * rs_H;
1071 
1072             // Search for a batch of singular values, recursing on deflated
1073             // subproblems whenever a split occurs. Iteration continues as
1074             // long as
1075             //   (a) there is still matrix left to operate on, and
1076             //   (b) the number of iterations performed in this batch is
1077             //       less than n_G.
1078             // If/when either of the two above conditions fails to hold,
1079             // the function returns.
1080             n_deflations = FLA_Bsvd_iteracc_v_opd_var1( m_A11,
1081                            n_GH,
1082                            ijTL,
1083                            tol,
1084                            thresh,
1085                            d1, inc_d,
1086                            e1, inc_e,
1087                            G,  rs_G, cs_G,
1088                            H,  rs_H, cs_H,
1089                            &n_iter_perf );
1090 
1091             // Record the number of deflations that were observed.
1092             total_deflations += n_deflations;
1093 
1094             // Update the maximum number of iterations performed in the
1095             // current sweep.
1096             n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf );
1097 
1098 #ifdef PRINTF
1099             printf( "FLA_Bsvd_v_opz_var1: deflations observed       = %d\n", n_deflations );
1100             printf( "FLA_Bsvd_v_opz_var1: total deflations observed = %d\n", total_deflations );
1101             printf( "FLA_Bsvd_v_opz_var1: num iterations performed  = %d\n", n_iter_perf );
1102 #endif
1103 
1104             // Store the most recent value of ijBR in m_G_sweep_max.
1105             // When the sweep is done, this value will contain the minimum
1106             // number of rows of G and H we can apply and safely include all
1107             // non-identity rotations that were computed during the
1108             // singular value searches.
1109             m_GH_sweep_max = ijBR;
1110 
1111             // Make sure we haven't exceeded our maximum iteration count.
1112             if ( n_iter_prev >= n_iter_max * min_m_n )
1113             {
1114 #ifdef PRINTF
1115                 printf( "FLA_Bsvd_v_opz_var1: reached maximum total number of iterations: %d\n", n_iter_prev );
1116 #endif
1117                 FLA_Abort();
1118                 //return FLA_FAILURE;
1119             }
1120         }
1121 
1122         // The sweep is complete. Now we must apply the Givens rotations
1123         // that were accumulated during the sweep.
1124 
1125         // Recall that the number of columns of U and V to which we apply
1126         // rotations is one more than the number of rotations.
1127         n_UV_apply = m_GH_sweep_max + 1;
1128 
1129 #ifdef PRINTF
1130         printf( "FLA_Bsvd_v_opz_var1: applying %d sets of Givens rotations\n", n_iter_perf_sweep_max );
1131         printf( "FLA_Bsvd_v_opz_var1: m_U = %d\n", m_U );
1132         printf( "FLA_Bsvd_v_opz_var1: m_V = %d\n", m_V );
1133         printf( "FLA_Bsvd_v_opz_var1: napp= %d\n", n_UV_apply );
1134 #endif
1135         // Apply the Givens rotations. Note that we only apply k sets of
1136         // rotations, where k = n_iter_perf_sweep_max. Also note that we only
1137         // apply to n_UV_apply columns of U and V since this is the most we
1138         // need to touch given the most recent value stored to m_GH_sweep_max.
1139         //FLA_Apply_G_rf_blz_var5( n_iter_perf_sweep_max,
1140         FLA_Apply_G_rf_blz_var3( n_iter_perf_sweep_max,
1141                                  //FLA_Apply_G_rf_blz_var9( n_iter_perf_sweep_max,
1142                                  //FLA_Apply_G_rf_blz_var6( n_iter_perf_sweep_max,
1143                                  m_U,
1144                                  n_UV_apply,
1145                                  buff_G, rs_G, cs_G,
1146                                  buff_U, rs_U, cs_U,
1147                                  b_alg );
1148         //FLA_Apply_G_rf_blz_var5( n_iter_perf_sweep_max,
1149         FLA_Apply_G_rf_blz_var3( n_iter_perf_sweep_max,
1150                                  //FLA_Apply_G_rf_blz_var9( n_iter_perf_sweep_max,
1151                                  //FLA_Apply_G_rf_blz_var6( n_iter_perf_sweep_max,
1152                                  m_V,
1153                                  n_UV_apply,
1154                                  buff_H, rs_H, cs_H,
1155                                  buff_V, rs_V, cs_V,
1156                                  b_alg );
1157 
1158         // Increment the total number of iterations previously performed.
1159         n_iter_prev += n_iter_perf_sweep_max;
1160 
1161 #ifdef PRINTF
1162         printf( "FLA_Bsvd_v_opz_var1: total number of iterations performed: %d\n", n_iter_prev );
1163 #endif
1164     }
1165 
1166     // Make all the singular values positive.
1167     {
1168         int    i;
1169         double minus_one = bl1_dm1();
1170 
1171         for ( i = 0; i < min_m_n; ++i )
1172         {
1173             if ( buff_d[ (i  )*inc_d ] < rzero )
1174             {
1175                 buff_d[ (i  )*inc_d ] = -buff_d[ (i  )*inc_d ];
1176 
1177                 // Scale the right singular vectors.
1178                 bl1_zdscalv( BLIS1_NO_CONJUGATE,
1179                              m_V,
1180                              &minus_one,
1181                              buff_V + (i  )*cs_V, rs_V );
1182             }
1183         }
1184     }
1185 
1186     return n_iter_prev;
1187 }
1188 
1189