1 /*
2  * $Id: dgesvd.c,v 1.1 2005-09-18 22:04:48 dhmunro Exp $
3  * LAPACK routine to return the SVD of a matrix (m>=n branch only).
4  */
5 /* Copyright (c) 2005, The Regents of the University of California.
6  * All rights reserved.
7  * This file is part of yorick (http://yorick.sourceforge.net).
8  * Read the accompanying LICENSE file for details.
9  */
10 
11 #include "dg.h"
12 
13 /* Compilers have a hard time with this ridiculously large driver routine.
14    Therefore, I split it into a separate m>=n and m<n branch.  The m<n
15    branch is in the file dgesv2.c.  Here is its prototype: */
16 extern void dgesv2( char jobu, char jobvt, long m, long n,
17                    double a[], long lda, double s[], double u[], long ldu,
18                    double vt[], long ldvt,
19                    double work[], long lwork, long *info );
20 
21 
22 
23 /*---blas routines---*/
24 /* dgemm */
25 
26 
27 
28 /*---converted nutty string switches to single characters (lower case)---*/
29 #define lsame(x,y) ((x)==(y))
30 
31 
32 /*-----Fortran intrinsics converted-----*/
33 extern double sqrt(double);
34 #define min(x,y) ((x)<(y)?(x):(y))
35 #define max(x,y) ((x)>(y)?(x):(y))
36 /*-----end of Fortran intrinsics-----*/
37 
38 
39 
dgesvd(char jobu,char jobvt,long m,long n,double a[],long lda,double s[],double u[],long ldu,double vt[],long ldvt,double work[],long lwork,long * info)40 void dgesvd( char jobu, char jobvt, long m, long n,
41             double a[], long lda, double s[], double u[], long ldu,
42             double vt[], long ldvt,
43             double work[], long lwork, long *info )
44 {
45   /**
46    *  -- LAPACK driver routine (version 1.1) --
47    *     Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
48    *     Courant Institute, Argonne National Lab, and Rice University
49    *     March 31, 1993
50    *
51    *     .. Scalar Arguments ..*/
52   /**     ..
53    *     .. Array Arguments ..*/
54 #undef work_1
55 #define work_1(a1) work[a1-1]
56 #undef vt_2
57 #define vt_2(a1,a2) vt[a1-1+ldvt*(a2-1)]
58 #undef u_2
59 #define u_2(a1,a2) u[a1-1+ldu*(a2-1)]
60 #undef s_1
61 #define s_1(a1) s[a1-1]
62 #undef a_2
63 #define a_2(a1,a2) a[a1-1+lda*(a2-1)]
64   /**     ..
65    *
66    *  Purpose
67    *  =======
68    *
69    *  DGESVD computes the singular value decomposition (SVD) of a real
70    *  M-by-N matrix A, optionally computing the left and/or right singular
71    *  vectors. The SVD is written
72    *
73    *       A = U * SIGMA * transpose(V)
74    *
75    *  where SIGMA is an M-by-N matrix which is zero except for its
76    *  min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
77    *  V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
78    *  are the singular values of A; they are real and non-negative, and
79    *  are returned in descending order.  The first min(m,n) columns of
80    *  U and V are the left and right singular vectors of A.
81    *
82    *  Note that the routine returns V**T, not V.
83    *
84    *  Arguments
85    *  =========
86    *
87    *  JOBU    (input) CHARACTER*1
88    *          Specifies options for computing all or part of the matrix U:
89    *          = 'A':  all M columns of U are returned in array U:
90    *          = 'S':  the first min(m,n) columns of U (the left singular
91    *                  vectors) are returned in the array U;
92    *          = 'O':  the first min(m,n) columns of U (the left singular
93    *                  vectors) are overwritten on the array A;
94    *          = 'N':  no columns of U (no left singular vectors) are
95    *                  computed.
96    *
97    *  JOBVT   (input) CHARACTER*1
98    *          Specifies options for computing all or part of the matrix
99    *          V**T:
100    *          = 'A':  all N rows of V**T are returned in the array VT;
101    *          = 'S':  the first min(m,n) rows of V**T (the right singular
102    *                  vectors) are returned in the array VT;
103    *          = 'O':  the first min(m,n) rows of V**T (the right singular
104    *                  vectors) are overwritten on the array A;
105    *          = 'N':  no rows of V**T (no right singular vectors) are
106    *                  computed.
107    *
108    *          JOBVT and JOBU cannot both be 'O'.
109    *
110    *  M       (input) INTEGER
111    *          The number of rows of the input matrix A.  M >= 0.
112    *
113    *  N       (input) INTEGER
114    *          The number of columns of the input matrix A.  N >= 0.
115    *
116    *  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
117    *          On entry, the M-by-N matrix A.
118    *          On exit,
119    *          if JOBU = 'O',  A is overwritten with the first min(m,n)
120    *                          columns of U (the left singular vectors,
121    *                          stored columnwise);
122    *          if JOBVT = 'O', A is overwritten with the first min(m,n)
123    *                          rows of V**T (the right singular vectors,
124    *                          stored rowwise);
125    *          if JOBU .ne. 'O' and JOBVT .ne. 'O', the contents of A
126    *                          are destroyed.
127    *
128    *  LDA     (input) INTEGER
129    *          The leading dimension of the array A.  LDA >= max(1,M).
130    *
131    *  S       (output) DOUBLE PRECISION array, dimension (min(M,N))
132    *          The singular values of A, sorted so that S(i) >= S(i+1).
133    *
134    *  U       (output) DOUBLE PRECISION array, dimension (LDU,UCOL)
135    *          (LDU,M) if JOBU = 'A' or (LDU,min(M,N)) if JOBU = 'S'.
136    *          If JOBU = 'A', U contains the M-by-M orthogonal matrix U;
137    *          if JOBU = 'S', U contains the first min(m,n) columns of U
138    *          (the left singular vectors, stored columnwise);
139    *          if JOBU = 'N' or 'O', U is not referenced.
140    *
141    *  LDU     (input) INTEGER
142    *          The leading dimension of the array U.  LDU >= 1; if
143    *          JOBU = 'S' or 'A', LDU >= M.
144    *
145    *  VT      (output) DOUBLE PRECISION array, dimension (LDVT,N)
146    *          If JOBVT = 'A', VT contains the N-by-N orthogonal matrix
147    *          V**T;
148    *          if JOBVT = 'S', VT contains the first min(m,n) rows of
149    *          V**T (the right singular vectors, stored rowwise);
150    *          if JOBVT = 'N' or 'O', VT is not referenced.
151    *
152    *  LDVT    (input) INTEGER
153    *          The leading dimension of the array VT.  LDVT >= 1; if
154    *          JOBVT = 'A', LDVT >= N; if JOBVT = 'S', LDVT >= min(M,N).
155    *
156    *  WORK    (workspace/output) DOUBLE PRECISION array, dimension (LWORK)
157    *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK;
158    *          if INFO > 0, WORK(2:MIN(M,N)) contains the unconverged
159    *          superdiagonal elements of an upper bidiagonal matrix B
160    *          whose diagonal is in S (not necessarily sorted). B
161    *          satisfies A = U * B * VT, so it has the same singular values
162    *          as A, and singular vectors related by U and VT.
163    *
164    *  LWORK   (input) INTEGER
165    *          The dimension of the array WORK. LWORK >= 1.
166    *          LWORK >= MAX(3*MIN(M,N)+MAX(M,N),5*MIN(M,N)-4).
167    *          For good performance, LWORK should generally be larger.
168    *
169    *  INFO    (output) INTEGER
170    *          = 0:  successful exit.
171    *          < 0:  if INFO = -i, the i-th argument had an illegal value.
172    *          > 0:  if DBDSQR did not converge, INFO specifies how many
173    *                superdiagonals of an intermediate bidiagonal form B
174    *                did not converge to zero. See the description of WORK
175    *                above for details.
176    *
177    *  =====================================================================
178    *
179    *     .. Parameters ..*/
180 #undef zero
181 #define zero 0.0e0
182 #undef one
183 #define one 1.0e0
184   /**     ..
185    *     .. Local Scalars ..*/
186   int            wntua, wntuas, wntun, wntuo, wntus, wntva,
187   wntvas, wntvn, wntvo, wntvs;
188   long   chunk, i, ie=0, ierr, ir, iscl, itau, itaup,
189          itauq, iu, iwork, ldwrkr, ldwrku, maxwrk=0,
190          minmn, minwrk, mnthr, ncu=0, ncvt=0, nru=0,
191          wrkbl=0;
192   double    anrm, bignum, eps, smlnum;
193   char job_u_vt[3];
194   /**     ..
195    *     .. Local Arrays ..*/
196   double    dum[1];
197 #undef dum_1
198 #define dum_1(a1) [a1-1]
199   /**     ..
200    *     .. Intrinsic Functions ..*/
201   /*      intrinsic          max, min, sqrt;*/
202   /**     ..
203    *     .. Executable Statements ..
204    *
205    *     Test the input arguments
206    **/
207   /*-----implicit-declarations-----*/
208   /*-----end-of-declarations-----*/
209   if (m<n) {
210     dgesv2( jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
211            work, lwork, info );
212     return;
213   }
214 
215   *info = 0;
216   minmn = min( m, n );
217   job_u_vt[0]= jobu;
218   job_u_vt[1]= jobvt;
219   job_u_vt[2]= '\0';
220   mnthr = ilaenv( 6, "dgesvd", job_u_vt, m, n, 0, 0 );
221   wntua = lsame( jobu, 'a' );
222   wntus = lsame( jobu, 's' );
223   wntuas = wntua || wntus;
224   wntuo = lsame( jobu, 'o' );
225   wntun = lsame( jobu, 'n' );
226   wntva = lsame( jobvt, 'a' );
227   wntvs = lsame( jobvt, 's' );
228   wntvas = wntva || wntvs;
229   wntvo = lsame( jobvt, 'o' );
230   wntvn = lsame( jobvt, 'n' );
231   minwrk = 1;
232 
233   if( !( wntua || wntus || wntuo || wntun ) ) {
234     *info = -1;
235   } else if( !( wntva || wntvs || wntvo || wntvn ) ||
236             ( wntvo && wntuo ) ) {
237     *info = -2;
238   } else if( m<0 ) {
239     *info = -3;
240   } else if( n<0 ) {
241     *info = -4;
242   } else if( lda<max( 1, m ) ) {
243     *info = -6;
244   } else if( ldu<1 || ( wntuas && ldu<m ) ) {
245     *info = -9;
246   } else if( ldvt<1 || ( wntva && ldvt<n ) ||
247             ( wntvs && ldvt<minmn ) ) {
248     *info = -11;
249   }
250   /**
251    *     Compute workspace
252    *      (Note: Comments in the code beginning "Workspace:" describe the
253    *       minimal amount of workspace needed at that point in the code,
254    *       as well as the preferred amount for good performance.
255    *       NB refers to the optimal block size for the immediately
256    *       following subroutine, as returned by ILAENV.)
257    **/
258   if( *info==0 && lwork>=1 && m>0 && n>0 ) {
259     if( m>=mnthr ) {
260       if( wntun ) {
261         /**
262          *                 Path 1 (M much larger than N, JOBU='N')
263          **/
264         maxwrk = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1,
265                               -1 );
266         maxwrk = max( maxwrk, 3*n+2*n*
267                      ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
268         if( wntvo || wntvas )
269           maxwrk = max( maxwrk, 3*n+( n-1 )*
270                        ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
271         maxwrk = max( maxwrk, 5*n-4 );
272         minwrk = max( 4*n, 5*n-4 );
273       } else if( wntuo && wntvn ) {
274         /**
275          *                 Path 2 (M much larger than N, JOBU='O', JOBVT='N')
276          **/
277         wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
278         wrkbl = max( wrkbl, n+n*ilaenv( 1, "dorgqr", " ", m,
279                                        n, n, -1 ) );
280         wrkbl = max( wrkbl, 3*n+2*n*
281                     ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
282         wrkbl = max( wrkbl, 3*n+n*
283                     ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
284         wrkbl = max( wrkbl, 5*n-4 );
285         maxwrk = max( n*n+wrkbl, n*n+m*n+n );
286         minwrk = max( 3*n+m, 5*n-4 );
287         minwrk = min( minwrk, maxwrk );
288       } else if( wntuo && wntvas ) {
289         /**
290          *                 Path 3 (M much larger than N, JOBU='O', JOBVT='S' or
291          *                 'A')
292          **/
293         wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
294         wrkbl = max( wrkbl, n+n*ilaenv( 1, "dorgqr", " ", m,
295                                        n, n, -1 ) );
296         wrkbl = max( wrkbl, 3*n+2*n*
297                     ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
298         wrkbl = max( wrkbl, 3*n+n*
299                     ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
300         wrkbl = max( wrkbl, 3*n+( n-1 )*
301                     ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
302         wrkbl = max( wrkbl, 5*n-4 );
303         maxwrk = max( n*n+wrkbl, n*n+m*n+n );
304         minwrk = max( 3*n+m, 5*n-4 );
305         minwrk = min( minwrk, maxwrk );
306       } else if( wntus && wntvn ) {
307         /**
308          *                 Path 4 (M much larger than N, JOBU='S', JOBVT='N')
309          **/
310         wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
311         wrkbl = max( wrkbl, n+n*ilaenv( 1, "dorgqr", " ", m,
312                                        n, n, -1 ) );
313         wrkbl = max( wrkbl, 3*n+2*n*
314                     ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
315         wrkbl = max( wrkbl, 3*n+n*
316                     ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
317         wrkbl = max( wrkbl, 5*n-4 );
318         maxwrk = n*n + wrkbl;
319         minwrk = max( 3*n+m, 5*n-4 );
320         minwrk = min( minwrk, maxwrk );
321       } else if( wntus && wntvo ) {
322         /**
323          *                 Path 5 (M much larger than N, JOBU='S', JOBVT='O')
324          **/
325         wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
326         wrkbl = max( wrkbl, n+n*ilaenv( 1, "dorgqr", " ", m,
327                                        n, n, -1 ) );
328         wrkbl = max( wrkbl, 3*n+2*n*
329                     ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
330         wrkbl = max( wrkbl, 3*n+n*
331                     ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
332         wrkbl = max( wrkbl, 3*n+( n-1 )*
333                     ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
334         wrkbl = max( wrkbl, 5*n-4 );
335         maxwrk = 2*n*n + wrkbl;
336         minwrk = max( 3*n+m, 5*n-4 );
337         minwrk = min( minwrk, maxwrk );
338       } else if( wntus && wntvas ) {
339         /**
340          *                 Path 6 (M much larger than N, JOBU='S', JOBVT='S' or
341          *                 'A')
342          **/
343         wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
344         wrkbl = max( wrkbl, n+n*ilaenv( 1, "dorgqr", " ", m,
345                                        n, n, -1 ) );
346         wrkbl = max( wrkbl, 3*n+2*n*
347                     ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
348         wrkbl = max( wrkbl, 3*n+n*
349                     ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
350         wrkbl = max( wrkbl, 3*n+( n-1 )*
351                     ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
352         wrkbl = max( wrkbl, 5*n-4 );
353         maxwrk = n*n + wrkbl;
354         minwrk = max( 3*n+m, 5*n-4 );
355         minwrk = min( minwrk, maxwrk );
356       } else if( wntua && wntvn ) {
357         /**
358          *                 Path 7 (M much larger than N, JOBU='A', JOBVT='N')
359          **/
360         wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
361         wrkbl = max( wrkbl, n+m*ilaenv( 1, "dorgqr", " ", m,
362                                        m, n, -1 ) );
363         wrkbl = max( wrkbl, 3*n+2*n*
364                     ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
365         wrkbl = max( wrkbl, 3*n+n*
366                     ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
367         wrkbl = max( wrkbl, 5*n-4 );
368         maxwrk = n*n + wrkbl;
369         minwrk = max( 3*n+m, 5*n-4 );
370         minwrk = min( minwrk, maxwrk );
371       } else if( wntua && wntvo ) {
372         /**
373          *                 Path 8 (M much larger than N, JOBU='A', JOBVT='O')
374          **/
375         wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
376         wrkbl = max( wrkbl, n+m*ilaenv( 1, "dorgqr", " ", m,
377                                        m, n, -1 ) );
378         wrkbl = max( wrkbl, 3*n+2*n*
379                     ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
380         wrkbl = max( wrkbl, 3*n+n*
381                     ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
382         wrkbl = max( wrkbl, 3*n+( n-1 )*
383                     ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
384         wrkbl = max( wrkbl, 5*n-4 );
385         maxwrk = 2*n*n + wrkbl;
386         minwrk = max( 3*n+m, 5*n-4 );
387         minwrk = min( minwrk, maxwrk );
388       } else if( wntua && wntvas ) {
389         /**
390          *                 Path 9 (M much larger than N, JOBU='A', JOBVT='S' or
391          *                 'A')
392          **/
393         wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
394         wrkbl = max( wrkbl, n+m*ilaenv( 1, "dorgqr", " ", m,
395                                        m, n, -1 ) );
396         wrkbl = max( wrkbl, 3*n+2*n*
397                     ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
398         wrkbl = max( wrkbl, 3*n+n*
399                     ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
400         wrkbl = max( wrkbl, 3*n+( n-1 )*
401                     ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
402         wrkbl = max( wrkbl, 5*n-4 );
403         maxwrk = n*n + wrkbl;
404         minwrk = max( 3*n+m, 5*n-4 );
405         minwrk = min( minwrk, maxwrk );
406       }
407     } else {
408       /**
409        *              Path 10 (M at least N, but not much larger)
410        **/
411       maxwrk = 3*n + ( m+n )*ilaenv( 1, "dgebrd", " ", m, n,
412                                     -1, -1 );
413       if( wntus || wntuo )
414         maxwrk = max( maxwrk, 3*n+n*
415                      ilaenv( 1, "dorgbr", "q", m, n, n, -1 ) );
416       if( wntua )
417         maxwrk = max( maxwrk, 3*n+m*
418                      ilaenv( 1, "dorgbr", "q", m, m, n, -1 ) );
419       if( !wntvn )
420         maxwrk = max( maxwrk, 3*n+( n-1 )*
421                      ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
422       maxwrk = max( maxwrk, 5*n-4 );
423       minwrk = max( 3*n+m, 5*n-4 );
424     }
425     work_1( 1 ) = maxwrk;
426   }
427 
428   if( lwork<minwrk ) {
429     *info = -13;
430   }
431   if( *info!=0 ) {
432     xerbla( "dgesvd", -*info );
433     return;
434   }
435   /**
436    *     Quick return if possible
437    **/
438   if( m==0 || n==0 ) {
439     if( lwork>=1 )
440       work_1( 1 ) = one;
441     return;
442   }
443   /**
444    *     Get machine constants
445    **/
446   eps = dlamch( 'p' );
447   smlnum = sqrt( dlamch( 's' ) ) / eps;
448   bignum = one / smlnum;
449   /**
450    *     Scale A if max entry outside range [SMLNUM,BIGNUM]
451    **/
452   anrm = dlange( 'm', m, n, a, lda, dum );
453   iscl = 0;
454   if( anrm>zero && anrm<smlnum ) {
455     iscl = 1;
456     dlascl( 'g', 0, 0, anrm, smlnum, m, n, a, lda, &ierr );
457   } else if( anrm>bignum ) {
458     iscl = 1;
459     dlascl( 'g', 0, 0, anrm, bignum, m, n, a, lda, &ierr );
460   }
461 
462   /**
463    *        A has at least as many rows as columns. If A has sufficiently
464    *        more rows than columns, first reduce using the QR
465    *        decomposition (if sufficient workspace available)
466    **/
467   if( m>=mnthr ) {
468 
469     if( wntun ) {
470         /**
471          *              Path 1 (M much larger than N, JOBU='N')
472          *              No left singular vectors to be computed
473          **/
474         itau = 1;
475         iwork = itau + n;
476         /**
477          *              Compute A=Q*R
478          *              (Workspace: need 2*N, prefer N+N*NB)
479          **/
480         dgeqrf( m, n, a, lda, &work_1( itau ), &work_1( iwork ),
481                lwork-iwork+1, &ierr );
482         /**
483          *              Zero out below R
484          **/
485         dlaset( 'l', n-1, n-1, zero, zero, &a_2( 2, 1 ), lda );
486         ie = 1;
487         itauq = ie + n;
488         itaup = itauq + n;
489         iwork = itaup + n;
490         /**
491          *              Bidiagonalize R in A
492          *              (Workspace: need 4*N, prefer 3*N+2*N*NB)
493          **/
494         dgebrd( n, n, a, lda, s, &work_1( ie ), &work_1( itauq ),
495                &work_1( itaup ), &work_1( iwork ), lwork-iwork+1,
496                &ierr );
497         ncvt = 0;
498         if( wntvo || wntvas ) {
499           /**
500            *                 If right singular vectors desired, generate P'.
501            *                 (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
502            **/
503           dorgbr( 'p', n, n, n, a, lda, &work_1( itaup ),
504                  &work_1( iwork ), lwork-iwork+1, &ierr );
505           ncvt = n;
506         }
507         iwork = ie + n;
508         /**
509          *              Perform bidiagonal QR iteration, computing right
510          *              singular vectors of A in A if desired
511          *              (Workspace: need 5*N-4)
512          **/
513         dbdsqr( 'u', n, ncvt, 0, 0, s, &work_1( ie ), a, lda,
514                dum, 1, dum, 1, &work_1( iwork ), info );
515         /**
516          *              If right singular vectors desired in VT, copy them there
517          **/
518         if( wntvas )
519           dlacpy( 'f', n, n, a, lda, vt, ldvt );
520 
521     } else if( wntuo && wntvn ) {
522         /**
523          *              Path 2 (M much larger than N, JOBU='O', JOBVT='N')
524          *              N left singular vectors to be overwritten on A and
525          *              no right singular vectors to be computed
526          **/
527         if( lwork>=n*n+max( 4*n, 5*n-4 ) ) {
528           /**
529            *                 Sufficient workspace for a fast algorithm
530            **/
531           ir = 1;
532           if( lwork>=max( wrkbl, lda*n+n )+lda*n ) {
533             /**
534              *                    WORK(IU) is LDA by N, WORK(IR) is LDA by N
535              **/
536             ldwrku = lda;
537             ldwrkr = lda;
538           } else if( lwork>=max( wrkbl, lda*n+n )+n*n ) {
539             /**
540              *                    WORK(IU) is LDA by N, WORK(IR) is N by N
541              **/
542             ldwrku = lda;
543             ldwrkr = n;
544           } else {
545             /**
546              *                    WORK(IU) is LDWRKU by N, WORK(IR) is N by N
547              **/
548             ldwrku = ( lwork-n*n-n ) / n;
549             ldwrkr = n;
550           }
551           itau = ir + ldwrkr*n;
552           iwork = itau + n;
553           /**
554            *                 Compute A=Q*R
555            *                 (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
556            **/
557           dgeqrf( m, n, a, lda, &work_1( itau ),
558                  &work_1( iwork ), lwork-iwork+1, &ierr );
559           /**
560            *                 Copy R to WORK(IR) and zero out below it
561            **/
562           dlacpy( 'u', n, n, a, lda, &work_1( ir ), ldwrkr );
563           dlaset( 'l', n-1, n-1, zero, zero, &work_1( ir+1 ),
564                  ldwrkr );
565           /**
566            *                 Generate Q in A
567            *                 (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
568            **/
569           dorgqr( m, n, n, a, lda, &work_1( itau ),
570                  &work_1( iwork ), lwork-iwork+1, &ierr );
571           ie = itau;
572           itauq = ie + n;
573           itaup = itauq + n;
574           iwork = itaup + n;
575           /**
576            *                 Bidiagonalize R in WORK(IR)
577            *                 (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
578            **/
579           dgebrd( n, n, &work_1( ir ), ldwrkr, s, &work_1( ie ),
580                  &work_1( itauq ), &work_1( itaup ),
581                  &work_1( iwork ), lwork-iwork+1, &ierr );
582           /**
583            *                 Generate left vectors bidiagonalizing R
584            *                 (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
585            **/
586           dorgbr( 'q', n, n, n, &work_1( ir ), ldwrkr,
587                  &work_1( itauq ), &work_1( iwork ),
588                  lwork-iwork+1, &ierr );
589           iwork = ie + n;
590           /**
591            *                 Perform bidiagonal QR iteration, computing left
592            *                 singular vectors of R in WORK(IR)
593            *                 (Workspace: need N*N+5*N-4)
594            **/
595           dbdsqr( 'u', n, 0, n, 0, s, &work_1( ie ), dum, 1,
596                  &work_1( ir ), ldwrkr, dum, 1,
597                  &work_1( iwork ), info );
598           iu = ie + n;
599           /**
600            *                 Multiply Q in A by left singular vectors of R in
601            *                 WORK(IR), storing result in WORK(IU) and copying to A
602            *                 (Workspace: need N*N+2*N, prefer N*N+M*N+N)
603            **/
604           for (i=1 ; ldwrku>0?i<=m:i>=m ; i+=ldwrku) {
605             chunk = min( m-i+1, ldwrku );
606             cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, chunk,
607                         n, n, one, &a_2( i, 1 ), lda, &work_1( ir ), ldwrkr,
608                         zero, &work_1( iu ), ldwrku );
609             dlacpy( 'f', chunk, n, &work_1( iu ), ldwrku,
610                    &a_2( i, 1 ), lda );
611           }
612 
613         } else {
614           /**
615            *                 Insufficient workspace for a fast algorithm
616            **/
617           ie = 1;
618           itauq = ie + n;
619           itaup = itauq + n;
620           iwork = itaup + n;
621           /**
622            *                 Bidiagonalize A
623            *                 (Workspace: need 3*N+M, prefer 3*N+(M+N)*NB)
624            **/
625           dgebrd( m, n, a, lda, s, &work_1( ie ),
626                  &work_1( itauq ), &work_1( itaup ),
627                  &work_1( iwork ), lwork-iwork+1, &ierr );
628           /**
629            *                 Generate left vectors bidiagonalizing A
630            *                 (Workspace: need 4*N, prefer 3*N+N*NB)
631            **/
632           dorgbr( 'q', m, n, n, a, lda, &work_1( itauq ),
633                  &work_1( iwork ), lwork-iwork+1, &ierr );
634           iwork = ie + n;
635           /**
636            *                 Perform bidiagonal QR iteration, computing left
637            *                 singular vectors of A in A
638            *                 (Workspace: need 5*N-4)
639            **/
640           dbdsqr( 'u', n, 0, m, 0, s, &work_1( ie ), dum, 1,
641                  a, lda, dum, 1, &work_1( iwork ), info );
642 
643         }
644 
645     } else if( wntuo && wntvas ) {
646         /**
647          *              Path 3 (M much larger than N, JOBU='O', JOBVT='S' or 'A')
648          *              N left singular vectors to be overwritten on A and
649          *              N right singular vectors to be computed in VT
650          **/
651         if( lwork>=n*n+max( 4*n, 5*n-4 ) ) {
652           /**
653            *                 Sufficient workspace for a fast algorithm
654            **/
655           ir = 1;
656           if( lwork>=max( wrkbl, lda*n+n )+lda*n ) {
657             /**
658              *                    WORK(IU) is LDA by N and WORK(IR) is LDA by N
659              **/
660             ldwrku = lda;
661             ldwrkr = lda;
662           } else if( lwork>=max( wrkbl, lda*n+n )+n*n ) {
663             /**
664              *                    WORK(IU) is LDA by N and WORK(IR) is N by N
665              **/
666             ldwrku = lda;
667             ldwrkr = n;
668           } else {
669             /**
670              *                    WORK(IU) is LDWRKU by N and WORK(IR) is N by N
671              **/
672             ldwrku = ( lwork-n*n-n ) / n;
673             ldwrkr = n;
674           }
675           itau = ir + ldwrkr*n;
676           iwork = itau + n;
677           /**
678            *                 Compute A=Q*R
679            *                 (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
680            **/
681           dgeqrf( m, n, a, lda, &work_1( itau ),
682                  &work_1( iwork ), lwork-iwork+1, &ierr );
683           /**
684            *                 Copy R to VT, zeroing out below it
685            **/
686           dlacpy( 'u', n, n, a, lda, vt, ldvt );
687           dlaset( 'l', n-1, n-1, zero, zero, &vt_2( 2, 1 ),
688                  ldvt );
689           /**
690            *                 Generate Q in A
691            *                 (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
692            **/
693           dorgqr( m, n, n, a, lda, &work_1( itau ),
694                  &work_1( iwork ), lwork-iwork+1, &ierr );
695           ie = itau;
696           itauq = ie + n;
697           itaup = itauq + n;
698           iwork = itaup + n;
699           /**
700            *                 Bidiagonalize R in VT, copying result to WORK(IR)
701            *                 (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
702            **/
703           dgebrd( n, n, vt, ldvt, s, &work_1( ie ),
704                  &work_1( itauq ), &work_1( itaup ),
705                  &work_1( iwork ), lwork-iwork+1, &ierr );
706           dlacpy( 'l', n, n, vt, ldvt, &work_1( ir ), ldwrkr );
707           /**
708            *                 Generate left vectors bidiagonalizing R in WORK(IR)
709            *                 (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
710            **/
711           dorgbr( 'q', n, n, n, &work_1( ir ), ldwrkr,
712                  &work_1( itauq ), &work_1( iwork ),
713                  lwork-iwork+1, &ierr );
714           /**
715            *                 Generate right vectors bidiagonalizing R in VT
716            *                 (Workspace: need N*N+4*N-1, prefer N*N+3*N+(N-1)*NB)
717            **/
718           dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
719                  &work_1( iwork ), lwork-iwork+1, &ierr );
720           iwork = ie + n;
721           /**
722            *                 Perform bidiagonal QR iteration, computing left
723            *                 singular vectors of R in WORK(IR) and computing right
724            *                 singular vectors of R in VT
725            *                 (Workspace: need N*N+5*N-4)
726            **/
727           dbdsqr( 'u', n, n, n, 0, s, &work_1( ie ), vt, ldvt,
728                  &work_1( ir ), ldwrkr, dum, 1,
729                  &work_1( iwork ), info );
730           iu = ie + n;
731           /**
732            *                 Multiply Q in A by left singular vectors of R in
733            *                 WORK(IR), storing result in WORK(IU) and copying to A
734            *                 (Workspace: need N*N+2*N, prefer N*N+M*N+N)
735            **/
736           for (i=1 ; ldwrku>0?i<=m:i>=m ; i+=ldwrku) {
737             chunk = min( m-i+1, ldwrku );
738             cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, chunk,
739                         n, n, one, &a_2( i, 1 ), lda, &work_1( ir ), ldwrkr,
740                         zero, &work_1( iu ), ldwrku );
741             dlacpy( 'f', chunk, n, &work_1( iu ), ldwrku,
742                    &a_2( i, 1 ), lda );
743           }
744 
745         } else {
746           /**
747            *                 Insufficient workspace for a fast algorithm
748            **/
749           itau = 1;
750           iwork = itau + n;
751           /**
752            *                 Compute A=Q*R
753            *                 (Workspace: need 2*N, prefer N+N*NB)
754            **/
755           dgeqrf( m, n, a, lda, &work_1( itau ),
756                  &work_1( iwork ), lwork-iwork+1, &ierr );
757           /**
758            *                 Copy R to VT, zeroing out below it
759            **/
760           dlacpy( 'u', n, n, a, lda, vt, ldvt );
761           dlaset( 'l', n-1, n-1, zero, zero, &vt_2( 2, 1 ),
762                  ldvt );
763           /**
764            *                 Generate Q in A
765            *                 (Workspace: need 2*N, prefer N+N*NB)
766            **/
767           dorgqr( m, n, n, a, lda, &work_1( itau ),
768                  &work_1( iwork ), lwork-iwork+1, &ierr );
769           ie = itau;
770           itauq = ie + n;
771           itaup = itauq + n;
772           iwork = itaup + n;
773           /**
774            *                 Bidiagonalize R in VT
775            *                 (Workspace: need 4*N, prefer 3*N+2*N*NB)
776            **/
777           dgebrd( n, n, vt, ldvt, s, &work_1( ie ),
778                  &work_1( itauq ), &work_1( itaup ),
779                  &work_1( iwork ), lwork-iwork+1, &ierr );
780           /**
781            *                 Multiply Q in A by left vectors bidiagonalizing R
782            *                 (Workspace: need 3*N+M, prefer 3*N+M*NB)
783            **/
784           dormbr( 'q', 'r', 'n', m, n, n, vt, ldvt,
785                  &work_1( itauq ), a, lda, &work_1( iwork ),
786                  lwork-iwork+1, &ierr );
787           /**
788            *                 Generate right vectors bidiagonalizing R in VT
789            *                 (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
790            **/
791           dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
792                  &work_1( iwork ), lwork-iwork+1, &ierr );
793           iwork = ie + n;
794           /**
795            *                 Perform bidiagonal QR iteration, computing left
796            *                 singular vectors of A in A and computing right
797            *                 singular vectors of A in VT
798            *                 (Workspace: need 5*N-4)
799            **/
800           dbdsqr( 'u', n, n, m, 0, s, &work_1( ie ), vt, ldvt,
801                  a, lda, dum, 1, &work_1( iwork ), info );
802 
803         }
804 
805     } else if( wntus ) {
806 
807         if( wntvn ) {
808           /**
809            *                 Path 4 (M much larger than N, JOBU='S', JOBVT='N')
810            *                 N left singular vectors to be computed in U and
811            *                 no right singular vectors to be computed
812            **/
813           if( lwork>=n*n+max( 4*n, 5*n-4 ) ) {
814             /**
815              *                    Sufficient workspace for a fast algorithm
816              **/
817             ir = 1;
818             if( lwork>=wrkbl+lda*n ) {
819               /**
820                *                       WORK(IR) is LDA by N
821                **/
822               ldwrkr = lda;
823             } else {
824               /**
825                *                       WORK(IR) is N by N
826                **/
827               ldwrkr = n;
828             }
829             itau = ir + ldwrkr*n;
830             iwork = itau + n;
831             /**
832              *                    Compute A=Q*R
833              *                    (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
834              **/
835             dgeqrf( m, n, a, lda, &work_1( itau ),
836                    &work_1( iwork ), lwork-iwork+1, &ierr );
837             /**
838              *                    Copy R to WORK(IR), zeroing out below it
839              **/
840             dlacpy( 'u', n, n, a, lda, &work_1( ir ),
841                    ldwrkr );
842             dlaset( 'l', n-1, n-1, zero, zero,
843                    &work_1( ir+1 ), ldwrkr );
844             /**
845              *                    Generate Q in A
846              *                    (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
847              **/
848             dorgqr( m, n, n, a, lda, &work_1( itau ),
849                    &work_1( iwork ), lwork-iwork+1, &ierr );
850             ie = itau;
851             itauq = ie + n;
852             itaup = itauq + n;
853             iwork = itaup + n;
854             /**
855              *                    Bidiagonalize R in WORK(IR)
856              *                    (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
857              **/
858             dgebrd( n, n, &work_1( ir ), ldwrkr, s,
859                    &work_1( ie ), &work_1( itauq ),
860                    &work_1( itaup ), &work_1( iwork ),
861                    lwork-iwork+1, &ierr );
862             /**
863              *                    Generate left vectors bidiagonalizing R in WORK(IR)
864              *                    (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
865              **/
866             dorgbr( 'q', n, n, n, &work_1( ir ), ldwrkr,
867                    &work_1( itauq ), &work_1( iwork ),
868                    lwork-iwork+1, &ierr );
869             iwork = ie + n;
870             /**
871              *                    Perform bidiagonal QR iteration, computing left
872              *                    singular vectors of R in WORK(IR)
873              *                    (Workspace: need N*N+5*N-4)
874              **/
875             dbdsqr( 'u', n, 0, n, 0, s, &work_1( ie ), dum,
876                    1, &work_1( ir ), ldwrkr, dum, 1,
877                    &work_1( iwork ), info );
878             /**
879              *                    Multiply Q in A by left singular vectors of R in
880              *                    WORK(IR), storing result in U
881              *                    (Workspace: need N*N)
882              **/
883             cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
884                         one, a, lda, &work_1( ir ), ldwrkr, zero, u, ldu );
885 
886           } else {
887             /**
888              *                    Insufficient workspace for a fast algorithm
889              **/
890             itau = 1;
891             iwork = itau + n;
892             /**
893              *                    Compute A=Q*R, copying result to U
894              *                    (Workspace: need 2*N, prefer N+N*NB)
895              **/
896             dgeqrf( m, n, a, lda, &work_1( itau ),
897                    &work_1( iwork ), lwork-iwork+1, &ierr );
898             dlacpy( 'l', m, n, a, lda, u, ldu );
899             /**
900              *                    Generate Q in U
901              *                    (Workspace: need 2*N, prefer N+N*NB)
902              **/
903             dorgqr( m, n, n, u, ldu, &work_1( itau ),
904                    &work_1( iwork ), lwork-iwork+1, &ierr );
905             ie = itau;
906             itauq = ie + n;
907             itaup = itauq + n;
908             iwork = itaup + n;
909             /**
910              *                    Zero out below R in A
911              **/
912             dlaset( 'l', n-1, n-1, zero, zero, &a_2( 2, 1 ),
913                    lda );
914             /**
915              *                    Bidiagonalize R in A
916              *                    (Workspace: need 4*N, prefer 3*N+2*N*NB)
917              **/
918             dgebrd( n, n, a, lda, s, &work_1( ie ),
919                    &work_1( itauq ), &work_1( itaup ),
920                    &work_1( iwork ), lwork-iwork+1, &ierr );
921             /**
922              *                    Multiply Q in U by left vectors bidiagonalizing R
923              *                    (Workspace: need 3*N+M, prefer 3*N+M*NB)
924              **/
925             dormbr( 'q', 'r', 'n', m, n, n, a, lda,
926                    &work_1( itauq ), u, ldu, &work_1( iwork ),
927                    lwork-iwork+1, &ierr );
928             iwork = ie + n;
929             /**
930              *                    Perform bidiagonal QR iteration, computing left
931              *                    singular vectors of A in U
932              *                    (Workspace: need 5*N-4)
933              **/
934             dbdsqr( 'u', n, 0, m, 0, s, &work_1( ie ), dum,
935                    1, u, ldu, dum, 1, &work_1( iwork ),
936                    info );
937 
938           }
939 
940         } else if( wntvo ) {
941           /**
942            *                 Path 5 (M much larger than N, JOBU='S', JOBVT='O')
943            *                 N left singular vectors to be computed in U and
944            *                 N right singular vectors to be overwritten on A
945            **/
946           if( lwork>=2*n*n+max( 4*n, 5*n-4 ) ) {
947             /**
948              *                    Sufficient workspace for a fast algorithm
949              **/
950             iu = 1;
951             if( lwork>=wrkbl+2*lda*n ) {
952               /**
953                *                       WORK(IU) is LDA by N and WORK(IR) is LDA by N
954                **/
955               ldwrku = lda;
956               ir = iu + ldwrku*n;
957               ldwrkr = lda;
958             } else if( lwork>=wrkbl+( lda+n )*n ) {
959               /**
960                *                       WORK(IU) is LDA by N and WORK(IR) is N by N
961                **/
962               ldwrku = lda;
963               ir = iu + ldwrku*n;
964               ldwrkr = n;
965             } else {
966               /**
967                *                       WORK(IU) is N by N and WORK(IR) is N by N
968                **/
969               ldwrku = n;
970               ir = iu + ldwrku*n;
971               ldwrkr = n;
972             }
973             itau = ir + ldwrkr*n;
974             iwork = itau + n;
975             /**
976              *                    Compute A=Q*R
977              *                    (Workspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB)
978              **/
979             dgeqrf( m, n, a, lda, &work_1( itau ),
980                    &work_1( iwork ), lwork-iwork+1, &ierr );
981             /**
982              *                    Copy R to WORK(IU), zeroing out below it
983              **/
984             dlacpy( 'u', n, n, a, lda, &work_1( iu ),
985                    ldwrku );
986             dlaset( 'l', n-1, n-1, zero, zero,
987                    &work_1( iu+1 ), ldwrku );
988             /**
989              *                    Generate Q in A
990              *                    (Workspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB)
991              **/
992             dorgqr( m, n, n, a, lda, &work_1( itau ),
993                    &work_1( iwork ), lwork-iwork+1, &ierr );
994             ie = itau;
995             itauq = ie + n;
996             itaup = itauq + n;
997             iwork = itaup + n;
998             /**
999              *                    Bidiagonalize R in WORK(IU), copying result to
1000              *                    WORK(IR)
1001              *                    (Workspace: need 2*N*N+4*N,
1002              *                                prefer 2*N*N+3*N+2*N*NB)
1003              **/
1004             dgebrd( n, n, &work_1( iu ), ldwrku, s,
1005                    &work_1( ie ), &work_1( itauq ),
1006                    &work_1( itaup ), &work_1( iwork ),
1007                    lwork-iwork+1, &ierr );
1008             dlacpy( 'u', n, n, &work_1( iu ), ldwrku,
1009                    &work_1( ir ), ldwrkr );
1010             /**
1011              *                    Generate left bidiagonalizing vectors in WORK(IU)
1012              *                    (Workspace: need 2*N*N+4*N, prefer 2*N*N+3*N+N*NB)
1013              **/
1014             dorgbr( 'q', n, n, n, &work_1( iu ), ldwrku,
1015                    &work_1( itauq ), &work_1( iwork ),
1016                    lwork-iwork+1, &ierr );
1017             /**
1018              *                    Generate right bidiagonalizing vectors in WORK(IR)
1019              *                    (Workspace: need 2*N*N+4*N-1,
1020              *                                prefer 2*N*N+3*N+(N-1)*NB)
1021              **/
1022             dorgbr( 'p', n, n, n, &work_1( ir ), ldwrkr,
1023                    &work_1( itaup ), &work_1( iwork ),
1024                    lwork-iwork+1, &ierr );
1025             iwork = ie + n;
1026             /**
1027              *                    Perform bidiagonal QR iteration, computing left
1028              *                    singular vectors of R in WORK(IU) and computing
1029              *                    right singular vectors of R in WORK(IR)
1030              *                    (Workspace: need 2*N*N+5*N-4)
1031              **/
1032             dbdsqr( 'u', n, n, n, 0, s, &work_1( ie ),
1033                    &work_1( ir ), ldwrkr, &work_1( iu ),
1034                    ldwrku, dum, 1, &work_1( iwork ), info );
1035             /**
1036              *                    Multiply Q in A by left singular vectors of R in
1037              *                    WORK(IU), storing result in U
1038              *                    (Workspace: need N*N)
1039              **/
1040             cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
1041                         one, a, lda, &work_1( iu ), ldwrku, zero, u, ldu );
1042             /**
1043              *                    Copy right singular vectors of R to A
1044              *                    (Workspace: need N*N)
1045              **/
1046             dlacpy( 'f', n, n, &work_1( ir ), ldwrkr, a,
1047                    lda );
1048 
1049           } else {
1050             /**
1051              *                    Insufficient workspace for a fast algorithm
1052              **/
1053             itau = 1;
1054             iwork = itau + n;
1055             /**
1056              *                    Compute A=Q*R, copying result to U
1057              *                    (Workspace: need 2*N, prefer N+N*NB)
1058              **/
1059             dgeqrf( m, n, a, lda, &work_1( itau ),
1060                    &work_1( iwork ), lwork-iwork+1, &ierr );
1061             dlacpy( 'l', m, n, a, lda, u, ldu );
1062             /**
1063              *                    Generate Q in U
1064              *                    (Workspace: need 2*N, prefer N+N*NB)
1065              **/
1066             dorgqr( m, n, n, u, ldu, &work_1( itau ),
1067                    &work_1( iwork ), lwork-iwork+1, &ierr );
1068             ie = itau;
1069             itauq = ie + n;
1070             itaup = itauq + n;
1071             iwork = itaup + n;
1072             /**
1073              *                    Zero out below R in A
1074              **/
1075             dlaset( 'l', n-1, n-1, zero, zero, &a_2( 2, 1 ),
1076                    lda );
1077             /**
1078              *                    Bidiagonalize R in A
1079              *                    (Workspace: need 4*N, prefer 3*N+2*N*NB)
1080              **/
1081             dgebrd( n, n, a, lda, s, &work_1( ie ),
1082                    &work_1( itauq ), &work_1( itaup ),
1083                    &work_1( iwork ), lwork-iwork+1, &ierr );
1084             /**
1085              *                    Multiply Q in U by left vectors bidiagonalizing R
1086              *                    (Workspace: need 3*N+M, prefer 3*N+M*NB)
1087              **/
1088             dormbr( 'q', 'r', 'n', m, n, n, a, lda,
1089                    &work_1( itauq ), u, ldu, &work_1( iwork ),
1090                    lwork-iwork+1, &ierr );
1091             /**
1092              *                    Generate right vectors bidiagonalizing R in A
1093              *                    (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1094              **/
1095             dorgbr( 'p', n, n, n, a, lda, &work_1( itaup ),
1096                    &work_1( iwork ), lwork-iwork+1, &ierr );
1097             iwork = ie + n;
1098             /**
1099              *                    Perform bidiagonal QR iteration, computing left
1100              *                    singular vectors of A in U and computing right
1101              *                    singular vectors of A in A
1102              *                    (Workspace: need 5*N-4)
1103              **/
1104             dbdsqr( 'u', n, n, m, 0, s, &work_1( ie ), a,
1105                    lda, u, ldu, dum, 1, &work_1( iwork ),
1106                    info );
1107 
1108           }
1109 
1110         } else if( wntvas ) {
1111           /**
1112            *                 Path 6 (M much larger than N, JOBU='S', JOBVT='S'
1113            *                         or 'A')
1114            *                 N left singular vectors to be computed in U and
1115            *                 N right singular vectors to be computed in VT
1116            **/
1117           if( lwork>=n*n+max( 4*n, 5*n-4 ) ) {
1118             /**
1119              *                    Sufficient workspace for a fast algorithm
1120              **/
1121             iu = 1;
1122             if( lwork>=wrkbl+lda*n ) {
1123               /**
1124                *                       WORK(IU) is LDA by N
1125                **/
1126               ldwrku = lda;
1127             } else {
1128               /**
1129                *                       WORK(IU) is N by N
1130                **/
1131               ldwrku = n;
1132             }
1133             itau = iu + ldwrku*n;
1134             iwork = itau + n;
1135             /**
1136              *                    Compute A=Q*R
1137              *                    (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
1138              **/
1139             dgeqrf( m, n, a, lda, &work_1( itau ),
1140                    &work_1( iwork ), lwork-iwork+1, &ierr );
1141             /**
1142              *                    Copy R to WORK(IU), zeroing out below it
1143              **/
1144             dlacpy( 'u', n, n, a, lda, &work_1( iu ),
1145                    ldwrku );
1146             dlaset( 'l', n-1, n-1, zero, zero,
1147                    &work_1( iu+1 ), ldwrku );
1148             /**
1149              *                    Generate Q in A
1150              *                    (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
1151              **/
1152             dorgqr( m, n, n, a, lda, &work_1( itau ),
1153                    &work_1( iwork ), lwork-iwork+1, &ierr );
1154             ie = itau;
1155             itauq = ie + n;
1156             itaup = itauq + n;
1157             iwork = itaup + n;
1158             /**
1159              *                    Bidiagonalize R in WORK(IU), copying result to VT
1160              *                    (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
1161              **/
1162             dgebrd( n, n, &work_1( iu ), ldwrku, s,
1163                    &work_1( ie ), &work_1( itauq ),
1164                    &work_1( itaup ), &work_1( iwork ),
1165                    lwork-iwork+1, &ierr );
1166             dlacpy( 'u', n, n, &work_1( iu ), ldwrku, vt,
1167                    ldvt );
1168             /**
1169              *                    Generate left bidiagonalizing vectors in WORK(IU)
1170              *                    (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
1171              **/
1172             dorgbr( 'q', n, n, n, &work_1( iu ), ldwrku,
1173                    &work_1( itauq ), &work_1( iwork ),
1174                    lwork-iwork+1, &ierr );
1175             /**
1176              *                    Generate right bidiagonalizing vectors in VT
1177              *                    (Workspace: need N*N+4*N-1,
1178              *                                prefer N*N+3*N+(N-1)*NB)
1179              **/
1180             dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
1181                    &work_1( iwork ), lwork-iwork+1, &ierr );
1182             iwork = ie + n;
1183             /**
1184              *                    Perform bidiagonal QR iteration, computing left
1185              *                    singular vectors of R in WORK(IU) and computing
1186              *                    right singular vectors of R in VT
1187              *                    (Workspace: need N*N+5*N-4)
1188              **/
1189             dbdsqr( 'u', n, n, n, 0, s, &work_1( ie ), vt,
1190                    ldvt, &work_1( iu ), ldwrku, dum, 1,
1191                    &work_1( iwork ), info );
1192             /**
1193              *                    Multiply Q in A by left singular vectors of R in
1194              *                    WORK(IU), storing result in U
1195              *                    (Workspace: need N*N)
1196              **/
1197             cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
1198                         one, a, lda, &work_1( iu ), ldwrku, zero, u, ldu );
1199 
1200           } else {
1201             /**
1202              *                    Insufficient workspace for a fast algorithm
1203              **/
1204             itau = 1;
1205             iwork = itau + n;
1206             /**
1207              *                    Compute A=Q*R, copying result to U
1208              *                    (Workspace: need 2*N, prefer N+N*NB)
1209              **/
1210             dgeqrf( m, n, a, lda, &work_1( itau ),
1211                    &work_1( iwork ), lwork-iwork+1, &ierr );
1212             dlacpy( 'l', m, n, a, lda, u, ldu );
1213             /**
1214              *                    Generate Q in U
1215              *                    (Workspace: need 2*N, prefer N+N*NB)
1216              **/
1217             dorgqr( m, n, n, u, ldu, &work_1( itau ),
1218                    &work_1( iwork ), lwork-iwork+1, &ierr );
1219             /**
1220              *                    Copy R to VT, zeroing out below it
1221              **/
1222             dlacpy( 'u', n, n, a, lda, vt, ldvt );
1223             dlaset( 'l', n-1, n-1, zero, zero, &vt_2( 2, 1 ),
1224                    ldvt );
1225             ie = itau;
1226             itauq = ie + n;
1227             itaup = itauq + n;
1228             iwork = itaup + n;
1229             /**
1230              *                    Bidiagonalize R in VT
1231              *                    (Workspace: need 4*N, prefer 3*N+2*N*NB)
1232              **/
1233             dgebrd( n, n, vt, ldvt, s, &work_1( ie ),
1234                    &work_1( itauq ), &work_1( itaup ),
1235                    &work_1( iwork ), lwork-iwork+1, &ierr );
1236             /**
1237              *                    Multiply Q in U by left bidiagonalizing vectors
1238              *                    in VT
1239              *                    (Workspace: need 3*N+M, prefer 3*N+M*NB)
1240              **/
1241             dormbr( 'q', 'r', 'n', m, n, n, vt, ldvt,
1242                    &work_1( itauq ), u, ldu, &work_1( iwork ),
1243                    lwork-iwork+1, &ierr );
1244             /**
1245              *                    Generate right bidiagonalizing vectors in VT
1246              *                    (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1247              **/
1248             dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
1249                    &work_1( iwork ), lwork-iwork+1, &ierr );
1250             iwork = ie + n;
1251             /**
1252              *                    Perform bidiagonal QR iteration, computing left
1253              *                    singular vectors of A in U and computing right
1254              *                    singular vectors of A in VT
1255              *                    (Workspace: need 5*N-4)
1256              **/
1257             dbdsqr( 'u', n, n, m, 0, s, &work_1( ie ), vt,
1258                    ldvt, u, ldu, dum, 1, &work_1( iwork ),
1259                    info );
1260 
1261           }
1262 
1263         }
1264 
1265     } else if( wntua ) {
1266 
1267         if( wntvn ) {
1268           /**
1269            *                 Path 7 (M much larger than N, JOBU='A', JOBVT='N')
1270            *                 M left singular vectors to be computed in U and
1271            *                 no right singular vectors to be computed
1272            **/
1273           if( lwork>=n*n+max( n+m, max(4*n, 5*n-4) ) ) {
1274             /**
1275              *                    Sufficient workspace for a fast algorithm
1276              **/
1277             ir = 1;
1278             if( lwork>=wrkbl+lda*n ) {
1279               /**
1280                *                       WORK(IR) is LDA by N
1281                **/
1282               ldwrkr = lda;
1283             } else {
1284               /**
1285                *                       WORK(IR) is N by N
1286                **/
1287               ldwrkr = n;
1288             }
1289             itau = ir + ldwrkr*n;
1290             iwork = itau + n;
1291             /**
1292              *                    Compute A=Q*R, copying result to U
1293              *                    (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
1294              **/
1295             dgeqrf( m, n, a, lda, &work_1( itau ),
1296                    &work_1( iwork ), lwork-iwork+1, &ierr );
1297             dlacpy( 'l', m, n, a, lda, u, ldu );
1298             /**
1299              *                    Copy R to WORK(IR), zeroing out below it
1300              **/
1301             dlacpy( 'u', n, n, a, lda, &work_1( ir ),
1302                    ldwrkr );
1303             dlaset( 'l', n-1, n-1, zero, zero,
1304                    &work_1( ir+1 ), ldwrkr );
1305             /**
1306              *                    Generate Q in U
1307              *                    (Workspace: need N*N+N+M, prefer N*N+N+M*NB)
1308              **/
1309             dorgqr( m, m, n, u, ldu, &work_1( itau ),
1310                    &work_1( iwork ), lwork-iwork+1, &ierr );
1311             ie = itau;
1312             itauq = ie + n;
1313             itaup = itauq + n;
1314             iwork = itaup + n;
1315             /**
1316              *                    Bidiagonalize R in WORK(IR)
1317              *                    (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
1318              **/
1319             dgebrd( n, n, &work_1( ir ), ldwrkr, s,
1320                    &work_1( ie ), &work_1( itauq ),
1321                    &work_1( itaup ), &work_1( iwork ),
1322                    lwork-iwork+1, &ierr );
1323             /**
1324              *                    Generate left bidiagonalizing vectors in WORK(IR)
1325              *                    (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
1326              **/
1327             dorgbr( 'q', n, n, n, &work_1( ir ), ldwrkr,
1328                    &work_1( itauq ), &work_1( iwork ),
1329                    lwork-iwork+1, &ierr );
1330             iwork = ie + n;
1331             /**
1332              *                    Perform bidiagonal QR iteration, computing left
1333              *                    singular vectors of R in WORK(IR)
1334              *                    (Workspace: need N*N+5*N-4)
1335              **/
1336             dbdsqr( 'u', n, 0, n, 0, s, &work_1( ie ), dum,
1337                    1, &work_1( ir ), ldwrkr, dum, 1,
1338                    &work_1( iwork ), info );
1339             /**
1340              *                    Multiply Q in U by left singular vectors of R in
1341              *                    WORK(IR), storing result in A
1342              *                    (Workspace: need N*N)
1343              **/
1344             cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
1345                         one, u, ldu, &work_1( ir ), ldwrkr, zero, a, lda );
1346             /**
1347              *                    Copy left singular vectors of A from A to U
1348              **/
1349             dlacpy( 'f', m, n, a, lda, u, ldu );
1350 
1351           } else {
1352             /**
1353              *                    Insufficient workspace for a fast algorithm
1354              **/
1355             itau = 1;
1356             iwork = itau + n;
1357             /**
1358              *                    Compute A=Q*R, copying result to U
1359              *                    (Workspace: need 2*N, prefer N+N*NB)
1360              **/
1361             dgeqrf( m, n, a, lda, &work_1( itau ),
1362                    &work_1( iwork ), lwork-iwork+1, &ierr );
1363             dlacpy( 'l', m, n, a, lda, u, ldu );
1364             /**
1365              *                    Generate Q in U
1366              *                    (Workspace: need N+M, prefer N+M*NB)
1367              **/
1368             dorgqr( m, m, n, u, ldu, &work_1( itau ),
1369                    &work_1( iwork ), lwork-iwork+1, &ierr );
1370             ie = itau;
1371             itauq = ie + n;
1372             itaup = itauq + n;
1373             iwork = itaup + n;
1374             /**
1375              *                    Zero out below R in A
1376              **/
1377             dlaset( 'l', n-1, n-1, zero, zero, &a_2( 2, 1 ),
1378                    lda );
1379             /**
1380              *                    Bidiagonalize R in A
1381              *                    (Workspace: need 4*N, prefer 3*N+2*N*NB)
1382              **/
1383             dgebrd( n, n, a, lda, s, &work_1( ie ),
1384                    &work_1( itauq ), &work_1( itaup ),
1385                    &work_1( iwork ), lwork-iwork+1, &ierr );
1386             /**
1387              *                    Multiply Q in U by left bidiagonalizing vectors
1388              *                    in A
1389              *                    (Workspace: need 3*N+M, prefer 3*N+M*NB)
1390              **/
1391             dormbr( 'q', 'r', 'n', m, n, n, a, lda,
1392                    &work_1( itauq ), u, ldu, &work_1( iwork ),
1393                    lwork-iwork+1, &ierr );
1394             iwork = ie + n;
1395             /**
1396              *                    Perform bidiagonal QR iteration, computing left
1397              *                    singular vectors of A in U
1398              *                    (Workspace: need 5*N-4)
1399              **/
1400             dbdsqr( 'u', n, 0, m, 0, s, &work_1( ie ), dum,
1401                    1, u, ldu, dum, 1, &work_1( iwork ),
1402                    info );
1403 
1404           }
1405 
1406         } else if( wntvo ) {
1407           /**
1408            *                 Path 8 (M much larger than N, JOBU='A', JOBVT='O')
1409            *                 M left singular vectors to be computed in U and
1410            *                 N right singular vectors to be overwritten on A
1411            **/
1412           if( lwork>=2*n*n+max( n+m, max(4*n, 5*n-4) ) ) {
1413             /**
1414              *                    Sufficient workspace for a fast algorithm
1415              **/
1416             iu = 1;
1417             if( lwork>=wrkbl+2*lda*n ) {
1418               /**
1419                *                       WORK(IU) is LDA by N and WORK(IR) is LDA by N
1420                **/
1421               ldwrku = lda;
1422               ir = iu + ldwrku*n;
1423               ldwrkr = lda;
1424             } else if( lwork>=wrkbl+( lda+n )*n ) {
1425               /**
1426                *                       WORK(IU) is LDA by N and WORK(IR) is N by N
1427                **/
1428               ldwrku = lda;
1429               ir = iu + ldwrku*n;
1430               ldwrkr = n;
1431             } else {
1432               /**
1433                *                       WORK(IU) is N by N and WORK(IR) is N by N
1434                **/
1435               ldwrku = n;
1436               ir = iu + ldwrku*n;
1437               ldwrkr = n;
1438             }
1439             itau = ir + ldwrkr*n;
1440             iwork = itau + n;
1441             /**
1442              *                    Compute A=Q*R, copying result to U
1443              *                    (Workspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB)
1444              **/
1445             dgeqrf( m, n, a, lda, &work_1( itau ),
1446                    &work_1( iwork ), lwork-iwork+1, &ierr );
1447             dlacpy( 'l', m, n, a, lda, u, ldu );
1448             /**
1449              *                    Generate Q in U
1450              *                    (Workspace: need 2*N*N+N+M, prefer 2*N*N+N+M*NB)
1451              **/
1452             dorgqr( m, m, n, u, ldu, &work_1( itau ),
1453                    &work_1( iwork ), lwork-iwork+1, &ierr );
1454             /**
1455              *                    Copy R to WORK(IU), zeroing out below it
1456              **/
1457             dlacpy( 'u', n, n, a, lda, &work_1( iu ),
1458                    ldwrku );
1459             dlaset( 'l', n-1, n-1, zero, zero,
1460                    &work_1( iu+1 ), ldwrku );
1461             ie = itau;
1462             itauq = ie + n;
1463             itaup = itauq + n;
1464             iwork = itaup + n;
1465             /**
1466              *                    Bidiagonalize R in WORK(IU), copying result to
1467              *                    WORK(IR)
1468              *                    (Workspace: need 2*N*N+4*N,
1469              *                                prefer 2*N*N+3*N+2*N*NB)
1470              **/
1471             dgebrd( n, n, &work_1( iu ), ldwrku, s,
1472                    &work_1( ie ), &work_1( itauq ),
1473                    &work_1( itaup ), &work_1( iwork ),
1474                    lwork-iwork+1, &ierr );
1475             dlacpy( 'u', n, n, &work_1( iu ), ldwrku,
1476                    &work_1( ir ), ldwrkr );
1477             /**
1478              *                    Generate left bidiagonalizing vectors in WORK(IU)
1479              *                    (Workspace: need 2*N*N+4*N, prefer 2*N*N+3*N+N*NB)
1480              **/
1481             dorgbr( 'q', n, n, n, &work_1( iu ), ldwrku,
1482                    &work_1( itauq ), &work_1( iwork ),
1483                    lwork-iwork+1, &ierr );
1484             /**
1485              *                    Generate right bidiagonalizing vectors in WORK(IR)
1486              *                    (Workspace: need 2*N*N+4*N-1,
1487              *                                prefer 2*N*N+3*N+(N-1)*NB)
1488              **/
1489             dorgbr( 'p', n, n, n, &work_1( ir ), ldwrkr,
1490                    &work_1( itaup ), &work_1( iwork ),
1491                    lwork-iwork+1, &ierr );
1492             iwork = ie + n;
1493             /**
1494              *                    Perform bidiagonal QR iteration, computing left
1495              *                    singular vectors of R in WORK(IU) and computing
1496              *                    right singular vectors of R in WORK(IR)
1497              *                    (Workspace: need 2*N*N+5*N-4)
1498              **/
1499             dbdsqr( 'u', n, n, n, 0, s, &work_1( ie ),
1500                    &work_1( ir ), ldwrkr, &work_1( iu ),
1501                    ldwrku, dum, 1, &work_1( iwork ), info );
1502             /**
1503              *                    Multiply Q in U by left singular vectors of R in
1504              *                    WORK(IU), storing result in A
1505              *                    (Workspace: need N*N)
1506              **/
1507             cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
1508                         one, u, ldu, &work_1( iu ), ldwrku, zero, a, lda );
1509             /**
1510              *                    Copy left singular vectors of A from A to U
1511              **/
1512             dlacpy( 'f', m, n, a, lda, u, ldu );
1513             /**
1514              *                    Copy right singular vectors of R from WORK(IR) to A
1515              **/
1516             dlacpy( 'f', n, n, &work_1( ir ), ldwrkr, a,
1517                    lda );
1518 
1519           } else {
1520             /**
1521              *                    Insufficient workspace for a fast algorithm
1522              **/
1523             itau = 1;
1524             iwork = itau + n;
1525             /**
1526              *                    Compute A=Q*R, copying result to U
1527              *                    (Workspace: need 2*N, prefer N+N*NB)
1528              **/
1529             dgeqrf( m, n, a, lda, &work_1( itau ),
1530                    &work_1( iwork ), lwork-iwork+1, &ierr );
1531             dlacpy( 'l', m, n, a, lda, u, ldu );
1532             /**
1533              *                    Generate Q in U
1534              *                    (Workspace: need N+M, prefer N+M*NB)
1535              **/
1536             dorgqr( m, m, n, u, ldu, &work_1( itau ),
1537                    &work_1( iwork ), lwork-iwork+1, &ierr );
1538             ie = itau;
1539             itauq = ie + n;
1540             itaup = itauq + n;
1541             iwork = itaup + n;
1542             /**
1543              *                    Zero out below R in A
1544              **/
1545             dlaset( 'l', n-1, n-1, zero, zero, &a_2( 2, 1 ),
1546                    lda );
1547             /**
1548              *                    Bidiagonalize R in A
1549              *                    (Workspace: need 4*N, prefer 3*N+2*N*NB)
1550              **/
1551             dgebrd( n, n, a, lda, s, &work_1( ie ),
1552                    &work_1( itauq ), &work_1( itaup ),
1553                    &work_1( iwork ), lwork-iwork+1, &ierr );
1554             /**
1555              *                    Multiply Q in U by left bidiagonalizing vectors
1556              *                    in A
1557              *                    (Workspace: need 3*N+M, prefer 3*N+M*NB)
1558              **/
1559             dormbr( 'q', 'r', 'n', m, n, n, a, lda,
1560                    &work_1( itauq ), u, ldu, &work_1( iwork ),
1561                    lwork-iwork+1, &ierr );
1562             /**
1563              *                    Generate right bidiagonalizing vectors in A
1564              *                    (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1565              **/
1566             dorgbr( 'p', n, n, n, a, lda, &work_1( itaup ),
1567                    &work_1( iwork ), lwork-iwork+1, &ierr );
1568             iwork = ie + n;
1569             /**
1570              *                    Perform bidiagonal QR iteration, computing left
1571              *                    singular vectors of A in U and computing right
1572              *                    singular vectors of A in A
1573              *                    (Workspace: need 5*N-4)
1574              **/
1575             dbdsqr( 'u', n, n, m, 0, s, &work_1( ie ), a,
1576                    lda, u, ldu, dum, 1, &work_1( iwork ),
1577                    info );
1578 
1579           }
1580 
1581         } else if( wntvas ) {
1582           /**
1583            *                 Path 9 (M much larger than N, JOBU='A', JOBVT='S'
1584            *                         or 'A')
1585            *                 M left singular vectors to be computed in U and
1586            *                 N right singular vectors to be computed in VT
1587            **/
1588           if( lwork>=n*n+max( n+m, max(4*n, 5*n-4) ) ) {
1589             /**
1590              *                    Sufficient workspace for a fast algorithm
1591              **/
1592             iu = 1;
1593             if( lwork>=wrkbl+lda*n ) {
1594               /**
1595                *                       WORK(IU) is LDA by N
1596                **/
1597               ldwrku = lda;
1598             } else {
1599               /**
1600                *                       WORK(IU) is N by N
1601                **/
1602               ldwrku = n;
1603             }
1604             itau = iu + ldwrku*n;
1605             iwork = itau + n;
1606             /**
1607              *                    Compute A=Q*R, copying result to U
1608              *                    (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
1609              **/
1610             dgeqrf( m, n, a, lda, &work_1( itau ),
1611                    &work_1( iwork ), lwork-iwork+1, &ierr );
1612             dlacpy( 'l', m, n, a, lda, u, ldu );
1613             /**
1614              *                    Generate Q in U
1615              *                    (Workspace: need N*N+N+M, prefer N*N+N+M*NB)
1616              **/
1617             dorgqr( m, m, n, u, ldu, &work_1( itau ),
1618                    &work_1( iwork ), lwork-iwork+1, &ierr );
1619             /**
1620              *                    Copy R to WORK(IU), zeroing out below it
1621              **/
1622             dlacpy( 'u', n, n, a, lda, &work_1( iu ),
1623                    ldwrku );
1624             dlaset( 'l', n-1, n-1, zero, zero,
1625                    &work_1( iu+1 ), ldwrku );
1626             ie = itau;
1627             itauq = ie + n;
1628             itaup = itauq + n;
1629             iwork = itaup + n;
1630             /**
1631              *                    Bidiagonalize R in WORK(IU), copying result to VT
1632              *                    (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
1633              **/
1634             dgebrd( n, n, &work_1( iu ), ldwrku, s,
1635                    &work_1( ie ), &work_1( itauq ),
1636                    &work_1( itaup ), &work_1( iwork ),
1637                    lwork-iwork+1, &ierr );
1638             dlacpy( 'u', n, n, &work_1( iu ), ldwrku, vt,
1639                    ldvt );
1640             /**
1641              *                    Generate left bidiagonalizing vectors in WORK(IU)
1642              *                    (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
1643              **/
1644             dorgbr( 'q', n, n, n, &work_1( iu ), ldwrku,
1645                    &work_1( itauq ), &work_1( iwork ),
1646                    lwork-iwork+1, &ierr );
1647             /**
1648              *                    Generate right bidiagonalizing vectors in VT
1649              *                    (Workspace: need N*N+4*N-1,
1650              *                                prefer N*N+3*N+(N-1)*NB)
1651              **/
1652             dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
1653                    &work_1( iwork ), lwork-iwork+1, &ierr );
1654             iwork = ie + n;
1655             /**
1656              *                    Perform bidiagonal QR iteration, computing left
1657              *                    singular vectors of R in WORK(IU) and computing
1658              *                    right singular vectors of R in VT
1659              *                    (Workspace: need N*N+5*N-4)
1660              **/
1661             dbdsqr( 'u', n, n, n, 0, s, &work_1( ie ), vt,
1662                    ldvt, &work_1( iu ), ldwrku, dum, 1,
1663                    &work_1( iwork ), info );
1664             /**
1665              *                    Multiply Q in U by left singular vectors of R in
1666              *                    WORK(IU), storing result in A
1667              *                    (Workspace: need N*N)
1668              **/
1669             cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
1670                         one, u, ldu, &work_1( iu ), ldwrku, zero, a, lda );
1671             /**
1672              *                    Copy left singular vectors of A from A to U
1673              **/
1674             dlacpy( 'f', m, n, a, lda, u, ldu );
1675 
1676           } else {
1677             /**
1678              *                    Insufficient workspace for a fast algorithm
1679              **/
1680             itau = 1;
1681             iwork = itau + n;
1682             /**
1683              *                    Compute A=Q*R, copying result to U
1684              *                    (Workspace: need 2*N, prefer N+N*NB)
1685              **/
1686             dgeqrf( m, n, a, lda, &work_1( itau ),
1687                    &work_1( iwork ), lwork-iwork+1, &ierr );
1688             dlacpy( 'l', m, n, a, lda, u, ldu );
1689             /**
1690              *                    Generate Q in U
1691              *                    (Workspace: need N+M, prefer N+M*NB)
1692              **/
1693             dorgqr( m, m, n, u, ldu, &work_1( itau ),
1694                    &work_1( iwork ), lwork-iwork+1, &ierr );
1695             /**
1696              *                    Copy R from A to VT, zeroing out below it
1697              **/
1698             dlacpy( 'u', n, n, a, lda, vt, ldvt );
1699             dlaset( 'l', n-1, n-1, zero, zero, &vt_2( 2, 1 ),
1700                    ldvt );
1701             ie = itau;
1702             itauq = ie + n;
1703             itaup = itauq + n;
1704             iwork = itaup + n;
1705             /**
1706              *                    Bidiagonalize R in VT
1707              *                    (Workspace: need 4*N, prefer 3*N+2*N*NB)
1708              **/
1709             dgebrd( n, n, vt, ldvt, s, &work_1( ie ),
1710                    &work_1( itauq ), &work_1( itaup ),
1711                    &work_1( iwork ), lwork-iwork+1, &ierr );
1712             /**
1713              *                    Multiply Q in U by left bidiagonalizing vectors
1714              *                    in VT
1715              *                    (Workspace: need 3*N+M, prefer 3*N+M*NB)
1716              **/
1717             dormbr( 'q', 'r', 'n', m, n, n, vt, ldvt,
1718                    &work_1( itauq ), u, ldu, &work_1( iwork ),
1719                    lwork-iwork+1, &ierr );
1720             /**
1721              *                    Generate right bidiagonalizing vectors in VT
1722              *                    (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1723              **/
1724             dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
1725                    &work_1( iwork ), lwork-iwork+1, &ierr );
1726             iwork = ie + n;
1727             /**
1728              *                    Perform bidiagonal QR iteration, computing left
1729              *                    singular vectors of A in U and computing right
1730              *                    singular vectors of A in VT
1731              *                    (Workspace: need 5*N-4)
1732              **/
1733             dbdsqr( 'u', n, n, m, 0, s, &work_1( ie ), vt,
1734                    ldvt, u, ldu, dum, 1, &work_1( iwork ),
1735                    info );
1736 
1737           }
1738 
1739         }
1740 
1741     }
1742 
1743   } else {
1744     /**
1745      *           M .LT. MNTHR
1746      *
1747      *           Path 10 (M at least N, but not much larger)
1748      *           Reduce to bidiagonal form without QR decomposition
1749      **/
1750     ie = 1;
1751     itauq = ie + n;
1752     itaup = itauq + n;
1753     iwork = itaup + n;
1754     /**
1755      *           Bidiagonalize A
1756      *           (Workspace: need 3*N+M, prefer 3*N+(M+N)*NB)
1757      **/
1758     dgebrd( m, n, a, lda, s, &work_1( ie ), &work_1( itauq ),
1759              &work_1( itaup ), &work_1( iwork ), lwork-iwork+1,
1760              &ierr );
1761     if( wntuas ) {
1762         /**
1763          *              If left singular vectors desired in U, copy result to U
1764          *              and generate left bidiagonalizing vectors in U
1765          *              (Workspace: need 3*N+NCU, prefer 3*N+NCU*NB)
1766          **/
1767         dlacpy( 'l', m, n, a, lda, u, ldu );
1768         if( wntus )
1769           ncu = n;
1770         if( wntua )
1771           ncu = m;
1772         dorgbr( 'q', m, ncu, n, u, ldu, &work_1( itauq ),
1773                &work_1( iwork ), lwork-iwork+1, &ierr );
1774     }
1775     if( wntvas ) {
1776         /**
1777          *              If right singular vectors desired in VT, copy result to
1778          *              VT and generate right bidiagonalizing vectors in VT
1779          *              (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1780          **/
1781         dlacpy( 'u', n, n, a, lda, vt, ldvt );
1782         dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
1783                &work_1( iwork ), lwork-iwork+1, &ierr );
1784     }
1785     if( wntuo ) {
1786         /**
1787          *              If left singular vectors desired in A, generate left
1788          *              bidiagonalizing vectors in A
1789          *              (Workspace: need 4*N, prefer 3*N+N*NB)
1790          **/
1791         dorgbr( 'q', m, n, n, a, lda, &work_1( itauq ),
1792                &work_1( iwork ), lwork-iwork+1, &ierr );
1793     }
1794     if( wntvo ) {
1795         /**
1796          *              If right singular vectors desired in A, generate right
1797          *              bidiagonalizing vectors in A
1798          *              (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1799          **/
1800         dorgbr( 'p', n, n, n, a, lda, &work_1( itaup ),
1801                &work_1( iwork ), lwork-iwork+1, &ierr );
1802     }
1803     iwork = ie + n;
1804     if( wntuas || wntuo )
1805         nru = m;
1806     if( wntun )
1807         nru = 0;
1808     if( wntvas || wntvo )
1809         ncvt = n;
1810     if( wntvn )
1811         ncvt = 0;
1812     if( ( !wntuo ) && ( !wntvo ) ) {
1813         /**
1814          *              Perform bidiagonal QR iteration, if desired, computing
1815          *              left singular vectors in U and computing right singular
1816          *              vectors in VT
1817          *              (Workspace: need 5*N-4)
1818          **/
1819         dbdsqr( 'u', n, ncvt, nru, 0, s, &work_1( ie ), vt,
1820                ldvt, u, ldu, dum, 1, &work_1( iwork ), info );
1821     } else if( ( !wntuo ) && wntvo ) {
1822         /**
1823          *              Perform bidiagonal QR iteration, if desired, computing
1824          *              left singular vectors in U and computing right singular
1825          *              vectors in A
1826          *              (Workspace: need 5*N-4)
1827          **/
1828         dbdsqr( 'u', n, ncvt, nru, 0, s, &work_1( ie ), a, lda,
1829                u, ldu, dum, 1, &work_1( iwork ), info );
1830     } else {
1831         /**
1832          *              Perform bidiagonal QR iteration, if desired, computing
1833          *              left singular vectors in A and computing right singular
1834          *              vectors in VT
1835          *              (Workspace: need 5*N-4)
1836          **/
1837         dbdsqr( 'u', n, ncvt, nru, 0, s, &work_1( ie ), vt,
1838                ldvt, a, lda, dum, 1, &work_1( iwork ), info );
1839     }
1840 
1841   }
1842 
1843   /**
1844    *     If DBDSQR failed to converge, copy unconverged superdiagonals
1845    *     to WORK( 2:MINMN )
1846    **/
1847   if( *info!=0 ) {
1848     if( ie>2 ) {
1849       for (i=1 ; i<=minmn - 1 ; i+=1) {
1850         work_1( i+1 ) = work_1( i+ie-1 );
1851       }
1852     }
1853     if( ie<2 ) {
1854       for (i=minmn - 1 ; i>=1 ; i+=-1) {
1855         work_1( i+1 ) = work_1( i+ie-1 );
1856       }
1857     }
1858   }
1859   /**
1860    *     Undo scaling if necessary
1861    **/
1862   if( iscl==1 ) {
1863     if( anrm>bignum )
1864       dlascl( 'g', 0, 0, bignum, anrm, minmn, 1, s, minmn,
1865              &ierr );
1866     if( *info!=0 && anrm>bignum )
1867       dlascl( 'g', 0, 0, bignum, anrm, minmn-1, 1, &work_1( 2 ),
1868              minmn, &ierr );
1869     if( anrm<smlnum )
1870       dlascl( 'g', 0, 0, smlnum, anrm, minmn, 1, s, minmn,
1871              &ierr );
1872     if( *info!=0 && anrm<smlnum )
1873       dlascl( 'g', 0, 0, smlnum, anrm, minmn-1, 1, &work_1( 2 ),
1874              minmn, &ierr );
1875   }
1876   /**
1877    *     Return optimal workspace in WORK(1)
1878    **/
1879   work_1( 1 ) = maxwrk;
1880 
1881   return;
1882   /**
1883    *     End of DGESVD
1884    **/
1885 }
1886