1 /*
2 * $Id: dgesvd.c,v 1.1 2005-09-18 22:04:48 dhmunro Exp $
3 * LAPACK routine to return the SVD of a matrix (m>=n branch only).
4 */
5 /* Copyright (c) 2005, The Regents of the University of California.
6 * All rights reserved.
7 * This file is part of yorick (http://yorick.sourceforge.net).
8 * Read the accompanying LICENSE file for details.
9 */
10
11 #include "dg.h"
12
13 /* Compilers have a hard time with this ridiculously large driver routine.
14 Therefore, I split it into a separate m>=n and m<n branch. The m<n
15 branch is in the file dgesv2.c. Here is its prototype: */
16 extern void dgesv2( char jobu, char jobvt, long m, long n,
17 double a[], long lda, double s[], double u[], long ldu,
18 double vt[], long ldvt,
19 double work[], long lwork, long *info );
20
21
22
23 /*---blas routines---*/
24 /* dgemm */
25
26
27
28 /*---converted nutty string switches to single characters (lower case)---*/
29 #define lsame(x,y) ((x)==(y))
30
31
32 /*-----Fortran intrinsics converted-----*/
33 extern double sqrt(double);
34 #define min(x,y) ((x)<(y)?(x):(y))
35 #define max(x,y) ((x)>(y)?(x):(y))
36 /*-----end of Fortran intrinsics-----*/
37
38
39
dgesvd(char jobu,char jobvt,long m,long n,double a[],long lda,double s[],double u[],long ldu,double vt[],long ldvt,double work[],long lwork,long * info)40 void dgesvd( char jobu, char jobvt, long m, long n,
41 double a[], long lda, double s[], double u[], long ldu,
42 double vt[], long ldvt,
43 double work[], long lwork, long *info )
44 {
45 /**
46 * -- LAPACK driver routine (version 1.1) --
47 * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
48 * Courant Institute, Argonne National Lab, and Rice University
49 * March 31, 1993
50 *
51 * .. Scalar Arguments ..*/
52 /** ..
53 * .. Array Arguments ..*/
54 #undef work_1
55 #define work_1(a1) work[a1-1]
56 #undef vt_2
57 #define vt_2(a1,a2) vt[a1-1+ldvt*(a2-1)]
58 #undef u_2
59 #define u_2(a1,a2) u[a1-1+ldu*(a2-1)]
60 #undef s_1
61 #define s_1(a1) s[a1-1]
62 #undef a_2
63 #define a_2(a1,a2) a[a1-1+lda*(a2-1)]
64 /** ..
65 *
66 * Purpose
67 * =======
68 *
69 * DGESVD computes the singular value decomposition (SVD) of a real
70 * M-by-N matrix A, optionally computing the left and/or right singular
71 * vectors. The SVD is written
72 *
73 * A = U * SIGMA * transpose(V)
74 *
75 * where SIGMA is an M-by-N matrix which is zero except for its
76 * min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
77 * V is an N-by-N orthogonal matrix. The diagonal elements of SIGMA
78 * are the singular values of A; they are real and non-negative, and
79 * are returned in descending order. The first min(m,n) columns of
80 * U and V are the left and right singular vectors of A.
81 *
82 * Note that the routine returns V**T, not V.
83 *
84 * Arguments
85 * =========
86 *
87 * JOBU (input) CHARACTER*1
88 * Specifies options for computing all or part of the matrix U:
89 * = 'A': all M columns of U are returned in array U:
90 * = 'S': the first min(m,n) columns of U (the left singular
91 * vectors) are returned in the array U;
92 * = 'O': the first min(m,n) columns of U (the left singular
93 * vectors) are overwritten on the array A;
94 * = 'N': no columns of U (no left singular vectors) are
95 * computed.
96 *
97 * JOBVT (input) CHARACTER*1
98 * Specifies options for computing all or part of the matrix
99 * V**T:
100 * = 'A': all N rows of V**T are returned in the array VT;
101 * = 'S': the first min(m,n) rows of V**T (the right singular
102 * vectors) are returned in the array VT;
103 * = 'O': the first min(m,n) rows of V**T (the right singular
104 * vectors) are overwritten on the array A;
105 * = 'N': no rows of V**T (no right singular vectors) are
106 * computed.
107 *
108 * JOBVT and JOBU cannot both be 'O'.
109 *
110 * M (input) INTEGER
111 * The number of rows of the input matrix A. M >= 0.
112 *
113 * N (input) INTEGER
114 * The number of columns of the input matrix A. N >= 0.
115 *
116 * A (input/output) DOUBLE PRECISION array, dimension (LDA,N)
117 * On entry, the M-by-N matrix A.
118 * On exit,
119 * if JOBU = 'O', A is overwritten with the first min(m,n)
120 * columns of U (the left singular vectors,
121 * stored columnwise);
122 * if JOBVT = 'O', A is overwritten with the first min(m,n)
123 * rows of V**T (the right singular vectors,
124 * stored rowwise);
125 * if JOBU .ne. 'O' and JOBVT .ne. 'O', the contents of A
126 * are destroyed.
127 *
128 * LDA (input) INTEGER
129 * The leading dimension of the array A. LDA >= max(1,M).
130 *
131 * S (output) DOUBLE PRECISION array, dimension (min(M,N))
132 * The singular values of A, sorted so that S(i) >= S(i+1).
133 *
134 * U (output) DOUBLE PRECISION array, dimension (LDU,UCOL)
135 * (LDU,M) if JOBU = 'A' or (LDU,min(M,N)) if JOBU = 'S'.
136 * If JOBU = 'A', U contains the M-by-M orthogonal matrix U;
137 * if JOBU = 'S', U contains the first min(m,n) columns of U
138 * (the left singular vectors, stored columnwise);
139 * if JOBU = 'N' or 'O', U is not referenced.
140 *
141 * LDU (input) INTEGER
142 * The leading dimension of the array U. LDU >= 1; if
143 * JOBU = 'S' or 'A', LDU >= M.
144 *
145 * VT (output) DOUBLE PRECISION array, dimension (LDVT,N)
146 * If JOBVT = 'A', VT contains the N-by-N orthogonal matrix
147 * V**T;
148 * if JOBVT = 'S', VT contains the first min(m,n) rows of
149 * V**T (the right singular vectors, stored rowwise);
150 * if JOBVT = 'N' or 'O', VT is not referenced.
151 *
152 * LDVT (input) INTEGER
153 * The leading dimension of the array VT. LDVT >= 1; if
154 * JOBVT = 'A', LDVT >= N; if JOBVT = 'S', LDVT >= min(M,N).
155 *
156 * WORK (workspace/output) DOUBLE PRECISION array, dimension (LWORK)
157 * On exit, if INFO = 0, WORK(1) returns the optimal LWORK;
158 * if INFO > 0, WORK(2:MIN(M,N)) contains the unconverged
159 * superdiagonal elements of an upper bidiagonal matrix B
160 * whose diagonal is in S (not necessarily sorted). B
161 * satisfies A = U * B * VT, so it has the same singular values
162 * as A, and singular vectors related by U and VT.
163 *
164 * LWORK (input) INTEGER
165 * The dimension of the array WORK. LWORK >= 1.
166 * LWORK >= MAX(3*MIN(M,N)+MAX(M,N),5*MIN(M,N)-4).
167 * For good performance, LWORK should generally be larger.
168 *
169 * INFO (output) INTEGER
170 * = 0: successful exit.
171 * < 0: if INFO = -i, the i-th argument had an illegal value.
172 * > 0: if DBDSQR did not converge, INFO specifies how many
173 * superdiagonals of an intermediate bidiagonal form B
174 * did not converge to zero. See the description of WORK
175 * above for details.
176 *
177 * =====================================================================
178 *
179 * .. Parameters ..*/
180 #undef zero
181 #define zero 0.0e0
182 #undef one
183 #define one 1.0e0
184 /** ..
185 * .. Local Scalars ..*/
186 int wntua, wntuas, wntun, wntuo, wntus, wntva,
187 wntvas, wntvn, wntvo, wntvs;
188 long chunk, i, ie=0, ierr, ir, iscl, itau, itaup,
189 itauq, iu, iwork, ldwrkr, ldwrku, maxwrk=0,
190 minmn, minwrk, mnthr, ncu=0, ncvt=0, nru=0,
191 wrkbl=0;
192 double anrm, bignum, eps, smlnum;
193 char job_u_vt[3];
194 /** ..
195 * .. Local Arrays ..*/
196 double dum[1];
197 #undef dum_1
198 #define dum_1(a1) [a1-1]
199 /** ..
200 * .. Intrinsic Functions ..*/
201 /* intrinsic max, min, sqrt;*/
202 /** ..
203 * .. Executable Statements ..
204 *
205 * Test the input arguments
206 **/
207 /*-----implicit-declarations-----*/
208 /*-----end-of-declarations-----*/
209 if (m<n) {
210 dgesv2( jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
211 work, lwork, info );
212 return;
213 }
214
215 *info = 0;
216 minmn = min( m, n );
217 job_u_vt[0]= jobu;
218 job_u_vt[1]= jobvt;
219 job_u_vt[2]= '\0';
220 mnthr = ilaenv( 6, "dgesvd", job_u_vt, m, n, 0, 0 );
221 wntua = lsame( jobu, 'a' );
222 wntus = lsame( jobu, 's' );
223 wntuas = wntua || wntus;
224 wntuo = lsame( jobu, 'o' );
225 wntun = lsame( jobu, 'n' );
226 wntva = lsame( jobvt, 'a' );
227 wntvs = lsame( jobvt, 's' );
228 wntvas = wntva || wntvs;
229 wntvo = lsame( jobvt, 'o' );
230 wntvn = lsame( jobvt, 'n' );
231 minwrk = 1;
232
233 if( !( wntua || wntus || wntuo || wntun ) ) {
234 *info = -1;
235 } else if( !( wntva || wntvs || wntvo || wntvn ) ||
236 ( wntvo && wntuo ) ) {
237 *info = -2;
238 } else if( m<0 ) {
239 *info = -3;
240 } else if( n<0 ) {
241 *info = -4;
242 } else if( lda<max( 1, m ) ) {
243 *info = -6;
244 } else if( ldu<1 || ( wntuas && ldu<m ) ) {
245 *info = -9;
246 } else if( ldvt<1 || ( wntva && ldvt<n ) ||
247 ( wntvs && ldvt<minmn ) ) {
248 *info = -11;
249 }
250 /**
251 * Compute workspace
252 * (Note: Comments in the code beginning "Workspace:" describe the
253 * minimal amount of workspace needed at that point in the code,
254 * as well as the preferred amount for good performance.
255 * NB refers to the optimal block size for the immediately
256 * following subroutine, as returned by ILAENV.)
257 **/
258 if( *info==0 && lwork>=1 && m>0 && n>0 ) {
259 if( m>=mnthr ) {
260 if( wntun ) {
261 /**
262 * Path 1 (M much larger than N, JOBU='N')
263 **/
264 maxwrk = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1,
265 -1 );
266 maxwrk = max( maxwrk, 3*n+2*n*
267 ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
268 if( wntvo || wntvas )
269 maxwrk = max( maxwrk, 3*n+( n-1 )*
270 ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
271 maxwrk = max( maxwrk, 5*n-4 );
272 minwrk = max( 4*n, 5*n-4 );
273 } else if( wntuo && wntvn ) {
274 /**
275 * Path 2 (M much larger than N, JOBU='O', JOBVT='N')
276 **/
277 wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
278 wrkbl = max( wrkbl, n+n*ilaenv( 1, "dorgqr", " ", m,
279 n, n, -1 ) );
280 wrkbl = max( wrkbl, 3*n+2*n*
281 ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
282 wrkbl = max( wrkbl, 3*n+n*
283 ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
284 wrkbl = max( wrkbl, 5*n-4 );
285 maxwrk = max( n*n+wrkbl, n*n+m*n+n );
286 minwrk = max( 3*n+m, 5*n-4 );
287 minwrk = min( minwrk, maxwrk );
288 } else if( wntuo && wntvas ) {
289 /**
290 * Path 3 (M much larger than N, JOBU='O', JOBVT='S' or
291 * 'A')
292 **/
293 wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
294 wrkbl = max( wrkbl, n+n*ilaenv( 1, "dorgqr", " ", m,
295 n, n, -1 ) );
296 wrkbl = max( wrkbl, 3*n+2*n*
297 ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
298 wrkbl = max( wrkbl, 3*n+n*
299 ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
300 wrkbl = max( wrkbl, 3*n+( n-1 )*
301 ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
302 wrkbl = max( wrkbl, 5*n-4 );
303 maxwrk = max( n*n+wrkbl, n*n+m*n+n );
304 minwrk = max( 3*n+m, 5*n-4 );
305 minwrk = min( minwrk, maxwrk );
306 } else if( wntus && wntvn ) {
307 /**
308 * Path 4 (M much larger than N, JOBU='S', JOBVT='N')
309 **/
310 wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
311 wrkbl = max( wrkbl, n+n*ilaenv( 1, "dorgqr", " ", m,
312 n, n, -1 ) );
313 wrkbl = max( wrkbl, 3*n+2*n*
314 ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
315 wrkbl = max( wrkbl, 3*n+n*
316 ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
317 wrkbl = max( wrkbl, 5*n-4 );
318 maxwrk = n*n + wrkbl;
319 minwrk = max( 3*n+m, 5*n-4 );
320 minwrk = min( minwrk, maxwrk );
321 } else if( wntus && wntvo ) {
322 /**
323 * Path 5 (M much larger than N, JOBU='S', JOBVT='O')
324 **/
325 wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
326 wrkbl = max( wrkbl, n+n*ilaenv( 1, "dorgqr", " ", m,
327 n, n, -1 ) );
328 wrkbl = max( wrkbl, 3*n+2*n*
329 ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
330 wrkbl = max( wrkbl, 3*n+n*
331 ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
332 wrkbl = max( wrkbl, 3*n+( n-1 )*
333 ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
334 wrkbl = max( wrkbl, 5*n-4 );
335 maxwrk = 2*n*n + wrkbl;
336 minwrk = max( 3*n+m, 5*n-4 );
337 minwrk = min( minwrk, maxwrk );
338 } else if( wntus && wntvas ) {
339 /**
340 * Path 6 (M much larger than N, JOBU='S', JOBVT='S' or
341 * 'A')
342 **/
343 wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
344 wrkbl = max( wrkbl, n+n*ilaenv( 1, "dorgqr", " ", m,
345 n, n, -1 ) );
346 wrkbl = max( wrkbl, 3*n+2*n*
347 ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
348 wrkbl = max( wrkbl, 3*n+n*
349 ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
350 wrkbl = max( wrkbl, 3*n+( n-1 )*
351 ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
352 wrkbl = max( wrkbl, 5*n-4 );
353 maxwrk = n*n + wrkbl;
354 minwrk = max( 3*n+m, 5*n-4 );
355 minwrk = min( minwrk, maxwrk );
356 } else if( wntua && wntvn ) {
357 /**
358 * Path 7 (M much larger than N, JOBU='A', JOBVT='N')
359 **/
360 wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
361 wrkbl = max( wrkbl, n+m*ilaenv( 1, "dorgqr", " ", m,
362 m, n, -1 ) );
363 wrkbl = max( wrkbl, 3*n+2*n*
364 ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
365 wrkbl = max( wrkbl, 3*n+n*
366 ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
367 wrkbl = max( wrkbl, 5*n-4 );
368 maxwrk = n*n + wrkbl;
369 minwrk = max( 3*n+m, 5*n-4 );
370 minwrk = min( minwrk, maxwrk );
371 } else if( wntua && wntvo ) {
372 /**
373 * Path 8 (M much larger than N, JOBU='A', JOBVT='O')
374 **/
375 wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
376 wrkbl = max( wrkbl, n+m*ilaenv( 1, "dorgqr", " ", m,
377 m, n, -1 ) );
378 wrkbl = max( wrkbl, 3*n+2*n*
379 ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
380 wrkbl = max( wrkbl, 3*n+n*
381 ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
382 wrkbl = max( wrkbl, 3*n+( n-1 )*
383 ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
384 wrkbl = max( wrkbl, 5*n-4 );
385 maxwrk = 2*n*n + wrkbl;
386 minwrk = max( 3*n+m, 5*n-4 );
387 minwrk = min( minwrk, maxwrk );
388 } else if( wntua && wntvas ) {
389 /**
390 * Path 9 (M much larger than N, JOBU='A', JOBVT='S' or
391 * 'A')
392 **/
393 wrkbl = n + n*ilaenv( 1, "dgeqrf", " ", m, n, -1, -1 );
394 wrkbl = max( wrkbl, n+m*ilaenv( 1, "dorgqr", " ", m,
395 m, n, -1 ) );
396 wrkbl = max( wrkbl, 3*n+2*n*
397 ilaenv( 1, "dgebrd", " ", n, n, -1, -1 ) );
398 wrkbl = max( wrkbl, 3*n+n*
399 ilaenv( 1, "dorgbr", "q", n, n, n, -1 ) );
400 wrkbl = max( wrkbl, 3*n+( n-1 )*
401 ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
402 wrkbl = max( wrkbl, 5*n-4 );
403 maxwrk = n*n + wrkbl;
404 minwrk = max( 3*n+m, 5*n-4 );
405 minwrk = min( minwrk, maxwrk );
406 }
407 } else {
408 /**
409 * Path 10 (M at least N, but not much larger)
410 **/
411 maxwrk = 3*n + ( m+n )*ilaenv( 1, "dgebrd", " ", m, n,
412 -1, -1 );
413 if( wntus || wntuo )
414 maxwrk = max( maxwrk, 3*n+n*
415 ilaenv( 1, "dorgbr", "q", m, n, n, -1 ) );
416 if( wntua )
417 maxwrk = max( maxwrk, 3*n+m*
418 ilaenv( 1, "dorgbr", "q", m, m, n, -1 ) );
419 if( !wntvn )
420 maxwrk = max( maxwrk, 3*n+( n-1 )*
421 ilaenv( 1, "dorgbr", "p", n, n, n, -1 ) );
422 maxwrk = max( maxwrk, 5*n-4 );
423 minwrk = max( 3*n+m, 5*n-4 );
424 }
425 work_1( 1 ) = maxwrk;
426 }
427
428 if( lwork<minwrk ) {
429 *info = -13;
430 }
431 if( *info!=0 ) {
432 xerbla( "dgesvd", -*info );
433 return;
434 }
435 /**
436 * Quick return if possible
437 **/
438 if( m==0 || n==0 ) {
439 if( lwork>=1 )
440 work_1( 1 ) = one;
441 return;
442 }
443 /**
444 * Get machine constants
445 **/
446 eps = dlamch( 'p' );
447 smlnum = sqrt( dlamch( 's' ) ) / eps;
448 bignum = one / smlnum;
449 /**
450 * Scale A if max entry outside range [SMLNUM,BIGNUM]
451 **/
452 anrm = dlange( 'm', m, n, a, lda, dum );
453 iscl = 0;
454 if( anrm>zero && anrm<smlnum ) {
455 iscl = 1;
456 dlascl( 'g', 0, 0, anrm, smlnum, m, n, a, lda, &ierr );
457 } else if( anrm>bignum ) {
458 iscl = 1;
459 dlascl( 'g', 0, 0, anrm, bignum, m, n, a, lda, &ierr );
460 }
461
462 /**
463 * A has at least as many rows as columns. If A has sufficiently
464 * more rows than columns, first reduce using the QR
465 * decomposition (if sufficient workspace available)
466 **/
467 if( m>=mnthr ) {
468
469 if( wntun ) {
470 /**
471 * Path 1 (M much larger than N, JOBU='N')
472 * No left singular vectors to be computed
473 **/
474 itau = 1;
475 iwork = itau + n;
476 /**
477 * Compute A=Q*R
478 * (Workspace: need 2*N, prefer N+N*NB)
479 **/
480 dgeqrf( m, n, a, lda, &work_1( itau ), &work_1( iwork ),
481 lwork-iwork+1, &ierr );
482 /**
483 * Zero out below R
484 **/
485 dlaset( 'l', n-1, n-1, zero, zero, &a_2( 2, 1 ), lda );
486 ie = 1;
487 itauq = ie + n;
488 itaup = itauq + n;
489 iwork = itaup + n;
490 /**
491 * Bidiagonalize R in A
492 * (Workspace: need 4*N, prefer 3*N+2*N*NB)
493 **/
494 dgebrd( n, n, a, lda, s, &work_1( ie ), &work_1( itauq ),
495 &work_1( itaup ), &work_1( iwork ), lwork-iwork+1,
496 &ierr );
497 ncvt = 0;
498 if( wntvo || wntvas ) {
499 /**
500 * If right singular vectors desired, generate P'.
501 * (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
502 **/
503 dorgbr( 'p', n, n, n, a, lda, &work_1( itaup ),
504 &work_1( iwork ), lwork-iwork+1, &ierr );
505 ncvt = n;
506 }
507 iwork = ie + n;
508 /**
509 * Perform bidiagonal QR iteration, computing right
510 * singular vectors of A in A if desired
511 * (Workspace: need 5*N-4)
512 **/
513 dbdsqr( 'u', n, ncvt, 0, 0, s, &work_1( ie ), a, lda,
514 dum, 1, dum, 1, &work_1( iwork ), info );
515 /**
516 * If right singular vectors desired in VT, copy them there
517 **/
518 if( wntvas )
519 dlacpy( 'f', n, n, a, lda, vt, ldvt );
520
521 } else if( wntuo && wntvn ) {
522 /**
523 * Path 2 (M much larger than N, JOBU='O', JOBVT='N')
524 * N left singular vectors to be overwritten on A and
525 * no right singular vectors to be computed
526 **/
527 if( lwork>=n*n+max( 4*n, 5*n-4 ) ) {
528 /**
529 * Sufficient workspace for a fast algorithm
530 **/
531 ir = 1;
532 if( lwork>=max( wrkbl, lda*n+n )+lda*n ) {
533 /**
534 * WORK(IU) is LDA by N, WORK(IR) is LDA by N
535 **/
536 ldwrku = lda;
537 ldwrkr = lda;
538 } else if( lwork>=max( wrkbl, lda*n+n )+n*n ) {
539 /**
540 * WORK(IU) is LDA by N, WORK(IR) is N by N
541 **/
542 ldwrku = lda;
543 ldwrkr = n;
544 } else {
545 /**
546 * WORK(IU) is LDWRKU by N, WORK(IR) is N by N
547 **/
548 ldwrku = ( lwork-n*n-n ) / n;
549 ldwrkr = n;
550 }
551 itau = ir + ldwrkr*n;
552 iwork = itau + n;
553 /**
554 * Compute A=Q*R
555 * (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
556 **/
557 dgeqrf( m, n, a, lda, &work_1( itau ),
558 &work_1( iwork ), lwork-iwork+1, &ierr );
559 /**
560 * Copy R to WORK(IR) and zero out below it
561 **/
562 dlacpy( 'u', n, n, a, lda, &work_1( ir ), ldwrkr );
563 dlaset( 'l', n-1, n-1, zero, zero, &work_1( ir+1 ),
564 ldwrkr );
565 /**
566 * Generate Q in A
567 * (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
568 **/
569 dorgqr( m, n, n, a, lda, &work_1( itau ),
570 &work_1( iwork ), lwork-iwork+1, &ierr );
571 ie = itau;
572 itauq = ie + n;
573 itaup = itauq + n;
574 iwork = itaup + n;
575 /**
576 * Bidiagonalize R in WORK(IR)
577 * (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
578 **/
579 dgebrd( n, n, &work_1( ir ), ldwrkr, s, &work_1( ie ),
580 &work_1( itauq ), &work_1( itaup ),
581 &work_1( iwork ), lwork-iwork+1, &ierr );
582 /**
583 * Generate left vectors bidiagonalizing R
584 * (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
585 **/
586 dorgbr( 'q', n, n, n, &work_1( ir ), ldwrkr,
587 &work_1( itauq ), &work_1( iwork ),
588 lwork-iwork+1, &ierr );
589 iwork = ie + n;
590 /**
591 * Perform bidiagonal QR iteration, computing left
592 * singular vectors of R in WORK(IR)
593 * (Workspace: need N*N+5*N-4)
594 **/
595 dbdsqr( 'u', n, 0, n, 0, s, &work_1( ie ), dum, 1,
596 &work_1( ir ), ldwrkr, dum, 1,
597 &work_1( iwork ), info );
598 iu = ie + n;
599 /**
600 * Multiply Q in A by left singular vectors of R in
601 * WORK(IR), storing result in WORK(IU) and copying to A
602 * (Workspace: need N*N+2*N, prefer N*N+M*N+N)
603 **/
604 for (i=1 ; ldwrku>0?i<=m:i>=m ; i+=ldwrku) {
605 chunk = min( m-i+1, ldwrku );
606 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, chunk,
607 n, n, one, &a_2( i, 1 ), lda, &work_1( ir ), ldwrkr,
608 zero, &work_1( iu ), ldwrku );
609 dlacpy( 'f', chunk, n, &work_1( iu ), ldwrku,
610 &a_2( i, 1 ), lda );
611 }
612
613 } else {
614 /**
615 * Insufficient workspace for a fast algorithm
616 **/
617 ie = 1;
618 itauq = ie + n;
619 itaup = itauq + n;
620 iwork = itaup + n;
621 /**
622 * Bidiagonalize A
623 * (Workspace: need 3*N+M, prefer 3*N+(M+N)*NB)
624 **/
625 dgebrd( m, n, a, lda, s, &work_1( ie ),
626 &work_1( itauq ), &work_1( itaup ),
627 &work_1( iwork ), lwork-iwork+1, &ierr );
628 /**
629 * Generate left vectors bidiagonalizing A
630 * (Workspace: need 4*N, prefer 3*N+N*NB)
631 **/
632 dorgbr( 'q', m, n, n, a, lda, &work_1( itauq ),
633 &work_1( iwork ), lwork-iwork+1, &ierr );
634 iwork = ie + n;
635 /**
636 * Perform bidiagonal QR iteration, computing left
637 * singular vectors of A in A
638 * (Workspace: need 5*N-4)
639 **/
640 dbdsqr( 'u', n, 0, m, 0, s, &work_1( ie ), dum, 1,
641 a, lda, dum, 1, &work_1( iwork ), info );
642
643 }
644
645 } else if( wntuo && wntvas ) {
646 /**
647 * Path 3 (M much larger than N, JOBU='O', JOBVT='S' or 'A')
648 * N left singular vectors to be overwritten on A and
649 * N right singular vectors to be computed in VT
650 **/
651 if( lwork>=n*n+max( 4*n, 5*n-4 ) ) {
652 /**
653 * Sufficient workspace for a fast algorithm
654 **/
655 ir = 1;
656 if( lwork>=max( wrkbl, lda*n+n )+lda*n ) {
657 /**
658 * WORK(IU) is LDA by N and WORK(IR) is LDA by N
659 **/
660 ldwrku = lda;
661 ldwrkr = lda;
662 } else if( lwork>=max( wrkbl, lda*n+n )+n*n ) {
663 /**
664 * WORK(IU) is LDA by N and WORK(IR) is N by N
665 **/
666 ldwrku = lda;
667 ldwrkr = n;
668 } else {
669 /**
670 * WORK(IU) is LDWRKU by N and WORK(IR) is N by N
671 **/
672 ldwrku = ( lwork-n*n-n ) / n;
673 ldwrkr = n;
674 }
675 itau = ir + ldwrkr*n;
676 iwork = itau + n;
677 /**
678 * Compute A=Q*R
679 * (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
680 **/
681 dgeqrf( m, n, a, lda, &work_1( itau ),
682 &work_1( iwork ), lwork-iwork+1, &ierr );
683 /**
684 * Copy R to VT, zeroing out below it
685 **/
686 dlacpy( 'u', n, n, a, lda, vt, ldvt );
687 dlaset( 'l', n-1, n-1, zero, zero, &vt_2( 2, 1 ),
688 ldvt );
689 /**
690 * Generate Q in A
691 * (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
692 **/
693 dorgqr( m, n, n, a, lda, &work_1( itau ),
694 &work_1( iwork ), lwork-iwork+1, &ierr );
695 ie = itau;
696 itauq = ie + n;
697 itaup = itauq + n;
698 iwork = itaup + n;
699 /**
700 * Bidiagonalize R in VT, copying result to WORK(IR)
701 * (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
702 **/
703 dgebrd( n, n, vt, ldvt, s, &work_1( ie ),
704 &work_1( itauq ), &work_1( itaup ),
705 &work_1( iwork ), lwork-iwork+1, &ierr );
706 dlacpy( 'l', n, n, vt, ldvt, &work_1( ir ), ldwrkr );
707 /**
708 * Generate left vectors bidiagonalizing R in WORK(IR)
709 * (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
710 **/
711 dorgbr( 'q', n, n, n, &work_1( ir ), ldwrkr,
712 &work_1( itauq ), &work_1( iwork ),
713 lwork-iwork+1, &ierr );
714 /**
715 * Generate right vectors bidiagonalizing R in VT
716 * (Workspace: need N*N+4*N-1, prefer N*N+3*N+(N-1)*NB)
717 **/
718 dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
719 &work_1( iwork ), lwork-iwork+1, &ierr );
720 iwork = ie + n;
721 /**
722 * Perform bidiagonal QR iteration, computing left
723 * singular vectors of R in WORK(IR) and computing right
724 * singular vectors of R in VT
725 * (Workspace: need N*N+5*N-4)
726 **/
727 dbdsqr( 'u', n, n, n, 0, s, &work_1( ie ), vt, ldvt,
728 &work_1( ir ), ldwrkr, dum, 1,
729 &work_1( iwork ), info );
730 iu = ie + n;
731 /**
732 * Multiply Q in A by left singular vectors of R in
733 * WORK(IR), storing result in WORK(IU) and copying to A
734 * (Workspace: need N*N+2*N, prefer N*N+M*N+N)
735 **/
736 for (i=1 ; ldwrku>0?i<=m:i>=m ; i+=ldwrku) {
737 chunk = min( m-i+1, ldwrku );
738 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, chunk,
739 n, n, one, &a_2( i, 1 ), lda, &work_1( ir ), ldwrkr,
740 zero, &work_1( iu ), ldwrku );
741 dlacpy( 'f', chunk, n, &work_1( iu ), ldwrku,
742 &a_2( i, 1 ), lda );
743 }
744
745 } else {
746 /**
747 * Insufficient workspace for a fast algorithm
748 **/
749 itau = 1;
750 iwork = itau + n;
751 /**
752 * Compute A=Q*R
753 * (Workspace: need 2*N, prefer N+N*NB)
754 **/
755 dgeqrf( m, n, a, lda, &work_1( itau ),
756 &work_1( iwork ), lwork-iwork+1, &ierr );
757 /**
758 * Copy R to VT, zeroing out below it
759 **/
760 dlacpy( 'u', n, n, a, lda, vt, ldvt );
761 dlaset( 'l', n-1, n-1, zero, zero, &vt_2( 2, 1 ),
762 ldvt );
763 /**
764 * Generate Q in A
765 * (Workspace: need 2*N, prefer N+N*NB)
766 **/
767 dorgqr( m, n, n, a, lda, &work_1( itau ),
768 &work_1( iwork ), lwork-iwork+1, &ierr );
769 ie = itau;
770 itauq = ie + n;
771 itaup = itauq + n;
772 iwork = itaup + n;
773 /**
774 * Bidiagonalize R in VT
775 * (Workspace: need 4*N, prefer 3*N+2*N*NB)
776 **/
777 dgebrd( n, n, vt, ldvt, s, &work_1( ie ),
778 &work_1( itauq ), &work_1( itaup ),
779 &work_1( iwork ), lwork-iwork+1, &ierr );
780 /**
781 * Multiply Q in A by left vectors bidiagonalizing R
782 * (Workspace: need 3*N+M, prefer 3*N+M*NB)
783 **/
784 dormbr( 'q', 'r', 'n', m, n, n, vt, ldvt,
785 &work_1( itauq ), a, lda, &work_1( iwork ),
786 lwork-iwork+1, &ierr );
787 /**
788 * Generate right vectors bidiagonalizing R in VT
789 * (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
790 **/
791 dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
792 &work_1( iwork ), lwork-iwork+1, &ierr );
793 iwork = ie + n;
794 /**
795 * Perform bidiagonal QR iteration, computing left
796 * singular vectors of A in A and computing right
797 * singular vectors of A in VT
798 * (Workspace: need 5*N-4)
799 **/
800 dbdsqr( 'u', n, n, m, 0, s, &work_1( ie ), vt, ldvt,
801 a, lda, dum, 1, &work_1( iwork ), info );
802
803 }
804
805 } else if( wntus ) {
806
807 if( wntvn ) {
808 /**
809 * Path 4 (M much larger than N, JOBU='S', JOBVT='N')
810 * N left singular vectors to be computed in U and
811 * no right singular vectors to be computed
812 **/
813 if( lwork>=n*n+max( 4*n, 5*n-4 ) ) {
814 /**
815 * Sufficient workspace for a fast algorithm
816 **/
817 ir = 1;
818 if( lwork>=wrkbl+lda*n ) {
819 /**
820 * WORK(IR) is LDA by N
821 **/
822 ldwrkr = lda;
823 } else {
824 /**
825 * WORK(IR) is N by N
826 **/
827 ldwrkr = n;
828 }
829 itau = ir + ldwrkr*n;
830 iwork = itau + n;
831 /**
832 * Compute A=Q*R
833 * (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
834 **/
835 dgeqrf( m, n, a, lda, &work_1( itau ),
836 &work_1( iwork ), lwork-iwork+1, &ierr );
837 /**
838 * Copy R to WORK(IR), zeroing out below it
839 **/
840 dlacpy( 'u', n, n, a, lda, &work_1( ir ),
841 ldwrkr );
842 dlaset( 'l', n-1, n-1, zero, zero,
843 &work_1( ir+1 ), ldwrkr );
844 /**
845 * Generate Q in A
846 * (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
847 **/
848 dorgqr( m, n, n, a, lda, &work_1( itau ),
849 &work_1( iwork ), lwork-iwork+1, &ierr );
850 ie = itau;
851 itauq = ie + n;
852 itaup = itauq + n;
853 iwork = itaup + n;
854 /**
855 * Bidiagonalize R in WORK(IR)
856 * (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
857 **/
858 dgebrd( n, n, &work_1( ir ), ldwrkr, s,
859 &work_1( ie ), &work_1( itauq ),
860 &work_1( itaup ), &work_1( iwork ),
861 lwork-iwork+1, &ierr );
862 /**
863 * Generate left vectors bidiagonalizing R in WORK(IR)
864 * (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
865 **/
866 dorgbr( 'q', n, n, n, &work_1( ir ), ldwrkr,
867 &work_1( itauq ), &work_1( iwork ),
868 lwork-iwork+1, &ierr );
869 iwork = ie + n;
870 /**
871 * Perform bidiagonal QR iteration, computing left
872 * singular vectors of R in WORK(IR)
873 * (Workspace: need N*N+5*N-4)
874 **/
875 dbdsqr( 'u', n, 0, n, 0, s, &work_1( ie ), dum,
876 1, &work_1( ir ), ldwrkr, dum, 1,
877 &work_1( iwork ), info );
878 /**
879 * Multiply Q in A by left singular vectors of R in
880 * WORK(IR), storing result in U
881 * (Workspace: need N*N)
882 **/
883 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
884 one, a, lda, &work_1( ir ), ldwrkr, zero, u, ldu );
885
886 } else {
887 /**
888 * Insufficient workspace for a fast algorithm
889 **/
890 itau = 1;
891 iwork = itau + n;
892 /**
893 * Compute A=Q*R, copying result to U
894 * (Workspace: need 2*N, prefer N+N*NB)
895 **/
896 dgeqrf( m, n, a, lda, &work_1( itau ),
897 &work_1( iwork ), lwork-iwork+1, &ierr );
898 dlacpy( 'l', m, n, a, lda, u, ldu );
899 /**
900 * Generate Q in U
901 * (Workspace: need 2*N, prefer N+N*NB)
902 **/
903 dorgqr( m, n, n, u, ldu, &work_1( itau ),
904 &work_1( iwork ), lwork-iwork+1, &ierr );
905 ie = itau;
906 itauq = ie + n;
907 itaup = itauq + n;
908 iwork = itaup + n;
909 /**
910 * Zero out below R in A
911 **/
912 dlaset( 'l', n-1, n-1, zero, zero, &a_2( 2, 1 ),
913 lda );
914 /**
915 * Bidiagonalize R in A
916 * (Workspace: need 4*N, prefer 3*N+2*N*NB)
917 **/
918 dgebrd( n, n, a, lda, s, &work_1( ie ),
919 &work_1( itauq ), &work_1( itaup ),
920 &work_1( iwork ), lwork-iwork+1, &ierr );
921 /**
922 * Multiply Q in U by left vectors bidiagonalizing R
923 * (Workspace: need 3*N+M, prefer 3*N+M*NB)
924 **/
925 dormbr( 'q', 'r', 'n', m, n, n, a, lda,
926 &work_1( itauq ), u, ldu, &work_1( iwork ),
927 lwork-iwork+1, &ierr );
928 iwork = ie + n;
929 /**
930 * Perform bidiagonal QR iteration, computing left
931 * singular vectors of A in U
932 * (Workspace: need 5*N-4)
933 **/
934 dbdsqr( 'u', n, 0, m, 0, s, &work_1( ie ), dum,
935 1, u, ldu, dum, 1, &work_1( iwork ),
936 info );
937
938 }
939
940 } else if( wntvo ) {
941 /**
942 * Path 5 (M much larger than N, JOBU='S', JOBVT='O')
943 * N left singular vectors to be computed in U and
944 * N right singular vectors to be overwritten on A
945 **/
946 if( lwork>=2*n*n+max( 4*n, 5*n-4 ) ) {
947 /**
948 * Sufficient workspace for a fast algorithm
949 **/
950 iu = 1;
951 if( lwork>=wrkbl+2*lda*n ) {
952 /**
953 * WORK(IU) is LDA by N and WORK(IR) is LDA by N
954 **/
955 ldwrku = lda;
956 ir = iu + ldwrku*n;
957 ldwrkr = lda;
958 } else if( lwork>=wrkbl+( lda+n )*n ) {
959 /**
960 * WORK(IU) is LDA by N and WORK(IR) is N by N
961 **/
962 ldwrku = lda;
963 ir = iu + ldwrku*n;
964 ldwrkr = n;
965 } else {
966 /**
967 * WORK(IU) is N by N and WORK(IR) is N by N
968 **/
969 ldwrku = n;
970 ir = iu + ldwrku*n;
971 ldwrkr = n;
972 }
973 itau = ir + ldwrkr*n;
974 iwork = itau + n;
975 /**
976 * Compute A=Q*R
977 * (Workspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB)
978 **/
979 dgeqrf( m, n, a, lda, &work_1( itau ),
980 &work_1( iwork ), lwork-iwork+1, &ierr );
981 /**
982 * Copy R to WORK(IU), zeroing out below it
983 **/
984 dlacpy( 'u', n, n, a, lda, &work_1( iu ),
985 ldwrku );
986 dlaset( 'l', n-1, n-1, zero, zero,
987 &work_1( iu+1 ), ldwrku );
988 /**
989 * Generate Q in A
990 * (Workspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB)
991 **/
992 dorgqr( m, n, n, a, lda, &work_1( itau ),
993 &work_1( iwork ), lwork-iwork+1, &ierr );
994 ie = itau;
995 itauq = ie + n;
996 itaup = itauq + n;
997 iwork = itaup + n;
998 /**
999 * Bidiagonalize R in WORK(IU), copying result to
1000 * WORK(IR)
1001 * (Workspace: need 2*N*N+4*N,
1002 * prefer 2*N*N+3*N+2*N*NB)
1003 **/
1004 dgebrd( n, n, &work_1( iu ), ldwrku, s,
1005 &work_1( ie ), &work_1( itauq ),
1006 &work_1( itaup ), &work_1( iwork ),
1007 lwork-iwork+1, &ierr );
1008 dlacpy( 'u', n, n, &work_1( iu ), ldwrku,
1009 &work_1( ir ), ldwrkr );
1010 /**
1011 * Generate left bidiagonalizing vectors in WORK(IU)
1012 * (Workspace: need 2*N*N+4*N, prefer 2*N*N+3*N+N*NB)
1013 **/
1014 dorgbr( 'q', n, n, n, &work_1( iu ), ldwrku,
1015 &work_1( itauq ), &work_1( iwork ),
1016 lwork-iwork+1, &ierr );
1017 /**
1018 * Generate right bidiagonalizing vectors in WORK(IR)
1019 * (Workspace: need 2*N*N+4*N-1,
1020 * prefer 2*N*N+3*N+(N-1)*NB)
1021 **/
1022 dorgbr( 'p', n, n, n, &work_1( ir ), ldwrkr,
1023 &work_1( itaup ), &work_1( iwork ),
1024 lwork-iwork+1, &ierr );
1025 iwork = ie + n;
1026 /**
1027 * Perform bidiagonal QR iteration, computing left
1028 * singular vectors of R in WORK(IU) and computing
1029 * right singular vectors of R in WORK(IR)
1030 * (Workspace: need 2*N*N+5*N-4)
1031 **/
1032 dbdsqr( 'u', n, n, n, 0, s, &work_1( ie ),
1033 &work_1( ir ), ldwrkr, &work_1( iu ),
1034 ldwrku, dum, 1, &work_1( iwork ), info );
1035 /**
1036 * Multiply Q in A by left singular vectors of R in
1037 * WORK(IU), storing result in U
1038 * (Workspace: need N*N)
1039 **/
1040 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
1041 one, a, lda, &work_1( iu ), ldwrku, zero, u, ldu );
1042 /**
1043 * Copy right singular vectors of R to A
1044 * (Workspace: need N*N)
1045 **/
1046 dlacpy( 'f', n, n, &work_1( ir ), ldwrkr, a,
1047 lda );
1048
1049 } else {
1050 /**
1051 * Insufficient workspace for a fast algorithm
1052 **/
1053 itau = 1;
1054 iwork = itau + n;
1055 /**
1056 * Compute A=Q*R, copying result to U
1057 * (Workspace: need 2*N, prefer N+N*NB)
1058 **/
1059 dgeqrf( m, n, a, lda, &work_1( itau ),
1060 &work_1( iwork ), lwork-iwork+1, &ierr );
1061 dlacpy( 'l', m, n, a, lda, u, ldu );
1062 /**
1063 * Generate Q in U
1064 * (Workspace: need 2*N, prefer N+N*NB)
1065 **/
1066 dorgqr( m, n, n, u, ldu, &work_1( itau ),
1067 &work_1( iwork ), lwork-iwork+1, &ierr );
1068 ie = itau;
1069 itauq = ie + n;
1070 itaup = itauq + n;
1071 iwork = itaup + n;
1072 /**
1073 * Zero out below R in A
1074 **/
1075 dlaset( 'l', n-1, n-1, zero, zero, &a_2( 2, 1 ),
1076 lda );
1077 /**
1078 * Bidiagonalize R in A
1079 * (Workspace: need 4*N, prefer 3*N+2*N*NB)
1080 **/
1081 dgebrd( n, n, a, lda, s, &work_1( ie ),
1082 &work_1( itauq ), &work_1( itaup ),
1083 &work_1( iwork ), lwork-iwork+1, &ierr );
1084 /**
1085 * Multiply Q in U by left vectors bidiagonalizing R
1086 * (Workspace: need 3*N+M, prefer 3*N+M*NB)
1087 **/
1088 dormbr( 'q', 'r', 'n', m, n, n, a, lda,
1089 &work_1( itauq ), u, ldu, &work_1( iwork ),
1090 lwork-iwork+1, &ierr );
1091 /**
1092 * Generate right vectors bidiagonalizing R in A
1093 * (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1094 **/
1095 dorgbr( 'p', n, n, n, a, lda, &work_1( itaup ),
1096 &work_1( iwork ), lwork-iwork+1, &ierr );
1097 iwork = ie + n;
1098 /**
1099 * Perform bidiagonal QR iteration, computing left
1100 * singular vectors of A in U and computing right
1101 * singular vectors of A in A
1102 * (Workspace: need 5*N-4)
1103 **/
1104 dbdsqr( 'u', n, n, m, 0, s, &work_1( ie ), a,
1105 lda, u, ldu, dum, 1, &work_1( iwork ),
1106 info );
1107
1108 }
1109
1110 } else if( wntvas ) {
1111 /**
1112 * Path 6 (M much larger than N, JOBU='S', JOBVT='S'
1113 * or 'A')
1114 * N left singular vectors to be computed in U and
1115 * N right singular vectors to be computed in VT
1116 **/
1117 if( lwork>=n*n+max( 4*n, 5*n-4 ) ) {
1118 /**
1119 * Sufficient workspace for a fast algorithm
1120 **/
1121 iu = 1;
1122 if( lwork>=wrkbl+lda*n ) {
1123 /**
1124 * WORK(IU) is LDA by N
1125 **/
1126 ldwrku = lda;
1127 } else {
1128 /**
1129 * WORK(IU) is N by N
1130 **/
1131 ldwrku = n;
1132 }
1133 itau = iu + ldwrku*n;
1134 iwork = itau + n;
1135 /**
1136 * Compute A=Q*R
1137 * (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
1138 **/
1139 dgeqrf( m, n, a, lda, &work_1( itau ),
1140 &work_1( iwork ), lwork-iwork+1, &ierr );
1141 /**
1142 * Copy R to WORK(IU), zeroing out below it
1143 **/
1144 dlacpy( 'u', n, n, a, lda, &work_1( iu ),
1145 ldwrku );
1146 dlaset( 'l', n-1, n-1, zero, zero,
1147 &work_1( iu+1 ), ldwrku );
1148 /**
1149 * Generate Q in A
1150 * (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
1151 **/
1152 dorgqr( m, n, n, a, lda, &work_1( itau ),
1153 &work_1( iwork ), lwork-iwork+1, &ierr );
1154 ie = itau;
1155 itauq = ie + n;
1156 itaup = itauq + n;
1157 iwork = itaup + n;
1158 /**
1159 * Bidiagonalize R in WORK(IU), copying result to VT
1160 * (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
1161 **/
1162 dgebrd( n, n, &work_1( iu ), ldwrku, s,
1163 &work_1( ie ), &work_1( itauq ),
1164 &work_1( itaup ), &work_1( iwork ),
1165 lwork-iwork+1, &ierr );
1166 dlacpy( 'u', n, n, &work_1( iu ), ldwrku, vt,
1167 ldvt );
1168 /**
1169 * Generate left bidiagonalizing vectors in WORK(IU)
1170 * (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
1171 **/
1172 dorgbr( 'q', n, n, n, &work_1( iu ), ldwrku,
1173 &work_1( itauq ), &work_1( iwork ),
1174 lwork-iwork+1, &ierr );
1175 /**
1176 * Generate right bidiagonalizing vectors in VT
1177 * (Workspace: need N*N+4*N-1,
1178 * prefer N*N+3*N+(N-1)*NB)
1179 **/
1180 dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
1181 &work_1( iwork ), lwork-iwork+1, &ierr );
1182 iwork = ie + n;
1183 /**
1184 * Perform bidiagonal QR iteration, computing left
1185 * singular vectors of R in WORK(IU) and computing
1186 * right singular vectors of R in VT
1187 * (Workspace: need N*N+5*N-4)
1188 **/
1189 dbdsqr( 'u', n, n, n, 0, s, &work_1( ie ), vt,
1190 ldvt, &work_1( iu ), ldwrku, dum, 1,
1191 &work_1( iwork ), info );
1192 /**
1193 * Multiply Q in A by left singular vectors of R in
1194 * WORK(IU), storing result in U
1195 * (Workspace: need N*N)
1196 **/
1197 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
1198 one, a, lda, &work_1( iu ), ldwrku, zero, u, ldu );
1199
1200 } else {
1201 /**
1202 * Insufficient workspace for a fast algorithm
1203 **/
1204 itau = 1;
1205 iwork = itau + n;
1206 /**
1207 * Compute A=Q*R, copying result to U
1208 * (Workspace: need 2*N, prefer N+N*NB)
1209 **/
1210 dgeqrf( m, n, a, lda, &work_1( itau ),
1211 &work_1( iwork ), lwork-iwork+1, &ierr );
1212 dlacpy( 'l', m, n, a, lda, u, ldu );
1213 /**
1214 * Generate Q in U
1215 * (Workspace: need 2*N, prefer N+N*NB)
1216 **/
1217 dorgqr( m, n, n, u, ldu, &work_1( itau ),
1218 &work_1( iwork ), lwork-iwork+1, &ierr );
1219 /**
1220 * Copy R to VT, zeroing out below it
1221 **/
1222 dlacpy( 'u', n, n, a, lda, vt, ldvt );
1223 dlaset( 'l', n-1, n-1, zero, zero, &vt_2( 2, 1 ),
1224 ldvt );
1225 ie = itau;
1226 itauq = ie + n;
1227 itaup = itauq + n;
1228 iwork = itaup + n;
1229 /**
1230 * Bidiagonalize R in VT
1231 * (Workspace: need 4*N, prefer 3*N+2*N*NB)
1232 **/
1233 dgebrd( n, n, vt, ldvt, s, &work_1( ie ),
1234 &work_1( itauq ), &work_1( itaup ),
1235 &work_1( iwork ), lwork-iwork+1, &ierr );
1236 /**
1237 * Multiply Q in U by left bidiagonalizing vectors
1238 * in VT
1239 * (Workspace: need 3*N+M, prefer 3*N+M*NB)
1240 **/
1241 dormbr( 'q', 'r', 'n', m, n, n, vt, ldvt,
1242 &work_1( itauq ), u, ldu, &work_1( iwork ),
1243 lwork-iwork+1, &ierr );
1244 /**
1245 * Generate right bidiagonalizing vectors in VT
1246 * (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1247 **/
1248 dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
1249 &work_1( iwork ), lwork-iwork+1, &ierr );
1250 iwork = ie + n;
1251 /**
1252 * Perform bidiagonal QR iteration, computing left
1253 * singular vectors of A in U and computing right
1254 * singular vectors of A in VT
1255 * (Workspace: need 5*N-4)
1256 **/
1257 dbdsqr( 'u', n, n, m, 0, s, &work_1( ie ), vt,
1258 ldvt, u, ldu, dum, 1, &work_1( iwork ),
1259 info );
1260
1261 }
1262
1263 }
1264
1265 } else if( wntua ) {
1266
1267 if( wntvn ) {
1268 /**
1269 * Path 7 (M much larger than N, JOBU='A', JOBVT='N')
1270 * M left singular vectors to be computed in U and
1271 * no right singular vectors to be computed
1272 **/
1273 if( lwork>=n*n+max( n+m, max(4*n, 5*n-4) ) ) {
1274 /**
1275 * Sufficient workspace for a fast algorithm
1276 **/
1277 ir = 1;
1278 if( lwork>=wrkbl+lda*n ) {
1279 /**
1280 * WORK(IR) is LDA by N
1281 **/
1282 ldwrkr = lda;
1283 } else {
1284 /**
1285 * WORK(IR) is N by N
1286 **/
1287 ldwrkr = n;
1288 }
1289 itau = ir + ldwrkr*n;
1290 iwork = itau + n;
1291 /**
1292 * Compute A=Q*R, copying result to U
1293 * (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
1294 **/
1295 dgeqrf( m, n, a, lda, &work_1( itau ),
1296 &work_1( iwork ), lwork-iwork+1, &ierr );
1297 dlacpy( 'l', m, n, a, lda, u, ldu );
1298 /**
1299 * Copy R to WORK(IR), zeroing out below it
1300 **/
1301 dlacpy( 'u', n, n, a, lda, &work_1( ir ),
1302 ldwrkr );
1303 dlaset( 'l', n-1, n-1, zero, zero,
1304 &work_1( ir+1 ), ldwrkr );
1305 /**
1306 * Generate Q in U
1307 * (Workspace: need N*N+N+M, prefer N*N+N+M*NB)
1308 **/
1309 dorgqr( m, m, n, u, ldu, &work_1( itau ),
1310 &work_1( iwork ), lwork-iwork+1, &ierr );
1311 ie = itau;
1312 itauq = ie + n;
1313 itaup = itauq + n;
1314 iwork = itaup + n;
1315 /**
1316 * Bidiagonalize R in WORK(IR)
1317 * (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
1318 **/
1319 dgebrd( n, n, &work_1( ir ), ldwrkr, s,
1320 &work_1( ie ), &work_1( itauq ),
1321 &work_1( itaup ), &work_1( iwork ),
1322 lwork-iwork+1, &ierr );
1323 /**
1324 * Generate left bidiagonalizing vectors in WORK(IR)
1325 * (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
1326 **/
1327 dorgbr( 'q', n, n, n, &work_1( ir ), ldwrkr,
1328 &work_1( itauq ), &work_1( iwork ),
1329 lwork-iwork+1, &ierr );
1330 iwork = ie + n;
1331 /**
1332 * Perform bidiagonal QR iteration, computing left
1333 * singular vectors of R in WORK(IR)
1334 * (Workspace: need N*N+5*N-4)
1335 **/
1336 dbdsqr( 'u', n, 0, n, 0, s, &work_1( ie ), dum,
1337 1, &work_1( ir ), ldwrkr, dum, 1,
1338 &work_1( iwork ), info );
1339 /**
1340 * Multiply Q in U by left singular vectors of R in
1341 * WORK(IR), storing result in A
1342 * (Workspace: need N*N)
1343 **/
1344 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
1345 one, u, ldu, &work_1( ir ), ldwrkr, zero, a, lda );
1346 /**
1347 * Copy left singular vectors of A from A to U
1348 **/
1349 dlacpy( 'f', m, n, a, lda, u, ldu );
1350
1351 } else {
1352 /**
1353 * Insufficient workspace for a fast algorithm
1354 **/
1355 itau = 1;
1356 iwork = itau + n;
1357 /**
1358 * Compute A=Q*R, copying result to U
1359 * (Workspace: need 2*N, prefer N+N*NB)
1360 **/
1361 dgeqrf( m, n, a, lda, &work_1( itau ),
1362 &work_1( iwork ), lwork-iwork+1, &ierr );
1363 dlacpy( 'l', m, n, a, lda, u, ldu );
1364 /**
1365 * Generate Q in U
1366 * (Workspace: need N+M, prefer N+M*NB)
1367 **/
1368 dorgqr( m, m, n, u, ldu, &work_1( itau ),
1369 &work_1( iwork ), lwork-iwork+1, &ierr );
1370 ie = itau;
1371 itauq = ie + n;
1372 itaup = itauq + n;
1373 iwork = itaup + n;
1374 /**
1375 * Zero out below R in A
1376 **/
1377 dlaset( 'l', n-1, n-1, zero, zero, &a_2( 2, 1 ),
1378 lda );
1379 /**
1380 * Bidiagonalize R in A
1381 * (Workspace: need 4*N, prefer 3*N+2*N*NB)
1382 **/
1383 dgebrd( n, n, a, lda, s, &work_1( ie ),
1384 &work_1( itauq ), &work_1( itaup ),
1385 &work_1( iwork ), lwork-iwork+1, &ierr );
1386 /**
1387 * Multiply Q in U by left bidiagonalizing vectors
1388 * in A
1389 * (Workspace: need 3*N+M, prefer 3*N+M*NB)
1390 **/
1391 dormbr( 'q', 'r', 'n', m, n, n, a, lda,
1392 &work_1( itauq ), u, ldu, &work_1( iwork ),
1393 lwork-iwork+1, &ierr );
1394 iwork = ie + n;
1395 /**
1396 * Perform bidiagonal QR iteration, computing left
1397 * singular vectors of A in U
1398 * (Workspace: need 5*N-4)
1399 **/
1400 dbdsqr( 'u', n, 0, m, 0, s, &work_1( ie ), dum,
1401 1, u, ldu, dum, 1, &work_1( iwork ),
1402 info );
1403
1404 }
1405
1406 } else if( wntvo ) {
1407 /**
1408 * Path 8 (M much larger than N, JOBU='A', JOBVT='O')
1409 * M left singular vectors to be computed in U and
1410 * N right singular vectors to be overwritten on A
1411 **/
1412 if( lwork>=2*n*n+max( n+m, max(4*n, 5*n-4) ) ) {
1413 /**
1414 * Sufficient workspace for a fast algorithm
1415 **/
1416 iu = 1;
1417 if( lwork>=wrkbl+2*lda*n ) {
1418 /**
1419 * WORK(IU) is LDA by N and WORK(IR) is LDA by N
1420 **/
1421 ldwrku = lda;
1422 ir = iu + ldwrku*n;
1423 ldwrkr = lda;
1424 } else if( lwork>=wrkbl+( lda+n )*n ) {
1425 /**
1426 * WORK(IU) is LDA by N and WORK(IR) is N by N
1427 **/
1428 ldwrku = lda;
1429 ir = iu + ldwrku*n;
1430 ldwrkr = n;
1431 } else {
1432 /**
1433 * WORK(IU) is N by N and WORK(IR) is N by N
1434 **/
1435 ldwrku = n;
1436 ir = iu + ldwrku*n;
1437 ldwrkr = n;
1438 }
1439 itau = ir + ldwrkr*n;
1440 iwork = itau + n;
1441 /**
1442 * Compute A=Q*R, copying result to U
1443 * (Workspace: need 2*N*N+2*N, prefer 2*N*N+N+N*NB)
1444 **/
1445 dgeqrf( m, n, a, lda, &work_1( itau ),
1446 &work_1( iwork ), lwork-iwork+1, &ierr );
1447 dlacpy( 'l', m, n, a, lda, u, ldu );
1448 /**
1449 * Generate Q in U
1450 * (Workspace: need 2*N*N+N+M, prefer 2*N*N+N+M*NB)
1451 **/
1452 dorgqr( m, m, n, u, ldu, &work_1( itau ),
1453 &work_1( iwork ), lwork-iwork+1, &ierr );
1454 /**
1455 * Copy R to WORK(IU), zeroing out below it
1456 **/
1457 dlacpy( 'u', n, n, a, lda, &work_1( iu ),
1458 ldwrku );
1459 dlaset( 'l', n-1, n-1, zero, zero,
1460 &work_1( iu+1 ), ldwrku );
1461 ie = itau;
1462 itauq = ie + n;
1463 itaup = itauq + n;
1464 iwork = itaup + n;
1465 /**
1466 * Bidiagonalize R in WORK(IU), copying result to
1467 * WORK(IR)
1468 * (Workspace: need 2*N*N+4*N,
1469 * prefer 2*N*N+3*N+2*N*NB)
1470 **/
1471 dgebrd( n, n, &work_1( iu ), ldwrku, s,
1472 &work_1( ie ), &work_1( itauq ),
1473 &work_1( itaup ), &work_1( iwork ),
1474 lwork-iwork+1, &ierr );
1475 dlacpy( 'u', n, n, &work_1( iu ), ldwrku,
1476 &work_1( ir ), ldwrkr );
1477 /**
1478 * Generate left bidiagonalizing vectors in WORK(IU)
1479 * (Workspace: need 2*N*N+4*N, prefer 2*N*N+3*N+N*NB)
1480 **/
1481 dorgbr( 'q', n, n, n, &work_1( iu ), ldwrku,
1482 &work_1( itauq ), &work_1( iwork ),
1483 lwork-iwork+1, &ierr );
1484 /**
1485 * Generate right bidiagonalizing vectors in WORK(IR)
1486 * (Workspace: need 2*N*N+4*N-1,
1487 * prefer 2*N*N+3*N+(N-1)*NB)
1488 **/
1489 dorgbr( 'p', n, n, n, &work_1( ir ), ldwrkr,
1490 &work_1( itaup ), &work_1( iwork ),
1491 lwork-iwork+1, &ierr );
1492 iwork = ie + n;
1493 /**
1494 * Perform bidiagonal QR iteration, computing left
1495 * singular vectors of R in WORK(IU) and computing
1496 * right singular vectors of R in WORK(IR)
1497 * (Workspace: need 2*N*N+5*N-4)
1498 **/
1499 dbdsqr( 'u', n, n, n, 0, s, &work_1( ie ),
1500 &work_1( ir ), ldwrkr, &work_1( iu ),
1501 ldwrku, dum, 1, &work_1( iwork ), info );
1502 /**
1503 * Multiply Q in U by left singular vectors of R in
1504 * WORK(IU), storing result in A
1505 * (Workspace: need N*N)
1506 **/
1507 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
1508 one, u, ldu, &work_1( iu ), ldwrku, zero, a, lda );
1509 /**
1510 * Copy left singular vectors of A from A to U
1511 **/
1512 dlacpy( 'f', m, n, a, lda, u, ldu );
1513 /**
1514 * Copy right singular vectors of R from WORK(IR) to A
1515 **/
1516 dlacpy( 'f', n, n, &work_1( ir ), ldwrkr, a,
1517 lda );
1518
1519 } else {
1520 /**
1521 * Insufficient workspace for a fast algorithm
1522 **/
1523 itau = 1;
1524 iwork = itau + n;
1525 /**
1526 * Compute A=Q*R, copying result to U
1527 * (Workspace: need 2*N, prefer N+N*NB)
1528 **/
1529 dgeqrf( m, n, a, lda, &work_1( itau ),
1530 &work_1( iwork ), lwork-iwork+1, &ierr );
1531 dlacpy( 'l', m, n, a, lda, u, ldu );
1532 /**
1533 * Generate Q in U
1534 * (Workspace: need N+M, prefer N+M*NB)
1535 **/
1536 dorgqr( m, m, n, u, ldu, &work_1( itau ),
1537 &work_1( iwork ), lwork-iwork+1, &ierr );
1538 ie = itau;
1539 itauq = ie + n;
1540 itaup = itauq + n;
1541 iwork = itaup + n;
1542 /**
1543 * Zero out below R in A
1544 **/
1545 dlaset( 'l', n-1, n-1, zero, zero, &a_2( 2, 1 ),
1546 lda );
1547 /**
1548 * Bidiagonalize R in A
1549 * (Workspace: need 4*N, prefer 3*N+2*N*NB)
1550 **/
1551 dgebrd( n, n, a, lda, s, &work_1( ie ),
1552 &work_1( itauq ), &work_1( itaup ),
1553 &work_1( iwork ), lwork-iwork+1, &ierr );
1554 /**
1555 * Multiply Q in U by left bidiagonalizing vectors
1556 * in A
1557 * (Workspace: need 3*N+M, prefer 3*N+M*NB)
1558 **/
1559 dormbr( 'q', 'r', 'n', m, n, n, a, lda,
1560 &work_1( itauq ), u, ldu, &work_1( iwork ),
1561 lwork-iwork+1, &ierr );
1562 /**
1563 * Generate right bidiagonalizing vectors in A
1564 * (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1565 **/
1566 dorgbr( 'p', n, n, n, a, lda, &work_1( itaup ),
1567 &work_1( iwork ), lwork-iwork+1, &ierr );
1568 iwork = ie + n;
1569 /**
1570 * Perform bidiagonal QR iteration, computing left
1571 * singular vectors of A in U and computing right
1572 * singular vectors of A in A
1573 * (Workspace: need 5*N-4)
1574 **/
1575 dbdsqr( 'u', n, n, m, 0, s, &work_1( ie ), a,
1576 lda, u, ldu, dum, 1, &work_1( iwork ),
1577 info );
1578
1579 }
1580
1581 } else if( wntvas ) {
1582 /**
1583 * Path 9 (M much larger than N, JOBU='A', JOBVT='S'
1584 * or 'A')
1585 * M left singular vectors to be computed in U and
1586 * N right singular vectors to be computed in VT
1587 **/
1588 if( lwork>=n*n+max( n+m, max(4*n, 5*n-4) ) ) {
1589 /**
1590 * Sufficient workspace for a fast algorithm
1591 **/
1592 iu = 1;
1593 if( lwork>=wrkbl+lda*n ) {
1594 /**
1595 * WORK(IU) is LDA by N
1596 **/
1597 ldwrku = lda;
1598 } else {
1599 /**
1600 * WORK(IU) is N by N
1601 **/
1602 ldwrku = n;
1603 }
1604 itau = iu + ldwrku*n;
1605 iwork = itau + n;
1606 /**
1607 * Compute A=Q*R, copying result to U
1608 * (Workspace: need N*N+2*N, prefer N*N+N+N*NB)
1609 **/
1610 dgeqrf( m, n, a, lda, &work_1( itau ),
1611 &work_1( iwork ), lwork-iwork+1, &ierr );
1612 dlacpy( 'l', m, n, a, lda, u, ldu );
1613 /**
1614 * Generate Q in U
1615 * (Workspace: need N*N+N+M, prefer N*N+N+M*NB)
1616 **/
1617 dorgqr( m, m, n, u, ldu, &work_1( itau ),
1618 &work_1( iwork ), lwork-iwork+1, &ierr );
1619 /**
1620 * Copy R to WORK(IU), zeroing out below it
1621 **/
1622 dlacpy( 'u', n, n, a, lda, &work_1( iu ),
1623 ldwrku );
1624 dlaset( 'l', n-1, n-1, zero, zero,
1625 &work_1( iu+1 ), ldwrku );
1626 ie = itau;
1627 itauq = ie + n;
1628 itaup = itauq + n;
1629 iwork = itaup + n;
1630 /**
1631 * Bidiagonalize R in WORK(IU), copying result to VT
1632 * (Workspace: need N*N+4*N, prefer N*N+3*N+2*N*NB)
1633 **/
1634 dgebrd( n, n, &work_1( iu ), ldwrku, s,
1635 &work_1( ie ), &work_1( itauq ),
1636 &work_1( itaup ), &work_1( iwork ),
1637 lwork-iwork+1, &ierr );
1638 dlacpy( 'u', n, n, &work_1( iu ), ldwrku, vt,
1639 ldvt );
1640 /**
1641 * Generate left bidiagonalizing vectors in WORK(IU)
1642 * (Workspace: need N*N+4*N, prefer N*N+3*N+N*NB)
1643 **/
1644 dorgbr( 'q', n, n, n, &work_1( iu ), ldwrku,
1645 &work_1( itauq ), &work_1( iwork ),
1646 lwork-iwork+1, &ierr );
1647 /**
1648 * Generate right bidiagonalizing vectors in VT
1649 * (Workspace: need N*N+4*N-1,
1650 * prefer N*N+3*N+(N-1)*NB)
1651 **/
1652 dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
1653 &work_1( iwork ), lwork-iwork+1, &ierr );
1654 iwork = ie + n;
1655 /**
1656 * Perform bidiagonal QR iteration, computing left
1657 * singular vectors of R in WORK(IU) and computing
1658 * right singular vectors of R in VT
1659 * (Workspace: need N*N+5*N-4)
1660 **/
1661 dbdsqr( 'u', n, n, n, 0, s, &work_1( ie ), vt,
1662 ldvt, &work_1( iu ), ldwrku, dum, 1,
1663 &work_1( iwork ), info );
1664 /**
1665 * Multiply Q in U by left singular vectors of R in
1666 * WORK(IU), storing result in A
1667 * (Workspace: need N*N)
1668 **/
1669 cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, n,
1670 one, u, ldu, &work_1( iu ), ldwrku, zero, a, lda );
1671 /**
1672 * Copy left singular vectors of A from A to U
1673 **/
1674 dlacpy( 'f', m, n, a, lda, u, ldu );
1675
1676 } else {
1677 /**
1678 * Insufficient workspace for a fast algorithm
1679 **/
1680 itau = 1;
1681 iwork = itau + n;
1682 /**
1683 * Compute A=Q*R, copying result to U
1684 * (Workspace: need 2*N, prefer N+N*NB)
1685 **/
1686 dgeqrf( m, n, a, lda, &work_1( itau ),
1687 &work_1( iwork ), lwork-iwork+1, &ierr );
1688 dlacpy( 'l', m, n, a, lda, u, ldu );
1689 /**
1690 * Generate Q in U
1691 * (Workspace: need N+M, prefer N+M*NB)
1692 **/
1693 dorgqr( m, m, n, u, ldu, &work_1( itau ),
1694 &work_1( iwork ), lwork-iwork+1, &ierr );
1695 /**
1696 * Copy R from A to VT, zeroing out below it
1697 **/
1698 dlacpy( 'u', n, n, a, lda, vt, ldvt );
1699 dlaset( 'l', n-1, n-1, zero, zero, &vt_2( 2, 1 ),
1700 ldvt );
1701 ie = itau;
1702 itauq = ie + n;
1703 itaup = itauq + n;
1704 iwork = itaup + n;
1705 /**
1706 * Bidiagonalize R in VT
1707 * (Workspace: need 4*N, prefer 3*N+2*N*NB)
1708 **/
1709 dgebrd( n, n, vt, ldvt, s, &work_1( ie ),
1710 &work_1( itauq ), &work_1( itaup ),
1711 &work_1( iwork ), lwork-iwork+1, &ierr );
1712 /**
1713 * Multiply Q in U by left bidiagonalizing vectors
1714 * in VT
1715 * (Workspace: need 3*N+M, prefer 3*N+M*NB)
1716 **/
1717 dormbr( 'q', 'r', 'n', m, n, n, vt, ldvt,
1718 &work_1( itauq ), u, ldu, &work_1( iwork ),
1719 lwork-iwork+1, &ierr );
1720 /**
1721 * Generate right bidiagonalizing vectors in VT
1722 * (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1723 **/
1724 dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
1725 &work_1( iwork ), lwork-iwork+1, &ierr );
1726 iwork = ie + n;
1727 /**
1728 * Perform bidiagonal QR iteration, computing left
1729 * singular vectors of A in U and computing right
1730 * singular vectors of A in VT
1731 * (Workspace: need 5*N-4)
1732 **/
1733 dbdsqr( 'u', n, n, m, 0, s, &work_1( ie ), vt,
1734 ldvt, u, ldu, dum, 1, &work_1( iwork ),
1735 info );
1736
1737 }
1738
1739 }
1740
1741 }
1742
1743 } else {
1744 /**
1745 * M .LT. MNTHR
1746 *
1747 * Path 10 (M at least N, but not much larger)
1748 * Reduce to bidiagonal form without QR decomposition
1749 **/
1750 ie = 1;
1751 itauq = ie + n;
1752 itaup = itauq + n;
1753 iwork = itaup + n;
1754 /**
1755 * Bidiagonalize A
1756 * (Workspace: need 3*N+M, prefer 3*N+(M+N)*NB)
1757 **/
1758 dgebrd( m, n, a, lda, s, &work_1( ie ), &work_1( itauq ),
1759 &work_1( itaup ), &work_1( iwork ), lwork-iwork+1,
1760 &ierr );
1761 if( wntuas ) {
1762 /**
1763 * If left singular vectors desired in U, copy result to U
1764 * and generate left bidiagonalizing vectors in U
1765 * (Workspace: need 3*N+NCU, prefer 3*N+NCU*NB)
1766 **/
1767 dlacpy( 'l', m, n, a, lda, u, ldu );
1768 if( wntus )
1769 ncu = n;
1770 if( wntua )
1771 ncu = m;
1772 dorgbr( 'q', m, ncu, n, u, ldu, &work_1( itauq ),
1773 &work_1( iwork ), lwork-iwork+1, &ierr );
1774 }
1775 if( wntvas ) {
1776 /**
1777 * If right singular vectors desired in VT, copy result to
1778 * VT and generate right bidiagonalizing vectors in VT
1779 * (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1780 **/
1781 dlacpy( 'u', n, n, a, lda, vt, ldvt );
1782 dorgbr( 'p', n, n, n, vt, ldvt, &work_1( itaup ),
1783 &work_1( iwork ), lwork-iwork+1, &ierr );
1784 }
1785 if( wntuo ) {
1786 /**
1787 * If left singular vectors desired in A, generate left
1788 * bidiagonalizing vectors in A
1789 * (Workspace: need 4*N, prefer 3*N+N*NB)
1790 **/
1791 dorgbr( 'q', m, n, n, a, lda, &work_1( itauq ),
1792 &work_1( iwork ), lwork-iwork+1, &ierr );
1793 }
1794 if( wntvo ) {
1795 /**
1796 * If right singular vectors desired in A, generate right
1797 * bidiagonalizing vectors in A
1798 * (Workspace: need 4*N-1, prefer 3*N+(N-1)*NB)
1799 **/
1800 dorgbr( 'p', n, n, n, a, lda, &work_1( itaup ),
1801 &work_1( iwork ), lwork-iwork+1, &ierr );
1802 }
1803 iwork = ie + n;
1804 if( wntuas || wntuo )
1805 nru = m;
1806 if( wntun )
1807 nru = 0;
1808 if( wntvas || wntvo )
1809 ncvt = n;
1810 if( wntvn )
1811 ncvt = 0;
1812 if( ( !wntuo ) && ( !wntvo ) ) {
1813 /**
1814 * Perform bidiagonal QR iteration, if desired, computing
1815 * left singular vectors in U and computing right singular
1816 * vectors in VT
1817 * (Workspace: need 5*N-4)
1818 **/
1819 dbdsqr( 'u', n, ncvt, nru, 0, s, &work_1( ie ), vt,
1820 ldvt, u, ldu, dum, 1, &work_1( iwork ), info );
1821 } else if( ( !wntuo ) && wntvo ) {
1822 /**
1823 * Perform bidiagonal QR iteration, if desired, computing
1824 * left singular vectors in U and computing right singular
1825 * vectors in A
1826 * (Workspace: need 5*N-4)
1827 **/
1828 dbdsqr( 'u', n, ncvt, nru, 0, s, &work_1( ie ), a, lda,
1829 u, ldu, dum, 1, &work_1( iwork ), info );
1830 } else {
1831 /**
1832 * Perform bidiagonal QR iteration, if desired, computing
1833 * left singular vectors in A and computing right singular
1834 * vectors in VT
1835 * (Workspace: need 5*N-4)
1836 **/
1837 dbdsqr( 'u', n, ncvt, nru, 0, s, &work_1( ie ), vt,
1838 ldvt, a, lda, dum, 1, &work_1( iwork ), info );
1839 }
1840
1841 }
1842
1843 /**
1844 * If DBDSQR failed to converge, copy unconverged superdiagonals
1845 * to WORK( 2:MINMN )
1846 **/
1847 if( *info!=0 ) {
1848 if( ie>2 ) {
1849 for (i=1 ; i<=minmn - 1 ; i+=1) {
1850 work_1( i+1 ) = work_1( i+ie-1 );
1851 }
1852 }
1853 if( ie<2 ) {
1854 for (i=minmn - 1 ; i>=1 ; i+=-1) {
1855 work_1( i+1 ) = work_1( i+ie-1 );
1856 }
1857 }
1858 }
1859 /**
1860 * Undo scaling if necessary
1861 **/
1862 if( iscl==1 ) {
1863 if( anrm>bignum )
1864 dlascl( 'g', 0, 0, bignum, anrm, minmn, 1, s, minmn,
1865 &ierr );
1866 if( *info!=0 && anrm>bignum )
1867 dlascl( 'g', 0, 0, bignum, anrm, minmn-1, 1, &work_1( 2 ),
1868 minmn, &ierr );
1869 if( anrm<smlnum )
1870 dlascl( 'g', 0, 0, smlnum, anrm, minmn, 1, s, minmn,
1871 &ierr );
1872 if( *info!=0 && anrm<smlnum )
1873 dlascl( 'g', 0, 0, smlnum, anrm, minmn-1, 1, &work_1( 2 ),
1874 minmn, &ierr );
1875 }
1876 /**
1877 * Return optimal workspace in WORK(1)
1878 **/
1879 work_1( 1 ) = maxwrk;
1880
1881 return;
1882 /**
1883 * End of DGESVD
1884 **/
1885 }
1886