1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2008 The Regents of the University of California
4 //
5 // This file is part of Qbox
6 //
7 // Qbox is distributed under the terms of the GNU General Public License
8 // as published by the Free Software Foundation, either version 2 of
9 // the License, or (at your option) any later version.
10 // See the file COPYING in the root directory of this distribution
11 // or <http://www.gnu.org/licenses/>.
12 //
13 ////////////////////////////////////////////////////////////////////////////////
14 //
15 // jade.cpp
16 //
17 ////////////////////////////////////////////////////////////////////////////////
18 
19 #include <cmath>
20 #include <cassert>
21 #include <vector>
22 #include <deque>
23 #include <algorithm>
24 #include <limits> // epsilon
25 #include <iostream>
26 #include <iomanip>
27 #ifdef USE_MPI
28 #include <mpi.h>
29 #endif
30 
31 #ifdef SCALAPACK
32 #include "blacs.h"
33 #endif
34 
35 #include "Context.h"
36 #include "Matrix.h"
37 #include "blas.h"
38 #include "Timer.h"
39 using namespace std;
40 
jade(int maxsweep,double tol,vector<DoubleMatrix * > a,DoubleMatrix & u,vector<vector<double>> & adiag)41 int jade(int maxsweep, double tol, vector<DoubleMatrix*> a,
42   DoubleMatrix& u, vector<vector<double> >& adiag)
43 {
44   Timer tm_comm;
45   const bool debug_diag_sum = false;
46 
47   const double eps = numeric_limits<double>::epsilon();
48   if (tol<eps)
49   {
50     if ( u.context().onpe0() )
51     {
52         cout << " warning: jade tol=" << tol << " < eps=" << eps << endl;
53     }
54   }
55   const Context& ctxt = u.context();
56   // The input matrices are *a[k]
57   // the orthogonal transformation is returned in u
58   // on exit, the matrices a[k] are maximally diagonal,
59   // u contains the orthogonal transformation
60   // adiag[k][i] contains the diagonal elements of the a[k]'s
61 
62   for ( int k = 0; k < a.size(); k++ )
63   {
64     assert(a[k]->context() == u.context());
65     assert(a[k]->m()==a[k]->n());
66     assert(a[k]->m()==u.n());
67     assert(a[k]->m()==u.m());
68     assert(a[k]->mb()==u.mb());
69     assert(a[k]->nb()==u.nb());
70   }
71 
72   int mloc = a[0]->mloc();
73   int nloc = a[0]->nloc();
74   //cout << ctxt.mype() << ": nloc: " << nloc << endl;
75 
76   // identify the last active process column
77   // process columns beyond that column do not have any elements of a[k]
78   // compute num_nblocks = total number of column blocks
79   // if num_nblocks >= ctxt.npcol(), all process columns are active
80   // otherwise, the last active process column has index num_nblocks-1
81   const int num_nblocks = u.n() / u.nb() + ( u.n()%u.nb() == 0 ? 0 : 1 );
82   const int last_active_process_col = min(ctxt.npcol()-1, num_nblocks-1);
83 
84   // initialize u with the identity
85   u.identity();
86 
87   // eigenvalue array
88   adiag.resize(a.size());
89   for ( int k = 0; k < a.size(); k++ )
90     adiag[k].resize(a[k]->n());
91 
92   // check if the local number of rows is odd
93   const bool nloc_odd = ( a[0]->nloc()%2 != 0 );
94 
95   // if nloc is odd, auxiliary arrays are created to host an extra column
96   // for both a[k] and u
97   vector<vector<double> > a_aux(a.size());
98   vector<double> u_aux;
99   if ( nloc_odd )
100   {
101     for ( int k = 0; k < a.size(); k++ )
102       a_aux[k].resize(mloc);
103     u_aux.resize(mloc);
104   }
105 
106   // compute local number of pairs nploc
107   const int nploc = (a[0]->nloc()+1)/2;
108   // dimension of top and bot arrays is nploc: local number of pairs
109   deque<int> top(nploc), bot(nploc);
110 
111   // compute total number of pairs np
112   int np = nploc;
113   ctxt.isum('r',1,1,&np,1);
114   //cout << ctxt.mype() << ": np=" << np << endl;
115   // initialize top and bot arrays
116   // the pair i is (top[i],bot[i])
117   // top[i] is the local index of the top column of pair i
118   // bot[i] is the local index of the bottom column of pair i
119   for ( int i = 0; i < nploc; i++ )
120     top[i] = i;
121   for ( int i = 0; i < nploc; i++ )
122     bot[nploc-i-1] = nploc+i;
123   // if top[i] or bot[i] == nloc,the data resides in the array a_aux or u_aux
124 
125   // jglobal: global column index
126   // jglobal[i] is the global column index of the column residing in
127   // the local vector i. If nloc_odd and i==2*nploc-1, jglobal[i] == -1
128   vector<int> jglobal(2*nploc,-1);
129   for ( int jblock = 0; jblock < a[0]->nblocks(); jblock++ )
130     for ( int y = 0; y < a[0]->nbs(jblock); y++ )
131       jglobal[y + jblock*a[0]->nb()] = a[0]->j(jblock,y);
132 
133   // store addresses of columns of a and of u in acol and ucol
134   vector<vector<double*> > acol(a.size());
135   vector<double*> ucol(2*nploc);
136   for ( int k = 0; k < a.size(); k++ )
137   {
138     acol[k].resize(2*nploc);
139     for ( int i = 0; i < a[k]->nloc(); i++ )
140       acol[k][i] = a[k]->valptr(i*a[k]->mloc());
141     // if nloc is odd, store the address of vector 2*nploc-1
142     if ( nloc_odd )
143       acol[k][2*nploc-1] = &a_aux[k][0];
144   }
145   for ( int i = 0; i < u.nloc(); i++ )
146     ucol[i] = u.valptr(i*u.mloc());
147   // if nloc is odd, store the address of vector 2*nploc-1
148   if ( nloc_odd )
149     ucol[2*nploc-1] = &u_aux[0];
150 
151   //for ( int i = 0; i < acol.size(); i++ )
152   //  cout << ctxt.mype() << ": acol[" << i << "]=" << acol[i] << endl;
153   //for ( int i = 0; i < ucol.size(); i++ )
154   //  cout << ctxt.mype() << ": ucol[" << i << "]=" << ucol[i] << endl;
155 
156   // the vectors of the pair (top[i],bot[i]) are located at
157   // addresses acol[top[i]] and acol[bot[i]]
158 
159   bool done = false;
160   int nsweep = 0;
161   // allocate matrix element packed array apq
162   // apq[3*ipair   + k*3*nploc] = apq[k][ipair]
163   // apq[3*ipair+1 + k*3*nploc] = app[k][ipair]
164   // apq[3*ipair+2 + k*3*nploc] = aqq[k][ipair]
165   vector<double> apq(a.size()*3*nploc);
166 
167   double diag_sum = 0.0;
168 #ifdef DEBUG
169   double previous_diag_sum = 0.0;
170 #endif
171   while ( !done )
172   {
173     // sweep: process local pairs and rotate 2*np-1 times
174     nsweep++;
175     double diag_change = 0.0;
176     for ( int irot = 0; irot < 2*np-1; irot++ )
177     {
178       //cout << ctxt.mype() << ": top[i]: ";
179       //for ( int i = 0; i < nploc; i++ )
180       //  cout << setw(3) << top[i];
181       //cout << endl;
182 
183       //cout << ctxt.mype() << ": bot[i]: ";
184       //for ( int i = 0; i < nploc; i++ )
185       //  cout << setw(3) << bot[i];
186       //cout << endl;
187 
188       //cout << ctxt.mype() << ": jglobal[top[i]]: ";
189       //for ( int i = 0; i < nploc; i++ )
190       //  cout << setw(3) << jglobal[top[i]];
191       //cout << endl;
192 
193       //cout << ctxt.mype() << ": jglobal[bot[i]]: ";
194       //for ( int i = 0; i < nploc; i++ )
195       //  cout << setw(3) << jglobal[bot[i]];
196       //cout << endl;
197 
198       // perform Jacobi rotations for all local pairs
199 
200       // compute off-diagonal matrix elements apq for all pairs
201       // skip the pair if one or both of the vectors is a dummy vector
202       // i.e. a vector having jglobal==-1
203 
204 #pragma omp parallel for
205       for ( int k = 0; k < a.size(); k++ )
206       {
207         for ( int ipair = 0; ipair < nploc; ipair++ )
208         {
209           const int iapq = 3*ipair + k*3*nploc;
210           //cout << ctxt.mype() << ": computing apq for global pair "
211           //     << jglobal[top[ipair]] << " " << jglobal[bot[ipair]] << endl;
212           apq[iapq]   = 0.0;
213           apq[iapq+1] = 0.0;
214           apq[iapq+2] = 0.0;
215           if ( jglobal[top[ipair]] >= 0 && jglobal[bot[ipair]] >= 0 )
216           {
217             const double *ap = acol[k][top[ipair]];
218             const double *aq = acol[k][bot[ipair]];
219             const double *up = ucol[top[ipair]];
220             const double *uq = ucol[bot[ipair]];
221             int one = 1;
222             apq[iapq]   = ddot(&mloc,ap,&one,uq,&one);
223             apq[iapq+1] = ddot(&mloc,ap,&one,up,&one);
224             apq[iapq+2] = ddot(&mloc,aq,&one,uq,&one);
225           }
226         }
227       } // for k
228       // apq now contains partial sums of matrix elements
229       tm_comm.start();
230       int len = apq.size();
231       ctxt.dsum('c',len,1,&apq[0],len);
232       tm_comm.stop();
233       // apq now contains the matrix elements
234 
235 #pragma omp parallel for
236       for ( int ipair = 0; ipair < nploc; ipair++ )
237       {
238         if ( jglobal[top[ipair]] >= 0 &&
239              jglobal[bot[ipair]] >= 0 )
240         {
241           // compute rotation sine and cosine
242           // Cardoso-Souloumiac expressions for the rotation angle
243 
244           // compute 2x2 matrix g
245           double g11 = 0.0, g12 = 0.0, g22 = 0.0;
246 
247           for ( int k = 0; k < a.size(); k++ )
248           {
249             const int iapq = 3*ipair + k*3*nploc;
250             const double app = apq[iapq+1];
251             const double aqq = apq[iapq+2];
252 
253             const double h1 = app - aqq;
254             const double h2 = 2.0 * apq[iapq];
255 
256             g11 += h1 * h1;
257             g12 += h1 * h2;
258             g22 += h2 * h2;
259           }
260 
261           // the matrix g is real, symmetric
262           // compute Jacobi rotation diagonalizing g
263 
264           double c = 1.0, s = 0.0, e1 = g11, e2 = g22;
265           if ( g12*g12 > eps * eps * fabs(g11*g22) )
266           {
267             double tau = 0.5 * ( g22 - g11 ) / g12;
268             double t = 1.0 / ( fabs(tau) + sqrt(1.0 + tau*tau));
269             if ( tau < 0.0 ) t *= -1.0;
270             c = 1.0 / sqrt(1.0 + t*t);
271             s = t * c;
272 
273             // eigenvalues
274             e1 -= t * g12;
275             e2 += t * g12;
276           }
277 
278           // components of eigenvector associated with the largest eigenvalue
279           double x,y;
280           if ( e1 > e2 )
281           {
282             x = c;
283             y = -s;
284           }
285           else
286           {
287             x = s;
288             y = c;
289           }
290 
291           // choose eigenvector with x positive to ensure small angle
292           if ( x < 0.0 )
293           {
294             x = -x;
295             y = -y;
296           }
297 
298           // compute Jacobi rotation R(p,q)
299           c = sqrt(0.5*(x+1.0));
300           s = y / sqrt(2.0*(x+1.0));
301 
302           // apply the rotation R(p,q)
303           //
304           //           |  c   s |
305           //  R(p,q) = |        |
306           //           | -s   c |
307 
308           // U := U * R(p,q)^T
309           // A := A * R(p,q)^T
310 
311           // apply rotation to columns of a and u
312 
313           // the drot function computes
314           // c*x + s*y -> x
315           //-s*x + c*y -> y
316           // call drot with args c, -s
317 
318           //cout << " p=" << jglobal[top[ipair]]
319           //     << " q=" << jglobal[bot[ipair]]
320           //     << " g11=" << g11 << " g22=" << g22
321           //     << " g12=" << g12 << endl;
322           //cout << " c=" << c << " s=" << s << endl;
323 
324           int one = 1;
325           for ( int k = 0; k < a.size(); k++ )
326           {
327             double *ap = acol[k][top[ipair]];
328             double *aq = acol[k][bot[ipair]];
329             drot(&mloc,ap,&one,aq,&one,&c,&s);
330           }
331           double *up = ucol[top[ipair]];
332           double *uq = ucol[bot[ipair]];
333           drot(&mloc,up,&one,uq,&one,&c,&s);
334 
335           // new value of off-diag element apq
336           double diag_change_ipair = 0.0;
337           for ( int k = 0; k < a.size(); k++ )
338           {
339             const int iapq = 3*ipair + k*3*nploc;
340             const double app = apq[iapq+1];
341             const double aqq = apq[iapq+2];
342             const double apqnew = (c*c-s*s)*apq[iapq] - c*s*(app-aqq);
343 
344             // accumulate change in sum of squares of diag elements
345             // note negative sign: decrease in offdiag is increase in diag
346             diag_change_ipair -= 2.0 * ( apqnew*apqnew - apq[iapq]*apq[iapq] );
347           }
348 #pragma omp critical
349           diag_change -= diag_change_ipair;
350         }
351       } // for ipair
352 
353       // all local pairs have been processed
354 
355       // rotate top and bot arrays
356       if ( nploc > 0 )
357       {
358         bot.push_back(top.back());
359         top.pop_back();
360         top.push_front(bot.front());
361         bot.pop_front();
362 
363         // make rotation skip element 0 on the first process column
364         // if my process column is zero, swap top[0] and top[1]
365         if ( ctxt.mycol() == 0 )
366         {
367           if ( nploc > 1 )
368           {
369             int tmp = top[0];
370             top[0] = top[1];
371             top[1] = tmp;
372           }
373           else
374           {
375             // if there is only one local pair, exchange top[0] and bot[0]
376             int tmp = top[0];
377             top[0] = bot[0];
378             bot[0] = tmp;
379           }
380         }
381 
382         // exchange columns of a[k] and u
383 
384         int rbufi_left, rbufi_right, sbufi_left, sbufi_right;
385         // send buffers contain k columns of a and one of u
386         int bufsize = (a.size()+1)*a[0]->mloc();
387         vector<double> sbuf_left(bufsize), sbuf_right(bufsize);
388         vector<double> rbuf_left(bufsize), rbuf_right(bufsize);
389 
390         // on each task except mycol==npcol-1
391         // send jglobal[bot[nploc-1]] to the right
392         // if jglobal != -1 send vector bot[nploc-1] to the right
393 
394         // on each task except mycol==npcol-1
395         // recv jglobal from the right
396         // if jglobal != -1 recv a vector from the right into bot[nploc-1]
397         // set value of jglobal[bot[nploc-1]]
398 
399         // on each task except mycol==0
400         // send jglobal[top[0]] to the left
401         // if jglobal != -1 send vector top[0] to the left
402 
403         // on each task except mycol==0
404         // recv jglobal from the left
405         // if jglobal != -1 recv a vector from the left into top[0]
406         // set value of jglobal[top[0]]
407 
408         // exchange jglobal values first
409         tm_comm.start();
410         if ( ctxt.mycol() < last_active_process_col )
411         {
412           sbufi_right = jglobal[bot[nploc-1]];
413           ctxt.isend(1,1,&sbufi_right,1,ctxt.myrow(),ctxt.mycol()+1);
414           ctxt.irecv(1,1,&rbufi_right,1,ctxt.myrow(),ctxt.mycol()+1);
415           jglobal[bot[nploc-1]] = rbufi_right;
416           //cout << ctxt.mype() << ": received jglobal="
417           //     << jglobal[bot[nploc-1]] << " from right" << endl;
418         }
419         if ( ctxt.mycol() != 0 )
420         {
421           sbufi_left = jglobal[top[0]];
422           ctxt.isend(1,1,&sbufi_left,1,ctxt.myrow(),ctxt.mycol()-1);
423           ctxt.irecv(1,1,&rbufi_left,1,ctxt.myrow(),ctxt.mycol()-1);
424           jglobal[top[0]] = rbufi_left;
425           //cout << ctxt.mype() << ": received jglobal="
426           //     << jglobal[top[0]] << " from left" << endl;
427         }
428 
429         // exchange column vectors
430 
431         if ( ctxt.mycol() < last_active_process_col )
432         {
433           for ( int k = 0; k < a.size(); k++ )
434           {
435             memcpy(&sbuf_right[k*mloc],acol[k][bot[nploc-1]],
436                    mloc*sizeof(double));
437           }
438           memcpy(&sbuf_right[a.size()*mloc], ucol[bot[nploc-1]],
439                  mloc*sizeof(double) );
440           ctxt.dsend(bufsize,1,&sbuf_right[0],bufsize,
441                      ctxt.myrow(),ctxt.mycol()+1);
442           ctxt.drecv(bufsize,1,&rbuf_right[0],bufsize,
443                      ctxt.myrow(),ctxt.mycol()+1);
444           for ( int k = 0; k < a.size(); k++ )
445           {
446             memcpy(acol[k][bot[nploc-1]],&rbuf_right[k*mloc],
447                    mloc*sizeof(double));
448           }
449           memcpy(ucol[bot[nploc-1]], &rbuf_right[a.size()*mloc],
450                  mloc*sizeof(double) );
451           //cout << ctxt.mype() << ": received vector jglobal="
452           //     << jglobal[bot[nploc-1]] << " from right" << endl;
453         }
454         if ( ctxt.mycol() != 0 )
455         {
456           for ( int k = 0; k < a.size(); k++ )
457           {
458             memcpy(&sbuf_left[k*mloc],acol[k][top[0]],mloc*sizeof(double));
459           }
460           memcpy(&sbuf_left[a.size()*mloc],ucol[top[0]],mloc*sizeof(double) );
461           ctxt.dsend(bufsize,1,&sbuf_left[0],bufsize,
462                      ctxt.myrow(),ctxt.mycol()-1);
463           ctxt.drecv(bufsize,1,&rbuf_left[0],bufsize,
464                      ctxt.myrow(),ctxt.mycol()-1);
465           for ( int k = 0; k < a.size(); k++ )
466           {
467             memcpy(acol[k][top[0]],&rbuf_left[k*mloc],mloc*sizeof(double) );
468           }
469           memcpy(ucol[top[0]],&rbuf_left[a.size()*mloc],mloc*sizeof(double) );
470           //cout << ctxt.mype() << ": received vector jglobal="
471           //     << jglobal[top[0]] << " from left" << endl;
472         }
473         tm_comm.stop();
474       } // if nploc > 0
475 
476       // end of step
477 
478     } // for irot
479 
480     // sweep is complete
481     tm_comm.start();
482     ctxt.dsum('r',1,1,&diag_change,1);
483     tm_comm.stop();
484 
485     // compute sum of squares of diagonal elements
486 
487     if ( debug_diag_sum )
488     {
489       // compute sum of squares of diagonal elements using current values
490       // (after rotation)
491 #ifdef DEBUG
492       previous_diag_sum = diag_sum;
493 #endif
494       diag_sum = 0.0;
495       for ( int k = 0; k < a.size(); k++ )
496       {
497         for ( int ipair = 0; ipair < nploc; ipair++ )
498         {
499           double tmp[2] = { 0.0, 0.0 };
500           // compute the diagonal elements
501           // skip dummy vectors
502           int one = 1;
503           if ( jglobal[top[ipair]] >= 0 )
504           {
505             const double *ap = acol[k][top[ipair]];
506             const double *up = ucol[top[ipair]];
507             tmp[0] = ddot(&mloc,ap,&one,up,&one);
508           }
509           if ( jglobal[bot[ipair]] >= 0 )
510           {
511             const double *aq = acol[k][bot[ipair]];
512             const double *uq = ucol[bot[ipair]];
513             tmp[1] = ddot(&mloc,aq,&one,uq,&one);
514           }
515           // tmp now contains partial sums of app and aqq
516           ctxt.dsum('c',2,1,tmp,2);
517           // tmp now contains the diagonal elements app and aqq
518           diag_sum += tmp[0]*tmp[0] + tmp[1]*tmp[1];
519         }
520       }
521       ctxt.dsum('r',1,1,&diag_sum,1);
522 #ifdef DEBUG
523       const double diag_sum_increase = diag_sum - previous_diag_sum;
524       if ( ctxt.onpe0() )
525         cout << " jade: nsweep=" << nsweep
526              << " dsum: "
527              << setw(15) << setprecision(10) << diag_sum
528              << " dsum_inc: "
529              << setw(15) << setprecision(10) << diag_sum_increase << endl;
530 #endif
531     }
532 #ifdef DEBUG
533     if ( ctxt.onpe0() )
534       cout << " jade: nsweep=" << nsweep
535            << " dchange: "
536            << setw(15) << setprecision(10) << diag_change << endl;
537 #endif
538     done = ( ( fabs(diag_change) < tol ) || ( nsweep >= maxsweep ) );
539 
540   } // while !done
541 
542 #ifdef DEBUG
543   //print final values of jglobal[top[i]] and jglobal[bot[i]]
544   cout << "final jglobal[top/bot] values:" << endl;
545   cout << ctxt.mype() << ": top: ";
546   for ( int i = 0; i < nploc; i++ )
547     cout << " " << jglobal[top[i]];
548   cout << endl;
549   cout << ctxt.mype() << ": bot: ";
550   for ( int i = 0; i < nploc; i++ )
551     cout << " " << jglobal[bot[i]];
552   cout << endl;
553   //print final values of top and bot
554   cout << "final top/bot values:" << endl;
555   cout << ctxt.mype() << ": top: ";
556   for ( int i = 0; i < nploc; i++ )
557     cout << " " << top[i];
558   cout << endl;
559   cout << ctxt.mype() << ": bot: ";
560   for ( int i = 0; i < nploc; i++ )
561     cout << " " << bot[i];
562   cout << endl;
563 #endif
564 
565   // rotate columns of a and u to restore original order
566   double *tmpmat = new double[nloc*mloc];
567   // rotate columns of a
568   for ( int k = 0; k < a.size(); k++ )
569   {
570     for ( int ipair = 0; ipair < nploc; ipair++ )
571     {
572       // copy columns of a[k] to temporary array tmpmat in original order
573       if ( jglobal[top[ipair]] >= 0 )
574       {
575         memcpy(&tmpmat[ipair*mloc], acol[k][top[ipair]], mloc*sizeof(double));
576       }
577       if ( jglobal[bot[nploc-ipair-1]] >= 0 )
578       {
579         memcpy(&tmpmat[(nploc+ipair)*mloc],acol[k][bot[nploc-ipair-1]],
580                mloc*sizeof(double));
581       }
582     }
583     // copy tmpmat back to a[k]
584     memcpy(acol[k][0],tmpmat,nloc*mloc*sizeof(double));
585   }
586 
587   // rotate columns of u
588   for ( int ipair = 0; ipair < nploc; ipair++ )
589   {
590     // copy columns of u to temporary array tmpmat in original order
591     if ( jglobal[top[ipair]] >= 0 )
592     {
593       memcpy(&tmpmat[ipair*mloc], ucol[top[ipair]], mloc*sizeof(double));
594     }
595     if ( jglobal[bot[nploc-ipair-1]] >= 0 )
596     {
597       memcpy(&tmpmat[(nploc+ipair)*mloc],ucol[bot[nploc-ipair-1]],
598              mloc*sizeof(double));
599     }
600   }
601   // copy tmpmat back to u
602   memcpy(ucol[0],tmpmat,nloc*mloc*sizeof(double));
603   delete [] tmpmat;
604 
605   // compute diagonal values
606   for ( int k = 0; k < a.size(); k++ )
607   {
608     for ( int i = 0; i < a[k]->n(); i++ )
609       adiag[k][i] = 0.0;
610     for ( int jblock = 0; jblock < a[k]->nblocks(); jblock++ )
611       for ( int y = 0; y < a[k]->nbs(jblock); y++ )
612       {
613         // j is the global column index
614         int j = a[k]->j(jblock,y);
615         int jjj = y + jblock*a[k]->nb();
616         const double *ap = a[k]->valptr(jjj*a[k]->mloc());
617         const double *up = u.valptr(jjj*u.mloc());
618         int one = 1;
619         adiag[k][j] = ddot(&mloc,ap,&one,up,&one);
620       }
621     // adiag[k][i] now contains the partial sums of the diagonal elements of a
622     tm_comm.start();
623     ctxt.dsum(a[k]->n(),1,&adiag[k][0],a[k]->n());
624     tm_comm.stop();
625     // adiag[k] contains the diagonal elements of a[k]
626     // u contains the orthogonal transformation minimizing the spread
627   }
628 
629 #ifdef DEBUG
630   if ( ctxt.onpe0() )
631     cout << " jade: comm time: " << tm_comm.real() << endl;
632 #endif
633   return nsweep;
634 }
635