1 /*
2  $Id$
3  *======================================================================
4  *
5  * DISCLAIMER
6  *
7  * This material was prepared as an account of work sponsored by an
8  * agency of the United States Government.  Neither the United States
9  * Government nor the United States Department of Energy, nor Battelle,
10  * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11  * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
12  * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
13  * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
14  * INFRINGE PRIVATELY OWNED RIGHTS.
15  *
16  * ACKNOWLEDGMENT
17  *
18  * This software and its documentation were produced with Government
19  * support under Contract Number DE-AC06-76RLO-1830 awarded by the United
20  * States Department of Energy.  The Government retains a paid-up
21  * non-exclusive, irrevocable worldwide license to reproduce, prepare
22  * derivative works, perform publicly and display publicly by or for the
23  * Government, including the right to distribute to other Government
24  * contractors.
25  *
26  *======================================================================
27  *
28  *  -- PEIGS  routine (version 2.1) --
29  *     Pacific Northwest Laboratory
30  *     July 28, 1995
31  *
32  *======================================================================
33  */
34 /*
35   PeIGS internal routine:
36 
37   not intended for external use; see the routine pstein
38 
39   Integer clustrinv_(n, d, e, eval, schedule, num_clustr, mapZ, mapvecZ, vecZ, imin, nacluster, icsplit, iscratch, scratch)
40 
41   Integer *n, *schedule, *num_clustr, *mapZ, *mapvecZ, *Zbegn, *nacluster,
42           *icsplit, *iscratch;
43 
44   DoublePrecision *d, *e, *eval, **vecZ, *scratch;
45 
46 
47   for computing eigenvectors by inverse iteration with cluster modified Gram Schmidt
48   with "de-tangling" of clusters.
49 
50   returns 0     if everything seems ok.
51           k > 0 if k-th eigenvector (owned by processor mapZ[k-1], k = 1 to neigval,
52                 failed to converge in inverse iteration.  In this case eigenvectors
53                 1 to k-1 converged ok, eigenvector k failed to converge, and it is
54                 possible that some eigenvectors numbered greater than k failed to
55                 converge.  In any case, the first k-1 eigenvectors are good,
56                 while eigenvectors k is bad, and eigenvectors k+1 to neigval are
57                 probably bad too.  Note that the meaning of bad is not always clear,
58                 in general, however, it means that the eigenvector is probably
59                 not accurate to full precision.
60 
61 
62   This code "de-tangles" clusters as follows.  Assume the eigenvalues in a cluster
63   are owned by more than one processor (as determined by mapZ).  Then, the processor
64   owing the first eigenvector in the cluster does all required iterations of
65   inverse iteration and mgs to compute the eigenvectors in this cluster.  It then
66   sends these finished eigenvectors to the second processor in mapZ which owns part
67   of this cluster.  At this point the clusters are de-tangled and all processors can
68   start working on their eigenvectors.
69 
70   This version of the code assumes that during de-tangling each processor owns the
71   first eigenvector in at most one multi-processor cluster  and receives the finished
72   eigenvalues from at most one other processor.  This assumptions are satified by
73   the current algorithm in clustrf and are checkd by this routine.
74 
75   */
76 
77 #include <stdio.h>
78 #include <math.h>
79 
80 #include "globalp.c.h"
81 #include "clustr_inv.h"
82 
83 /*
84 #define DEBUG1
85 #define DEBUG5
86 */
87 #define CLUSTRLEN  4
88 #define LOOP  3
89 #define INV_TIME 2
90 #define ITIME1  1
91 
clustrinv4_(n,d,e,dplus,lplus,eval,schedule,num_clustr,mapZ,mapvecZ,vecZ,imin,nacluster,icsplit,iscratch,scratch)92 Integer clustrinv4_(n, d, e, dplus, lplus, eval, schedule, num_clustr, mapZ, mapvecZ, vecZ, imin, nacluster, icsplit, iscratch, scratch)
93      Integer *n, *schedule, *num_clustr, *mapZ, *mapvecZ, *imin,
94   *nacluster, *icsplit, *iscratch;
95      DoublePrecision *d, *e, *eval, **vecZ, *scratch, *dplus, *lplus;
96 {
97   /*
98     n = dimension of the tridiagonal matrix
99     d = diagonal of the tridiagonal matrix
100     e[2:n] = upper diagonal of the tridiagonal matrix
101     schedule = a integer array of the scheduling information
102     mapZ = location of the eigenvectors
103     mapvecZ = in
104     vecZ
105     iscratch = integer scratch space
106     scratch = DoublePrecision precision scratch space
107 
108     returns the number of eigenvector that this processor holds
109     */
110 
111   static Integer three = 3, IONE = 1;
112   Integer indx, i, j, iseed[4], bb1, bn;
113   Integer blksiz, clustr_ptr, cn;
114   Integer me, naproc, Zvec;
115   Integer *cl_ptr;
116   Integer c1, csiz, xc1, xcsiz, xblksiz;
117   Integer cl_num;
118   Integer itime;
119   Integer send_num, send_cl, send_to,
120           recv_num, recv_cl, recv_from,
121           myindx, itype, nvecs, isize, ival, first, ibad, itmp;
122 
123   DoublePrecision stpcrt, onenrm, eps;
124   DoublePrecision tmp, *dscrat, *first_buf;
125 
126   extern void xerbla_();
127   extern Integer idamax_(), mclock_(), succ_(), mxmynd_(), mxnprc_();
128   extern DoublePrecision dasum_(), dlamch_(), dnrm2_(), ddot_(), dlarnd_();
129   extern void dlagtf_(), dlarnv_();
130   extern void printff_(), mgs_3();
131   extern void mgspnl_(), dscal_(), dlagts_();
132   extern void dcopy_(), daxpy_();
133   extern Integer count_list ();
134   extern Integer inv_it();
135   extern void mgs ();
136   extern DoublePrecision dlamch_();
137   extern void fil_dbl_lst ();
138 
139   /*
140     Function Body
141     */
142 
143   me = mxmynd_();
144   naproc = mxnprc_();
145 
146   ibad = 0;
147 
148   dscrat = scratch;
149 
150 #ifdef DEBUG5
151 
152   if( me == mapZ[0] ){
153     fprintf(stderr, " nacluster = %d \n", *nacluster );
154     cn = -1;
155     for( j = 0; j < *nacluster; j++ ) {
156       c1 = cn + 1;
157       cn = icsplit[j];
158        if( cn > c1 )
159         fprintf( stderr, " cluster[%d] = %d to %d  owned by %d to %d \n",
160                  j,c1,cn, mapZ[c1], mapZ[cn] );
161     }
162     for( j = 0; j < *n; j++ )
163       fprintf( stderr, " eval[%d] = %g \n", j, eval[j]);
164     for( j = 0; j < *n; j++ )
165       fprintf( stderr, " d[%d] = %g e[%d] = %g \n", j, d[j], j, e[j] );
166   }
167   mxsync_();
168   exit(-1);
169 #endif
170 #ifdef DEBUG1
171   fprintf(stderr, " in clustrxx me = %d \n", me );
172 
173   if( me == mapZ[0] ){
174     cn = -1;
175     for( j = 0; j < *nacluster; j++ ) {
176       c1 = cn + 1;
177       cn = icsplit[j];
178        if( cn > c1 )
179         fprintf( stderr, " cluster[%d] = %d to %d  owned by %d to %d \n",
180                  j,c1,cn, mapZ[c1], mapZ[cn] );
181     }
182   }
183 #endif
184 
185   /*
186     Get machine constants. should set this up somewhere to call it only once
187     */
188 
189   /*
190     eps = dlamch_("E");
191     */
192 
193   eps = DLAMCHE;
194 
195   /*
196     Initialize seed for random number generator DLARND.
197     how about randomizing the randomness?
198     */
199 
200   blksiz = 1;
201 
202   for (i = 0; i < 4; ++i)
203     iseed[i] = 1;
204 
205   iseed[3] = 2*me + 1;
206 
207   /*
208     Compute eigenvectors of matrix blocks.
209     */
210 
211   /*
212     this is the number of clusters that this processor will process
213     */
214 
215 
216   cl_num = *num_clustr;
217   Zvec = 0;
218 
219   if ( cl_num == 0 )
220     return(ibad);
221 
222   /*
223    * If I own the start of my last cluster, but not all of this cluster
224    * then do my part of the last cluster and send it to the processor
225    * which owns the first vector in the cluster which is not mine.
226    */
227 
228   cl_ptr = schedule;
229 
230   send_num = 0;
231   recv_num = 0;
232   if( naproc > 1 ) {
233     for (clustr_ptr= 0;  clustr_ptr < cl_num ; clustr_ptr++) {
234 
235       c1 = *(cl_ptr++);
236       cn = *(cl_ptr++);
237       bb1 = *(cl_ptr++);
238       bn = *(cl_ptr++);
239 
240       if( cn > c1 ) {
241         if( mapZ[c1] == me ) {
242           for (j = c1+1; j <= cn; j++ ){
243             if ( mapZ[j] != me ) {
244               send_num++;
245               send_cl = clustr_ptr;
246               send_to = mapZ[j];
247               break;
248             }
249           }
250         }
251         else {
252 
253           for (j = c1+1; j <= cn; j++ ){
254             if ( mapZ[j] != mapZ[c1] ) {
255               if ( mapZ[j] == me ) {
256                 recv_num++;
257                 recv_cl   = clustr_ptr;
258                 recv_from = mapZ[c1];
259               }
260               break;
261             }
262           }
263         }
264       }
265     }
266 
267     /*
268      *
269      * The following tests verify that the "de-tangling" assumptions about
270      * sending and/or receiving at most one finished set of eigenvectors listed
271      * in the subroutine header are satisfied.
272      *
273      * The exit(-1) conditions should only occur if the
274      * cluster scheduling algorithm is changed from its current form without
275      * modifying the de-tangeling code appropriately.  Thus, the following exits
276      * should never be executed in the tested, distribution version of the code.
277      * Thus, we do not try to do a global exit here.
278      *
279      */
280 
281     if( send_num > 1 || recv_num > 1 ) {
282       fprintf( stderr, " me = %d Internal Error in PEIGS clustrinv. \n", me );
283       fprintf( stderr, " me = %d recv_num and/or send_num > 1. \n", me);
284       exit( -1 );
285     }
286     if( ( send_num > 0 ) && ( send_cl != cl_num - 1) ) {
287         fprintf( stderr, " me = %d Internal Error in PEIGS clustrinv. \n", me );
288         fprintf( stderr, " me = %d send_cl != cl_num - 1. \n", me);
289         exit( -1 );
290     }
291     if( ( recv_num > 0 ) && ( recv_cl != 0 ) ) {
292         fprintf( stderr, " me = %d Internal Error in PEIGS clustrinv. \n", me );
293         fprintf( stderr, " me = %d recv_cl != 0. \n", me);
294         exit( -1 );
295     }
296   }
297 
298   if( recv_num > 0 || send_num > 0 ) {
299 
300     /* Set iscratch to a linked-list such that
301      * iscratch[j] = k means processor
302      * j receives from processor k when sending
303      * the first block of a cluster to the next processor in that cluster.
304      * k = -1 means processor j receives from no one.
305      * Only really care about the part of the linked-list
306      * including 'me'.
307      */
308 
309     for (j= 0;  j < naproc; j++)
310       iscratch[j] = -1;
311 
312     cn = -1;
313     for( i = 0; i < *nacluster; i++ ) {
314       c1 = cn + 1;
315       cn = icsplit[i];
316       if( cn > c1 ) {
317         for (j = c1+1; j <= cn; j++ )
318           if ( mapZ[j] != mapZ[c1] ) {
319             iscratch[ mapZ[j] ] = mapZ[c1];
320             break;
321           }
322       }
323     }
324 
325     myindx = 0;
326     j      = me;
327     for (i= 0;  i < naproc; i++) {
328       if( iscratch[j] == -1 )
329         break;
330 
331       myindx++;
332       j = iscratch[j];
333     }
334 
335     if( iscratch[j] != -1 ) {
336       fprintf( stderr, " me = %d Internal Error in PEIGS clustrinv. \n", me );
337       fprintf( stderr, " me = %d Swapping of initial cluster data \n", me);
338       fprintf( stderr, " me = %d does not have a well defined start. \n", me);
339       fprintf( stderr, " me = %d clustrinv,mgs cannot handle this. \n", me);
340       exit( -1 );
341     }
342 
343     if( send_num == 0 ) {
344       itype = 999999;
345 
346       c1     = schedule[4*recv_cl];
347       csiz   = schedule[4*recv_cl+1] - c1 + 1;
348 
349       blksiz = schedule[4*recv_cl+3] - schedule[4*recv_cl+2] + 1;
350 
351       nvecs  = count_list( recv_from, &mapZ[c1], &csiz);
352 
353       isize  = sizeof( DoublePrecision ) * blksiz * nvecs;
354 
355       first_buf = dscrat;
356       dscrat += nvecs * blksiz;
357 
358 #ifdef DEBUG1
359   fprintf(stderr, " me = %d Just before mxread isize = %d nvecs = %d \n", me, isize, nvecs );
360 #endif
361 
362       ival = mxread_( first_buf, &isize, &recv_from, &itype );
363 
364 #ifdef DEBUG1
365   fprintf(stderr, " me = %d Just after mxread \n", me );
366   for( j = 0; j < nvecs*blksiz; j++)
367        fprintf(stderr, " me = %d first_buf[%d] = %g \n",
368                      me, j, first_buf[j]);
369 #endif
370 
371     }
372   }
373 
374   cl_ptr = schedule;
375   for (clustr_ptr= 0;  clustr_ptr < cl_num ; clustr_ptr++) {
376 
377     if( clustr_ptr == 0 && send_num > 0 ) {
378 
379         c1  = schedule[ 4*send_cl ];
380         cn  = schedule[ 4*send_cl + 1 ];
381         bb1 = schedule[ 4*send_cl + 2 ];
382         bn  = schedule[ 4*send_cl + 3 ];
383 
384         if( clustr_ptr == send_cl )
385           cl_ptr += 4;
386     }
387     else {
388 
389       c1 = *(cl_ptr++);
390       cn = *(cl_ptr++);
391       bb1 = *(cl_ptr++);
392       bn = *(cl_ptr++);
393 
394       if( clustr_ptr == send_cl && send_num > 0 )
395         cl_ptr += 4;
396 
397     }
398 
399 
400     if ( c1 < *imin ) {
401       Zvec = 0;
402     }
403     else {
404       Zvec = c1 - *imin;
405     }
406 
407 
408     blksiz = bn - bb1 + 1;
409     csiz = cn - c1 + 1;
410 
411     onenrm = fabs(d[bb1]) + fabs(e[bb1 + 1]);
412     tmp = fabs(d[bn])+ fabs(e[bn]);
413     onenrm = max(onenrm,tmp);
414 
415     for (i = bb1 + 1; i < bn; ++i){
416       tmp = fabs(d[i])+fabs(e[i])+fabs(e[i + 1]);
417       onenrm = max(onenrm, tmp );
418     }
419 
420     stpcrt = sqrt((DoublePrecision ) 1.0e-1 / (DoublePrecision ) blksiz);
421 
422     for (i = 0; i < 4; ++i)
423       iseed[i] = 1;
424     iseed[3] = 1;
425 
426     indx = 0;
427     for (j = c1; j <= cn; j++ ){
428       if ( mapZ[j] == me ) {
429 	i = indx + Zvec;
430 
431 	/*
432 	  mapvecZ[ i ] = j;
433 	  */
434 
435         /*
436 	  Initialize vector to zero.
437 	  */
438 
439         fil_dbl_lst (*n, vecZ[i], 0.0e0);
440 	dlarnv_(&three, &iseed[0], &blksiz, &vecZ[i][bb1]);
441 	indx++;
442       }
443     }
444 
445 #ifdef DEBUG1
446     fprintf(stderr, " just before inv_it and mgs of clustrxx me = %d c1 = %d cn = %d \n", me, c1, cn );
447 #endif
448 
449     first = 0;
450     if( clustr_ptr == 0 && send_num > 0 )
451       first = 1;
452 
453     itime = 1;
454     for ( j = 0; j < INV_TIME; j++ ) {
455       itmp = inv_it4( n, &c1, &cn, &bb1, &bn, &Zvec, &mapZ[c1], mapvecZ, vecZ,
456 		      dplus, lplus, eval, &eps, &stpcrt, &onenrm, iscratch, dscrat);
457 
458       if( itmp >  0 )
459         if( ibad == 0 || itmp < ibad )
460 	  ibad = itmp;
461 
462       if ( c1 != cn ) {
463 	for ( i = 0; i < itime ; i++ ) {
464 	  mgs_3( &csiz, vecZ, &mapZ[c1], &bb1, &bn, &Zvec, &first, first_buf, iscratch, dscrat);
465 	}
466 	itime = 1;
467       }
468     }
469 
470 #ifdef DEBUG1
471   fprintf(stderr, " clustrxx3 me = %d before send/rec \n", me );
472 #endif
473 
474     /*
475      * Swap beginning portions of clusters which are distributed
476      * across more than one processor.
477      */
478 
479     if( clustr_ptr == 0 && send_num > 0 ) {
480 
481       itype = 999999;
482 
483       if( recv_num > 0  &&  (( myindx % 2 ) == 0 ) ) {
484         xc1     = schedule[4*recv_cl];
485         xcsiz   = schedule[4*recv_cl+1] - xc1 + 1;
486 
487         xblksiz = schedule[4*recv_cl+3] - schedule[4*recv_cl+2] + 1;
488 
489         nvecs  = count_list( recv_from, &mapZ[xc1], &xcsiz);
490         isize = sizeof( DoublePrecision ) * xblksiz * nvecs;
491 
492         first_buf = dscrat;
493         dscrat += nvecs * xblksiz;
494 
495 #ifdef DEBUG1
496   fprintf(stderr, " me = %d Just before mxread 2 isize = %d nvecs = %d \n", me, isize, nvecs );
497 #endif
498 
499         ival = mxread_( first_buf, &isize, &recv_from, &itype );
500       }
501 
502       nvecs = 0;
503       for (j = c1; j <= cn; j++ ){
504         if ( mapZ[j] == me ) {
505 	  dcopy_(&blksiz, &vecZ[nvecs+Zvec][bb1], &IONE,
506                  dscrat+nvecs*blksiz, &IONE );
507           nvecs++;
508         }
509       }
510 
511       isize = sizeof( DoublePrecision ) * blksiz * nvecs;
512 #ifdef DEBUG1
513   fprintf(stderr, " me = %d Just before mxwrit isize = %d nvecs = %d \n", me, isize, nvecs );
514   for( j = 0; j < nvecs*blksiz; j++)
515        fprintf(stderr, " me = %d sending dscrat[%d] = %g \n",
516                      me, j, dscrat[j]);
517 #endif
518 
519       ival = mxwrit_( dscrat, &isize, &send_to, &itype );
520 
521       if( recv_num > 0  &&  (( myindx % 2 ) != 0 ) ) {
522         xc1     = schedule[4*recv_cl];
523         xcsiz   = schedule[4*recv_cl+1] - xc1 + 1;
524 
525         xblksiz = schedule[4*recv_cl+3] - schedule[4*recv_cl+2] + 1;
526 
527         nvecs  = count_list( recv_from, &mapZ[xc1], &xcsiz);
528         isize = sizeof( DoublePrecision ) * xblksiz * nvecs;
529 
530         first_buf = dscrat;
531         dscrat += nvecs * xblksiz;
532 
533 #ifdef DEBUG1
534   fprintf(stderr, " me = %d Just before mxread 3 isize = %d nvecs = %d \n", me, isize, nvecs );
535 #endif
536 
537         ival = mxread_( first_buf, &isize, &recv_from, &itype );
538       }
539     }
540 #ifdef DEBUG1
541   fprintf(stderr, " clustrxx3 me = %d after send/rec \n", me );
542 #endif
543 
544   }
545 
546 #ifdef DEBUG1
547   fprintf(stderr, " me = %d Exiting clustrinv_ \n", me );
548 #endif
549 
550   return(ibad);
551 }
552