1 /*
2  $Id$
3  *======================================================================
4  *
5  * DISCLAIMER
6  *
7  * This material was prepared as an account of work sponsored by an
8  * agency of the United States Government.  Neither the United States
9  * Government nor the United States Department of Energy, nor Battelle,
10  * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11  * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
12  * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
13  * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
14  * INFRINGE PRIVATELY OWNED RIGHTS.
15  *
16  * ACKNOWLEDGMENT
17  *
18  * This software and its documentation were produced with Government
19  * support under Contract Number DE-AC06-76RLO-1830 awarded by the United
20  * States Department of Energy.  The Government retains a paid-up
21  * non-exclusive, irrevocable worldwide license to reproduce, prepare
22  * derivative works, perform publicly and display publicly by or for the
23  * Government, including the right to distribute to other Government
24  * contractors.
25  *
26  *======================================================================
27  *
28  *  -- PEIGS  routine (version 2.2) --
29  *     Pacific Northwest Laboratory
30  *     July 28, 1995
31  *
32  *======================================================================
33  */
34 #include <stdio.h>
35 #include <memory.h>
36 #include <math.h>
37 
38 #include "blas_lapack.h"
39 #include "globalp.c.h"
40 
41 #define PANELSIZE 5
42 
43 #define max(a,b) ((a) > (b) ? (a) : (b))
44 #define min(a,b) ((a) < (b) ? (a) : (b))
45 #define ffabs(a) ((a) > (0.) ? (a) : (-a))
46 #define sgn(a) ((a) >= (0.) ? (1.e0) : (-1.e0))
47 
48 /*
49    Internal PeIGS routine
50 
51    modified gram-schmidt, 1-D systolic wrap panelized version
52    */
53 
mgs_3(n,colF,mapF,b1,bn,nvecsZ,first,first_buf,iscratch,scratch)54 void mgs_3( n, colF, mapF, b1, bn, nvecsZ, first, first_buf, iscratch, scratch)
55      Integer *n, mapF[], *b1, *bn, *nvecsZ, *first, iscratch[];
56      DoublePrecision **colF, first_buf[], scratch[];
57 {
58 
59   /*
60      n = number of vector to be orthogonalized
61      b1 = beginning of vector subscript
62      bn = end of vector subscript (e.g. colF[i][b1:bn] )
63      colF = double pointer to the matrix
64      mapF = array describing the processors holding columns of F
65      first = 1, then mgs only my local block
66                 otherwise do full mgs.
67      iscratch = integer scratch space
68      scratch = double precision scratch space
69      */
70 
71   static Integer IONE = 1, MONE = (DoublePrecision) -1.0e0;
72   Integer jndx, vec_len;
73   Integer i, k, me, isize, indx;
74   Integer nvecs_in, nvecs, iii;
75   Integer bb, nproc;
76   Integer rsize, mvecs, kk, iter;
77 
78   Integer *nleft;
79   Integer *iscrat, *proclist, naproc;
80   Integer me_indx, max_vecs, max_panel;
81 
82   DoublePrecision *buffer, *in_buffer;
83   DoublePrecision t, *dptr, *dptr1;
84 
85   /* blas calls */
86 
87   extern void dcopy_(), daxpy_(), dscal_();
88   extern DoublePrecision ddot_();
89   extern DoublePrecision gdot_();
90   extern DoublePrecision dnrm2_();
91 
92   extern void bbcast00();
93   extern Integer count_list();
94   extern Integer mxmynd_();
95   extern Integer reduce_list4();
96   extern Integer indxL();
97 
98 
99   me = mxmynd_();
100   naproc = mxnprc_();
101 
102 #ifdef DEBUG1
103    fprintf(stderr, " me = %d in mgs \n", me );
104 #endif
105 #ifdef DEBUG2
106    fprintf(stderr, " \n" );
107    i = *nvecsZ-1;
108    for( iii = 0; iii < *n; iii++)
109      if( mapF[iii] == me ) {
110        i++;
111        for( j = *b1; j <= *bn; j++)
112 	 fprintf(stderr, " mgs1b me = %d vecZ[%d][%d] = %g \n",
113 		 me, iii, j, colF[i][j]);
114      }
115 
116    fprintf(stderr, " \n" );
117    for( iii = 0; iii < *n; iii++)
118      fprintf(stderr, " mgs5 me = %d mapZ[%d] = %d \n", me, iii, mapF[iii]);
119 #endif
120 
121    nvecs = count_list( me, mapF, n );
122 
123    if ( nvecs == 0 )
124      return;
125 
126    vec_len = *bn - *b1 + 1;
127    bb      = *b1;
128 
129    iscrat = iscratch;
130 
131    proclist = iscrat;
132    nproc = reduce_list4( *n, mapF, proclist, &iscrat[naproc] );
133    iscrat += nproc;
134 
135    nleft = iscrat;
136    iscrat += nproc;
137 
138 #ifdef DEBUG1
139    fprintf(stderr, " me = %d mgs nprocs = %d nvecs = %d \n", me , nproc, nvecs);
140 #endif
141 
142    if( *first == 1  || nproc == 1 ) {
143 
144      /* mgs local block and return */
145 
146      k = *nvecsZ;
147      for ( jndx = k; jndx < k + nvecs; jndx++ ){
148        dptr = &colF[jndx][bb];
149        t = dnrm2_( &vec_len, dptr, &IONE );
150        t = 1.0e0/t;
151        dscal_( &vec_len, &t, dptr, &IONE);
152        for ( indx = jndx + 1; indx < k + nvecs; indx++ ){
153 	 dptr1 = &colF[indx][bb];
154 	 t = ddot_( &vec_len, dptr, &IONE, dptr1, &IONE );
155 	 t *= MONE;
156 	 daxpy_( &vec_len, &t, dptr, &IONE, dptr1, &IONE );
157        }
158      }
159      return;
160    }
161 
162    k = *nvecsZ;
163 
164    buffer = (DoublePrecision *) scratch;
165    in_buffer = buffer;
166 
167    /*
168     *  MGS my part of cluster against the first part of the cluster,
169     *  which is stored in first_buf.
170     */
171 
172    nvecs_in = count_list( proclist[0], mapF, n );
173 
174    if( me == proclist[1] ) {
175      isize = vec_len * nvecs_in;
176      dcopy_( &isize, first_buf, &IONE, in_buffer, &IONE);
177    }
178 
179    if( nproc > 2 ) {
180      rsize = nvecs_in * vec_len * sizeof(DoublePrecision);
181 
182      if ( rsize != 0 )
183        bbcast00( (char *) in_buffer, rsize, 999,  proclist[1],
184 		 nproc-1, &proclist[1] );
185    }
186 
187    dptr = in_buffer;
188    for ( iii = k; iii < k + nvecs; iii++ ){
189      dptr = &colF[iii][bb];
190      dptr1 = in_buffer;
191      for ( jndx = 0; jndx < nvecs_in; jndx++ ){
192        t = ddot_( &vec_len, dptr, &IONE, dptr1, &IONE);
193        t *= MONE;
194        daxpy_( &vec_len, &t, dptr1, &IONE, dptr, &IONE );
195        dptr1 += vec_len;
196      }
197    }
198 
199    /*
200     * MGS the rest of the cluster.
201     */
202 
203    me_indx = indxL( me, nproc, proclist);
204 
205    /*
206     * Determine maximum number of panels owned by
207     * any processor in this cluster
208     */
209 
210    max_vecs = 0;
211    for ( i = 1; i < nproc; i++ ) {
212      kk       = count_list( proclist[i], mapF, n );
213      nleft[i] = kk;
214      max_vecs = max( max_vecs, kk );
215    }
216 
217    max_panel = max_vecs / PANELSIZE;
218    if( max_panel * PANELSIZE != max_vecs )
219      max_panel++;
220 
221    kk = k;
222    for ( iter = 0; iter < max_panel; iter++ ){
223      for( i = 1; i < nproc; i++ ) {
224        mvecs = min( nleft[i], PANELSIZE );
225        if ( mvecs == 0 )
226 	 continue;
227        nleft[i] -= mvecs;
228        if ( me_indx == i ) {
229         for ( jndx = kk; jndx < kk + mvecs; jndx++ ){
230  	  dptr = &colF[jndx][bb];
231  	  t = dnrm2_( &vec_len, dptr, &IONE );
232 	if ( t != 1.0e0 ) {
233 	  t = (DoublePrecision) 1.0e0/t;
234 	  dscal_( &vec_len, &t, dptr, &IONE);
235 	}
236 	  for ( indx = jndx + 1; indx < kk + mvecs; indx++ ){
237 	    dptr1 = &colF[indx][bb];
238 	    t = ddot_( &vec_len, dptr, &IONE, dptr1, &IONE );
239 	   if ( fabs(t) > DLAMCHE ) {
240 	    t *= MONE;
241 	    daxpy_( &vec_len, &t, dptr, &IONE, dptr1, &IONE );
242 	}
243 	  }
244 	}
245 
246         /* load up buffer */
247 
248    	dptr = in_buffer;
249 	for ( indx = kk; indx < kk + mvecs; indx++ ){
250 	  dcopy_( &vec_len, &colF[indx][bb], &IONE, dptr, &IONE);
251 	  dptr += vec_len;
252         }
253 
254         kk += mvecs;
255        }
256 
257       rsize = mvecs * vec_len * sizeof(DoublePrecision);
258 
259       bbcast00( (char *) in_buffer, rsize, 888,  proclist[i],
260                 nproc-1, &proclist[1] );
261 
262       /*
263        * the buffer contains incoming orthonormal vectors
264        */
265 
266       dptr = in_buffer;
267       for ( iii = kk; iii < k + nvecs; iii++ ){
268         dptr = &colF[iii][bb];
269         dptr1 = in_buffer;
270         for ( jndx = 0; jndx < mvecs; jndx++ ){
271           t = ddot_( &vec_len, dptr, &IONE, dptr1, &IONE);
272 	if ( fabs(t) > DLAMCHE ) {
273 	  t *= MONE;
274 	  daxpy_( &vec_len, &t, dptr1, &IONE, dptr, &IONE );
275 	}
276           dptr1 += vec_len;
277 	}
278       }
279      }
280    }
281    return;
282 }
283 
284 
285