1 /*
2 $Id$
3 *======================================================================
4 *
5 * DISCLAIMER
6 *
7 * This material was prepared as an account of work sponsored by an
8 * agency of the United States Government. Neither the United States
9 * Government nor the United States Department of Energy, nor Battelle,
10 * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
12 * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
13 * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
14 * INFRINGE PRIVATELY OWNED RIGHTS.
15 *
16 * ACKNOWLEDGMENT
17 *
18 * This software and its documentation were produced with Government
19 * support under Contract Number DE-AC06-76RLO-1830 awarded by the United
20 * States Department of Energy. The Government retains a paid-up
21 * non-exclusive, irrevocable worldwide license to reproduce, prepare
22 * derivative works, perform publicly and display publicly by or for the
23 * Government, including the right to distribute to other Government
24 * contractors.
25 *
26 *======================================================================
27 *
28 * -- PEIGS routine (version 2.2) --
29 * Pacific Northwest Laboratory
30 * July 28, 1995
31 *
32 *======================================================================
33 */
34 #include <stdio.h>
35 #include <memory.h>
36 #include <math.h>
37
38 #include "blas_lapack.h"
39 #include "globalp.c.h"
40
41 #define PANELSIZE 5
42
43 #define max(a,b) ((a) > (b) ? (a) : (b))
44 #define min(a,b) ((a) < (b) ? (a) : (b))
45 #define ffabs(a) ((a) > (0.) ? (a) : (-a))
46 #define sgn(a) ((a) >= (0.) ? (1.e0) : (-1.e0))
47
48 /*
49 Internal PeIGS routine
50
51 modified gram-schmidt, 1-D systolic wrap panelized version
52 */
53
mgs_3(n,colF,mapF,b1,bn,nvecsZ,first,first_buf,iscratch,scratch)54 void mgs_3( n, colF, mapF, b1, bn, nvecsZ, first, first_buf, iscratch, scratch)
55 Integer *n, mapF[], *b1, *bn, *nvecsZ, *first, iscratch[];
56 DoublePrecision **colF, first_buf[], scratch[];
57 {
58
59 /*
60 n = number of vector to be orthogonalized
61 b1 = beginning of vector subscript
62 bn = end of vector subscript (e.g. colF[i][b1:bn] )
63 colF = double pointer to the matrix
64 mapF = array describing the processors holding columns of F
65 first = 1, then mgs only my local block
66 otherwise do full mgs.
67 iscratch = integer scratch space
68 scratch = double precision scratch space
69 */
70
71 static Integer IONE = 1, MONE = (DoublePrecision) -1.0e0;
72 Integer jndx, vec_len;
73 Integer i, k, me, isize, indx;
74 Integer nvecs_in, nvecs, iii;
75 Integer bb, nproc;
76 Integer rsize, mvecs, kk, iter;
77
78 Integer *nleft;
79 Integer *iscrat, *proclist, naproc;
80 Integer me_indx, max_vecs, max_panel;
81
82 DoublePrecision *buffer, *in_buffer;
83 DoublePrecision t, *dptr, *dptr1;
84
85 /* blas calls */
86
87 extern void dcopy_(), daxpy_(), dscal_();
88 extern DoublePrecision ddot_();
89 extern DoublePrecision gdot_();
90 extern DoublePrecision dnrm2_();
91
92 extern void bbcast00();
93 extern Integer count_list();
94 extern Integer mxmynd_();
95 extern Integer reduce_list4();
96 extern Integer indxL();
97
98
99 me = mxmynd_();
100 naproc = mxnprc_();
101
102 #ifdef DEBUG1
103 fprintf(stderr, " me = %d in mgs \n", me );
104 #endif
105 #ifdef DEBUG2
106 fprintf(stderr, " \n" );
107 i = *nvecsZ-1;
108 for( iii = 0; iii < *n; iii++)
109 if( mapF[iii] == me ) {
110 i++;
111 for( j = *b1; j <= *bn; j++)
112 fprintf(stderr, " mgs1b me = %d vecZ[%d][%d] = %g \n",
113 me, iii, j, colF[i][j]);
114 }
115
116 fprintf(stderr, " \n" );
117 for( iii = 0; iii < *n; iii++)
118 fprintf(stderr, " mgs5 me = %d mapZ[%d] = %d \n", me, iii, mapF[iii]);
119 #endif
120
121 nvecs = count_list( me, mapF, n );
122
123 if ( nvecs == 0 )
124 return;
125
126 vec_len = *bn - *b1 + 1;
127 bb = *b1;
128
129 iscrat = iscratch;
130
131 proclist = iscrat;
132 nproc = reduce_list4( *n, mapF, proclist, &iscrat[naproc] );
133 iscrat += nproc;
134
135 nleft = iscrat;
136 iscrat += nproc;
137
138 #ifdef DEBUG1
139 fprintf(stderr, " me = %d mgs nprocs = %d nvecs = %d \n", me , nproc, nvecs);
140 #endif
141
142 if( *first == 1 || nproc == 1 ) {
143
144 /* mgs local block and return */
145
146 k = *nvecsZ;
147 for ( jndx = k; jndx < k + nvecs; jndx++ ){
148 dptr = &colF[jndx][bb];
149 t = dnrm2_( &vec_len, dptr, &IONE );
150 t = 1.0e0/t;
151 dscal_( &vec_len, &t, dptr, &IONE);
152 for ( indx = jndx + 1; indx < k + nvecs; indx++ ){
153 dptr1 = &colF[indx][bb];
154 t = ddot_( &vec_len, dptr, &IONE, dptr1, &IONE );
155 t *= MONE;
156 daxpy_( &vec_len, &t, dptr, &IONE, dptr1, &IONE );
157 }
158 }
159 return;
160 }
161
162 k = *nvecsZ;
163
164 buffer = (DoublePrecision *) scratch;
165 in_buffer = buffer;
166
167 /*
168 * MGS my part of cluster against the first part of the cluster,
169 * which is stored in first_buf.
170 */
171
172 nvecs_in = count_list( proclist[0], mapF, n );
173
174 if( me == proclist[1] ) {
175 isize = vec_len * nvecs_in;
176 dcopy_( &isize, first_buf, &IONE, in_buffer, &IONE);
177 }
178
179 if( nproc > 2 ) {
180 rsize = nvecs_in * vec_len * sizeof(DoublePrecision);
181
182 if ( rsize != 0 )
183 bbcast00( (char *) in_buffer, rsize, 999, proclist[1],
184 nproc-1, &proclist[1] );
185 }
186
187 dptr = in_buffer;
188 for ( iii = k; iii < k + nvecs; iii++ ){
189 dptr = &colF[iii][bb];
190 dptr1 = in_buffer;
191 for ( jndx = 0; jndx < nvecs_in; jndx++ ){
192 t = ddot_( &vec_len, dptr, &IONE, dptr1, &IONE);
193 t *= MONE;
194 daxpy_( &vec_len, &t, dptr1, &IONE, dptr, &IONE );
195 dptr1 += vec_len;
196 }
197 }
198
199 /*
200 * MGS the rest of the cluster.
201 */
202
203 me_indx = indxL( me, nproc, proclist);
204
205 /*
206 * Determine maximum number of panels owned by
207 * any processor in this cluster
208 */
209
210 max_vecs = 0;
211 for ( i = 1; i < nproc; i++ ) {
212 kk = count_list( proclist[i], mapF, n );
213 nleft[i] = kk;
214 max_vecs = max( max_vecs, kk );
215 }
216
217 max_panel = max_vecs / PANELSIZE;
218 if( max_panel * PANELSIZE != max_vecs )
219 max_panel++;
220
221 kk = k;
222 for ( iter = 0; iter < max_panel; iter++ ){
223 for( i = 1; i < nproc; i++ ) {
224 mvecs = min( nleft[i], PANELSIZE );
225 if ( mvecs == 0 )
226 continue;
227 nleft[i] -= mvecs;
228 if ( me_indx == i ) {
229 for ( jndx = kk; jndx < kk + mvecs; jndx++ ){
230 dptr = &colF[jndx][bb];
231 t = dnrm2_( &vec_len, dptr, &IONE );
232 if ( t != 1.0e0 ) {
233 t = (DoublePrecision) 1.0e0/t;
234 dscal_( &vec_len, &t, dptr, &IONE);
235 }
236 for ( indx = jndx + 1; indx < kk + mvecs; indx++ ){
237 dptr1 = &colF[indx][bb];
238 t = ddot_( &vec_len, dptr, &IONE, dptr1, &IONE );
239 if ( fabs(t) > DLAMCHE ) {
240 t *= MONE;
241 daxpy_( &vec_len, &t, dptr, &IONE, dptr1, &IONE );
242 }
243 }
244 }
245
246 /* load up buffer */
247
248 dptr = in_buffer;
249 for ( indx = kk; indx < kk + mvecs; indx++ ){
250 dcopy_( &vec_len, &colF[indx][bb], &IONE, dptr, &IONE);
251 dptr += vec_len;
252 }
253
254 kk += mvecs;
255 }
256
257 rsize = mvecs * vec_len * sizeof(DoublePrecision);
258
259 bbcast00( (char *) in_buffer, rsize, 888, proclist[i],
260 nproc-1, &proclist[1] );
261
262 /*
263 * the buffer contains incoming orthonormal vectors
264 */
265
266 dptr = in_buffer;
267 for ( iii = kk; iii < k + nvecs; iii++ ){
268 dptr = &colF[iii][bb];
269 dptr1 = in_buffer;
270 for ( jndx = 0; jndx < mvecs; jndx++ ){
271 t = ddot_( &vec_len, dptr, &IONE, dptr1, &IONE);
272 if ( fabs(t) > DLAMCHE ) {
273 t *= MONE;
274 daxpy_( &vec_len, &t, dptr1, &IONE, dptr, &IONE );
275 }
276 dptr1 += vec_len;
277 }
278 }
279 }
280 }
281 return;
282 }
283
284
285