1 /*
2 $Id$
3 *======================================================================
4 *
5 * DISCLAIMER
6 *
7 * This material was prepared as an account of work sponsored by an
8 * agency of the United States Government. Neither the United States
9 * Government nor the United States Department of Energy, nor Battelle,
10 * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
12 * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
13 * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
14 * INFRINGE PRIVATELY OWNED RIGHTS.
15 *
16 * ACKNOWLEDGMENT
17 *
18 * This software and its documentation were produced with Government
19 * support under Contract Number DE-AC06-76RLO-1830 awarded by the United
20 * States Department of Energy. The Government retains a paid-up
21 * non-exclusive, irrevocable worldwide license to reproduce, prepare
22 * derivative works, perform publicly and display publicly by or for the
23 * Government, including the right to distribute to other Government
24 * contractors.
25 *
26 *======================================================================
27 *
28 * -- PEIGS routine (version 2.1) --
29 * Pacific Northwest Laboratory
30 * July 28, 1995
31 *
32 *======================================================================
33 */
34 #include <stdio.h>
35
36 #include "globalp.c.h"
37
38 #define max(a,b) ((a) > (b) ? (a) : (b))
39
mxm25(n1,n2,rowQ,mapQ,m,colW,mapW,colZ,iwork,work)40 void mxm25 ( n1, n2, rowQ, mapQ, m, colW, mapW, colZ, iwork, work)
41 Integer *n1, *n2, mapQ[], *m, mapW[], iwork[];
42 DoublePrecision **rowQ, **colW, **colZ, work[];
43
44 /**************************************************************
45 *
46 * Subroutine mxm25
47 *
48 This subroutine does the matrix multiplication Z <- Q*W
49
50 where matrix Q is a n1 x n2 general matrix in packed storage format,
51 distributed by rows,
52
53 matrix W is a general n2 x m matrix in packed storage format,
54 distributed by columns.
55
56 and matrix Z is the n1 x m product Q*W, distributed the same as W.
57
58 In this version we assume that Q and W share the same processors
59 ( perhaps different data distributions )
60
61 It is ok to have colZ = colW. However, in this
62 case each colZ[i]=colW[i] must point to a vector of
63 length = MAX{ n1, n2 }.
64
65 ARGUMENTS
66 ---------
67 In the following:
68
69 INTEGER = "pointer to Integer"
70 DOUBLE PRECISION = "pointer to DoublePrecision"
71
72 me = this processor's id (= mxmynd_())
73 nprocs = number of allocated processors ( = mxnprc_())
74 nvecsW = number of entries in mapW equal to me
75 (= count_list( me, mapW, m ))
76 nvecsQ = number of entries in mapQ equal to me
77 (= count_list( me, mapQ, n1 ))
78 nvecsQ_max = maximum number of entries in mapQ equal to i,
79 i = 0 to nprocs-1
80 (= max over i of count_list( i, mapQ, n1 ))
81 sDP = sizeof( DoublePrecision )
82
83 n1 ..... (input) INTEGER
84 The number of rows in matrix Q
85
86 n2 ..... (input) INTEGER
87 The number of columns in matrix Q
88 and the number of rows in matrix W
89
90 rowQ ... (input) array of pointers to DoublePrecision,
91 length (nvecsQ)
92 The part of matrix Q owned by this processer stored
93 in packed format, i.e., rowQ[i] points to the start
94 of the i-th row of Q
95 owned by this processor, i = 0 to nvecsQ-1.
96
97 mapQ ... (input) INTEGER array, length (n1)
98 The i-th row of Q is owned by processor
99 mapQ[i], i = 0 to n1-1.
100
101 m ...... (input) INTEGER
102 The number of columns in W.
103
104 colW ... (input) array of pointers to DoublePrecision,
105 length (nvecsW)
106
107 The part of matrix W owned by this processer stored
108 in packed format by columns, i.e., colW[i] points
109 to the first element of the i-th column of W
110 owned by this processor, i = 0 to nvecsW-1.
111
112 mapW ... (input) INTEGER array, length (m)
113 The i-th column of W is
114 owned by processor mapW[i], i = 0 to m-1.
115
116 colZ ... (output) array of pointers to DoublePrecision,
117 length (nvecsW)
118
119 The result matrix Z = Q * W stored identical to W.
120
121 It is ok to have colZ = colW. However, in this
122 case each colZ[i]=colW[i] must point to a vector of
123 length = MAX{ n1, n2 }.
124
125 iwork .. (workspace) INTEGER array, length ( 3 * n1 + 2 * m )
126
127 work ... (workspace) DOUBLE PRECISION array,
128 length ( maximum ( n1 + m, (nvecsW + 2 * nvecsQ_max) * n2 )
129 */
130 {
131 static Integer ONE = 1;
132 Integer jj, i, nvecsQ, nvecsW, l1, l2;
133 Integer *mapvecQ, *mapvecW;
134 Integer isize, indx, jndx, me;
135 Integer *proclist, nprocs, *mapvec_in, last_proc, next_proc;
136 Integer *iscrat, me_indx;
137 Integer maxsz, rsize, osize;
138 Integer *ijunk, idummy;
139
140 DoublePrecision *ptr;
141 DoublePrecision *buffer, *in_buffer, *out_buffer;
142 DoublePrecision *dd_ptr, *dd_ptr2;
143
144 extern Integer count_list();
145 extern Integer fil_mapvec_();
146
147 extern Integer mxwrit_(), mxread_();
148 extern Integer menode_(), mxmynd_();
149
150 extern Integer fil_mapvec_();
151 extern Integer count_list();
152 extern Integer indxL ();
153 extern Integer mxwrit_(), mxread_();
154 extern Integer mxmynd_();
155
156 /*
157 blas call
158 */
159
160 extern DoublePrecision ddot_();
161 extern Integer reduce_list2();
162 extern Integer peigs_cmod_();
163
164 extern void mxsync_();
165 extern void gshellsort_();
166 extern void fil_dbl_lst();
167 extern void daxpy_();
168 extern void dcopy_();
169
170 l1 = *n1;
171 l2 = *n2;
172
173 me = mxmynd_();
174 iscrat = iwork;
175 mapvecQ = iscrat;
176 nvecsQ = fil_mapvec_( &me, &l1, mapQ, mapvecQ );
177
178 iscrat += nvecsQ;
179 mapvecW = iscrat;
180 nvecsW = fil_mapvec_( &me, m, mapW, mapvecW );
181
182 if ( nvecsW + nvecsQ == 0 )
183 return;
184
185 iscrat += nvecsW;
186
187 proclist = iscrat;
188
189
190 ijunk = (Integer *) work;
191
192 for ( indx = 0; indx < l1; indx++ )
193 ijunk[indx] = mapQ[indx];
194
195 jndx = l1;
196 for ( indx = 0; indx < *m; indx++ ) {
197 ijunk[jndx] = mapW[indx];
198 jndx++;
199 }
200
201 /*
202 the list of processors for this computation
203 */
204
205 indx = l1 + *m ;
206
207 nprocs = reduce_list2( indx, &ijunk[0], proclist);
208 gshellsort_( &nprocs, proclist);
209
210 /*
211 the number of distinct processors
212 */
213
214 iscrat += nprocs;
215 mapvec_in = iscrat;
216 indx = indxL ( me, nprocs, proclist);
217 me_indx = indx;
218
219 /*
220 last_proc = ((indx - 1) % nprocs);
221 */
222
223 idummy = indx - 1 ;
224 idummy += nprocs;
225 last_proc = peigs_cmod_( &idummy, &nprocs);
226 last_proc = proclist[last_proc];
227
228 /*
229 next_proc = (indx + 1) % nprocs;
230 */
231
232 idummy = indx + 1;
233 idummy += nprocs;
234 next_proc = peigs_cmod_( &idummy, &nprocs);
235 next_proc = proclist[next_proc];
236
237 maxsz = 0;
238 for ( indx = 0; indx < nprocs; indx++ ) {
239 isize = count_list( proclist[indx], mapQ,& l1); /* the number Q row vectors that I own */
240 maxsz = max( isize, maxsz );
241 }
242
243
244 /*
245
246 load what I own of W (columns) into a buffer;
247 for multiplication purpose
248 just in case the W pointers and the Z pointers are the same
249
250 */
251
252 buffer = (DoublePrecision *) work;
253
254 indx = 0;
255 for ( jndx = 0; jndx < nvecsW; jndx++ ){
256 ptr = colW[jndx];
257 dcopy_( &l2, ptr, &ONE, buffer + indx, &ONE);
258 indx += l2;
259 }
260 buffer += indx;
261
262 /*
263 zero out the matrix Z
264 */
265
266 for ( jndx = 0; jndx < nvecsW; jndx++ ){
267 ptr = colZ[jndx];
268 fil_dbl_lst ( l1, ptr, 0.0e0);
269 }
270
271 /*
272 copy rows of Q into output buffer for shifting
273 shift matrix Q around and multiply
274 */
275
276 indx = 0;
277 for ( jndx = 0; jndx < nvecsQ; jndx++ ){
278 dcopy_( &l2, rowQ[jndx], &ONE, buffer + indx, &ONE);
279 indx+= l2;
280 }
281
282 for ( indx = 0; indx < nvecsQ; indx++ )
283 mapvec_in[indx] = mapvecQ[indx];
284
285 /*
286 just computing matrix multiplication within our
287 own vectors
288 */
289
290 for ( indx = 0; indx < nvecsW; indx++ ){
291 ptr = colZ[indx];
292 for (jndx = 0; jndx < nvecsQ; jndx++ ){
293 jj = mapvec_in[jndx];
294 ptr[jj] = ddot_( &l2, &buffer[jndx*l2], &ONE, &work[indx*l2], &ONE);
295 }
296 }
297
298 /*
299 this is the block that gets send around; first multiply what I have
300 */
301
302 in_buffer = buffer;
303 out_buffer = in_buffer + maxsz*l2;
304
305 isize = nvecsQ*l2;
306 osize = isize;
307
308 for ( i = 0; i < nprocs-1 ; i++ ) {
309 if( osize > 0 )
310 dcopy_(&osize, in_buffer, &ONE, out_buffer, &ONE);
311
312 idummy = 2;
313 if ( peigs_cmod_( &me_indx, &idummy) == 0 ) {
314
315 rsize = osize*sizeof(DoublePrecision);
316 if ( rsize != 0 )
317 rsize = mxwrit_( out_buffer, &rsize, &next_proc, &i );
318
319 /*
320 indx = (( me_indx - i -1) % nprocs);
321 */
322
323 idummy = me_indx - i -1;
324 idummy += nprocs;
325 #ifdef DEBUG1
326 fprintf(stderr, " me = %d i = %d idummy %d \n", me, i, idummy);
327 #endif
328 indx = peigs_cmod_( &idummy, &nprocs);
329 indx = proclist[indx];
330
331 /*
332 the current processor id number
333 */
334
335
336 isize = fil_mapvec_( &indx, &l1, mapQ, mapvec_in);
337 nvecsQ = isize;
338 isize *= l2;
339
340 /*
341 mapvec_in contains the mapvecQ for processor proclist[i])
342 */
343
344 rsize = isize * sizeof(DoublePrecision);
345 if ( rsize != 0 )
346 rsize = mxread_( in_buffer, &rsize, &last_proc, &i );
347
348 }
349 else {
350 /*
351 indx = (( me_indx - i -1 ) % nprocs);
352 */
353
354 idummy = me_indx - i - 1;
355 idummy += nprocs;
356
357 #ifdef DEBUG1
358 fprintf(stderr, "222 me = %d i = %d idummy %d \n", me, i, idummy);
359 #endif
360
361 indx = peigs_cmod_(&idummy, &nprocs);
362 #ifdef DEBUG1
363 fprintf(stderr, " me = %d indx = %d idummy %d \n", me, indx, idummy);
364 #endif
365
366 indx = proclist[indx]; /* the current processor id number */
367 isize = fil_mapvec_( &indx, &l1, mapQ, mapvec_in);
368 nvecsQ = isize;
369 isize *= l2;
370
371 /*
372 mapvec_in contains the mapvecQ for processor proclist[i])
373 */
374
375 rsize = isize * sizeof(DoublePrecision);
376 if ( rsize != 0 )
377 rsize = mxread_( in_buffer, &rsize, &last_proc, &i );
378
379 rsize = osize * sizeof(DoublePrecision);
380 if ( rsize != 0 )
381 rsize = mxwrit_( out_buffer, &rsize, &next_proc, &i );
382 }
383
384 osize = isize;
385 for (jndx = 0; jndx < nvecsQ; jndx++ ){
386 jj = mapvec_in[jndx];
387 dd_ptr = &buffer[jndx*l2];
388 for ( indx = 0; indx < nvecsW; indx++ ){
389 dd_ptr2 = &work[indx*l2];
390 ptr = colZ[indx];
391 ptr[jj] = ddot_( &l2, dd_ptr, &ONE, dd_ptr2, &ONE);
392 }
393 }
394 }
395
396 /*
397 c free(proclist);
398 */
399 return;
400 }
401