1 /*
2  $Id$
3  *======================================================================
4  *
5  * DISCLAIMER
6  *
7  * This material was prepared as an account of work sponsored by an
8  * agency of the United States Government.  Neither the United States
9  * Government nor the United States Department of Energy, nor Battelle,
10  * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11  * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
12  * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
13  * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
14  * INFRINGE PRIVATELY OWNED RIGHTS.
15  *
16  * ACKNOWLEDGMENT
17  *
18  * This software and its documentation were produced with Government
19  * support under Contract Number DE-AC06-76RLO-1830 awarded by the United
20  * States Department of Energy.  The Government retains a paid-up
21  * non-exclusive, irrevocable worldwide license to reproduce, prepare
22  * derivative works, perform publicly and display publicly by or for the
23  * Government, including the right to distribute to other Government
24  * contractors.
25  *
26  *======================================================================
27  *
28  *  -- PEIGS  routine (version 2.1) --
29  *     Pacific Northwest Laboratory
30  *     July 28, 1995
31  *
32  *======================================================================
33  */
34 #include <stdio.h>
35 
36 #include "globalp.c.h"
37 
38 #define max(a,b) ((a) > (b) ? (a) : (b))
39 
mxm25(n1,n2,rowQ,mapQ,m,colW,mapW,colZ,iwork,work)40 void mxm25 ( n1, n2, rowQ, mapQ, m, colW, mapW, colZ, iwork, work)
41      Integer *n1, *n2, mapQ[], *m, mapW[], iwork[];
42      DoublePrecision **rowQ, **colW, **colZ, work[];
43 
44 /**************************************************************
45  *
46  * Subroutine mxm25
47  *
48    This subroutine does the matrix multiplication Z <- Q*W
49 
50    where matrix Q is a n1 x n2 general matrix in packed storage format,
51          distributed by rows,
52 
53          matrix W is a general n2 x m matrix in packed storage format,
54          distributed by columns.
55 
56          and matrix Z is the n1 x m product Q*W, distributed the same as W.
57 
58    In this version we assume that Q and W share the same processors
59    ( perhaps different data distributions )
60 
61    It is ok to have colZ = colW.  However, in this
62    case each colZ[i]=colW[i] must point to a vector of
63    length = MAX{ n1, n2 }.
64 
65    ARGUMENTS
66    ---------
67    In the following:
68 
69      INTEGER          = "pointer to Integer"
70      DOUBLE PRECISION = "pointer to DoublePrecision"
71 
72      me     = this processor's id (= mxmynd_())
73      nprocs = number of allocated processors ( = mxnprc_())
74      nvecsW = number of entries in mapW equal to me
75                   (= count_list( me, mapW, m ))
76      nvecsQ = number of entries in mapQ equal to me
77                   (= count_list( me, mapQ, n1 ))
78      nvecsQ_max = maximum number of entries in mapQ equal to i,
79                   i = 0 to nprocs-1
80                   (= max over i of count_list( i, mapQ, n1 ))
81      sDP    = sizeof( DoublePrecision )
82 
83    n1 ..... (input) INTEGER
84             The number of rows in matrix Q
85 
86    n2 ..... (input) INTEGER
87             The number of columns in matrix Q
88             and the number of rows in matrix W
89 
90    rowQ ... (input) array of pointers to DoublePrecision,
91                     length (nvecsQ)
92             The part of matrix Q owned by this processer stored
93             in packed format, i.e., rowQ[i] points to the start
94             of the i-th row of Q
95             owned by this processor, i = 0 to nvecsQ-1.
96 
97    mapQ ... (input) INTEGER array, length (n1)
98             The i-th row of Q is owned by processor
99             mapQ[i], i = 0 to n1-1.
100 
101    m ...... (input) INTEGER
102             The number of columns in W.
103 
104    colW ... (input) array of pointers to DoublePrecision,
105                            length (nvecsW)
106 
107             The part of matrix W owned by this processer stored
108             in packed format by columns, i.e., colW[i] points
109             to the first element of the i-th column of W
110             owned by this processor, i = 0 to nvecsW-1.
111 
112    mapW ... (input) INTEGER array, length (m)
113             The i-th column of W is
114             owned by processor mapW[i], i = 0 to m-1.
115 
116    colZ ... (output) array of pointers to DoublePrecision,
117                      length (nvecsW)
118 
119              The result matrix Z = Q * W stored identical to W.
120 
121              It is ok to have colZ = colW.  However, in this
122              case each colZ[i]=colW[i] must point to a vector of
123              length = MAX{ n1, n2 }.
124 
125    iwork .. (workspace) INTEGER array, length ( 3 * n1 + 2 * m )
126 
127    work ... (workspace) DOUBLE PRECISION array,
128                         length ( maximum ( n1 + m, (nvecsW + 2 * nvecsQ_max) * n2 )
129 */
130 {
131   static Integer ONE = 1;
132   Integer jj, i, nvecsQ, nvecsW, l1, l2;
133   Integer *mapvecQ, *mapvecW;
134   Integer isize, indx, jndx, me;
135   Integer *proclist, nprocs, *mapvec_in, last_proc, next_proc;
136   Integer *iscrat, me_indx;
137   Integer maxsz, rsize, osize;
138   Integer *ijunk, idummy;
139 
140   DoublePrecision *ptr;
141   DoublePrecision *buffer, *in_buffer, *out_buffer;
142   DoublePrecision *dd_ptr, *dd_ptr2;
143 
144   extern Integer count_list();
145   extern Integer fil_mapvec_();
146 
147   extern Integer mxwrit_(), mxread_();
148   extern Integer menode_(), mxmynd_();
149 
150   extern Integer fil_mapvec_();
151   extern Integer count_list();
152   extern Integer indxL ();
153   extern Integer mxwrit_(), mxread_();
154   extern Integer mxmynd_();
155 
156   /*
157     blas call
158     */
159 
160   extern DoublePrecision ddot_();
161   extern Integer reduce_list2();
162   extern Integer peigs_cmod_();
163 
164   extern void mxsync_();
165   extern void gshellsort_();
166   extern void fil_dbl_lst();
167   extern void daxpy_();
168   extern void dcopy_();
169 
170   l1 = *n1;
171   l2 = *n2;
172 
173   me = mxmynd_();
174   iscrat = iwork;
175   mapvecQ = iscrat;
176   nvecsQ = fil_mapvec_( &me, &l1, mapQ, mapvecQ );
177 
178   iscrat += nvecsQ;
179   mapvecW = iscrat;
180   nvecsW = fil_mapvec_( &me, m, mapW, mapvecW );
181 
182   if ( nvecsW + nvecsQ == 0 )
183     return;
184 
185   iscrat += nvecsW;
186 
187   proclist = iscrat;
188 
189 
190   ijunk = (Integer *) work;
191 
192   for ( indx = 0; indx < l1; indx++ )
193     ijunk[indx] = mapQ[indx];
194 
195   jndx = l1;
196   for ( indx = 0; indx <  *m; indx++ ) {
197     ijunk[jndx] = mapW[indx];
198     jndx++;
199   }
200 
201   /*
202     the list of processors for this computation
203     */
204 
205   indx = l1 + *m ;
206 
207   nprocs = reduce_list2( indx, &ijunk[0], proclist);
208   gshellsort_( &nprocs, proclist);
209 
210   /*
211     the number of distinct processors
212     */
213 
214   iscrat += nprocs;
215   mapvec_in = iscrat;
216   indx = indxL ( me, nprocs, proclist);
217   me_indx = indx;
218 
219   /*
220     last_proc = ((indx - 1) % nprocs);
221     */
222 
223   idummy = indx - 1 ;
224   idummy += nprocs;
225   last_proc = peigs_cmod_( &idummy, &nprocs);
226   last_proc = proclist[last_proc];
227 
228   /*
229     next_proc = (indx + 1) % nprocs;
230     */
231 
232   idummy = indx + 1;
233   idummy += nprocs;
234   next_proc = peigs_cmod_( &idummy, &nprocs);
235   next_proc = proclist[next_proc];
236 
237   maxsz = 0;
238   for ( indx = 0; indx < nprocs; indx++ ) {
239     isize = count_list( proclist[indx], mapQ,& l1);   /* the number Q row vectors that I own */
240     maxsz = max( isize, maxsz );
241   }
242 
243 
244   /*
245 
246     load what I own of W (columns) into a buffer;
247     for multiplication purpose
248     just in case the W pointers and the Z pointers are the same
249 
250     */
251 
252   buffer = (DoublePrecision *) work;
253 
254   indx = 0;
255   for ( jndx = 0; jndx < nvecsW; jndx++ ){
256     ptr = colW[jndx];
257     dcopy_( &l2, ptr, &ONE, buffer + indx, &ONE);
258     indx += l2;
259   }
260   buffer += indx;
261 
262   /*
263     zero out the matrix Z
264     */
265 
266   for ( jndx = 0; jndx < nvecsW; jndx++ ){
267     ptr = colZ[jndx];
268     fil_dbl_lst ( l1, ptr, 0.0e0);
269   }
270 
271   /*
272     copy rows of Q into output buffer for shifting
273     shift matrix Q around and multiply
274     */
275 
276   indx = 0;
277   for ( jndx = 0; jndx < nvecsQ; jndx++ ){
278     dcopy_( &l2, rowQ[jndx], &ONE, buffer + indx, &ONE);
279     indx+= l2;
280   }
281 
282   for ( indx = 0; indx < nvecsQ; indx++ )
283     mapvec_in[indx] = mapvecQ[indx];
284 
285   /*
286     just computing matrix multiplication within our
287     own vectors
288     */
289 
290   for ( indx = 0; indx < nvecsW; indx++ ){
291     ptr = colZ[indx];
292     for (jndx = 0; jndx < nvecsQ; jndx++ ){
293       jj = mapvec_in[jndx];
294       ptr[jj] = ddot_( &l2, &buffer[jndx*l2], &ONE, &work[indx*l2], &ONE);
295     }
296   }
297 
298   /*
299     this is the block that gets send around; first multiply what I have
300     */
301 
302   in_buffer = buffer;
303   out_buffer = in_buffer + maxsz*l2;
304 
305   isize = nvecsQ*l2;
306   osize = isize;
307 
308   for ( i = 0; i < nprocs-1 ; i++ ) {
309     if( osize > 0 )
310        dcopy_(&osize, in_buffer, &ONE, out_buffer, &ONE);
311 
312     idummy = 2;
313     if ( peigs_cmod_( &me_indx, &idummy) ==  0 ) {
314 
315       rsize = osize*sizeof(DoublePrecision);
316       if ( rsize != 0 )
317 	rsize = mxwrit_( out_buffer, &rsize, &next_proc, &i );
318 
319       /*
320         indx = (( me_indx - i -1) % nprocs);
321         */
322 
323       idummy = me_indx - i -1;
324       idummy += nprocs;
325 #ifdef DEBUG1
326       fprintf(stderr, " me = %d i = %d idummy %d \n", me, i, idummy);
327 #endif
328       indx = peigs_cmod_( &idummy, &nprocs);
329       indx = proclist[indx];
330 
331       /*
332 	the current processor id number
333 	*/
334 
335 
336       isize = fil_mapvec_( &indx, &l1, mapQ, mapvec_in);
337       nvecsQ = isize;
338       isize *= l2;
339 
340       /*
341 	mapvec_in contains the mapvecQ for processor proclist[i])
342 	*/
343 
344       rsize = isize * sizeof(DoublePrecision);
345       if ( rsize != 0 )
346 	rsize = mxread_( in_buffer, &rsize, &last_proc, &i );
347 
348     }
349     else {
350       /*
351         indx = (( me_indx - i -1 ) % nprocs);
352         */
353 
354       idummy = me_indx - i - 1;
355       idummy += nprocs;
356 
357 #ifdef DEBUG1
358       fprintf(stderr, "222 me = %d i = %d idummy %d \n", me, i, idummy);
359 #endif
360 
361       indx = peigs_cmod_(&idummy, &nprocs);
362 #ifdef DEBUG1
363       fprintf(stderr, " me = %d indx = %d idummy %d \n", me, indx, idummy);
364 #endif
365 
366       indx = proclist[indx]; /* the current processor id number */
367       isize = fil_mapvec_( &indx, &l1, mapQ, mapvec_in);
368       nvecsQ = isize;
369       isize *= l2;
370 
371       /*
372 	mapvec_in contains the mapvecQ for processor proclist[i])
373 	*/
374 
375       rsize = isize * sizeof(DoublePrecision);
376       if ( rsize != 0 )
377 	rsize = mxread_( in_buffer, &rsize, &last_proc, &i );
378 
379       rsize = osize * sizeof(DoublePrecision);
380       if ( rsize != 0 )
381 	rsize = mxwrit_( out_buffer, &rsize, &next_proc, &i );
382     }
383 
384     osize = isize;
385     for (jndx = 0; jndx < nvecsQ; jndx++ ){
386       jj = mapvec_in[jndx];
387       dd_ptr = &buffer[jndx*l2];
388       for ( indx = 0; indx < nvecsW; indx++ ){
389 	dd_ptr2 = &work[indx*l2];
390 	ptr = colZ[indx];
391 	ptr[jj] = ddot_( &l2, dd_ptr, &ONE, dd_ptr2, &ONE);
392       }
393     }
394   }
395 
396   /*
397     c    free(proclist);
398   */
399   return;
400 }
401