1 /****************************************************************/
2 /* Parallel Combinatorial BLAS Library (for Graph Computations) */
3 /* version 1.6 -------------------------------------------------*/
4 /* date: 6/15/2017 ---------------------------------------------*/
5 /* authors: Ariful Azad, Aydin Buluc --------------------------*/
6 /****************************************************************/
7 /*
8 Copyright (c) 2010-2017, The Regents of the University of California
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 THE SOFTWARE.
27 */
28
29
30 #ifndef _PAR_FRIENDS_EXT_H_
31 #define _PAR_FRIENDS_EXT_H_
32
33 #include "mpi.h"
34 #include <iostream>
35 #include "SpParMat.h"
36 #include "SpParHelper.h"
37 #include "MPIType.h"
38 #include "Friends.h"
39
40 namespace combblas {
41
42 template <class IT, class NT, class DER>
43 class SpParMat;
44
45 /*************************************************************************************************/
46 /**************************** FRIEND FUNCTIONS FOR PARALLEL CLASSES ******************************/
47 /************************** EXTENDED SET (NOT COMMONLY USED FUNCTIONS) ***************************/
48 /*************************************************************************************************/
49
50
51 /**
52 * Parallel A = B*C routine that uses one-sided MPI-2 features
53 * General active target syncronization via MPI_Win_Post, MPI_Win_Start, MPI_Win_Complete, MPI_Win_Wait
54 * Tested on my dual core Macbook with 1,4,9,16,25 MPI processes
55 * No memory hog: splits the matrix into two along the column, prefetches the next half matrix while computing on the current one
56 **/
57 template <typename SR, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
Mult_AnXBn_ActiveTarget(const SpParMat<IU,NU1,UDERA> & A,const SpParMat<IU,NU2,UDERB> & B)58 SpParMat<IU,typename promote_trait<NU1,NU2>::T_promote,typename promote_trait<UDERA,UDERB>::T_promote> Mult_AnXBn_ActiveTarget
59 (const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B )
60
61 {
62 typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
63 typedef typename promote_trait<UDERA,UDERB>::T_promote DER_promote;
64
65 if(A.getncol() != B.getnrow())
66 {
67 std::cout<<"Can not multiply, dimensions does not match"<<std::endl;
68 MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
69 return SpParMat< IU,N_promote,DER_promote >();
70 }
71 int stages, Aoffset, Boffset; // stages = inner dimension of matrix blocks
72 std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, Aoffset, Boffset);
73
74 IU C_m = A.spSeq->getnrow();
75 IU C_n = B.spSeq->getncol();
76
77 UDERA A1seq, A2seq;
78 (A.spSeq)->Split( A1seq, A2seq);
79
80 // ABAB: It should be able to perform split/merge with the transpose option [single function call]
81 const_cast< UDERB* >(B.spSeq)->Transpose();
82
83 UDERB B1seq, B2seq;
84 (B.spSeq)->Split( B1seq, B2seq);
85
86 // Create row and column windows (collective operation, i.e. everybody exposes its window to others)
87 std::vector<MPI_Win> rowwins1, rowwins2, colwins1, colwins2;
88 SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A1seq, rowwins1);
89 SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A2seq, rowwins2);
90 SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B1seq, colwins1);
91 SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B2seq, colwins2);
92
93 // ABAB: We can optimize the call to create windows in the absence of passive synchronization
94 // MPI_Info info;
95 // MPI_Info_create ( &info );
96 // MPI_Info_set( info, "no_locks", "true" );
97 // MPI_Win_create( . . ., info, . . . );
98 // MPI_Info_free( &info );
99
100 IU ** ARecvSizes1 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
101 IU ** ARecvSizes2 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
102 IU ** BRecvSizes1 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
103 IU ** BRecvSizes2 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
104
105 SpParHelper::GetSetSizes( A1seq, ARecvSizes1, (A.commGrid)->GetRowWorld());
106 SpParHelper::GetSetSizes( A2seq, ARecvSizes2, (A.commGrid)->GetRowWorld());
107 SpParHelper::GetSetSizes( B1seq, BRecvSizes1, (B.commGrid)->GetColWorld());
108 SpParHelper::GetSetSizes( B2seq, BRecvSizes2, (B.commGrid)->GetColWorld());
109
110 // Remotely fetched matrices are stored as pointers
111 UDERA * ARecv1, * ARecv2;
112 UDERB * BRecv1, * BRecv2;
113 std::vector< SpTuples<IU,N_promote> *> tomerge;
114
115 MPI_Group row_group, col_group;
116 MPI_Comm_group((A.commGrid)->GetRowWorld(), &row_group);
117 MPI_Comm_group((B.commGrid)->GetColWorld(), &col_group);
118
119 int Aself = (A.commGrid)->GetRankInProcRow();
120 int Bself = (B.commGrid)->GetRankInProcCol();
121
122 #ifdef SPGEMMDEBUG
123 MPI_Barrier(GridC->GetWorld());
124 SpParHelper::Print("Writing to file\n");
125 std::ofstream oput;
126 GridC->OpenDebugFile("deb", oput);
127 oput << "A1seq: " << A1seq.getnrow() << " " << A1seq.getncol() << " " << A1seq.getnnz() << std::endl;
128 oput << "A2seq: " << A2seq.getnrow() << " " << A2seq.getncol() << " " << A2seq.getnnz() << std::endl;
129 oput << "B1seq: " << B1seq.getnrow() << " " << B1seq.getncol() << " " << B1seq.getnnz() << std::endl;
130 oput << "B2seq: " << B2seq.getnrow() << " " << B2seq.getncol() << " " << B2seq.getnnz() << std::endl;
131 SpParHelper::Print("Wrote to file\n");
132 MPI_Barrier(GridC->GetWorld());
133 #endif
134
135 SpParHelper::PostExposureEpoch(Aself, rowwins1, row_group);
136 SpParHelper::PostExposureEpoch(Aself, rowwins2, row_group);
137 SpParHelper::PostExposureEpoch(Bself, colwins1, col_group);
138 SpParHelper::PostExposureEpoch(Bself, colwins2, col_group);
139
140 MPI_Barrier(GridC->GetWorld());
141 SpParHelper::Print("Exposure epochs posted\n");
142 MPI_Barrier(GridC->GetWorld());
143
144 int Aowner = (0+Aoffset) % stages;
145 int Bowner = (0+Boffset) % stages;
146 SpParHelper::AccessNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
147 SpParHelper::AccessNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2); // Start prefetching next half
148
149 for(int j=0; j< rowwins1.size(); ++j) // wait for the first half to complete
150 rowwins1[j].Complete();
151
152 SpParHelper::AccessNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
153 SpParHelper::AccessNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2); // Start prefetching next half
154
155 for(int j=0; j< colwins1.size(); ++j)
156 colwins1[j].Complete();
157
158 for(int i = 1; i < stages; ++i)
159 {
160
161 #ifdef SPGEMMDEBUG
162 SpParHelper::Print("Stage starting\n");
163 #endif
164
165 SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
166
167 #ifdef SPGEMMDEBUG
168 SpParHelper::Print("Multiplied\n");
169 #endif
170
171 if(!C_cont->isZero())
172 tomerge.push_back(C_cont);
173
174 #ifdef SPGEMMDEBUG
175 SpParHelper::Print("Pushed back\n");
176 MPI_Barrier(GridC->GetWorld());
177 #endif
178
179 bool remoteA = false;
180 bool remoteB = false;
181
182 delete ARecv1; // free the memory of the previous first half
183 for(int j=0; j< rowwins2.size(); ++j) // wait for the previous second half to complete
184 rowwins2[j].Complete();
185 SpParHelper::Print("Completed A\n");
186
187 delete BRecv1;
188 for(int j=0; j< colwins2.size(); ++j) // wait for the previous second half to complete
189 colwins2[j].Complete();
190
191 #ifdef SPGEMMDEBUG
192 SpParHelper::Print("Completed B\n");
193 MPI_Barrier(GridC->GetWorld());
194 #endif
195
196 Aowner = (i+Aoffset) % stages;
197 Bowner = (i+Boffset) % stages;
198
199 // start fetching the current first half
200 SpParHelper::AccessNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
201 SpParHelper::AccessNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
202
203 #ifdef SPGEMMDEBUG
204 SpParHelper::Print("Fetched next\n");
205 MPI_Barrier(GridC->GetWorld());
206 #endif
207
208 // while multiplying the already completed previous second halfs
209 C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
210 if(!C_cont->isZero())
211 tomerge.push_back(C_cont);
212
213 #ifdef SPGEMMDEBUG
214 SpParHelper::Print("Multiplied and pushed\n");
215 MPI_Barrier(GridC->GetWorld());
216 #endif
217
218 delete ARecv2; // free memory of the previous second half
219 delete BRecv2;
220
221 for(int j=0; j< rowwins1.size(); ++j) // wait for the current first half to complte
222 rowwins1[j].Complete();
223 for(int j=0; j< colwins1.size(); ++j)
224 colwins1[j].Complete();
225
226 #ifdef SPGEMMDEBUG
227 SpParHelper::Print("Completed next\n");
228 MPI_Barrier(GridC->GetWorld());
229 #endif
230
231 // start prefetching the current second half
232 SpParHelper::AccessNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2);
233 SpParHelper::AccessNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2);
234 }
235 SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
236 if(!C_cont->isZero())
237 tomerge.push_back(C_cont);
238
239 delete ARecv1; // free the memory of the previous first half
240 for(int j=0; j< rowwins2.size(); ++j) // wait for the previous second half to complete
241 rowwins2[j].Complete();
242 delete BRecv1;
243 for(int j=0; j< colwins2.size(); ++j) // wait for the previous second half to complete
244 colwins2[j].Complete();
245
246 C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
247 if(!C_cont->isZero())
248 tomerge.push_back(C_cont);
249
250 delete ARecv2;
251 delete BRecv2;
252
253 SpHelper::deallocate2D(ARecvSizes1, UDERA::esscount);
254 SpHelper::deallocate2D(ARecvSizes2, UDERA::esscount);
255 SpHelper::deallocate2D(BRecvSizes1, UDERB::esscount);
256 SpHelper::deallocate2D(BRecvSizes2, UDERB::esscount);
257
258 DER_promote * C = new DER_promote(MergeAll<SR>(tomerge, C_m, C_n), false, NULL); // First get the result in SpTuples, then convert to UDER
259 for(int i=0; i<tomerge.size(); ++i)
260 {
261 delete tomerge[i];
262 }
263
264 // MPI_Win_Wait() works like a barrier as it waits for all origins to finish their remote memory operation on "this" window
265 SpParHelper::WaitNFree(rowwins1);
266 SpParHelper::WaitNFree(rowwins2);
267 SpParHelper::WaitNFree(colwins1);
268 SpParHelper::WaitNFree(colwins2);
269
270 (A.spSeq)->Merge(A1seq, A2seq);
271 (B.spSeq)->Merge(B1seq, B2seq);
272
273 MPI_Group_free(&row_group);
274 MPI_Group_free(&col_group);
275 const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
276 return SpParMat<IU,N_promote,DER_promote> (C, GridC); // return the result object
277 }
278
279 /**
280 * Parallel A = B*C routine that uses one-sided MPI-2 features
281 * This function implements an asynchronous 2D algorithm, in the sense that there is no notion of stages.
282 * \n The process that completes its submatrix update, requests subsequent matrices from their owners w/out waiting to sychronize with other processors
283 * \n This partially remedies the severe load balancing problem in sparse matrices.
284 * \n The class uses MPI-2 to achieve one-sided asynchronous communication
285 * \n The algorithm treats each submatrix as a single block
286 * \n Local data structure can be any SpMat that has a constructor with array sizes and getarrs() member
287 * Passive target syncronization via MPI_Win_Lock, MPI_Win_Unlock
288 * No memory hog: splits the matrix into two along the column, prefetches the next half matrix while computing on the current one
289 **/
290 template <typename SR, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
Mult_AnXBn_PassiveTarget(const SpParMat<IU,NU1,UDERA> & A,const SpParMat<IU,NU2,UDERB> & B)291 SpParMat<IU,typename promote_trait<NU1,NU2>::T_promote,typename promote_trait<UDERA,UDERB>::T_promote> Mult_AnXBn_PassiveTarget
292 (const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B )
293
294 {
295 typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
296 typedef typename promote_trait<UDERA,UDERB>::T_promote DER_promote;
297
298 if(A.getncol() != B.getnrow())
299 {
300 std::cout<<"Can not multiply, dimensions does not match"<<std::endl;
301 MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
302 return SpParMat< IU,N_promote,DER_promote >();
303 }
304 int stages, Aoffset, Boffset; // stages = inner dimension of matrix blocks
305 std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, Aoffset, Boffset);
306
307 IU C_m = A.spSeq->getnrow();
308 IU C_n = B.spSeq->getncol();
309
310 UDERA A1seq, A2seq;
311 (A.spSeq)->Split( A1seq, A2seq);
312
313 // ABAB: It should be able to perform split/merge with the transpose option [single function call]
314 const_cast< UDERB* >(B.spSeq)->Transpose();
315
316 UDERB B1seq, B2seq;
317 (B.spSeq)->Split( B1seq, B2seq);
318
319 // Create row and column windows (collective operation, i.e. everybody exposes its window to others)
320 std::vector<MPI_Win> rowwins1, rowwins2, colwins1, colwins2;
321 SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A1seq, rowwins1);
322 SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A2seq, rowwins2);
323 SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B1seq, colwins1);
324 SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B2seq, colwins2);
325
326 IU ** ARecvSizes1 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
327 IU ** ARecvSizes2 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
328 IU ** BRecvSizes1 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
329 IU ** BRecvSizes2 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
330
331 SpParHelper::GetSetSizes( A1seq, ARecvSizes1, (A.commGrid)->GetRowWorld());
332 SpParHelper::GetSetSizes( A2seq, ARecvSizes2, (A.commGrid)->GetRowWorld());
333 SpParHelper::GetSetSizes( B1seq, BRecvSizes1, (B.commGrid)->GetColWorld());
334 SpParHelper::GetSetSizes( B2seq, BRecvSizes2, (B.commGrid)->GetColWorld());
335
336 // Remotely fetched matrices are stored as pointers
337 UDERA * ARecv1, * ARecv2;
338 UDERB * BRecv1, * BRecv2;
339 std::vector< SpTuples<IU,N_promote> *> tomerge; // sorted triples to be merged
340
341 MPI_Group row_group, col_group;
342 MPI_Comm_group((A.commGrid)->GetRowWorld(), &row_group);
343 MPI_Comm_group((B.commGrid)->GetColWorld(), &col_group);
344
345 int Aself = (A.commGrid)->GetRankInProcRow();
346 int Bself = (B.commGrid)->GetRankInProcCol();
347
348 int Aowner = (0+Aoffset) % stages;
349 int Bowner = (0+Boffset) % stages;
350
351 SpParHelper::LockNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
352 SpParHelper::LockNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2); // Start prefetching next half
353 SpParHelper::LockNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
354 SpParHelper::LockNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2); // Start prefetching next half
355
356 // Finish the first halfs
357 SpParHelper::UnlockWindows(Aowner, rowwins1);
358 SpParHelper::UnlockWindows(Bowner, colwins1);
359
360 for(int i = 1; i < stages; ++i)
361 {
362 SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
363
364 if(!C_cont->isZero())
365 tomerge.push_back(C_cont);
366
367 bool remoteA = false;
368 bool remoteB = false;
369
370 delete ARecv1; // free the memory of the previous first half
371 delete BRecv1;
372
373 SpParHelper::UnlockWindows(Aowner, rowwins2); // Finish the second half
374 SpParHelper::UnlockWindows(Bowner, colwins2);
375
376 Aowner = (i+Aoffset) % stages;
377 Bowner = (i+Boffset) % stages;
378
379 // start fetching the current first half
380 SpParHelper::LockNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
381 SpParHelper::LockNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
382
383 // while multiplying the already completed previous second halfs
384 C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
385 if(!C_cont->isZero())
386 tomerge.push_back(C_cont);
387
388 delete ARecv2; // free memory of the previous second half
389 delete BRecv2;
390
391 // wait for the current first half to complte
392 SpParHelper::UnlockWindows(Aowner, rowwins1);
393 SpParHelper::UnlockWindows(Bowner, colwins1);
394
395 // start prefetching the current second half
396 SpParHelper::LockNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2);
397 SpParHelper::LockNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2);
398 }
399
400 SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
401 if(!C_cont->isZero())
402 tomerge.push_back(C_cont);
403
404 delete ARecv1; // free the memory of the previous first half
405 delete BRecv1;
406
407 SpParHelper::UnlockWindows(Aowner, rowwins2);
408 SpParHelper::UnlockWindows(Bowner, colwins2);
409
410 C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
411 if(!C_cont->isZero())
412 tomerge.push_back(C_cont);
413
414 delete ARecv2;
415 delete BRecv2;
416
417 SpHelper::deallocate2D(ARecvSizes1, UDERA::esscount);
418 SpHelper::deallocate2D(ARecvSizes2, UDERA::esscount);
419 SpHelper::deallocate2D(BRecvSizes1, UDERB::esscount);
420 SpHelper::deallocate2D(BRecvSizes2, UDERB::esscount);
421
422 DER_promote * C = new DER_promote(MergeAll<SR>(tomerge, C_m, C_n), false, NULL); // First get the result in SpTuples, then convert to UDER
423 for(int i=0; i<tomerge.size(); ++i)
424 {
425 delete tomerge[i];
426 }
427
428 SpParHelper::FreeWindows(rowwins1);
429 SpParHelper::FreeWindows(rowwins2);
430 SpParHelper::FreeWindows(colwins1);
431 SpParHelper::FreeWindows(colwins2);
432
433 (A.spSeq)->Merge(A1seq, A2seq);
434 (B.spSeq)->Merge(B1seq, B2seq);
435
436 MPI_Group_free(&row_group);
437 MPI_Group_free(&col_group);
438 const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
439 return SpParMat<IU,N_promote,DER_promote> (C, GridC); // return the result object
440 }
441
442 /**
443 * Parallel A = B*C routine that uses one-sided MPI-2 features
444 * Syncronization is through MPI_Win_Fence
445 * Buggy as of September, 2009
446 **/
447 template <typename SR, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
Mult_AnXBn_Fence(const SpParMat<IU,NU1,UDERA> & A,const SpParMat<IU,NU2,UDERB> & B)448 SpParMat<IU,typename promote_trait<NU1,NU2>::T_promote,typename promote_trait<UDERA,UDERB>::T_promote> Mult_AnXBn_Fence
449 (const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B )
450 {
451 typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
452 typedef typename promote_trait<UDERA,UDERB>::T_promote DER_promote;
453
454 if(A.getncol() != B.getnrow())
455 {
456 std::cout<<"Can not multiply, dimensions does not match"<<std::endl;
457 MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
458 return SpParMat< IU,N_promote,DER_promote >();
459 }
460
461 int stages, Aoffset, Boffset; // stages = inner dimension of matrix blocks
462 std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, Aoffset, Boffset);
463
464 std::ofstream oput;
465 GridC->OpenDebugFile("deb", oput);
466 const_cast< UDERB* >(B.spSeq)->Transpose();
467
468 // set row & col window handles
469 std::vector<MPI_Win> rowwindows, colwindows;
470 std::vector<MPI_Win> rowwinnext, colwinnext;
471 SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), *(A.spSeq), rowwindows);
472 SpParHelper::SetWindows((B.commGrid)->GetColWorld(), *(B.spSeq), colwindows);
473 SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), *(A.spSeq), rowwinnext);
474 SpParHelper::SetWindows((B.commGrid)->GetColWorld(), *(B.spSeq), colwinnext);
475
476 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
477 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
478
479 SpParHelper::GetSetSizes( *(A.spSeq), ARecvSizes, (A.commGrid)->GetRowWorld());
480 SpParHelper::GetSetSizes( *(B.spSeq), BRecvSizes, (B.commGrid)->GetColWorld());
481
482 UDERA * ARecv, * ARecvNext;
483 UDERB * BRecv, * BRecvNext;
484 std::vector< SpTuples<IU,N_promote> *> tomerge;
485
486 // Prefetch first
487 for(int j=0; j< rowwindows.size(); ++j)
488 MPI_Win_fence(MPI_MODE_NOPRECEDE, rowwindows[j]);
489 for(int j=0; j< colwindows.size(); ++j)
490 MPI_Win_fence(MPI_MODE_NOPRECEDE, colwindows[j]);
491
492 for(int j=0; j< rowwinnext.size(); ++j)
493 MPI_Win_fence(MPI_MODE_NOPRECEDE, rowwinnext[j]);
494 for(int j=0; j< colwinnext.size(); ++j)
495 MPI_Win_fence(MPI_MODE_NOPRECEDE, colwinnext[j]);
496
497
498 int Aownind = (0+Aoffset) % stages;
499 int Bownind = (0+Boffset) % stages;
500 if(Aownind == (A.commGrid)->GetRankInProcRow())
501 {
502 ARecv = A.spSeq; // shallow-copy
503 }
504 else
505 {
506 std::vector<IU> ess1(UDERA::esscount); // pack essentials to a vector
507 for(int j=0; j< UDERA::esscount; ++j)
508 {
509 ess1[j] = ARecvSizes[j][Aownind];
510 }
511 ARecv = new UDERA(); // create the object first
512
513 oput << "For A (out), Fetching " << (void*)rowwindows[0] << std::endl;
514 SpParHelper::FetchMatrix(*ARecv, ess1, rowwindows, Aownind); // fetch its elements later
515 }
516 if(Bownind == (B.commGrid)->GetRankInProcCol())
517 {
518 BRecv = B.spSeq; // shallow-copy
519 }
520 else
521 {
522 std::vector<IU> ess2(UDERB::esscount); // pack essentials to a vector
523 for(int j=0; j< UDERB::esscount; ++j)
524 {
525 ess2[j] = BRecvSizes[j][Bownind];
526 }
527 BRecv = new UDERB();
528
529 oput << "For B (out), Fetching " << (void*)colwindows[0] << std::endl;
530 SpParHelper::FetchMatrix(*BRecv, ess2, colwindows, Bownind); // No lock version, only get !
531 }
532
533 int Aownprev = Aownind;
534 int Bownprev = Bownind;
535
536 for(int i = 1; i < stages; ++i)
537 {
538 Aownind = (i+Aoffset) % stages;
539 Bownind = (i+Boffset) % stages;
540
541 if(i % 2 == 1) // Fetch RecvNext via winnext, fence on Recv via windows
542 {
543 if(Aownind == (A.commGrid)->GetRankInProcRow())
544 {
545 ARecvNext = A.spSeq; // shallow-copy
546 }
547 else
548 {
549 std::vector<IU> ess1(UDERA::esscount); // pack essentials to a vector
550 for(int j=0; j< UDERA::esscount; ++j)
551 {
552 ess1[j] = ARecvSizes[j][Aownind];
553 }
554 ARecvNext = new UDERA(); // create the object first
555
556 oput << "For A, Fetching " << (void*) rowwinnext[0] << std::endl;
557 SpParHelper::FetchMatrix(*ARecvNext, ess1, rowwinnext, Aownind);
558 }
559
560 if(Bownind == (B.commGrid)->GetRankInProcCol())
561 {
562 BRecvNext = B.spSeq; // shallow-copy
563 }
564 else
565 {
566 std::vector<IU> ess2(UDERB::esscount); // pack essentials to a vector
567 for(int j=0; j< UDERB::esscount; ++j)
568 {
569 ess2[j] = BRecvSizes[j][Bownind];
570 }
571 BRecvNext = new UDERB();
572
573 oput << "For B, Fetching " << (void*)colwinnext[0] << std::endl;
574 SpParHelper::FetchMatrix(*BRecvNext, ess2, colwinnext, Bownind); // No lock version, only get !
575 }
576
577 oput << "Fencing " << (void*) rowwindows[0] << std::endl;
578 oput << "Fencing " << (void*) colwindows[0] << std::endl;
579
580 for(int j=0; j< rowwindows.size(); ++j)
581 MPI_Win_fence(MPI_MODE_NOSTORE, rowwindows[j]); // Synch using "other" windows
582 for(int j=0; j< colwindows.size(); ++j)
583 MPI_Win_fence(MPI_MODE_NOSTORE, colwindows[j]);
584
585 SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv, *BRecv, false, true);
586 if(!C_cont->isZero())
587 tomerge.push_back(C_cont);
588
589 if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecv;
590 if(Bownprev != (B.commGrid)->GetRankInProcCol()) delete BRecv;
591
592 Aownprev = Aownind;
593 Bownprev = Bownind;
594 }
595 else // fetch to Recv via windows, fence on RecvNext via winnext
596 {
597
598 if(Aownind == (A.commGrid)->GetRankInProcRow())
599 {
600 ARecv = A.spSeq; // shallow-copy
601 }
602 else
603 {
604 std::vector<IU> ess1(UDERA::esscount); // pack essentials to a vector
605 for(int j=0; j< UDERA::esscount; ++j)
606 {
607 ess1[j] = ARecvSizes[j][Aownind];
608 }
609 ARecv = new UDERA(); // create the object first
610
611 oput << "For A, Fetching " << (void*) rowwindows[0] << std::endl;
612 SpParHelper::FetchMatrix(*ARecv, ess1, rowwindows, Aownind);
613 }
614
615 if(Bownind == (B.commGrid)->GetRankInProcCol())
616 {
617 BRecv = B.spSeq; // shallow-copy
618 }
619 else
620 {
621 std::vector<IU> ess2(UDERB::esscount); // pack essentials to a vector
622 for(int j=0; j< UDERB::esscount; ++j)
623 {
624 ess2[j] = BRecvSizes[j][Bownind];
625 }
626 BRecv = new UDERB();
627
628 oput << "For B, Fetching " << (void*)colwindows[0] << std::endl;
629 SpParHelper::FetchMatrix(*BRecv, ess2, colwindows, Bownind); // No lock version, only get !
630 }
631
632 oput << "Fencing " << (void*) rowwinnext[0] << std::endl;
633 oput << "Fencing " << (void*) rowwinnext[0] << std::endl;
634
635 for(int j=0; j< rowwinnext.size(); ++j)
636 MPI_Win_fence(MPI_MODE_NOSTORE, rowwinnext[j]); // Synch using "other" windows
637 for(int j=0; j< colwinnext.size(); ++j)
638 MPI_Win_fence(MPI_MODE_NOSTORE, colwinnext[j]);
639
640 SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecvNext, *BRecvNext, false, true);
641 if(!C_cont->isZero())
642 tomerge.push_back(C_cont);
643
644
645 if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecvNext;
646 if(Bownprev != (B.commGrid)->GetRankInProcCol()) delete BRecvNext;
647
648 Aownprev = Aownind;
649 Bownprev = Bownind;
650 }
651
652 }
653
654 if(stages % 2 == 1) // fence on Recv via windows
655 {
656 oput << "Fencing " << (void*) rowwindows[0] << std::endl;
657 oput << "Fencing " << (void*) colwindows[0] << std::endl;
658
659 for(int j=0; j< rowwindows.size(); ++j)
660 MPI_Win_fence(MPI_MODE_NOSUCCEED, rowwindows[j]); // Synch using "prev" windows
661 for(int j=0; j< colwindows.size(); ++j)
662 MPI_Win_fence(MPI_MODE_NOSUCCEED, colwindows[j]);
663
664 SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv, *BRecv, false, true);
665 if(!C_cont->isZero())
666 tomerge.push_back(C_cont);
667
668 if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecv;
669 if(Bownprev != (B.commGrid)->GetRankInProcRow()) delete BRecv;
670 }
671 else // fence on RecvNext via winnext
672 {
673 oput << "Fencing " << (void*) rowwinnext[0] << std::endl;
674 oput << "Fencing " << (void*) colwinnext[0] << std::endl;
675
676 for(int j=0; j< rowwinnext.size(); ++j)
677 MPI_Win_fence(MPI_MODE_NOSUCCEED, rowwinnext[j]); // Synch using "prev" windows
678 for(int j=0; j< colwinnext.size(); ++j)
679 MPI_Win_fence(MPI_MODE_NOSUCCEED, colwinnext[j]);
680
681 SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecvNext, *BRecvNext, false, true);
682 if(!C_cont->isZero())
683 tomerge.push_back(C_cont);
684
685 if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecvNext;
686 if(Bownprev != (B.commGrid)->GetRankInProcRow()) delete BRecvNext;
687 }
688 for(int i=0; i< rowwindows.size(); ++i)
689 {
690 MPI_Win_free(&rowwindows[i]);
691 MPI_Win_free(&rowwinnext[i]);
692 }
693 for(int i=0; i< colwindows.size(); ++i)
694 {
695 MPI_Win_free(&colwindows[i]);
696 MPI_Win_free(&colwinnext[i]);
697 }
698 MPI_Barrier(GridC->GetWorld());
699
700 IU C_m = A.spSeq->getnrow();
701 IU C_n = B.spSeq->getncol();
702 DER_promote * C = new DER_promote(MergeAll<SR>(tomerge, C_m, C_n), false, NULL); // First get the result in SpTuples, then convert to UDER
703 for(int i=0; i<tomerge.size(); ++i)
704 {
705 delete tomerge[i];
706 }
707 SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
708 SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
709
710 const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
711 return SpParMat<IU,N_promote,DER_promote> (C, GridC); // return the result object
712 }
713
714 }
715
716 #endif
717