1 /****************************************************************/
2 /* Parallel Combinatorial BLAS Library (for Graph Computations) */
3 /* version 1.6 -------------------------------------------------*/
4 /* date: 6/15/2017 ---------------------------------------------*/
5 /* authors: Ariful Azad, Aydin Buluc  --------------------------*/
6 /****************************************************************/
7 /*
8  Copyright (c) 2010-2017, The Regents of the University of California
9 
10  Permission is hereby granted, free of charge, to any person obtaining a copy
11  of this software and associated documentation files (the "Software"), to deal
12  in the Software without restriction, including without limitation the rights
13  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  copies of the Software, and to permit persons to whom the Software is
15  furnished to do so, subject to the following conditions:
16 
17  The above copyright notice and this permission notice shall be included in
18  all copies or substantial portions of the Software.
19 
20  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  THE SOFTWARE.
27  */
28 
29 
30 #ifndef _PAR_FRIENDS_EXT_H_
31 #define _PAR_FRIENDS_EXT_H_
32 
33 #include "mpi.h"
34 #include <iostream>
35 #include "SpParMat.h"
36 #include "SpParHelper.h"
37 #include "MPIType.h"
38 #include "Friends.h"
39 
40 namespace combblas {
41 
42 template <class IT, class NT, class DER>
43 class SpParMat;
44 
45 /*************************************************************************************************/
46 /**************************** FRIEND FUNCTIONS FOR PARALLEL CLASSES ******************************/
47 /************************** EXTENDED SET (NOT COMMONLY USED FUNCTIONS) ***************************/
48 /*************************************************************************************************/
49 
50 
51 /**
52  * Parallel A = B*C routine that uses one-sided MPI-2 features
53  * General active target syncronization via MPI_Win_Post, MPI_Win_Start, MPI_Win_Complete, MPI_Win_Wait
54  * Tested on my dual core Macbook with 1,4,9,16,25 MPI processes
55  * No memory hog: splits the matrix into two along the column, prefetches the next half matrix while computing on the current one
56  **/
57 template <typename SR, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
Mult_AnXBn_ActiveTarget(const SpParMat<IU,NU1,UDERA> & A,const SpParMat<IU,NU2,UDERB> & B)58 SpParMat<IU,typename promote_trait<NU1,NU2>::T_promote,typename promote_trait<UDERA,UDERB>::T_promote> Mult_AnXBn_ActiveTarget
59 		(const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B )
60 
61 {
62 	typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
63 	typedef typename promote_trait<UDERA,UDERB>::T_promote DER_promote;
64 
65 	if(A.getncol() != B.getnrow())
66 	{
67     std::cout<<"Can not multiply, dimensions does not match"<<std::endl;
68 		MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
69 		return SpParMat< IU,N_promote,DER_promote >();
70 	}
71 	int stages, Aoffset, Boffset; 	// stages = inner dimension of matrix blocks
72   std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, Aoffset, Boffset);
73 
74 	IU C_m = A.spSeq->getnrow();
75 	IU C_n = B.spSeq->getncol();
76 
77 	UDERA A1seq, A2seq;
78 	(A.spSeq)->Split( A1seq, A2seq);
79 
80 	// ABAB: It should be able to perform split/merge with the transpose option [single function call]
81 	const_cast< UDERB* >(B.spSeq)->Transpose();
82 
83 	UDERB B1seq, B2seq;
84 	(B.spSeq)->Split( B1seq, B2seq);
85 
86 	// Create row and column windows (collective operation, i.e. everybody exposes its window to others)
87   std::vector<MPI_Win> rowwins1, rowwins2, colwins1, colwins2;
88 	SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A1seq, rowwins1);
89 	SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A2seq, rowwins2);
90 	SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B1seq, colwins1);
91 	SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B2seq, colwins2);
92 
93 	// ABAB: We can optimize the call to create windows in the absence of passive synchronization
94 	// 	MPI_Info info;
95 	// 	MPI_Info_create ( &info );
96 	// 	MPI_Info_set( info, "no_locks", "true" );
97 	// 	MPI_Win_create( . . ., info, . . . );
98 	// 	MPI_Info_free( &info );
99 
100 	IU ** ARecvSizes1 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
101 	IU ** ARecvSizes2 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
102 	IU ** BRecvSizes1 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
103 	IU ** BRecvSizes2 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
104 
105 	SpParHelper::GetSetSizes( A1seq, ARecvSizes1, (A.commGrid)->GetRowWorld());
106 	SpParHelper::GetSetSizes( A2seq, ARecvSizes2, (A.commGrid)->GetRowWorld());
107 	SpParHelper::GetSetSizes( B1seq, BRecvSizes1, (B.commGrid)->GetColWorld());
108 	SpParHelper::GetSetSizes( B2seq, BRecvSizes2, (B.commGrid)->GetColWorld());
109 
110 	// Remotely fetched matrices are stored as pointers
111 	UDERA * ARecv1, * ARecv2;
112 	UDERB * BRecv1, * BRecv2;
113   std::vector< SpTuples<IU,N_promote>  *> tomerge;
114 
115 	MPI_Group row_group, col_group;
116 	MPI_Comm_group((A.commGrid)->GetRowWorld(), &row_group);
117 	MPI_Comm_group((B.commGrid)->GetColWorld(), &col_group);
118 
119 	int Aself = (A.commGrid)->GetRankInProcRow();
120 	int Bself = (B.commGrid)->GetRankInProcCol();
121 
122 #ifdef SPGEMMDEBUG
123 	MPI_Barrier(GridC->GetWorld());
124 	SpParHelper::Print("Writing to file\n");
125   std::ofstream oput;
126 	GridC->OpenDebugFile("deb", oput);
127 	oput << "A1seq: " << A1seq.getnrow() << " " << A1seq.getncol() << " " << A1seq.getnnz() << std::endl;
128 	oput << "A2seq: " << A2seq.getnrow() << " " << A2seq.getncol() << " " << A2seq.getnnz() << std::endl;
129 	oput << "B1seq: " << B1seq.getnrow() << " " << B1seq.getncol() << " " << B1seq.getnnz() << std::endl;
130 	oput << "B2seq: " << B2seq.getnrow() << " " << B2seq.getncol() << " " << B2seq.getnnz() << std::endl;
131 	SpParHelper::Print("Wrote to file\n");
132 	MPI_Barrier(GridC->GetWorld());
133 #endif
134 
135 	SpParHelper::PostExposureEpoch(Aself, rowwins1, row_group);
136 	SpParHelper::PostExposureEpoch(Aself, rowwins2, row_group);
137 	SpParHelper::PostExposureEpoch(Bself, colwins1, col_group);
138 	SpParHelper::PostExposureEpoch(Bself, colwins2, col_group);
139 
140 	MPI_Barrier(GridC->GetWorld());
141 	SpParHelper::Print("Exposure epochs posted\n");
142 	MPI_Barrier(GridC->GetWorld());
143 
144 	int Aowner = (0+Aoffset) % stages;
145 	int Bowner = (0+Boffset) % stages;
146 	SpParHelper::AccessNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
147 	SpParHelper::AccessNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2);	// Start prefetching next half
148 
149 	for(int j=0; j< rowwins1.size(); ++j)	// wait for the first half to complete
150 		rowwins1[j].Complete();
151 
152 	SpParHelper::AccessNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
153 	SpParHelper::AccessNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2);	// Start prefetching next half
154 
155 	for(int j=0; j< colwins1.size(); ++j)
156 		colwins1[j].Complete();
157 
158 	for(int i = 1; i < stages; ++i)
159 	{
160 
161 #ifdef SPGEMMDEBUG
162 		SpParHelper::Print("Stage starting\n");
163 #endif
164 
165 		SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
166 
167 #ifdef SPGEMMDEBUG
168 		SpParHelper::Print("Multiplied\n");
169 #endif
170 
171 		if(!C_cont->isZero())
172 			tomerge.push_back(C_cont);
173 
174 #ifdef SPGEMMDEBUG
175 		SpParHelper::Print("Pushed back\n");
176 		MPI_Barrier(GridC->GetWorld());
177 #endif
178 
179 		bool remoteA = false;
180 		bool remoteB = false;
181 
182 		delete ARecv1;		// free the memory of the previous first half
183 		for(int j=0; j< rowwins2.size(); ++j)	// wait for the previous second half to complete
184 			rowwins2[j].Complete();
185 		SpParHelper::Print("Completed A\n");
186 
187 		delete BRecv1;
188 		for(int j=0; j< colwins2.size(); ++j)	// wait for the previous second half to complete
189 			colwins2[j].Complete();
190 
191 #ifdef SPGEMMDEBUG
192 		SpParHelper::Print("Completed B\n");
193 		MPI_Barrier(GridC->GetWorld());
194 #endif
195 
196 		Aowner = (i+Aoffset) % stages;
197 		Bowner = (i+Boffset) % stages;
198 
199 		// start fetching the current first half
200 		SpParHelper::AccessNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
201 		SpParHelper::AccessNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
202 
203 #ifdef SPGEMMDEBUG
204 		SpParHelper::Print("Fetched next\n");
205 		MPI_Barrier(GridC->GetWorld());
206 #endif
207 
208 		// while multiplying the already completed previous second halfs
209 		C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
210 		if(!C_cont->isZero())
211 			tomerge.push_back(C_cont);
212 
213 #ifdef SPGEMMDEBUG
214 		SpParHelper::Print("Multiplied and pushed\n");
215 		MPI_Barrier(GridC->GetWorld());
216 #endif
217 
218 		delete ARecv2;		// free memory of the previous second half
219 		delete BRecv2;
220 
221 		for(int j=0; j< rowwins1.size(); ++j)	// wait for the current first half to complte
222 			rowwins1[j].Complete();
223 		for(int j=0; j< colwins1.size(); ++j)
224 			colwins1[j].Complete();
225 
226 #ifdef SPGEMMDEBUG
227 		SpParHelper::Print("Completed next\n");
228 		MPI_Barrier(GridC->GetWorld());
229 #endif
230 
231 		// start prefetching the current second half
232 		SpParHelper::AccessNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2);
233 		SpParHelper::AccessNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2);
234 	}
235 	SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
236 	if(!C_cont->isZero())
237 		tomerge.push_back(C_cont);
238 
239 	delete ARecv1;		// free the memory of the previous first half
240 	for(int j=0; j< rowwins2.size(); ++j)	// wait for the previous second half to complete
241 		rowwins2[j].Complete();
242 	delete BRecv1;
243 	for(int j=0; j< colwins2.size(); ++j)	// wait for the previous second half to complete
244 		colwins2[j].Complete();
245 
246 	C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
247 	if(!C_cont->isZero())
248 		tomerge.push_back(C_cont);
249 
250 	delete ARecv2;
251 	delete BRecv2;
252 
253 	SpHelper::deallocate2D(ARecvSizes1, UDERA::esscount);
254 	SpHelper::deallocate2D(ARecvSizes2, UDERA::esscount);
255 	SpHelper::deallocate2D(BRecvSizes1, UDERB::esscount);
256 	SpHelper::deallocate2D(BRecvSizes2, UDERB::esscount);
257 
258 	DER_promote * C = new DER_promote(MergeAll<SR>(tomerge, C_m, C_n), false, NULL);	// First get the result in SpTuples, then convert to UDER
259 	for(int i=0; i<tomerge.size(); ++i)
260 	{
261 		delete tomerge[i];
262 	}
263 
264 	// MPI_Win_Wait() works like a barrier as it waits for all origins to finish their remote memory operation on "this" window
265 	SpParHelper::WaitNFree(rowwins1);
266 	SpParHelper::WaitNFree(rowwins2);
267 	SpParHelper::WaitNFree(colwins1);
268 	SpParHelper::WaitNFree(colwins2);
269 
270 	(A.spSeq)->Merge(A1seq, A2seq);
271 	(B.spSeq)->Merge(B1seq, B2seq);
272 
273 	MPI_Group_free(&row_group);
274 	MPI_Group_free(&col_group);
275 	const_cast< UDERB* >(B.spSeq)->Transpose();	// transpose back to original
276 	return SpParMat<IU,N_promote,DER_promote> (C, GridC);		// return the result object
277 }
278 
279 /**
280   * Parallel A = B*C routine that uses one-sided MPI-2 features
281   * This function implements an asynchronous 2D algorithm, in the sense that there is no notion of stages.
282   * \n The process that completes its submatrix update, requests subsequent matrices from their owners w/out waiting to sychronize with other processors
283   * \n This partially remedies the severe load balancing problem in sparse matrices.
284   * \n The class uses MPI-2 to achieve one-sided asynchronous communication
285   * \n The algorithm treats each submatrix as a single block
286   * \n Local data structure can be any SpMat that has a constructor with array sizes and getarrs() member
287   * Passive target syncronization via MPI_Win_Lock, MPI_Win_Unlock
288   * No memory hog: splits the matrix into two along the column, prefetches the next half matrix while computing on the current one
289  **/
290 template <typename SR, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
Mult_AnXBn_PassiveTarget(const SpParMat<IU,NU1,UDERA> & A,const SpParMat<IU,NU2,UDERB> & B)291 SpParMat<IU,typename promote_trait<NU1,NU2>::T_promote,typename promote_trait<UDERA,UDERB>::T_promote> Mult_AnXBn_PassiveTarget
292 		(const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B )
293 
294 {
295 	typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
296 	typedef typename promote_trait<UDERA,UDERB>::T_promote DER_promote;
297 
298 	if(A.getncol() != B.getnrow())
299 	{
300     std::cout<<"Can not multiply, dimensions does not match"<<std::endl;
301 		MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
302 		return SpParMat< IU,N_promote,DER_promote >();
303 	}
304 	int stages, Aoffset, Boffset; 	// stages = inner dimension of matrix blocks
305   std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, Aoffset, Boffset);
306 
307 	IU C_m = A.spSeq->getnrow();
308 	IU C_n = B.spSeq->getncol();
309 
310 	UDERA A1seq, A2seq;
311 	(A.spSeq)->Split( A1seq, A2seq);
312 
313 	// ABAB: It should be able to perform split/merge with the transpose option [single function call]
314 	const_cast< UDERB* >(B.spSeq)->Transpose();
315 
316 	UDERB B1seq, B2seq;
317 	(B.spSeq)->Split( B1seq, B2seq);
318 
319 	// Create row and column windows (collective operation, i.e. everybody exposes its window to others)
320   std::vector<MPI_Win> rowwins1, rowwins2, colwins1, colwins2;
321 	SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A1seq, rowwins1);
322 	SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A2seq, rowwins2);
323 	SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B1seq, colwins1);
324 	SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B2seq, colwins2);
325 
326 	IU ** ARecvSizes1 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
327 	IU ** ARecvSizes2 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
328 	IU ** BRecvSizes1 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
329 	IU ** BRecvSizes2 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
330 
331 	SpParHelper::GetSetSizes( A1seq, ARecvSizes1, (A.commGrid)->GetRowWorld());
332 	SpParHelper::GetSetSizes( A2seq, ARecvSizes2, (A.commGrid)->GetRowWorld());
333 	SpParHelper::GetSetSizes( B1seq, BRecvSizes1, (B.commGrid)->GetColWorld());
334 	SpParHelper::GetSetSizes( B2seq, BRecvSizes2, (B.commGrid)->GetColWorld());
335 
336 	// Remotely fetched matrices are stored as pointers
337 	UDERA * ARecv1, * ARecv2;
338 	UDERB * BRecv1, * BRecv2;
339   std::vector< SpTuples<IU,N_promote> *> tomerge;	// sorted triples to be merged
340 
341 	MPI_Group row_group, col_group;
342 	MPI_Comm_group((A.commGrid)->GetRowWorld(), &row_group);
343 	MPI_Comm_group((B.commGrid)->GetColWorld(), &col_group);
344 
345 	int Aself = (A.commGrid)->GetRankInProcRow();
346 	int Bself = (B.commGrid)->GetRankInProcCol();
347 
348 	int Aowner = (0+Aoffset) % stages;
349 	int Bowner = (0+Boffset) % stages;
350 
351 	SpParHelper::LockNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
352 	SpParHelper::LockNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2);	// Start prefetching next half
353 	SpParHelper::LockNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
354 	SpParHelper::LockNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2);	// Start prefetching next half
355 
356 	// Finish the first halfs
357 	SpParHelper::UnlockWindows(Aowner, rowwins1);
358 	SpParHelper::UnlockWindows(Bowner, colwins1);
359 
360 	for(int i = 1; i < stages; ++i)
361 	{
362 		SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
363 
364 		if(!C_cont->isZero())
365 			tomerge.push_back(C_cont);
366 
367 		bool remoteA = false;
368 		bool remoteB = false;
369 
370 		delete ARecv1;		// free the memory of the previous first half
371 		delete BRecv1;
372 
373 		SpParHelper::UnlockWindows(Aowner, rowwins2);	// Finish the second half
374 		SpParHelper::UnlockWindows(Bowner, colwins2);
375 
376 		Aowner = (i+Aoffset) % stages;
377 		Bowner = (i+Boffset) % stages;
378 
379 		// start fetching the current first half
380 		SpParHelper::LockNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
381 		SpParHelper::LockNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
382 
383 		// while multiplying the already completed previous second halfs
384 		C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
385 		if(!C_cont->isZero())
386 			tomerge.push_back(C_cont);
387 
388 		delete ARecv2;		// free memory of the previous second half
389 		delete BRecv2;
390 
391 		// wait for the current first half to complte
392 		SpParHelper::UnlockWindows(Aowner, rowwins1);
393 		SpParHelper::UnlockWindows(Bowner, colwins1);
394 
395 		// start prefetching the current second half
396 		SpParHelper::LockNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2);
397 		SpParHelper::LockNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2);
398 	}
399 
400 	SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
401 	if(!C_cont->isZero())
402 		tomerge.push_back(C_cont);
403 
404 	delete ARecv1;		// free the memory of the previous first half
405 	delete BRecv1;
406 
407 	SpParHelper::UnlockWindows(Aowner, rowwins2);
408 	SpParHelper::UnlockWindows(Bowner, colwins2);
409 
410 	C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
411 	if(!C_cont->isZero())
412 		tomerge.push_back(C_cont);
413 
414 	delete ARecv2;
415 	delete BRecv2;
416 
417 	SpHelper::deallocate2D(ARecvSizes1, UDERA::esscount);
418 	SpHelper::deallocate2D(ARecvSizes2, UDERA::esscount);
419 	SpHelper::deallocate2D(BRecvSizes1, UDERB::esscount);
420 	SpHelper::deallocate2D(BRecvSizes2, UDERB::esscount);
421 
422 	DER_promote * C = new DER_promote(MergeAll<SR>(tomerge, C_m, C_n), false, NULL);	// First get the result in SpTuples, then convert to UDER
423 	for(int i=0; i<tomerge.size(); ++i)
424 	{
425 		delete tomerge[i];
426 	}
427 
428 	SpParHelper::FreeWindows(rowwins1);
429 	SpParHelper::FreeWindows(rowwins2);
430 	SpParHelper::FreeWindows(colwins1);
431 	SpParHelper::FreeWindows(colwins2);
432 
433 	(A.spSeq)->Merge(A1seq, A2seq);
434 	(B.spSeq)->Merge(B1seq, B2seq);
435 
436 	MPI_Group_free(&row_group);
437 	MPI_Group_free(&col_group);
438 	const_cast< UDERB* >(B.spSeq)->Transpose();	// transpose back to original
439 	return SpParMat<IU,N_promote,DER_promote> (C, GridC);		// return the result object
440 }
441 
442 /**
443  * Parallel A = B*C routine that uses one-sided MPI-2 features
444  * Syncronization is through MPI_Win_Fence
445  * Buggy as of September, 2009
446  **/
447 template <typename SR, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
Mult_AnXBn_Fence(const SpParMat<IU,NU1,UDERA> & A,const SpParMat<IU,NU2,UDERB> & B)448 SpParMat<IU,typename promote_trait<NU1,NU2>::T_promote,typename promote_trait<UDERA,UDERB>::T_promote> Mult_AnXBn_Fence
449 		(const SpParMat<IU,NU1,UDERA> & A, const SpParMat<IU,NU2,UDERB> & B )
450 {
451 	typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
452 	typedef typename promote_trait<UDERA,UDERB>::T_promote DER_promote;
453 
454 	if(A.getncol() != B.getnrow())
455 	{
456     std::cout<<"Can not multiply, dimensions does not match"<<std::endl;
457 		MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
458 		return SpParMat< IU,N_promote,DER_promote >();
459 	}
460 
461 	int stages, Aoffset, Boffset; 	// stages = inner dimension of matrix blocks
462   std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, Aoffset, Boffset);
463 
464   std::ofstream oput;
465 	GridC->OpenDebugFile("deb", oput);
466 	const_cast< UDERB* >(B.spSeq)->Transpose();
467 
468 	// set row & col window handles
469   std::vector<MPI_Win> rowwindows, colwindows;
470   std::vector<MPI_Win> rowwinnext, colwinnext;
471 	SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), *(A.spSeq), rowwindows);
472 	SpParHelper::SetWindows((B.commGrid)->GetColWorld(), *(B.spSeq), colwindows);
473 	SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), *(A.spSeq), rowwinnext);
474 	SpParHelper::SetWindows((B.commGrid)->GetColWorld(), *(B.spSeq), colwinnext);
475 
476 	IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
477 	IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
478 
479 	SpParHelper::GetSetSizes( *(A.spSeq), ARecvSizes, (A.commGrid)->GetRowWorld());
480 	SpParHelper::GetSetSizes( *(B.spSeq), BRecvSizes, (B.commGrid)->GetColWorld());
481 
482 	UDERA * ARecv, * ARecvNext;
483 	UDERB * BRecv, * BRecvNext;
484   std::vector< SpTuples<IU,N_promote>  *> tomerge;
485 
486 	// Prefetch first
487 	for(int j=0; j< rowwindows.size(); ++j)
488 		MPI_Win_fence(MPI_MODE_NOPRECEDE, rowwindows[j]);
489 	for(int j=0; j< colwindows.size(); ++j)
490 		MPI_Win_fence(MPI_MODE_NOPRECEDE, colwindows[j]);
491 
492 	for(int j=0; j< rowwinnext.size(); ++j)
493 		MPI_Win_fence(MPI_MODE_NOPRECEDE, rowwinnext[j]);
494 	for(int j=0; j< colwinnext.size(); ++j)
495 		MPI_Win_fence(MPI_MODE_NOPRECEDE, colwinnext[j]);
496 
497 
498 	int Aownind = (0+Aoffset) % stages;
499 	int Bownind = (0+Boffset) % stages;
500 	if(Aownind == (A.commGrid)->GetRankInProcRow())
501 	{
502 		ARecv = A.spSeq;	// shallow-copy
503 	}
504 	else
505 	{
506     std::vector<IU> ess1(UDERA::esscount);		// pack essentials to a vector
507 		for(int j=0; j< UDERA::esscount; ++j)
508 		{
509 			ess1[j] = ARecvSizes[j][Aownind];
510 		}
511 		ARecv = new UDERA();	// create the object first
512 
513 		oput << "For A (out), Fetching " << (void*)rowwindows[0] << std::endl;
514 		SpParHelper::FetchMatrix(*ARecv, ess1, rowwindows, Aownind);	// fetch its elements later
515 	}
516 	if(Bownind == (B.commGrid)->GetRankInProcCol())
517 	{
518 		BRecv = B.spSeq;	// shallow-copy
519 	}
520 	else
521 	{
522     std::vector<IU> ess2(UDERB::esscount);		// pack essentials to a vector
523 		for(int j=0; j< UDERB::esscount; ++j)
524 		{
525 			ess2[j] = BRecvSizes[j][Bownind];
526 		}
527 		BRecv = new UDERB();
528 
529 		oput << "For B (out), Fetching " << (void*)colwindows[0] << std::endl;
530 		SpParHelper::FetchMatrix(*BRecv, ess2, colwindows, Bownind);	// No lock version, only get !
531 	}
532 
533 	int Aownprev = Aownind;
534 	int Bownprev = Bownind;
535 
536 	for(int i = 1; i < stages; ++i)
537 	{
538 		Aownind = (i+Aoffset) % stages;
539 		Bownind = (i+Boffset) % stages;
540 
541 		if(i % 2 == 1)	// Fetch RecvNext via winnext, fence on Recv via windows
542 		{
543 			if(Aownind == (A.commGrid)->GetRankInProcRow())
544 			{
545 				ARecvNext = A.spSeq;	// shallow-copy
546 			}
547 			else
548 			{
549         std::vector<IU> ess1(UDERA::esscount);		// pack essentials to a vector
550 				for(int j=0; j< UDERA::esscount; ++j)
551 				{
552 					ess1[j] = ARecvSizes[j][Aownind];
553 				}
554 				ARecvNext = new UDERA();	// create the object first
555 
556 				oput << "For A, Fetching " << (void*) rowwinnext[0] << std::endl;
557 				SpParHelper::FetchMatrix(*ARecvNext, ess1, rowwinnext, Aownind);
558 			}
559 
560 			if(Bownind == (B.commGrid)->GetRankInProcCol())
561 			{
562 				BRecvNext = B.spSeq;	// shallow-copy
563 			}
564 			else
565 			{
566         std::vector<IU> ess2(UDERB::esscount);		// pack essentials to a vector
567 				for(int j=0; j< UDERB::esscount; ++j)
568 				{
569 					ess2[j] = BRecvSizes[j][Bownind];
570 				}
571 				BRecvNext = new UDERB();
572 
573 				oput << "For B, Fetching " << (void*)colwinnext[0] << std::endl;
574 				SpParHelper::FetchMatrix(*BRecvNext, ess2, colwinnext, Bownind);	// No lock version, only get !
575 			}
576 
577 			oput << "Fencing " << (void*) rowwindows[0] << std::endl;
578 			oput << "Fencing " << (void*) colwindows[0] << std::endl;
579 
580 			for(int j=0; j< rowwindows.size(); ++j)
581 				MPI_Win_fence(MPI_MODE_NOSTORE, rowwindows[j]);		// Synch using "other" windows
582 			for(int j=0; j< colwindows.size(); ++j)
583 				MPI_Win_fence(MPI_MODE_NOSTORE, colwindows[j]);
584 
585 			SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv, *BRecv, false, true);
586 			if(!C_cont->isZero())
587 				tomerge.push_back(C_cont);
588 
589 			if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecv;
590 			if(Bownprev != (B.commGrid)->GetRankInProcCol()) delete BRecv;
591 
592 			Aownprev = Aownind;
593 			Bownprev = Bownind;
594 		}
595 		else	// fetch to Recv via windows, fence on RecvNext via winnext
596 		{
597 
598 			if(Aownind == (A.commGrid)->GetRankInProcRow())
599 			{
600 				ARecv = A.spSeq;	// shallow-copy
601 			}
602 			else
603 			{
604         std::vector<IU> ess1(UDERA::esscount);		// pack essentials to a vector
605 				for(int j=0; j< UDERA::esscount; ++j)
606 				{
607 					ess1[j] = ARecvSizes[j][Aownind];
608 				}
609 				ARecv = new UDERA();	// create the object first
610 
611 				oput << "For A, Fetching " << (void*) rowwindows[0] << std::endl;
612 				SpParHelper::FetchMatrix(*ARecv, ess1, rowwindows, Aownind);
613 			}
614 
615 			if(Bownind == (B.commGrid)->GetRankInProcCol())
616 			{
617 				BRecv = B.spSeq;	// shallow-copy
618 			}
619 			else
620 			{
621         std::vector<IU> ess2(UDERB::esscount);		// pack essentials to a vector
622 				for(int j=0; j< UDERB::esscount; ++j)
623 				{
624 					ess2[j] = BRecvSizes[j][Bownind];
625 				}
626 				BRecv = new UDERB();
627 
628 				oput << "For B, Fetching " << (void*)colwindows[0] << std::endl;
629 				SpParHelper::FetchMatrix(*BRecv, ess2, colwindows, Bownind);	// No lock version, only get !
630 			}
631 
632 			oput << "Fencing " << (void*) rowwinnext[0] << std::endl;
633 			oput << "Fencing " << (void*) rowwinnext[0] << std::endl;
634 
635 			for(int j=0; j< rowwinnext.size(); ++j)
636 				MPI_Win_fence(MPI_MODE_NOSTORE, rowwinnext[j]);		// Synch using "other" windows
637 			for(int j=0; j< colwinnext.size(); ++j)
638 				MPI_Win_fence(MPI_MODE_NOSTORE, colwinnext[j]);
639 
640 			SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecvNext, *BRecvNext, false, true);
641 			if(!C_cont->isZero())
642 				tomerge.push_back(C_cont);
643 
644 
645 			if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecvNext;
646 			if(Bownprev != (B.commGrid)->GetRankInProcCol()) delete BRecvNext;
647 
648 			Aownprev = Aownind;
649 			Bownprev = Bownind;
650 		}
651 
652 	}
653 
654 	if(stages % 2 == 1)	// fence on Recv via windows
655 	{
656 		oput << "Fencing " << (void*) rowwindows[0] << std::endl;
657 		oput << "Fencing " << (void*) colwindows[0] << std::endl;
658 
659 		for(int j=0; j< rowwindows.size(); ++j)
660 			MPI_Win_fence(MPI_MODE_NOSUCCEED, rowwindows[j]);		// Synch using "prev" windows
661 		for(int j=0; j< colwindows.size(); ++j)
662 			MPI_Win_fence(MPI_MODE_NOSUCCEED, colwindows[j]);
663 
664 		SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv, *BRecv, false, true);
665 		if(!C_cont->isZero())
666 			tomerge.push_back(C_cont);
667 
668 		if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecv;
669 		if(Bownprev != (B.commGrid)->GetRankInProcRow()) delete BRecv;
670 	}
671 	else		// fence on RecvNext via winnext
672 	{
673 		oput << "Fencing " << (void*) rowwinnext[0] << std::endl;
674 		oput << "Fencing " << (void*) colwinnext[0] << std::endl;
675 
676 		for(int j=0; j< rowwinnext.size(); ++j)
677 			MPI_Win_fence(MPI_MODE_NOSUCCEED, rowwinnext[j]);		// Synch using "prev" windows
678 		for(int j=0; j< colwinnext.size(); ++j)
679 			MPI_Win_fence(MPI_MODE_NOSUCCEED, colwinnext[j]);
680 
681 		SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecvNext, *BRecvNext, false, true);
682 		if(!C_cont->isZero())
683 			tomerge.push_back(C_cont);
684 
685 		if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecvNext;
686 		if(Bownprev != (B.commGrid)->GetRankInProcRow()) delete BRecvNext;
687 	}
688 	for(int i=0; i< rowwindows.size(); ++i)
689 	{
690 		MPI_Win_free(&rowwindows[i]);
691 		MPI_Win_free(&rowwinnext[i]);
692 	}
693 	for(int i=0; i< colwindows.size(); ++i)
694 	{
695 		MPI_Win_free(&colwindows[i]);
696 		MPI_Win_free(&colwinnext[i]);
697 	}
698 	MPI_Barrier(GridC->GetWorld());
699 
700 	IU C_m = A.spSeq->getnrow();
701 	IU C_n = B.spSeq->getncol();
702 	DER_promote * C = new DER_promote(MergeAll<SR>(tomerge, C_m, C_n), false, NULL);	// First get the result in SpTuples, then convert to UDER
703 	for(int i=0; i<tomerge.size(); ++i)
704 	{
705 		delete tomerge[i];
706 	}
707 	SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
708 	SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
709 
710 	const_cast< UDERB* >(B.spSeq)->Transpose();	// transpose back to original
711 	return SpParMat<IU,N_promote,DER_promote> (C, GridC);			// return the result object
712 }
713 
714 }
715 
716 #endif
717