1 /****************************************************************/
2 /* Parallel Combinatorial BLAS Library (for Graph Computations) */
3 /* version 1.6 -------------------------------------------------*/
4 /* date: 6/15/2017 ---------------------------------------------*/
5 /* authors: Ariful Azad, Aydin Buluc  --------------------------*/
6 /****************************************************************/
7 /*
8  Copyright (c) 2010-2017, The Regents of the University of California
9 
10  Permission is hereby granted, free of charge, to any person obtaining a copy
11  of this software and associated documentation files (the "Software"), to deal
12  in the Software without restriction, including without limitation the rights
13  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  copies of the Software, and to permit persons to whom the Software is
15  furnished to do so, subject to the following conditions:
16 
17  The above copyright notice and this permission notice shall be included in
18  all copies or substantial portions of the Software.
19 
20  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  THE SOFTWARE.
27  */
28 
29 #include "FullyDistVec.h"
30 #include "FullyDistSpVec.h"
31 #include "Operations.h"
32 
33 namespace combblas {
34 
35 template <class IT, class NT>
FullyDistVec()36 FullyDistVec<IT, NT>::FullyDistVec ()
37 : FullyDist<IT,NT,typename combblas::disable_if< combblas::is_boolean<NT>::value, NT >::type>()
38 {
39 }
40 
41 template <class IT, class NT>
FullyDistVec(IT globallen,NT initval)42 FullyDistVec<IT, NT>::FullyDistVec (IT globallen, NT initval)
43 :FullyDist<IT,NT,typename combblas::disable_if< combblas::is_boolean<NT>::value, NT >::type>(globallen)
44 {
45 	arr.resize(MyLocLength(), initval);
46 }
47 
48 
49 template <class IT, class NT>
FullyDistVec(std::shared_ptr<CommGrid> grid)50 FullyDistVec<IT, NT>::FullyDistVec ( std::shared_ptr<CommGrid> grid)
51 : FullyDist<IT,NT,typename combblas::disable_if< combblas::is_boolean<NT>::value, NT >::type>(grid)
52 { }
53 
54 template <class IT, class NT>
FullyDistVec(std::shared_ptr<CommGrid> grid,IT globallen,NT initval)55 FullyDistVec<IT, NT>::FullyDistVec ( std::shared_ptr<CommGrid> grid, IT globallen, NT initval)
56 : FullyDist<IT,NT,typename combblas::disable_if< combblas::is_boolean<NT>::value, NT >::type>(grid,globallen)
57 {
58 	arr.resize(MyLocLength(), initval);
59 }
60 
61 template <class IT, class NT>
FullyDistVec(const FullyDistSpVec<IT,NT> & rhs)62 FullyDistVec<IT,NT>::FullyDistVec (const FullyDistSpVec<IT,NT> & rhs)		// Conversion copy-constructor
63 : FullyDist<IT,NT,typename combblas::disable_if< combblas::is_boolean<NT>::value, NT >::type>(rhs.commGrid,rhs.glen)
64 {
65 	*this = rhs;
66 }
67 
68 
69 template <class IT, class NT>
70 template <class ITRHS, class NTRHS>
FullyDistVec(const FullyDistVec<ITRHS,NTRHS> & rhs)71 FullyDistVec<IT, NT>::FullyDistVec ( const FullyDistVec<ITRHS, NTRHS>& rhs )
72 : FullyDist<IT,NT,typename combblas::disable_if< combblas::is_boolean<NT>::value, NT >::type>(rhs.commGrid, static_cast<IT>(rhs.glen))
73 {
74 	arr.resize(static_cast<IT>(rhs.arr.size()), NT());
75 
76 	for(IT i=0; (unsigned)i < arr.size(); ++i)
77 	{
78 		arr[i] = static_cast<NT>(rhs.arr[static_cast<ITRHS>(i)]);
79 	}
80 }
81 
82 /**
83   * Initialize a FullyDistVec with a separate vector from each processor
84   * Optimizes for the common case where all fillarr's in separate processors are of the same size
85   */
86 template <class IT, class NT>
FullyDistVec(const std::vector<NT> & fillarr,std::shared_ptr<CommGrid> grid)87 FullyDistVec<IT, NT>::FullyDistVec ( const std::vector<NT> & fillarr, std::shared_ptr<CommGrid> grid )
88 : FullyDist<IT,NT,typename combblas::disable_if< combblas::is_boolean<NT>::value, NT >::type>(grid)
89 {
90 	MPI_Comm World = commGrid->GetWorld();
91 	int nprocs = commGrid->GetSize();
92 	int rank = commGrid->GetRank();
93 
94 	IT * sizes = new IT[nprocs];
95 	IT nsize = fillarr.size();
96 	sizes[rank] = nsize;
97 	MPI_Allgather(MPI_IN_PLACE, 1, MPIType<IT>(), sizes, 1, MPIType<IT>(), World);
98 	glen = std::accumulate(sizes, sizes+nprocs, static_cast<IT>(0));
99 
100 	std::vector<IT> uniq_sizes;
101 	std::unique_copy(sizes, sizes+nprocs, std::back_inserter(uniq_sizes));
102 	if(uniq_sizes.size() == 1)
103 	{
104 		arr = fillarr;
105 	}
106 	else
107 	{
108 		IT lengthuntil = std::accumulate(sizes, sizes+rank, static_cast<IT>(0));
109 
110 		// Although the found vector is not reshuffled yet, its glen and commGrid are set
111 		// We can call the Owner/MyLocLength/LengthUntil functions (to infer future distribution)
112 
113 		// rebalance/redistribute
114 		int * sendcnt = new int[nprocs];
115 		std::fill(sendcnt, sendcnt+nprocs, 0);
116 		for(IT i=0; i<nsize; ++i)
117 		{
118 			IT locind;
119 			int owner = Owner(i+lengthuntil, locind);
120 			++sendcnt[owner];
121 		}
122 		int * recvcnt = new int[nprocs];
123 		MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World); // share the counts
124 
125 		int * sdispls = new int[nprocs];
126 		int * rdispls = new int[nprocs];
127 		sdispls[0] = 0;
128 		rdispls[0] = 0;
129 		for(int i=0; i<nprocs-1; ++i)
130 		{
131 			sdispls[i+1] = sdispls[i] + sendcnt[i];
132 			rdispls[i+1] = rdispls[i] + recvcnt[i];
133 		}
134 		IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
135 		std::vector<IT> recvbuf(totrecv);
136 
137 		// data is already in the right order in found.arr
138 		MPI_Alltoallv(&(arr[0]), sendcnt, sdispls, MPIType<IT>(), &(recvbuf[0]), recvcnt, rdispls, MPIType<IT>(), World);
139 		arr.swap(recvbuf);
140 		DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
141 	}
142 	delete [] sizes;
143 
144 }
145 
146 
147 template <class IT, class NT>
MinElement() const148 std::pair<IT, NT> FullyDistVec<IT,NT>::MinElement() const
149 {
150 
151 
152     auto it = min_element(arr.begin(), arr.end());
153     NT localMin = *it;
154     NT globalMin;
155     MPI_Allreduce( &localMin, &globalMin, 1, MPIType<NT>(), MPI_MIN, commGrid->GetWorld());
156 
157     IT localMinIdx = TotalLength();
158     if(globalMin==localMin)
159     {
160         localMinIdx = distance(arr.begin(), it) + LengthUntil();
161     }
162     IT globalMinIdx;
163     MPI_Allreduce( &localMinIdx, &globalMinIdx, 1, MPIType<IT>(), MPI_MIN, commGrid->GetWorld()); // it can be MPI_MAX or anything
164 
165     return std::make_pair(globalMinIdx, globalMin);
166 }
167 
168 
169 template <class IT, class NT>
170 template <typename _BinaryOperation>
Reduce(_BinaryOperation __binary_op,NT identity) const171 NT FullyDistVec<IT,NT>::Reduce(_BinaryOperation __binary_op, NT identity) const
172 {
173 	// std::accumulate returns identity for empty sequences
174 	NT localsum = std::accumulate( arr.begin(), arr.end(), identity, __binary_op);
175 
176 	NT totalsum = identity;
177 	MPI_Allreduce( &localsum, &totalsum, 1, MPIType<NT>(), MPIOp<_BinaryOperation, NT>::op(), commGrid->GetWorld());
178 	return totalsum;
179 }
180 
181 template <class IT, class NT>
182 template <typename OUT, typename _BinaryOperation, typename _UnaryOperation>
183 OUT FullyDistVec<IT,NT>::Reduce(_BinaryOperation __binary_op, OUT default_val, _UnaryOperation __unary_op) const
184 {
185 	// std::accumulate returns identity for empty sequences
186 	OUT localsum = default_val;
187 
188 	if (arr.size() > 0)
189 	{
190 		typename std::vector< NT >::const_iterator iter = arr.begin();
191 		//localsum = __unary_op(*iter);
192 		//iter++;
193 		while (iter < arr.end())
194 		{
195 			localsum = __binary_op(localsum, __unary_op(*iter));
196 			iter++;
197 		}
198 	}
199 
200 	OUT totalsum = default_val;
201 	MPI_Allreduce( &localsum, &totalsum, 1, MPIType<OUT>(), MPIOp<_BinaryOperation, OUT>::op(), commGrid->GetWorld());
202 	return totalsum;
203 }
204 
205 
206 //! ABAB: Put concept check, NT should be integer for this to make sense
207 template<class IT, class NT>
SelectCandidates(double nver)208 void FullyDistVec<IT,NT>::SelectCandidates(double nver)
209 {
210 #ifdef DETERMINISTIC
211 	MTRand M(1);
212 #else
213 	MTRand M;	// generate random numbers with Mersenne Twister
214 #endif
215 
216 	IT length = TotalLength();
217 	std::vector<double> loccands(length);
218 	std::vector<NT> loccandints(length);
219 	MPI_Comm World = commGrid->GetWorld();
220 	int myrank = commGrid->GetRank();
221 	if(myrank == 0)
222 	{
223 		for(int i=0; i<length; ++i)
224 			loccands[i] = M.rand();
225       std::transform(loccands.begin(), loccands.end(), loccands.begin(), std::bind2nd( std::multiplies<double>(), nver ));
226 
227 		for(int i=0; i<length; ++i)
228 			loccandints[i] = static_cast<NT>(loccands[i]);
229 	}
230 	MPI_Bcast(&(loccandints[0]), length, MPIType<NT>(),0, World);
231 	for(IT i=0; i<length; ++i)
232 		SetElement(i,loccandints[i]);
233 }
234 
235 template <class IT, class NT>
236 template <class ITRHS, class NTRHS>
operator =(const FullyDistVec<ITRHS,NTRHS> & rhs)237 FullyDistVec< IT,NT > &  FullyDistVec<IT,NT>::operator=(const FullyDistVec< ITRHS,NTRHS > & rhs)
238 {
239 	if(static_cast<const void*>(this) != static_cast<const void*>(&rhs))
240 	{
241 		//FullyDist<IT,NT>::operator= (rhs);	// to update glen and commGrid
242 		glen = static_cast<IT>(rhs.glen);
243 		commGrid = rhs.commGrid;
244 
245 		arr.resize(rhs.arr.size(), NT());
246 		for(IT i=0; (unsigned)i < arr.size(); ++i)
247 		{
248 			arr[i] = static_cast<NT>(rhs.arr[static_cast<ITRHS>(i)]);
249 		}
250 	}
251 	return *this;
252 }
253 
254 template <class IT, class NT>
operator =(const FullyDistVec<IT,NT> & rhs)255 FullyDistVec< IT,NT > &  FullyDistVec<IT,NT>::operator=(const FullyDistVec< IT,NT > & rhs)
256 {
257 	if(this != &rhs)
258 	{
259 		FullyDist<IT,NT,typename combblas::disable_if< combblas::is_boolean<NT>::value, NT >::type>::operator= (rhs);	// to update glen and commGrid
260 		arr = rhs.arr;
261 	}
262 	return *this;
263 }
264 
265 template <class IT, class NT>
operator =(const FullyDistSpVec<IT,NT> & rhs)266 FullyDistVec< IT,NT > &  FullyDistVec<IT,NT>::operator=(const FullyDistSpVec< IT,NT > & rhs)		// FullyDistSpVec->FullyDistVec conversion operator
267 {
268 	FullyDist<IT,NT,typename combblas::disable_if< combblas::is_boolean<NT>::value, NT >::type>::operator= (rhs);	// to update glen and commGrid
269 	arr.resize(rhs.MyLocLength());
270 	std::fill(arr.begin(), arr.end(), NT());
271 
272 	IT spvecsize = rhs.getlocnnz();
273 	for(IT i=0; i< spvecsize; ++i)
274 	{
275 		//if(rhs.ind[i] > arr.size())
276 		//	cout << "rhs.ind[i]: " << rhs.ind[i] <<  endl;
277 		arr[rhs.ind[i]] = rhs.num[i];
278 	}
279 	return *this;
280 }
281 
282 
283 
284 /**
285  ** Let the compiler create an assignment operator and call base class'
286  ** assignment operator automatically
287  **/
288 
289 
290 template <class IT, class NT>
operator +=(const FullyDistSpVec<IT,NT> & rhs)291 FullyDistVec< IT,NT > &  FullyDistVec<IT,NT>::operator+=(const FullyDistSpVec< IT,NT > & rhs)
292 {
293 	IT spvecsize = rhs.getlocnnz();
294 	#ifdef _OPENMP
295 	#pragma omp parallel for
296 	#endif
297 	for(IT i=0; i< spvecsize; ++i)
298 	{
299 		if(arr[rhs.ind[i]] == NT()) // not set before
300 			arr[rhs.ind[i]] = rhs.num[i];
301 		else
302 			arr[rhs.ind[i]] += rhs.num[i];
303 	}
304 	return *this;
305 }
306 
307 
308 
309 template <class IT, class NT>
operator -=(const FullyDistSpVec<IT,NT> & rhs)310 FullyDistVec< IT,NT > &  FullyDistVec<IT,NT>::operator-=(const FullyDistSpVec< IT,NT > & rhs)
311 {
312 	IT spvecsize = rhs.getlocnnz();
313 	for(IT i=0; i< spvecsize; ++i)
314 	{
315 		arr[rhs.ind[i]] -= rhs.num[i];
316 	}
317 	return *this;
318 }
319 
320 
321 /**
322   * Perform __binary_op(*this[i], rhs[i]) for every element in rhs, *this,
323   * which are of the same size. and write the result back to *this
324   */
325 template <class IT, class NT>
326 template <typename _BinaryOperation>
EWise(const FullyDistVec<IT,NT> & rhs,_BinaryOperation __binary_op)327 void FullyDistVec<IT,NT>::EWise(const FullyDistVec<IT,NT> & rhs,  _BinaryOperation __binary_op)
328 {
329   std::transform ( arr.begin(), arr.end(), rhs.arr.begin(), arr.begin(), __binary_op );
330 };
331 
332 /**
333  * Perform __binary_op(*this[i], rhs[i]) for every element in rhs and *this, which are of the same size.
334  * write the result output vector, hence the name EWiseOut
335  */
336 template <class IT, class NT>
337 template <typename _BinaryOperation, typename OUT>
EWiseOut(const FullyDistVec<IT,NT> & rhs,_BinaryOperation __binary_op,FullyDistVec<IT,OUT> & result)338 void FullyDistVec<IT,NT>::EWiseOut(const FullyDistVec<IT,NT> & rhs, _BinaryOperation __binary_op, FullyDistVec<IT,OUT> & result)
339 {
340   std::transform ( arr.begin(), arr.end(), rhs.arr.begin(), result.arr.begin(), __binary_op );
341 };
342 
343 
344 template <class IT, class NT>
operator +=(const FullyDistVec<IT,NT> & rhs)345 FullyDistVec<IT,NT> & FullyDistVec<IT, NT>::operator+=(const FullyDistVec<IT,NT> & rhs)
346 {
347 	if(this != &rhs)
348 	{
349 		if(!(*commGrid == *rhs.commGrid))
350 		{
351 			std::cout << "Grids are not comparable elementwise addition" << std::endl;
352 			MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
353 		}
354 		else
355 		{
356 			EWise(rhs, std::plus<NT>());
357 		}
358 	}
359 	return *this;
360 };
361 
362 template <class IT, class NT>
operator -=(const FullyDistVec<IT,NT> & rhs)363 FullyDistVec<IT,NT> & FullyDistVec<IT, NT>::operator-=(const FullyDistVec<IT,NT> & rhs)
364 {
365 	if(this != &rhs)
366 	{
367 		if(!(*commGrid == *rhs.commGrid))
368 		{
369 			std::cout << "Grids are not comparable elementwise addition" << std::endl;
370 			MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
371 		}
372 		else
373 		{
374 			EWise(rhs, std::minus<NT>());
375 		}
376 	}
377 	return *this;
378 };
379 
380 template <class IT, class NT>
operator ==(const FullyDistVec<IT,NT> & rhs) const381 bool FullyDistVec<IT,NT>::operator==(const FullyDistVec<IT,NT> & rhs) const
382 {
383 	ErrorTolerantEqual<NT> epsilonequal;
384 	int local = 1;
385 	local = (int) std::equal(arr.begin(), arr.end(), rhs.arr.begin(), epsilonequal );
386 	int whole = 1;
387 	MPI_Allreduce( &local, &whole, 1, MPI_INT, MPI_BAND, commGrid->GetWorld());
388 	return static_cast<bool>(whole);
389 }
390 
391 template <class IT, class NT>
392 template <typename _Predicate>
Count(_Predicate pred) const393 IT FullyDistVec<IT,NT>::Count(_Predicate pred) const
394 {
395 	IT local = count_if( arr.begin(), arr.end(), pred );
396 	IT whole = 0;
397 	MPI_Allreduce( &local, &whole, 1, MPIType<IT>(), MPI_SUM, commGrid->GetWorld());
398 	return whole;
399 }
400 
401 //! Returns a dense vector of global indices
402 //! for which the predicate is satisfied
403 template <class IT, class NT>
404 template <typename _Predicate>
FindInds(_Predicate pred) const405 FullyDistVec<IT,IT> FullyDistVec<IT,NT>::FindInds(_Predicate pred) const
406 {
407 	FullyDistVec<IT,IT> found(commGrid);
408 	MPI_Comm World = commGrid->GetWorld();
409 	int nprocs = commGrid->GetSize();
410 	int rank = commGrid->GetRank();
411 
412 	IT sizelocal = LocArrSize();
413 	IT sizesofar = LengthUntil();
414 	for(IT i=0; i<sizelocal; ++i)
415 	{
416 		if(pred(arr[i]))
417 		{
418 			found.arr.push_back(i+sizesofar);
419 		}
420 	}
421 	IT * dist = new IT[nprocs];
422 	IT nsize = found.arr.size();
423 	dist[rank] = nsize;
424 	MPI_Allgather(MPI_IN_PLACE, 1, MPIType<IT>(), dist, 1, MPIType<IT>(), World);
425 	IT lengthuntil = std::accumulate(dist, dist+rank, static_cast<IT>(0));
426 	found.glen = std::accumulate(dist, dist+nprocs, static_cast<IT>(0));
427 
428 	// Although the found vector is not reshuffled yet, its glen and commGrid are set
429 	// We can call the Owner/MyLocLength/LengthUntil functions (to infer future distribution)
430 
431 	// rebalance/redistribute
432 	int * sendcnt = new int[nprocs];
433 	std::fill(sendcnt, sendcnt+nprocs, 0);
434 	for(IT i=0; i<nsize; ++i)
435 	{
436 		IT locind;
437 		int owner = found.Owner(i+lengthuntil, locind);
438 		++sendcnt[owner];
439 	}
440 	int * recvcnt = new int[nprocs];
441 	MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World); // share the counts
442 
443 	int * sdispls = new int[nprocs];
444 	int * rdispls = new int[nprocs];
445 	sdispls[0] = 0;
446 	rdispls[0] = 0;
447 	for(int i=0; i<nprocs-1; ++i)
448 	{
449 		sdispls[i+1] = sdispls[i] + sendcnt[i];
450 		rdispls[i+1] = rdispls[i] + recvcnt[i];
451 	}
452 	IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
453 	std::vector<IT> recvbuf(totrecv);
454 
455 	// data is already in the right order in found.arr
456 	MPI_Alltoallv(&(found.arr[0]), sendcnt, sdispls, MPIType<IT>(), &(recvbuf[0]), recvcnt, rdispls, MPIType<IT>(), World);
457 	found.arr.swap(recvbuf);
458 	delete [] dist;
459 	DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
460 
461 	return found;
462 }
463 
464 
465 //! Requires no communication because FullyDistSpVec (the return object)
466 //! is distributed based on length, not nonzero counts
467 template <class IT, class NT>
468 template <typename _Predicate>
Find(_Predicate pred) const469 FullyDistSpVec<IT,NT> FullyDistVec<IT,NT>::Find(_Predicate pred) const
470 {
471 	FullyDistSpVec<IT,NT> found(commGrid);
472 	size_t size = arr.size();
473 	for(size_t i=0; i<size; ++i)
474 	{
475 		if(pred(arr[i]))
476 		{
477 			found.ind.push_back( (IT) i);
478 			found.num.push_back(arr[i]);
479 		}
480 	}
481 	found.glen = glen;
482 	return found;
483 }
484 
485 
486 //! Retain a sparse vector with indices where the supplied value is found
487 template <class IT, class NT>
Find(NT val) const488 FullyDistSpVec<IT,NT> FullyDistVec<IT,NT>::Find(NT val) const
489 {
490     FullyDistSpVec<IT,NT> found(commGrid);
491     size_t size = arr.size();
492     for(size_t i=0; i<size; ++i)
493     {
494         if(arr[i]==val)
495         {
496             found.ind.push_back( (IT) i);
497             found.num.push_back(val);
498         }
499     }
500     found.glen = glen;
501     return found;
502 }
503 
504 
505 template <class IT, class NT>
506 template <class HANDLER>
ReadDistribute(std::ifstream & infile,int master,HANDLER handler)507 std::ifstream& FullyDistVec<IT,NT>::ReadDistribute (std::ifstream& infile, int master, HANDLER handler)
508 {
509 	FullyDistSpVec<IT,NT> tmpSpVec(commGrid);
510 	tmpSpVec.ReadDistribute(infile, master, handler);
511 
512 	*this = tmpSpVec;
513 	return infile;
514 }
515 
516 template <class IT, class NT>
517 template <class HANDLER>
SaveGathered(std::ofstream & outfile,int master,HANDLER handler,bool printProcSplits)518 void FullyDistVec<IT,NT>::SaveGathered(std::ofstream& outfile, int master, HANDLER handler, bool printProcSplits)
519 {
520 	FullyDistSpVec<IT,NT> tmpSpVec = *this;
521 	tmpSpVec.SaveGathered(outfile, master, handler, printProcSplits);
522 }
523 
524 template <class IT, class NT>
SetElement(IT indx,NT numx)525 void FullyDistVec<IT,NT>::SetElement (IT indx, NT numx)
526 {
527 	int rank = commGrid->GetRank();
528 	if (glen == 0)
529 	{
530 		if(rank == 0)
531 			std::cout << "FullyDistVec::SetElement can't be called on an empty vector." << std::endl;
532 		return;
533 	}
534 	IT locind;
535 	int owner = Owner(indx, locind);
536 	if(commGrid->GetRank() == owner)
537 	{
538 		if (locind > (LocArrSize() -1))
539 		{
540 			std::cout << "FullyDistVec::SetElement cannot expand array" << std::endl;
541 		}
542 		else if (locind < 0)
543 		{
544 			std::cout << "FullyDistVec::SetElement local index < 0" << std::endl;
545 		}
546 		else
547 		{
548 			arr[locind] = numx;
549 		}
550 	}
551 }
552 
553 template <class IT, class NT>
GetElement(IT indx) const554 NT FullyDistVec<IT,NT>::GetElement (IT indx) const
555 {
556 	NT ret;
557 	MPI_Comm World = commGrid->GetWorld();
558 	int rank = commGrid->GetRank();
559 	if (glen == 0)
560 	{
561 		if(rank == 0)
562 			std::cout << "FullyDistVec::GetElement can't be called on an empty vector." << std::endl;
563 
564 		return NT();
565 	}
566 	IT locind;
567 	int owner = Owner(indx, locind);
568 	if(commGrid->GetRank() == owner)
569 	{
570 		if (locind > (LocArrSize() -1))
571 		{
572 			std::cout << "FullyDistVec::GetElement local index > size" << std::endl;
573 			ret = NT();
574 
575 		}
576 		else if (locind < 0)
577 		{
578 			std::cout << "FullyDistVec::GetElement local index < 0" << std::endl;
579 			ret = NT();
580 		}
581 		else
582 		{
583 			ret = arr[locind];
584 		}
585 	}
586 	MPI_Bcast(&ret, 1, MPIType<NT>(), owner, World);
587 	return ret;
588 }
589 
590 // Write to file using MPI-2
591 template <class IT, class NT>
DebugPrint()592 void FullyDistVec<IT,NT>::DebugPrint()
593 {
594 	int nprocs, rank;
595 	MPI_Comm World = commGrid->GetWorld();
596 	MPI_Comm_rank(World, &rank);
597 	MPI_Comm_size(World, &nprocs);
598 	MPI_File thefile;
599 	char _fn[] = "temp_fullydistvec"; // AL: this is to avoid the problem that C++ string literals are const char* while C string literals are char*, leading to a const warning (technically error, but compilers are tolerant)
600 	MPI_File_open(World, _fn, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &thefile);
601 	IT lengthuntil = LengthUntil();
602 
603 	// The disp displacement argument specifies the position
604 	// (absolute offset in bytes from the beginning of the file)
605 	char native[] = "native"; // AL: this is to avoid the problem that C++ string literals are const char* while C string literals are char*, leading to a const warning (technically error, but compilers are tolerant)
606 	MPI_File_set_view(thefile, int64_t(lengthuntil * sizeof(NT)), MPIType<NT>(), MPIType<NT>(), native, MPI_INFO_NULL);
607 
608 	IT count = LocArrSize();
609 	MPI_File_write(thefile, &(arr[0]), count, MPIType<NT>(), MPI_STATUS_IGNORE);
610 	MPI_File_close(&thefile);
611 
612 	// Now let processor-0 read the file and print
613 	IT * counts = new IT[nprocs];
614 	MPI_Gather(&count, 1, MPIType<IT>(), counts, 1, MPIType<IT>(), 0, World);	// gather at root=0
615 	if(rank == 0)
616 	{
617 		FILE * f = fopen("temp_fullydistvec", "r");
618 		if(!f)
619 		{
620 				std::cerr << "Problem reading binary input file\n";
621 				return;
622 		}
623 		IT maxd = *std::max_element(counts, counts+nprocs);
624 		NT * data = new NT[maxd];
625 
626 		for(int i=0; i<nprocs; ++i)
627 		{
628 			// read counts[i] integers and print them
629 			size_t result = fread(data, sizeof(NT), counts[i],f);
630 			if (result != (unsigned)counts[i]) { std::cout << "Error in fread, only " << result << " entries read" << std::endl; }
631 
632 			std::cout << "Elements stored on proc " << i << ": {" ;
633 			for (int j = 0; j < counts[i]; j++)
634 			{
635 				std::cout << data[j] << ",";
636 			}
637 			std::cout << "}" << std::endl;
638 		}
639 		delete [] data;
640 		delete [] counts;
641 	}
642 }
643 
644 template <class IT, class NT>
645 template <typename _UnaryOperation, typename IRRELEVANT_NT>
Apply(_UnaryOperation __unary_op,const FullyDistSpVec<IT,IRRELEVANT_NT> & mask)646 void FullyDistVec<IT,NT>::Apply(_UnaryOperation __unary_op, const FullyDistSpVec<IT,IRRELEVANT_NT> & mask)
647 {
648 	typename std::vector< IT >::const_iterator miter = mask.ind.begin();
649 	while (miter < mask.ind.end())
650 	{
651 		IT index = *miter++;
652 		arr[index] = __unary_op(arr[index]);
653 	}
654 }
655 
656 template <class IT, class NT>
657 template <typename _BinaryOperation, typename _BinaryPredicate, class NT2>
EWiseApply(const FullyDistVec<IT,NT2> & other,_BinaryOperation __binary_op,_BinaryPredicate _do_op,const bool useExtendedBinOp)658 void FullyDistVec<IT,NT>::EWiseApply(const FullyDistVec<IT,NT2> & other, _BinaryOperation __binary_op, _BinaryPredicate _do_op, const bool useExtendedBinOp)
659 {
660 	if(*(commGrid) == *(other.commGrid))
661 	{
662 		if(glen != other.glen)
663 		{
664 			std::ostringstream outs;
665 			outs << "Vector dimensions don't match (" << glen << " vs " << other.glen << ") for FullyDistVec::EWiseApply\n";
666 			SpParHelper::Print(outs.str());
667 			MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
668 		}
669 		else
670 		{
671 			typename std::vector< NT >::iterator thisIter = arr.begin();
672 			typename std::vector< NT2 >::const_iterator otherIter = other.arr.begin();
673 			while (thisIter < arr.end())
674 			{
675 				if (_do_op(*thisIter, *otherIter, false, false))
676 					*thisIter = __binary_op(*thisIter, *otherIter, false, false);
677 				thisIter++;
678 				otherIter++;
679 			}
680 		}
681 	}
682 	else
683 	{
684 		std::ostringstream outs;
685 		outs << "Grids are not comparable for FullyDistVec<IT,NT>::EWiseApply" << std::endl;
686 		SpParHelper::Print(outs.str());
687 		MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
688 	}
689 }
690 
691 
692 // Note (Ariful): multithreded implemented only when applyNulls=false.
693 // TODO: employ multithreding when applyNulls=true
694 template <class IT, class NT>
695 template <typename _BinaryOperation, typename _BinaryPredicate, class NT2>
EWiseApply(const FullyDistSpVec<IT,NT2> & other,_BinaryOperation __binary_op,_BinaryPredicate _do_op,bool applyNulls,NT2 nullValue,const bool useExtendedBinOp)696 void FullyDistVec<IT,NT>::EWiseApply(const FullyDistSpVec<IT,NT2> & other, _BinaryOperation __binary_op, _BinaryPredicate _do_op, bool applyNulls, NT2 nullValue, const bool useExtendedBinOp)
697 {
698 	if(*(commGrid) == *(other.commGrid))
699 	{
700 		if(glen != other.glen)
701 		{
702 			std::cerr << "Vector dimensions don't match (" << glen << " vs " << other.glen << ") for FullyDistVec::EWiseApply\n";
703 			MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
704 		}
705 		else
706 		{
707 			typename std::vector< IT >::const_iterator otherInd = other.ind.begin();
708 			typename std::vector< NT2 >::const_iterator otherNum = other.num.begin();
709 
710 			if (applyNulls) // scan the entire dense vector and apply sparse elements as they appear
711 			{
712 				for(IT i=0; (unsigned)i < arr.size(); ++i)
713 				{
714 					if (otherInd == other.ind.end() || i < *otherInd)
715 					{
716 						if (_do_op(arr[i], nullValue, false, true))
717 							arr[i] = __binary_op(arr[i], nullValue, false, true);
718 					}
719 					else
720 					{
721 						if (_do_op(arr[i], *otherNum, false, false))
722 							arr[i] = __binary_op(arr[i], *otherNum, false, false);
723 						otherInd++;
724 						otherNum++;
725 					}
726 				}
727 			}
728 			else // scan the sparse vector only
729 			{
730                 /*
731                 for(otherInd = other.ind.begin(); otherInd < other.ind.end(); otherInd++, otherNum++)
732 				{
733 					if (_do_op(arr[*otherInd], *otherNum, false, false))
734 						arr[*otherInd] = __binary_op(arr[*otherInd], *otherNum, false, false);
735 				}*/
736 
737                 IT spsize = other.ind.size();
738 #ifdef _OPENMP
739 #pragma omp parallel for
740 #endif
741                 for(IT i=0; i< spsize; i++)
742                 {
743                     if (_do_op(arr[other.ind[i]], other.num[i], false, false))
744                         arr[other.ind[i]] = __binary_op(arr[other.ind[i]], other.num[i], false, false);
745                 }
746 			}
747 		}
748 	}
749 	else
750 	{
751 		std::cout << "Grids are not comparable elementwise apply" << std::endl;
752 		MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
753 	}
754 }
755 
756 
757 template <class IT, class NT>
sort()758 FullyDistVec<IT, IT> FullyDistVec<IT, NT>::sort()
759 {
760 	MPI_Comm World = commGrid->GetWorld();
761 	FullyDistVec<IT,IT> temp(commGrid);
762 	IT nnz = LocArrSize();
763 	std::pair<NT,IT> * vecpair = new std::pair<NT,IT>[nnz];
764 	int nprocs = commGrid->GetSize();
765 	int rank = commGrid->GetRank();
766 
767 	IT * dist = new IT[nprocs];
768 	dist[rank] = nnz;
769 	MPI_Allgather(MPI_IN_PLACE, 1, MPIType<IT>(), dist, 1, MPIType<IT>(), World);
770 	IT sizeuntil = LengthUntil();	// size = length, for dense vectors
771 	for(IT i=0; i< nnz; ++i)
772 	{
773 		vecpair[i].first = arr[i];	// we'll sort wrt numerical values
774 		vecpair[i].second = i + sizeuntil;
775 	}
776 	SpParHelper::MemoryEfficientPSort(vecpair, nnz, dist, World);
777 
778 	std::vector< IT > narr(nnz);
779 	for(IT i=0; i< nnz; ++i)
780 	{
781 		arr[i] = vecpair[i].first;	// sorted range (change the object itself)
782 		narr[i] = vecpair[i].second;	// inverse permutation stored as numerical values
783 	}
784 	delete [] vecpair;
785 	delete [] dist;
786 
787 	temp.glen = glen;
788 	temp.arr = narr;
789 	return temp;
790 }
791 
792 
793 // Randomly permutes an already existing vector
794 template <class IT, class NT>
RandPerm()795 void FullyDistVec<IT,NT>::RandPerm()
796 {
797 #ifdef DETERMINISTIC
798 	uint64_t seed = 1383098845;
799 #else
800 	uint64_t seed= time(NULL);
801 #endif
802 
803 	MTRand M(seed);	// generate random numbers with Mersenne Twister
804     MPI_Comm World = commGrid->GetWorld();
805 	int nprocs = commGrid->GetSize();
806 	int rank = commGrid->GetRank();
807     IT size = LocArrSize();
808 
809 #ifdef COMBBLAS_LEGACY
810   std::pair<double,NT> * vecpair = new std::pair<double,NT>[size];
811 	IT * dist = new IT[nprocs];
812 	dist[rank] = size;
813 	MPI_Allgather(MPI_IN_PLACE, 1, MPIType<IT>(), dist, 1, MPIType<IT>(), World);
814 	for(int i=0; i<size; ++i)
815 	{
816 		vecpair[i].first = M.rand();
817 		vecpair[i].second = arr[i];
818 	}
819 	// less< pair<T1,T2> > works correctly (sorts wrt first elements)
820 	SpParHelper::MemoryEfficientPSort(vecpair, size, dist, World);
821   std::vector< NT > nnum(size);
822 	for(int i=0; i<size; ++i) nnum[i] = vecpair[i].second;
823     DeleteAll(vecpair, dist);
824 	arr.swap(nnum);
825 #else
826     std::vector< std::vector< NT > > data_send(nprocs);
827 	for(int i=0; i<size; ++i)
828     {
829 		// send each entry to a random process
830 		uint32_t dest = M.randInt(nprocs-1);
831 		data_send[dest].push_back(arr[i]);
832 	}
833     int * sendcnt = new int[nprocs];
834     int * sdispls = new int[nprocs];
835     for(int i=0; i<nprocs; ++i) sendcnt[i] = (int) data_send[i].size();
836 
837     int * rdispls = new int[nprocs];
838     int * recvcnt = new int[nprocs];
839     MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);  // share the request counts
840     sdispls[0] = 0;
841     rdispls[0] = 0;
842     for(int i=0; i<nprocs-1; ++i)
843     {
844         sdispls[i+1] = sdispls[i] + sendcnt[i];
845         rdispls[i+1] = rdispls[i] + recvcnt[i];
846     }
847     IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
848 	if(totrecv > std::numeric_limits<int>::max())
849 	{
850 		std::cout << "COMBBLAS_WARNING: total data to receive exceeds max int: " << totrecv << std::endl;
851 	}
852     std::vector<NT>().swap(arr);  // make space for temporaries
853 
854 	NT * sendbuf = new NT[size];
855     for(int i=0; i<nprocs; ++i)
856     {
857         std::copy(data_send[i].begin(), data_send[i].end(), sendbuf+sdispls[i]);
858         std::vector<NT>().swap(data_send[i]);	// free memory
859     }
860 	NT * recvbuf = new NT[totrecv];
861     MPI_Alltoallv(sendbuf, sendcnt, sdispls, MPIType<NT>(), recvbuf, recvcnt, rdispls, MPIType<NT>(), World);
862 	//std::random_shuffle(recvbuf, recvbuf+ totrecv);
863     std::default_random_engine gen(seed);
864     std::shuffle(recvbuf, recvbuf+ totrecv,gen); // locally shuffle data
865 
866 	int64_t * localcounts = new int64_t[nprocs];
867 	localcounts[rank] = totrecv;
868 	MPI_Allgather(MPI_IN_PLACE, 1, MPI_LONG_LONG, localcounts, 1, MPI_LONG_LONG, World);
869 	int64_t glenuntil = std::accumulate(localcounts, localcounts+rank, static_cast<int64_t>(0));
870 
871 	std::vector< std::vector< IT > > locs_send(nprocs);
872 	for(IT i=0; i< totrecv; ++i)    // determine new locations w/ prefix sums
873 	{
874 		IT remotelocind;
875 		int owner = Owner(glenuntil+i, remotelocind);
876 		locs_send[owner].push_back(remotelocind);
877 		data_send[owner].push_back(recvbuf[i]);
878 	}
879 
880     for(int i=0; i<nprocs; ++i) sendcnt[i] = (int) data_send[i].size(); // = locs_send.size()
881     MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
882     sdispls[0] = 0;
883     rdispls[0] = 0;
884     for(int i=0; i<nprocs-1; ++i)
885     {
886         sdispls[i+1] = sdispls[i] + sendcnt[i];
887         rdispls[i+1] = rdispls[i] + recvcnt[i];
888     }
889     IT newsize = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
890 	if(newsize > std::numeric_limits<int>::max())
891 	{
892 		std::cout << "COMBBLAS_WARNING: total data to receive exceeds max int: " << newsize << std::endl;
893 	}
894 	// re-use the receive buffer as sendbuf of second stage
895     IT totalsend = std::accumulate(sendcnt, sendcnt+nprocs, static_cast<IT>(0));
896     if(totalsend != totrecv || newsize != size)
897     {
898         std::cout << "COMBBLAS_WARNING: sending different sized data than received: " << totalsend << "=" << totrecv << " , " << newsize << "=" << size << std::endl;
899     }
900     for(int i=0; i<nprocs; ++i)
901     {
902         std::copy(data_send[i].begin(), data_send[i].end(), recvbuf+sdispls[i]);
903         std::vector<NT>().swap(data_send[i]);	// free memory
904     }
905     // re-use the send buffer as receive buffer of second stage
906     MPI_Alltoallv(recvbuf, sendcnt, sdispls, MPIType<NT>(), sendbuf, recvcnt, rdispls, MPIType<NT>(), World);
907     delete [] recvbuf;
908     IT * newinds = new IT[totalsend];
909     for(int i=0; i<nprocs; ++i)
910     {
911         std::copy(locs_send[i].begin(), locs_send[i].end(), newinds+sdispls[i]);
912         std::vector<IT>().swap(locs_send[i]);	// free memory
913     }
914     IT * indsbuf = new IT[size];
915 	MPI_Alltoallv(newinds, sendcnt, sdispls, MPIType<IT>(), indsbuf, recvcnt, rdispls, MPIType<IT>(), World);
916     DeleteAll(newinds, sendcnt, sdispls, rdispls, recvcnt);
917     arr.resize(size);
918     for(IT i=0; i<size; ++i)
919     {
920         arr[indsbuf[i]] = sendbuf[i];
921     }
922 #endif
923 }
924 
925 // ABAB: In its current form, unless LengthUntil returns NT
926 // this won't work reliably for anything other than integers
927 template <class IT, class NT>
iota(IT globalsize,NT first)928 void FullyDistVec<IT,NT>::iota(IT globalsize, NT first)
929 {
930 	glen = globalsize;
931 	IT length = MyLocLength();	// only needs glen to determine length
932 
933 	arr.resize(length);
934 	SpHelper::iota(arr.begin(), arr.end(), LengthUntil() + first);	// global across processors
935 }
936 
937 template <class IT, class NT>
operator ()(const FullyDistVec<IT,IT> & ri) const938 FullyDistVec<IT,NT> FullyDistVec<IT,NT>::operator() (const FullyDistVec<IT,IT> & ri) const
939 {
940 	if(!(*commGrid == *ri.commGrid))
941 	{
942 		std::cout << "Grids are not comparable for dense vector subsref" << std::endl;
943 		return FullyDistVec<IT,NT>();
944 	}
945 
946 	MPI_Comm World = commGrid->GetWorld();
947 	FullyDistVec<IT,NT> Indexed(commGrid, ri.glen, NT());	// length(Indexed) = length(ri)
948 	int nprocs = commGrid->GetSize();
949 	std::vector< std::vector< IT > > data_req(nprocs);
950 	std::vector< std::vector< IT > > revr_map(nprocs);	// to put the incoming data to the correct location
951 
952 	IT riloclen = ri.LocArrSize();
953 	for(IT i=0; i < riloclen; ++i)
954 	{
955 		IT locind;
956 		int owner = Owner(ri.arr[i], locind);	// numerical values in ri are 0-based
957 		data_req[owner].push_back(locind);
958 		revr_map[owner].push_back(i);
959 	}
960 	IT * sendbuf = new IT[riloclen];
961 	int * sendcnt = new int[nprocs];
962 	int * sdispls = new int[nprocs];
963 	for(int i=0; i<nprocs; ++i)
964 		sendcnt[i] = (int) data_req[i].size();
965 
966 	int * rdispls = new int[nprocs];
967 	int * recvcnt = new int[nprocs];
968 	MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);  // share the request counts
969 	sdispls[0] = 0;
970 	rdispls[0] = 0;
971 	for(int i=0; i<nprocs-1; ++i)
972 	{
973 		sdispls[i+1] = sdispls[i] + sendcnt[i];
974 		rdispls[i+1] = rdispls[i] + recvcnt[i];
975 	}
976 	IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
977 	for(int i=0; i<nprocs; ++i)
978 	{
979     std::copy(data_req[i].begin(), data_req[i].end(), sendbuf+sdispls[i]);
980 		std::vector<IT>().swap(data_req[i]);
981 	}
982 
983 	IT * reversemap = new IT[riloclen];
984 	for(int i=0; i<nprocs; ++i)
985 	{
986     std::copy(revr_map[i].begin(), revr_map[i].end(), reversemap+sdispls[i]);	// reversemap array is unique
987 		std::vector<IT>().swap(revr_map[i]);
988 	}
989 
990 	IT * recvbuf = new IT[totrecv];
991 	MPI_Alltoallv(sendbuf, sendcnt, sdispls, MPIType<IT>(), recvbuf, recvcnt, rdispls, MPIType<IT>(), World);  // request data
992 	delete [] sendbuf;
993 
994 	// We will return the requested data,
995 	// our return will be as big as the request
996 	// as we are indexing a dense vector, all elements exist
997 	// so the displacement boundaries are the same as rdispls
998 	NT * databack = new NT[totrecv];
999 	for(int i=0; i<nprocs; ++i)
1000 	{
1001 		for(int j = rdispls[i]; j < rdispls[i] + recvcnt[i]; ++j)	// fetch the numerical values
1002 		{
1003 			databack[j] = arr[recvbuf[j]];
1004 		}
1005 	}
1006 	delete [] recvbuf;
1007 	NT * databuf = new NT[riloclen];
1008 
1009 	// the response counts are the same as the request counts
1010 	MPI_Alltoallv(databack, recvcnt, rdispls, MPIType<NT>(), databuf, sendcnt, sdispls, MPIType<NT>(), World);  // send data
1011 	DeleteAll(rdispls, recvcnt, databack);
1012 
1013 	// Now create the output from databuf
1014 	// Indexed.arr is already allocated in contructor
1015 	for(int i=0; i<nprocs; ++i)
1016 	{
1017 		for(int j=sdispls[i]; j< sdispls[i]+sendcnt[i]; ++j)
1018 		{
1019 			Indexed.arr[reversemap[j]] = databuf[j];
1020 		}
1021 	}
1022 	DeleteAll(sdispls, sendcnt, databuf,reversemap);
1023 	return Indexed;
1024 }
1025 
1026 
1027 template <class IT, class NT>
PrintInfo(std::string vectorname) const1028 void FullyDistVec<IT,NT>::PrintInfo(std::string vectorname) const
1029 {
1030 	IT totl = TotalLength();
1031 	if (commGrid->GetRank() == 0)
1032 		std::cout << "As a whole, " << vectorname << " has length " << totl << std::endl;
1033 }
1034 
1035 
1036 
1037 
1038 
1039 template <class IT, class NT>
Set(const FullyDistSpVec<IT,NT> & other)1040 void FullyDistVec<IT,NT>::Set(const FullyDistSpVec< IT,NT > & other)
1041 {
1042     if(*(commGrid) == *(other.commGrid))
1043     {
1044         if(glen != other.glen)
1045         {
1046             std::cerr << "Vector dimensions don't match (" << glen << " vs " << other.glen << ") for FullyDistVec::Set\n";
1047             MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1048         }
1049         else
1050         {
1051 
1052             IT spvecsize = other.getlocnnz();
1053 #ifdef _OPENMP
1054 #pragma omp parallel for
1055 #endif
1056             for(IT i=0; i< spvecsize; ++i)
1057             {
1058                 arr[other.ind[i]] = other.num[i];
1059             }
1060         }
1061     }
1062     else
1063     {
1064         std::cout << "Grids are not comparable for Set" << std::endl;
1065         MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1066     }
1067 }
1068 
1069 
1070 
1071 
1072 // General purpose set operation on dense vector by a sparse vector
1073 
1074 template <class IT, class NT>
1075 template <class NT1, typename _BinaryOperationIdx, typename _BinaryOperationVal>
GSet(const FullyDistSpVec<IT,NT1> & spVec,_BinaryOperationIdx __binopIdx,_BinaryOperationVal __binopVal,MPI_Win win)1076 void FullyDistVec<IT,NT>::GSet (const FullyDistSpVec<IT,NT1> & spVec, _BinaryOperationIdx __binopIdx, _BinaryOperationVal __binopVal, MPI_Win win)
1077 {
1078     if(*(commGrid) != *(spVec.commGrid))
1079     {
1080         std::cout << "Grids are not comparable for GSet" << std::endl;
1081         MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1082     }
1083 
1084     IT spVecSize = spVec.getlocnnz();
1085     if(spVecSize==0) return;
1086 
1087 
1088     MPI_Comm World = commGrid->GetWorld();
1089     int nprocs = commGrid->GetSize();
1090     int myrank;
1091     MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
1092 
1093 
1094     std::vector< std::vector< NT > > datsent(nprocs);
1095     std::vector< std::vector< IT > > indsent(nprocs);
1096     IT lengthUntil = spVec.LengthUntil();
1097 
1098     for(IT k=0; k < spVecSize; ++k)
1099     {
1100         IT locind;
1101         // get global index of the dense vector from the value. Most often a select operator.
1102         // If the first operand is selected, then invert; otherwise, EwiseApply.
1103         IT globind = __binopIdx(spVec.num[k], spVec.ind[k] + lengthUntil);
1104         int owner = Owner(globind, locind); // get local index
1105         NT val = __binopVal(spVec.num[k], spVec.ind[k] + lengthUntil);
1106         if(globind < glen) // prevent index greater than size of the composed vector
1107         {
1108             datsent[owner].push_back(val);
1109             indsent[owner].push_back(locind);   // so that we don't need no correction at the recipient
1110         }
1111     }
1112 
1113 
1114     for(int j = 0; j < datsent[myrank].size(); ++j)	// directly set local entries
1115     {
1116         arr[indsent[myrank][j]] = datsent[myrank][j];
1117     }
1118 
1119 
1120     //MPI_Win win;
1121     //MPI_Win_create(&arr[0], LocArrSize() * sizeof(NT), sizeof(NT), MPI_INFO_NULL, World, &win);
1122     //MPI_Win_fence(0, win);
1123     for(int i=0; i<nprocs; ++i)
1124     {
1125         if(i!=myrank)
1126         {
1127             MPI_Win_lock(MPI_LOCK_SHARED,i,MPI_MODE_NOCHECK,win);
1128             for(int j = 0; j < datsent[i].size(); ++j)
1129             {
1130                 MPI_Put(&datsent[i][j], 1, MPIType<NT>(), i, indsent[i][j], 1, MPIType<NT>(), win);
1131             }
1132             MPI_Win_unlock(i, win);
1133         }
1134     }
1135     //MPI_Win_fence(0, win);
1136     //MPI_Win_free(&win);
1137 
1138 }
1139 
1140 
1141 
1142 // General purpose get operation on dense vector by a sparse vector
1143 // Get the element of the dense vector indexed by the value of the sparse vector
1144 // invert and get might not work in the presence of repeated values
1145 
1146 template <class IT, class NT>
1147 template <class NT1, typename _BinaryOperationIdx>
GGet(const FullyDistSpVec<IT,NT1> & spVec,_BinaryOperationIdx __binopIdx,NT nullValue)1148  FullyDistSpVec<IT,NT> FullyDistVec<IT,NT>::GGet (const FullyDistSpVec<IT,NT1> & spVec, _BinaryOperationIdx __binopIdx, NT nullValue)
1149 {
1150     if(*(commGrid) != *(spVec.commGrid))
1151     {
1152         std::cout << "Grids are not comparable for GGet" << std::endl;
1153         MPI_Abort(MPI_COMM_WORLD, GRIDMISMATCH);
1154     }
1155 
1156     MPI_Comm World = commGrid->GetWorld();
1157     int nprocs = commGrid->GetSize();
1158     int myrank;
1159     MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
1160 
1161 
1162     std::vector< std::vector< NT > > spIdx(nprocs);
1163     std::vector< std::vector< IT > > indsent(nprocs);
1164     IT lengthUntil = spVec.LengthUntil();
1165     IT spVecSize = spVec.getlocnnz();
1166 
1167     FullyDistSpVec<IT, NT> res(spVec.commGrid, spVec.TotalLength());
1168     res.ind.resize(spVecSize);
1169     res.num.resize(spVecSize);
1170 
1171 
1172     for(IT k=0; k < spVecSize; ++k)
1173     {
1174         IT locind;
1175         // get global index of the dense vector from the value. Most often a select operator.
1176         // If the first operand is selected, then invert; otherwise, EwiseApply.
1177         IT globind = __binopIdx(spVec.num[k], spVec.ind[k] + lengthUntil);
1178         int owner = Owner(globind, locind); // get local index
1179         //NT val = __binopVal(spVec.num[k], spVec.ind[k] + lengthUntil);
1180         if(globind < glen) // prevent index greater than size of the composed vector
1181         {
1182             spIdx[owner].push_back(k); // position of spVec
1183             indsent[owner].push_back(locind);   // so that we don't need no correction at the recipient
1184         }
1185         else
1186             res.num[k] = nullValue;
1187         res.ind[k] = spVec.ind[k];
1188     }
1189 
1190 
1191     for(int j = 0; j < indsent[myrank].size(); ++j)	// directly get local entries
1192     {
1193         res.num[spIdx[myrank][j]] = arr[indsent[myrank][j]];
1194     }
1195 
1196 
1197     MPI_Win win;
1198     MPI_Win_create(&arr[0], LocArrSize() * sizeof(NT), sizeof(NT), MPI_INFO_NULL, World, &win);
1199     for(int i=0; i<nprocs; ++i)
1200     {
1201         if(i!=myrank)
1202         {
1203             MPI_Win_lock(MPI_LOCK_SHARED,i,MPI_MODE_NOCHECK,win);
1204             for(int j = 0; j < indsent[i].size(); ++j)
1205             {
1206                 MPI_Get(&res.num[spIdx[i][j]], 1, MPIType<NT>(), i, indsent[i][j], 1, MPIType<NT>(), win);
1207             }
1208             MPI_Win_unlock(i, win);
1209         }
1210     }
1211     MPI_Win_free(&win);
1212 
1213     return res;
1214 }
1215 
1216 }
1217