1 /****************************************************************/
2 /* Parallel Combinatorial BLAS Library (for Graph Computations) */
3 /* version 1.4 -------------------------------------------------*/
4 /* date: 1/17/2014 ---------------------------------------------*/
5 /* authors: Aydin Buluc (abuluc@lbl.gov), Adam Lugowski --------*/
6 /****************************************************************/
7 /*
8  Copyright (c) 2010-2014, The Regents of the University of California
9 
10  Permission is hereby granted, free of charge, to any person obtaining a copy
11  of this software and associated documentation files (the "Software"), to deal
12  in the Software without restriction, including without limitation the rights
13  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  copies of the Software, and to permit persons to whom the Software is
15  furnished to do so, subject to the following conditions:
16 
17  The above copyright notice and this permission notice shall be included in
18  all copies or substantial portions of the Software.
19 
20  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  THE SOFTWARE.
27  */
28 
29 #include <mpi.h>
30 
31 #include "SpParMat.h"
32 #include "ParFriends.h"
33 #include "Operations.h"
34 
35 #ifndef GRAPH_GENERATOR_SEQ
36 #define GRAPH_GENERATOR_SEQ
37 #endif
38 
39 #include "graph500/generator/graph_generator.h"
40 #include "graph500/generator/utils.h"
41 #include "RefGen21.h"
42 
43 #include <fstream>
44 #include <algorithm>
45 
46 namespace combblas {
47 
48 template <typename IT>
DistEdgeList()49 DistEdgeList<IT>::DistEdgeList(): edges(NULL), pedges(NULL), nedges(0), globalV(0)
50 {
51 	commGrid.reset(new CommGrid(MPI_COMM_WORLD, 0, 0));
52 }
53 
54 template <typename IT>
DistEdgeList(MPI_Comm & myWorld)55 DistEdgeList<IT>::DistEdgeList(MPI_Comm & myWorld): edges(NULL), pedges(NULL), nedges(0), globalV(0)
56 {
57     commGrid.reset(new CommGrid(myWorld, 0, 0));
58 }
59 
60 template <typename IT>
DistEdgeList(const char * filename,IT globaln,IT globalm)61 DistEdgeList<IT>::DistEdgeList(const char * filename, IT globaln, IT globalm): edges(NULL), pedges(NULL), globalV(globaln)
62 {
63 	commGrid.reset(new CommGrid(MPI_COMM_WORLD, 0, 0));
64 
65 	int nprocs = commGrid->GetSize();
66 	int rank = commGrid->GetRank();
67 	nedges = (rank == nprocs-1)? (globalm - rank * (globalm / nprocs)) : (globalm / nprocs);
68 
69 	FILE * infp = fopen(filename, "rb");
70 	assert(infp != NULL);
71 	IT read_offset_start, read_offset_end;
72 	read_offset_start = rank * 8 * (globalm / nprocs);
73 	read_offset_end = (rank+1) * 8 * (globalm / nprocs);
74 	if (rank == nprocs - 1)
75    		read_offset_end = 8*globalm;
76 
77 
78 	std::ofstream oput;
79 	commGrid->OpenDebugFile("BinRead", oput);
80 	if(infp != NULL)
81 	{
82 		oput << "File exists" << std::endl;
83 		oput << "Trying to read " << nedges << " edges out of " << globalm << std::endl;
84 	}
85 	else
86 	{
87 		oput << "File does not exist" << std::endl;
88 	}
89 
90 	/* gen_edges is an array of unsigned ints of size 2*nedges */
91 	uint32_t * gen_edges = new uint32_t[2*nedges];
92 	fseek(infp, read_offset_start, SEEK_SET);
93 	fread(gen_edges, 2*nedges, sizeof(uint32_t), infp);
94 	SetMemSize(nedges);
95 	oput << "Freads done " << std::endl;
96 	for(IT i=0; i< 2*nedges; ++i)
97 		edges[i] = (IT) gen_edges[i];
98 	oput << "Puts done " << std::endl;
99 	delete [] gen_edges;
100 	oput.close();
101 
102 }
103 
104 #if 0
105 // Adam:
106 // commenting out because there is a compiler error with MPI_File_open.
107 
108 template <typename IT>
109 void DistEdgeList<IT>::Dump64bit(string filename)
110 {
111 	int rank,nprocs;
112 	MPI_Comm World = commGrid->GetWorld();
113 	MPI_Comm_rank(World, &rank);
114 	MPI_Comm_size(World, &nprocs);
115 	MPI_File thefile;
116 	MPI_File_open(World, filename.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &thefile);
117 
118 	IT * prelens = new IT[nprocs];
119 	prelens[rank] = 2*nedges;
120 	MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(), prelens, 1, MPIType<IT>(), commGrid->GetWorld());
121 	IT lengthuntil = accumulate(prelens, prelens+rank, 0);
122 
123 	// The disp displacement argument specifies the position
124 	// (absolute offset in bytes from the beginning of the file)
125     	MPI_File_set_view(thefile, int64_t(lengthuntil * sizeof(IT)), MPIType<IT>(), MPIType<IT>(), "native", MPI_INFO_NULL);
126 	MPI_File_write(thefile, edges, prelens[rank], MPIType<IT>(), NULL);
127 	MPI_File_close(&thefile);
128 	delete [] prelens;
129 }
130 
131 template <typename IT>
132 void DistEdgeList<IT>::Dump32bit(string filename)
133 {
134 	int rank, nprocs;
135 	MPI_Comm World = commGrid->GetWorld();
136 	MPI_Comm_rank(World, &rank);
137 	MPI_Comm_size(World, &nprocs);
138 	MPI_File thefile;
139 	MPI_File_open(World, filename.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &thefile);
140 
141 	IT * prelens = new IT[nprocs];
142 	prelens[rank] = 2*nedges;
143 	MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(), prelens, 1, MPIType<IT>(), commGrid->GetWorld());
144 	IT lengthuntil = accumulate(prelens, prelens+rank, static_cast<IT>(0));
145 
146 	// The disp displacement argument specifies the position
147 	// (absolute offset in bytes from the beginning of the file)
148     	MPI_File_set_view(thefile, int64_t(lengthuntil * sizeof(uint32_t)), MPI_UNSIGNED, MPI_UNSIGNED, "native", MPI_INFO_NULL);
149 	uint32_t * gen_edges = new uint32_t[prelens[rank]];
150 	for(IT i=0; i< prelens[rank]; ++i)
151 		gen_edges[i] = (uint32_t) edges[i];
152 
153 	MPI_File_write(thefile, gen_edges, prelens[rank], MPI_UNSIGNED, NULL);
154 	MPI_File_close(&thefile);
155 
156 	delete [] prelens;
157 	delete [] gen_edges;
158 }
159 #endif
160 
161 template <typename IT>
~DistEdgeList()162 DistEdgeList<IT>::~DistEdgeList()
163 {
164 	if(edges) delete [] edges;
165 	if(pedges) delete [] pedges;
166 }
167 
168 //! Allocates enough space
169 template <typename IT>
SetMemSize(IT ne)170 void DistEdgeList<IT>::SetMemSize(IT ne)
171 {
172 	if (edges)
173 	{
174 		delete [] edges;
175 		edges = NULL;
176 	}
177 
178 	memedges = ne;
179 	edges = 0;
180 
181 	if (memedges > 0)
182 		edges = new IT[2*memedges];
183 }
184 
185 /** Removes all edges that begin with a -1.
186  * Walks back from the end to tighten the nedges counter, then walks forward and replaces any edge
187  * with a -1 source with the last edge.
188  */
189 template <typename IT>
CleanupEmpties()190 void DistEdgeList<IT>::CleanupEmpties()
191 {
192 
193 	// find out how many edges there actually are
194 	while (nedges > 0 && edges[2*(nedges-1) + 0] == -1)
195 	{
196 		nedges--;
197 	}
198 
199 	// remove marked multiplicities or self-loops
200 	for (IT i = 0; i < (nedges-1); i++)
201 	{
202 		if (edges[2*i + 0] == -1)
203 		{
204 			// the graph500 generator marked this edge as a self-loop or a multiple edge.
205 			// swap it with the last edge
206 			edges[2*i + 0] = edges[2*(nedges-1) + 0];
207 			edges[2*i + 1] = edges[2*(nedges-1) + 1];
208 			edges[2*(nedges-1) + 0] = -1; // mark this spot as unused
209 
210 			while (nedges > 0 && edges[2*(nedges-1) + 0] == -1)	// the swapped edge might be -1 too
211 				nedges--;
212 		}
213 	}
214 }
215 
216 
217 /**
218  * Note that GenGraph500Data will return global vertex numbers (from 1... N). The ith edge can be
219  * accessed with edges[2*i] and edges[2*i+1]. There will be duplicates and the data won't be sorted.
220  * Generates an edge list consisting of an RMAT matrix suitable for the Graph500 benchmark.
221 */
222 template <typename IT>
GenGraph500Data(double initiator[4],int log_numverts,int edgefactor,bool scramble,bool packed)223 void DistEdgeList<IT>::GenGraph500Data(double initiator[4], int log_numverts, int edgefactor, bool scramble, bool packed)
224 {
225 	if(packed && (!scramble))
226 	{
227 		SpParHelper::Print("WARNING: Packed version does always generate scrambled vertex identifiers\n");
228 	}
229 
230 	globalV = ((int64_t)1)<< log_numverts;
231 	int64_t globaledges = globalV * static_cast<int64_t>(edgefactor);
232 
233 	if(packed)
234 	{
235 		RefGen21::make_graph (log_numverts, globaledges, &nedges, (packed_edge**)(&pedges), commGrid->GetWorld());
236 	}
237 	else
238 	{
239 		// The generations use different seeds on different processors, generating independent
240 		// local RMAT matrices all having vertex ranges [0,...,globalmax-1]
241 		// Spread the two 64-bit numbers into five nonzero values in the correct range
242 		uint_fast32_t seed[5];
243 
244 		uint64_t size = (uint64_t) commGrid->GetSize();
245 		uint64_t rank = (uint64_t) commGrid->GetRank();
246 	#ifdef DETERMINISTIC
247 		uint64_t seed2 = 2;
248 	#else
249 		uint64_t seed2 = time(NULL);
250 	#endif
251 		make_mrg_seed(rank, seed2, seed);	// we give rank as the first seed, so it is different on processors
252 
253 		// a single pair of [val0,val1] for all the computation, global across all processors
254 		uint64_t val0, val1; /* Values for scrambling */
255 		if(scramble)
256 		{
257 			if(rank == 0)
258 				RefGen21::MakeScrambleValues(val0, val1, seed);	// ignore the return value
259 
260 			MPI_Bcast(&val0, 1, MPIType<uint64_t>(),0, commGrid->GetWorld());
261  			MPI_Bcast(&val1, 1, MPIType<uint64_t>(),0, commGrid->GetWorld());
262 		}
263 
264 		nedges = globaledges/size;
265 		SetMemSize(nedges);
266 		// clear the source vertex by setting it to -1
267 		for (IT i = 0; i < nedges; i++)
268 			edges[2*i+0] = -1;
269 
270 		generate_kronecker(0, 1, seed, log_numverts, nedges, initiator, edges);
271 		if(scramble)
272 		{
273 			for(IT i=0; i < nedges; ++i)
274 			{
275 				edges[2*i+0] = RefGen21::scramble(edges[2*i+0], log_numverts, val0, val1);
276 				edges[2*i+1] = RefGen21::scramble(edges[2*i+1], log_numverts, val0, val1);
277 			}
278 		}
279 	}
280 }
281 
282 
283 /**
284  * Randomly permutes the distributed edge list.
285  * Once we call Viral's psort on this vector, everything will go to the right place [tuples are
286  * sorted lexicographically] and you can reconstruct the int64_t * edges in an embarrassingly parallel way.
287  * As I understood, the entire purpose of this function is to destroy any locality. It does not
288  * rename any vertices and edges are not named anyway.
289  * For an example, think about the edge (0,1). It will eventually (at the end of kernel 1) be owned by processor P(0,0).
290  * However, assume that processor P(r1,c1) has a copy of it before the call to PermEdges. After
291  * this call, some other irrelevant processor P(r2,c2) will own it. So we gained nothing, it is just a scrambled egg.
292 **/
293 template <typename IT>
PermEdges(DistEdgeList<IT> & DEL)294 void PermEdges(DistEdgeList<IT> & DEL)
295 {
296 	IT maxedges = DEL.memedges;	// this can be optimized by calling the clean-up first
297 
298 	// to lower memory consumption, rename in stages
299 	// this is not "identical" to a full randomization;
300 	// but more than enough to destroy any possible locality
301 	IT stages = 8;
302 	IT perstage = maxedges / stages;
303 
304 	int nproc =(DEL.commGrid)->GetSize();
305 	int rank = (DEL.commGrid)->GetRank();
306 	IT * dist = new IT[nproc];
307 
308 #ifdef DETERMINISTIC
309 	MTRand M(1);
310 #else
311 	MTRand M;	// generate random numbers with Mersenne Twister
312 #endif
313 	for(IT s=0; s< stages; ++s)
314 	{
315 		#ifdef DEBUG
316 		SpParHelper::Print("PermEdges stage starting\n");
317 		double st = MPI_Wtime();
318 		#endif
319 
320 		IT n_sofar = s*perstage;
321 		IT n_thisstage = ((s==(stages-1))? (maxedges - n_sofar): perstage);
322 
323 		std::pair<double, std::pair<IT,IT> >* vecpair = new std::pair<double, std::pair<IT,IT> >[n_thisstage];
324 		dist[rank] = n_thisstage;
325 		MPI_Allgather(MPI_IN_PLACE, 1, MPIType<IT>(), dist, 1, MPIType<IT>(), DEL.commGrid->GetWorld());
326 
327 		for (IT i = 0; i < n_thisstage; i++)
328 		{
329 			vecpair[i].first = M.rand();
330 			vecpair[i].second.first = DEL.edges[2*(i+n_sofar)];
331 			vecpair[i].second.second = DEL.edges[2*(i+n_sofar)+1];
332 		}
333 
334 		// less< pair<T1,T2> > works correctly (sorts w.r.t. first element of type T1)
335 		SpParHelper::MemoryEfficientPSort(vecpair, n_thisstage, dist, DEL.commGrid->GetWorld());
336 		// SpParHelper::DebugPrintKeys(vecpair, n_thisstage, dist, DEL.commGrid->GetWorld());
337 		for (IT i = 0; i < n_thisstage; i++)
338 		{
339 			DEL.edges[2*(i+n_sofar)] = vecpair[i].second.first;
340 			DEL.edges[2*(i+n_sofar)+1] = vecpair[i].second.second;
341 		}
342 		delete [] vecpair;
343 		#ifdef DEBUG
344 		double et = MPI_Wtime();
345     std::ostringstream timeinfo;
346 		timeinfo << "Stage " << s << " in " << et-st << " seconds" << std::endl;
347 		SpParHelper::Print(timeinfo.str());
348 		#endif
349 	}
350 	delete [] dist;
351 }
352 
353 /**
354   * Rename vertices globally.
355   *	You first need to do create a random permutation distributed on all processors.
356   *	Then the p round robin algorithm will do the renaming:
357   * For all processors P(i,i)
358   *          Broadcast local_p to all p processors
359   *          For j= i*N/p to min((i+1)*N/p, N)
360   *                    Rename the all j's with local_p(j) inside the edgelist (and mark them
361   *                    "renamed" so that yeach vertex id is renamed only once)
362   **/
363 template <typename IU>
RenameVertices(DistEdgeList<IU> & DEL)364 void RenameVertices(DistEdgeList<IU> & DEL)
365 {
366 	int nprocs = DEL.commGrid->GetSize();
367 	int rank = DEL.commGrid->GetRank();
368 	MPI_Comm World = DEL.commGrid->GetWorld();
369 
370 	// create permutation
371 	FullyDistVec<IU, IU> globalPerm(DEL.commGrid);
372 	globalPerm.iota(DEL.getGlobalV(), 0);
373 	globalPerm.RandPerm();	// now, randperm can return a 0-based permutation
374 	IU locrows = globalPerm.MyLocLength();
375 
376 	// way to mark whether each vertex was already renamed or not
377 	IU locedgelist = 2*DEL.getNumLocalEdges();
378 	bool* renamed = new bool[locedgelist];
379 	std::fill_n(renamed, locedgelist, 0);
380 
381 	// permutation for one round
382 	IU * localPerm = NULL;
383 	IU permsize;
384 	IU startInd = 0;
385 
386 	//vector < pair<IU, IU> > vec;
387 	//for(IU i=0; i< DEL.getNumLocalEdges(); i++)
388 	//	vec.push_back(make_pair(DEL.edges[2*i], DEL.edges[2*i+1]));
389 	//sort(vec.begin(), vec.end());
390 	//vector < pair<IU, IU> > uniqued;
391 	//unique_copy(vec.begin(), vec.end(), back_inserter(uniqued));
392 	//cout << "before: " << vec.size() << " and after: " << uniqued.size() << endl;
393 
394 	for (int round = 0; round < nprocs; round++)
395 	{
396 		// broadcast the permutation from the one processor
397 		if (rank == round)
398 		{
399 			permsize = locrows;
400 			localPerm = new IU[permsize];
401       std::copy(globalPerm.arr.begin(), globalPerm.arr.end(), localPerm);
402 		}
403 		MPI_Bcast(&permsize, 1, MPIType<IU>(), round, World);
404 		if(rank != round)
405 		{
406 			localPerm = new IU[permsize];
407 		}
408 		MPI_Bcast(localPerm, permsize, MPIType<IU>(), round, World);
409 
410 		// iterate over
411 		for (typename std::vector<IU>::size_type j = 0; j < (unsigned)locedgelist ; j++)
412 		{
413 			// We are renaming vertices, not edges
414 			if (startInd <= DEL.edges[j] && DEL.edges[j] < (startInd + permsize) && !renamed[j])
415 			{
416 				DEL.edges[j] = localPerm[DEL.edges[j]-startInd];
417 				renamed[j] = true;
418 			}
419 		}
420 		startInd += permsize;
421 		delete [] localPerm;
422 	}
423 	delete [] renamed;
424 }
425 
426 }
427