1 // SPDX-License-Identifier: Apache-2.0
2
3 #include <cmath>
4 #include <cstdint>
5 #include <random>
6
_cudaGetErrorEnum(cudaError_t error)7 static const char *_cudaGetErrorEnum(cudaError_t error) {
8 return cudaGetErrorName(error);
9 }
10
11 template <typename T>
check(T result,char const * const func,const char * const file,int const line)12 void check(T result, char const *const func, const char *const file,
13 int const line) {
14 if (result) {
15 fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
16 static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
17 exit(EXIT_FAILURE);
18 }
19 }
20
21 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
22
23 // This will output the proper error string when calling cudaGetLastError
24 #define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
25
__getLastCudaError(const char * errorMessage,const char * file,const int line)26 inline void __getLastCudaError(const char *errorMessage, const char *file,
27 const int line) {
28 cudaError_t err = cudaGetLastError();
29
30 if (cudaSuccess != err) {
31 fprintf(stderr,
32 "%s(%i) : getLastCudaError() CUDA error :"
33 " %s : (%d) %s.\n",
34 file, line, errorMessage, static_cast<int>(err),
35 cudaGetErrorString(err));
36 exit(EXIT_FAILURE);
37 }
38 }
39
40 // This will only print the proper error string when calling cudaGetLastError
41 // but not exit program incase error detected.
42 #define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
43
__printLastCudaError(const char * errorMessage,const char * file,const int line)44 inline void __printLastCudaError(const char *errorMessage, const char *file,
45 const int line) {
46 cudaError_t err = cudaGetLastError();
47
48 if (cudaSuccess != err) {
49 fprintf(stderr,
50 "%s(%i) : getLastCudaError() CUDA error :"
51 " %s : (%d) %s.\n",
52 file, line, errorMessage, static_cast<int>(err),
53 cudaGetErrorString(err));
54 }
55 }
56 #define CHECK_CUDA(call) checkCudaErrors( call )
57
58 //Vector generators
59 template<typename T>
fillvector_linear(int N,T * vec)60 void fillvector_linear( int N, T *vec) {
61 for (int i = 0; i< N; ++i) vec[i] = T(i);
62 }
63 template<typename T>
fillvector_constant(int N,T * vec,T val)64 void fillvector_constant( int N, T *vec, T val) {
65 for (int i = 0; i< N; ++i) vec[i] = val;
66 }
67
68 // Mix-in class to enable unified memory
69 class Managed {
70 public:
operator new(size_t len)71 void *operator new(size_t len) {
72 void *ptr = nullptr;
73 //std::cout<<"in new operator, alloc for "<<len<<" bytes"<<std::endl;
74 CHECK_CUDA( cudaMallocManaged( &ptr, len) );
75 cudaDeviceSynchronize();
76 //std::cout<<"in new operator, sync "<<len<<" bytes"<<std::endl;
77 return ptr;
78 }
79
operator delete(void * ptr)80 void operator delete(void *ptr) {
81 cudaDeviceSynchronize();
82 //std::cout<<"in delete operator, free "<<std::endl;
83 CHECK_CUDA( cudaFree(ptr) );
84 }
85 };
86
87 //Basic matrix container class
88 template<typename T>
89 class matrix : public Managed {
90 public:
91 uint64_t zombie_count = 0;
92 int64_t vlen;
93 int64_t vdim;
94 int64_t nnz;
95 int64_t *p = nullptr;
96 int64_t *h = nullptr;
97 int64_t *i = nullptr;
98 T *x = nullptr;
99 bool is_filled = false;
100
matrix()101 matrix(){};
102
matrix(int64_t N,int64_t nvecs)103 matrix( int64_t N, int64_t nvecs){
104 vlen = N;
105 vdim = nvecs;
106 }
107
set_zombie_count(uint64_t zc)108 void set_zombie_count( uint64_t zc) { zombie_count = zc;}
get_zombie_count()109 uint64_t get_zombie_count() { return zombie_count;}
add_zombie_count(int nz)110 void add_zombie_count( int nz) { zombie_count += nz;}
111
clear()112 void clear() {
113 if ( p != nullptr){ cudaFree(p); p = nullptr; }
114 if ( h != nullptr){ cudaFree(h); h = nullptr; }
115 if ( i != nullptr){ cudaFree(i); i = nullptr; }
116 if ( x != nullptr){ cudaFree(x); x = nullptr; }
117 is_filled = false;
118 vlen = 0;
119 vdim = 0;
120 nnz = 0;
121 zombie_count = 0;
122 }
123
alloc(int64_t N,int64_t Nz)124 void alloc( int64_t N, int64_t Nz) {
125
126 //cudaMallocManaged((void**)&p, (Nz+N+1)*sizeof(int64_t)+ (Nz*sizeof(T)));
127 //i = p+(N+1);
128 //x = (T*)(p + (Nz+N+1));
129 CHECK_CUDA( cudaMallocManaged((void**)&p, (N+1)*sizeof(int64_t)) );
130 CHECK_CUDA( cudaMallocManaged((void**)&i, Nz*sizeof(int64_t)) );
131 CHECK_CUDA( cudaMallocManaged((void**)&x, Nz*sizeof(T)) );
132
133 }
134
fill_random(int64_t N,int64_t Nz,std::mt19937 r)135 void fill_random( int64_t N, int64_t Nz, std::mt19937 r) {
136
137 int64_t inv_sparsity = (N*N)/Nz; //= values not taken per value occupied in index space
138
139 //std::cout<< "fill_random N="<< N<<" need "<< Nz<<" values, invsparse = "<<inv_sparsity<<std::endl;
140 alloc( N, Nz);
141
142 //std::cout<< "fill_random"<<" after alloc values"<<std::endl;
143 vdim = N;
144 //std::cout<<"vdim ready "<<std::endl;
145 vlen = N;
146 //std::cout<<"vlen ready "<<std::endl;
147 nnz = Nz;
148 //std::cout<<"ready to fill p"<<std::endl;
149
150 p[0] = 0;
151 p[N] = nnz;
152
153 //std::cout<<" in fill loop"<<std::endl;
154 for (int64_t j = 0; j < N; ++j) {
155 p[j+1] = p[j] + Nz/N;
156 //std::cout<<" row "<<j<<" has "<< p[j+1]-p[j]<<" entries."<<std::endl;
157 for ( int k = p[j] ; k < p[j+1]; ++k) {
158 i[k] = (k-p[j])*inv_sparsity + r() % inv_sparsity;
159 x[k] = (T) (k & 63) ;
160 }
161 }
162 is_filled = true;
163 }
164 };
165
166
167
168 template< typename T_C, typename T_M, typename T_A, typename T_B>
169 class SpGEMM_problem_generator {
170
171 float Anzpercent,Bnzpercent,Cnzpercent;
172 int64_t Cnz;
173 int64_t *Bucket = nullptr;
174 int64_t BucketStart[13];
175 unsigned seed = 13372801;
176 std::mt19937 r; //random number generator Mersenne Twister
177 bool ready = false;
178
179 public:
180
181 matrix<T_C> *C= nullptr;
182 matrix<T_M> *M= nullptr;
183 matrix<T_A> *A= nullptr;
184 matrix<T_B> *B= nullptr;
185
SpGEMM_problem_generator()186 SpGEMM_problem_generator() {
187
188 //std::cout<<"creating matrices"<<std::endl;
189 // Create sparse matrices
190 C = new matrix<T_C>;
191 // CHECK_CUDA( cudaMallocManaged( (void**)&C, sizeof(matrix<T_C>)) );
192 //cudaMemAdvise ( C, sizeof(matrix<T_C>), cudaMemAdviseSetReadMostly, 1);
193 //std::cout<<"created C matrix"<<std::endl;
194 M = new matrix<T_M>;
195 //cudaMallocManaged( (void**)&M, sizeof(matrix<T_M>));
196 //cudaMemAdvise ( M, sizeof(matrix<T_C>), cudaMemAdviseSetReadOnly, 1);
197 //std::cout<<"created M matrix"<<std::endl;
198 A = new matrix<T_A>;
199 //cudaMallocManaged( (void**)&A, sizeof(matrix<T_A>));
200 //cudaMemAdvise ( C, sizeof(matrix<T_C>), cudaMemAdviseSetReadOnly, 1);
201 //std::cout<<"created A matrix"<<std::endl;
202 B = new matrix<T_B>;
203 //cudaMallocManaged( (void**)&B, sizeof(matrix<T_B>));
204 //cudaMemAdvise ( C, sizeof(matrix<T_C>), cudaMemAdviseSetReadOnly, 1);
205 //std::cout<<"created B matrix"<<std::endl;
206
207 };
208
getCptr()209 matrix<T_C>* getCptr(){ return C;}
getMptr()210 matrix<T_M>* getMptr(){ return M;}
getAptr()211 matrix<T_A>* getAptr(){ return A;}
getBptr()212 matrix<T_B>* getBptr(){ return B;}
213
getBucket()214 int64_t* getBucket() { return Bucket;}
getBucketStart()215 int64_t* getBucketStart(){ return BucketStart;}
216
loadCj()217 void loadCj() {
218
219 // Load C_i with column j info to avoid another lookup
220 for (int c = 0 ; c< M->vdim; ++c) {
221 for ( int r = M->p[c]; r< M->p[c+1]; ++r){
222 C->i[r] = c << 4 ; //shift to store bucket info
223 }
224 }
225
226 }
227
init(int64_t N,int64_t Anz,int64_t Bnz,float Cnzpercent)228 void init( int64_t N , int64_t Anz, int64_t Bnz, float Cnzpercent){
229
230 // Get sizes relative to fully dense matrices
231 Anzpercent = float(Anz)/float(N*N);
232 Bnzpercent = float(Bnz)/float(N*N);
233 Cnzpercent = Cnzpercent;
234 Cnz = (int64_t)(Cnzpercent * N * N);
235 std::cout<<"Anz% ="<<Anzpercent<<" Bnz% ="<<Bnzpercent<<" Cnz% ="<<Cnzpercent<<std::endl;
236
237 //Seed the generator
238 r.seed(seed);
239
240 std::cout<<"filling matrices"<<std::endl;
241
242 C->fill_random( N, Cnz, r);
243 M->fill_random( N, Cnz, r);
244 A->fill_random( N, Anz, r);
245 B->fill_random( N, Bnz, r);
246
247 std::cout<<"fill complete"<<std::endl;
248 C->p = M->p; //same column pointers (assuming CSC here)
249
250 loadCj();
251
252 }
253
del()254 void del(){
255 C->clear();
256 M->clear();
257 A->clear();
258 B->clear();
259 if (Bucket != nullptr) CHECK_CUDA( cudaFree(Bucket) );
260 delete C;
261 delete M;
262 delete A;
263 delete B;
264 CHECK_CUDA( cudaDeviceSynchronize() );
265 }
266
fill_buckets(int fill_bucket)267 void fill_buckets( int fill_bucket){
268
269 std::cout<<Cnz<<" slots to fill"<<std::endl;
270
271 if (fill_bucket == -1){
272
273 // Allocate Bucket space
274 CHECK_CUDA( cudaMallocManaged((void**)&Bucket, Cnz*sizeof(int64_t)) );
275
276 //Fill buckets with random extents such that they sum to Cnz, set BucketStart
277 BucketStart[0] = 0;
278 BucketStart[12] = Cnz;
279 for (int b = 1; b < 12; ++b){
280 BucketStart[b] = BucketStart[b-1] + (Cnz / 12);
281 //std::cout<< "bucket "<< b<<" starts at "<<BucketStart[b]<<std::endl;
282 for (int j = BucketStart[b-1]; j < BucketStart[b]; ++j) {
283 Bucket[j] = b ;
284 }
285 }
286 int b = 11;
287 for (int j = BucketStart[11]; j < BucketStart[12]; ++j) {
288 Bucket[j] = b ;
289 }
290 }
291 else {// all in one test bucket
292 Bucket = nullptr;
293 BucketStart[0] = 0;
294 BucketStart[12] = Cnz;
295 for (int b= 0; b<12; ++b){
296 if (b <= fill_bucket) BucketStart[b] = 0;
297 if (b > fill_bucket) BucketStart[b] = Cnz;
298 //std::cout<< " one bucket "<< b<<"starts at "<<BucketStart[b]<<std::endl;
299 }
300 std::cout<<"all pairs to bucket "<<fill_bucket<<", no filling"<<std::endl;
301 std::cout<<"done assigning buckets"<<std::endl;
302 }
303 }
304 };
305
306
307