1 // SPDX-License-Identifier: Apache-2.0
2 
3 #include <cmath>
4 #include <cstdint>
5 #include <random>
6 
_cudaGetErrorEnum(cudaError_t error)7 static const char *_cudaGetErrorEnum(cudaError_t error) {
8   return cudaGetErrorName(error);
9 }
10 
11 template <typename T>
check(T result,char const * const func,const char * const file,int const line)12 void check(T result, char const *const func, const char *const file,
13            int const line) {
14   if (result) {
15     fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
16             static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
17     exit(EXIT_FAILURE);
18   }
19 }
20 
21 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
22 
23 // This will output the proper error string when calling cudaGetLastError
24 #define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
25 
__getLastCudaError(const char * errorMessage,const char * file,const int line)26 inline void __getLastCudaError(const char *errorMessage, const char *file,
27                                const int line) {
28   cudaError_t err = cudaGetLastError();
29 
30   if (cudaSuccess != err) {
31     fprintf(stderr,
32             "%s(%i) : getLastCudaError() CUDA error :"
33             " %s : (%d) %s.\n",
34             file, line, errorMessage, static_cast<int>(err),
35             cudaGetErrorString(err));
36     exit(EXIT_FAILURE);
37   }
38 }
39 
40 // This will only print the proper error string when calling cudaGetLastError
41 // but not exit program incase error detected.
42 #define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
43 
__printLastCudaError(const char * errorMessage,const char * file,const int line)44 inline void __printLastCudaError(const char *errorMessage, const char *file,
45                                  const int line) {
46   cudaError_t err = cudaGetLastError();
47 
48   if (cudaSuccess != err) {
49     fprintf(stderr,
50             "%s(%i) : getLastCudaError() CUDA error :"
51             " %s : (%d) %s.\n",
52             file, line, errorMessage, static_cast<int>(err),
53             cudaGetErrorString(err));
54   }
55 }
56 #define CHECK_CUDA(call) checkCudaErrors( call )
57 
58 //Vector generators
59 template<typename T>
fillvector_linear(int N,T * vec)60 void fillvector_linear( int N, T *vec) {
61    for (int i = 0; i< N; ++i) vec[i] = T(i);
62 }
63 template<typename T>
fillvector_constant(int N,T * vec,T val)64 void fillvector_constant( int N, T *vec, T val) {
65    for (int i = 0; i< N; ++i) vec[i] = val;
66 }
67 
68 // Mix-in class to enable unified memory
69 class Managed {
70 public:
operator new(size_t len)71   void *operator new(size_t len) {
72     void *ptr = nullptr;
73     //std::cout<<"in new operator, alloc for "<<len<<" bytes"<<std::endl;
74     CHECK_CUDA( cudaMallocManaged( &ptr, len) );
75     cudaDeviceSynchronize();
76     //std::cout<<"in new operator, sync "<<len<<" bytes"<<std::endl;
77     return ptr;
78   }
79 
operator delete(void * ptr)80   void operator delete(void *ptr) {
81     cudaDeviceSynchronize();
82     //std::cout<<"in delete operator, free "<<std::endl;
83     CHECK_CUDA( cudaFree(ptr) );
84   }
85 };
86 
87 //Basic matrix container class
88 template<typename T>
89 class matrix : public Managed {
90   public:
91     uint64_t zombie_count = 0;
92      int64_t vlen;
93      int64_t vdim;
94      int64_t nnz;
95      int64_t *p = nullptr;
96      int64_t *h = nullptr;
97      int64_t *i = nullptr;
98      T *x = nullptr;
99      bool is_filled = false;
100 
matrix()101      matrix(){};
102 
matrix(int64_t N,int64_t nvecs)103      matrix( int64_t N, int64_t nvecs){
104         vlen = N;
105         vdim = nvecs;
106      }
107 
set_zombie_count(uint64_t zc)108      void set_zombie_count( uint64_t zc) { zombie_count = zc;}
get_zombie_count()109      uint64_t get_zombie_count() { return zombie_count;}
add_zombie_count(int nz)110      void add_zombie_count( int nz) { zombie_count += nz;}
111 
clear()112      void clear() {
113         if ( p != nullptr){  cudaFree(p); p = nullptr; }
114         if ( h != nullptr){  cudaFree(h); h = nullptr; }
115         if ( i != nullptr){  cudaFree(i); i = nullptr; }
116         if ( x != nullptr){  cudaFree(x); x = nullptr; }
117         is_filled = false;
118         vlen = 0;
119         vdim = 0;
120         nnz  = 0;
121         zombie_count = 0;
122      }
123 
alloc(int64_t N,int64_t Nz)124      void alloc( int64_t N, int64_t Nz) {
125 
126         //cudaMallocManaged((void**)&p, (Nz+N+1)*sizeof(int64_t)+ (Nz*sizeof(T)));
127         //i = p+(N+1);
128         //x = (T*)(p + (Nz+N+1));
129         CHECK_CUDA( cudaMallocManaged((void**)&p, (N+1)*sizeof(int64_t)) );
130         CHECK_CUDA( cudaMallocManaged((void**)&i, Nz*sizeof(int64_t)) );
131         CHECK_CUDA( cudaMallocManaged((void**)&x, Nz*sizeof(T)) );
132 
133      }
134 
fill_random(int64_t N,int64_t Nz,std::mt19937 r)135      void fill_random(  int64_t N, int64_t Nz, std::mt19937 r) {
136 
137         int64_t inv_sparsity = (N*N)/Nz;   //= values not taken per value occupied in index space
138 
139         //std::cout<< "fill_random N="<< N<<" need "<< Nz<<" values, invsparse = "<<inv_sparsity<<std::endl;
140         alloc( N, Nz);
141 
142         //std::cout<< "fill_random"<<" after alloc values"<<std::endl;
143         vdim = N;
144         //std::cout<<"vdim ready "<<std::endl;
145         vlen = N;
146         //std::cout<<"vlen ready "<<std::endl;
147         nnz = Nz;
148         //std::cout<<"ready to fill p"<<std::endl;
149 
150         p[0] = 0;
151         p[N] = nnz;
152 
153         //std::cout<<"   in fill loop"<<std::endl;
154         for (int64_t j = 0; j < N; ++j) {
155            p[j+1] = p[j] + Nz/N;
156            //std::cout<<" row "<<j<<" has "<< p[j+1]-p[j]<<" entries."<<std::endl;
157            for ( int k = p[j] ; k < p[j+1]; ++k) {
158                i[k] = (k-p[j])*inv_sparsity +  r() % inv_sparsity;
159                x[k] = (T) (k & 63) ;
160            }
161         }
162         is_filled = true;
163      }
164 };
165 
166 
167 
168 template< typename T_C, typename T_M, typename T_A, typename T_B>
169 class SpGEMM_problem_generator {
170 
171     float Anzpercent,Bnzpercent,Cnzpercent;
172     int64_t Cnz;
173     int64_t *Bucket = nullptr;
174     int64_t BucketStart[13];
175     unsigned seed = 13372801;
176     std::mt19937 r; //random number generator Mersenne Twister
177     bool ready = false;
178 
179   public:
180 
181     matrix<T_C> *C= nullptr;
182     matrix<T_M> *M= nullptr;
183     matrix<T_A> *A= nullptr;
184     matrix<T_B> *B= nullptr;
185 
SpGEMM_problem_generator()186     SpGEMM_problem_generator() {
187 
188        //std::cout<<"creating matrices"<<std::endl;
189        // Create sparse matrices
190        C = new matrix<T_C>;
191        // CHECK_CUDA( cudaMallocManaged( (void**)&C, sizeof(matrix<T_C>)) );
192        //cudaMemAdvise ( C, sizeof(matrix<T_C>), cudaMemAdviseSetReadMostly, 1);
193        //std::cout<<"created  C matrix"<<std::endl;
194        M = new matrix<T_M>;
195        //cudaMallocManaged( (void**)&M, sizeof(matrix<T_M>));
196        //cudaMemAdvise ( M, sizeof(matrix<T_C>), cudaMemAdviseSetReadOnly, 1);
197        //std::cout<<"created  M matrix"<<std::endl;
198        A = new matrix<T_A>;
199        //cudaMallocManaged( (void**)&A, sizeof(matrix<T_A>));
200        //cudaMemAdvise ( C, sizeof(matrix<T_C>), cudaMemAdviseSetReadOnly, 1);
201        //std::cout<<"created  A matrix"<<std::endl;
202        B = new matrix<T_B>;
203        //cudaMallocManaged( (void**)&B, sizeof(matrix<T_B>));
204        //cudaMemAdvise ( C, sizeof(matrix<T_C>), cudaMemAdviseSetReadOnly, 1);
205        //std::cout<<"created  B matrix"<<std::endl;
206 
207     };
208 
getCptr()209     matrix<T_C>* getCptr(){ return C;}
getMptr()210     matrix<T_M>* getMptr(){ return M;}
getAptr()211     matrix<T_A>* getAptr(){ return A;}
getBptr()212     matrix<T_B>* getBptr(){ return B;}
213 
getBucket()214     int64_t* getBucket() { return Bucket;}
getBucketStart()215     int64_t* getBucketStart(){ return BucketStart;}
216 
loadCj()217     void loadCj() {
218 
219        // Load C_i with column j info to avoid another lookup
220        for (int c = 0 ; c< M->vdim; ++c) {
221            for ( int r = M->p[c]; r< M->p[c+1]; ++r){
222                C->i[r] = c << 4 ; //shift to store bucket info
223            }
224        }
225 
226     }
227 
init(int64_t N,int64_t Anz,int64_t Bnz,float Cnzpercent)228     void init( int64_t N , int64_t Anz, int64_t Bnz, float Cnzpercent){
229 
230        // Get sizes relative to fully dense matrices
231        Anzpercent = float(Anz)/float(N*N);
232        Bnzpercent = float(Bnz)/float(N*N);
233        Cnzpercent = Cnzpercent;
234        Cnz = (int64_t)(Cnzpercent * N * N);
235        std::cout<<"Anz% ="<<Anzpercent<<" Bnz% ="<<Bnzpercent<<" Cnz% ="<<Cnzpercent<<std::endl;
236 
237        //Seed the generator
238        r.seed(seed);
239 
240        std::cout<<"filling matrices"<<std::endl;
241 
242        C->fill_random( N, Cnz, r);
243        M->fill_random( N, Cnz, r);
244        A->fill_random( N, Anz, r);
245        B->fill_random( N, Bnz, r);
246 
247        std::cout<<"fill complete"<<std::endl;
248        C->p = M->p; //same column pointers (assuming CSC here)
249 
250        loadCj();
251 
252     }
253 
del()254     void del(){
255        C->clear();
256        M->clear();
257        A->clear();
258        B->clear();
259        if (Bucket != nullptr) CHECK_CUDA( cudaFree(Bucket) );
260        delete C;
261        delete M;
262        delete A;
263        delete B;
264        CHECK_CUDA( cudaDeviceSynchronize() );
265     }
266 
fill_buckets(int fill_bucket)267     void fill_buckets( int fill_bucket){
268 
269        std::cout<<Cnz<<" slots to fill"<<std::endl;
270 
271        if (fill_bucket == -1){
272 
273        // Allocate Bucket space
274        CHECK_CUDA( cudaMallocManaged((void**)&Bucket, Cnz*sizeof(int64_t)) );
275 
276        //Fill buckets with random extents such that they sum to Cnz, set BucketStart
277            BucketStart[0] = 0;
278            BucketStart[12] = Cnz;
279            for (int b = 1; b < 12; ++b){
280               BucketStart[b] = BucketStart[b-1] + (Cnz / 12);
281               //std::cout<< "bucket "<< b<<" starts at "<<BucketStart[b]<<std::endl;
282               for (int j = BucketStart[b-1]; j < BucketStart[b]; ++j) {
283                 Bucket[j] = b ;
284               }
285            }
286            int b = 11;
287            for (int j = BucketStart[11]; j < BucketStart[12]; ++j) {
288                 Bucket[j] = b ;
289            }
290        }
291        else {// all in one test bucket
292            Bucket = nullptr;
293            BucketStart[0] = 0;
294            BucketStart[12] = Cnz;
295            for (int b= 0; b<12; ++b){
296               if (b <= fill_bucket) BucketStart[b] = 0;
297               if (b  > fill_bucket) BucketStart[b] = Cnz;
298               //std::cout<< " one  bucket "<< b<<"starts at "<<BucketStart[b]<<std::endl;
299            }
300            std::cout<<"all pairs to bucket "<<fill_bucket<<", no filling"<<std::endl;
301            std::cout<<"done assigning buckets"<<std::endl;
302        }
303     }
304 };
305 
306 
307