1 // SPDX-License-Identifier: Apache-2.0
2 
3 #include <cassert>
4 #include <cmath>
5 #include <random>
6 #include <algorithm>
7 #include "jitFactory.hpp"
8 #include "../binary_search.h"
9 #include "GpuTimer.h"
10 #include "gtest/gtest.h"
11 
12 //Operations for test results on CPU
myOP_plus(T a,T b)13 template<typename T> T myOP_plus( T a, T b) { return  a + b;}
myOP_min(T a,T b)14 template<typename T> T myOP_min ( T a, T b) { return  a < b ? a : b;}
myOP_max(T a,T b)15 template<typename T> T myOP_max ( T a, T b) { return  a > b ? a : b;}
myOP_first(T a,T b)16 template<typename T> T myOP_first ( T a, T b) { return  a ;}
myOP_second(T a,T b)17 template<typename T> T myOP_second ( T a, T b) { return  b ;}
myOP_times(T a,T b)18 template<typename T> T myOP_times ( T a, T b) { return  a * b ;}
19 
20 template<typename T> T (*myOpPTR)(T a, T b);
21 template<typename T> T (*ADD_ptr)(T a, T b);
22 template<typename T> T (*MUL_ptr)(T a, T b);
23 
24 
25 //Test generators using jitify
26 template <typename T>
27 bool test_reducefactoryUM( unsigned int N, std::string OP) ;
28 
29 template <typename T1,typename T2,typename T3>
30 bool test_dndotfactoryUM( unsigned int N, std::string SEMI_RING) ;
31 
32 template <typename T1,typename T2,typename T3>
33 bool test_spdotfactoryUM( unsigned int N, unsigned int xn, unsigned int yn, std::string SEMI_RING) ;
34 
35 //AxB_dot3_phase1 kernels
36 template <typename T_C, typename T_M, typename T_A,typename T_B>
37 bool test_AxB_dot3_phase1_factory( int64_t , int64_t , int64_t , int64_t ) ;
38 
39 //AxB_dot3_phase2 kernels
40 template <typename T_C>
41 bool test_AxB_dot3_phase2_factory( int , int64_t , int64_t , int64_t, int64_t ) ;
42 
43 template <typename T_C>
44 bool test_AxB_dot3_phase2end_factory( int , int64_t , int64_t , int64_t ) ;
45 
46 //AxB_dot3_phase3 kernels
47 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
48 bool test_AxB_dot3_dndn_factory( int , int64_t , int64_t , int64_t , std::string&) ;
49 
50 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
51 bool test_AxB_dot3_vsvs_factory( int , int64_t , int64_t , int64_t , std::string&) ;
52 
53 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
54 bool test_AxB_dot3_spdn_factory( int , int64_t , int64_t , int64_t , std::string&) ;
55 
56 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
57 bool test_AxB_dot3_vssp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
58 
59 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
60 bool test_AxB_dot3_mp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
61 
62 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
63 bool test_AxB_dot3_warp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
64 
65 
66 //Fixture to generate valid inputs and hold them for tests
67 class AxB_dot3_Test : public ::testing::Test
68 {
SetUp()69    void SetUp()
70    {
71 
72 
73    }
74 
TearDown()75    void TearDown()
76    {
77 
78    }
79 
80 }
81 
82 // Test generator code, to allow parameterized tests
83 // Uses jitFactory, dataFactory and GB_jit
84 template <typename T_C, typename T_M, typename T_A,typename T_B>
test_AxB_dot3_phase1_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz)85 bool test_AxB_dot3_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
86 
87 int gpuID;
88 cudaGetDevice( &gpuID);
89 
90 std::cout<< "found device "<<gpuID<<std::endl;
91 
92 phase1launchFactory<T_C, T_M, T_A, T_B> p1lF();
93 
94 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
95 int64_t Annz = N*N;
96 int64_t Bnnz = N*N;
97 int64_t Cnz = N;
98 float Cnzpercent = (float) Cnz/(N*N);
99 
100 G.init(N, Annz, Bnnz, Cnzpercent);
101 
102 G.fill_buckets( testBucket); // all elements go to testbucket= TB
103 
104 matrix<T_C>* C = G.getCptr();
105 matrix<T_M>* M = G.getMptr();
106 matrix<T_A>* A = G.getAptr();
107 matrix<T_B>* B = G.getBptr();
108 
109 T_C *Cx = C->x;
110 T_A *Ax = A->x;
111 T_B *Bx = B->x;
112 
113        int nthrd = 32;
114        int sz = 4;
115        //int m = 256/sz;
116        int nblck = Cnz;
117        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
118 
119        GpuTimer kernTimer;
120        kernTimer.Start();
121        p1lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket,
122                                 C, M, A, B);
123 
124        kernTimer.Stop();
125        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
126 
127        return true;
128 
129 }
130 
131 template <typename T_C, typename T_M, typename T_A,typename T_B>
test_AxB_dot3_phase2_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz)132 bool test_AxB_dot3_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
133 
134 int gpuID;
135 cudaGetDevice( &gpuID);
136 
137 std::cout<< "found device "<<gpuID<<std::endl;
138 
139 phase2launchFactory<T_C, T_M, T_A, T_B> p2lF();
140 
141 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
142 int64_t Annz = N*N;
143 int64_t Bnnz = N*N;
144 int64_t Cnz = N;
145 float Cnzpercent = (float) Cnz/(N*N);
146 
147 G.init(N, Annz, Bnnz, Cnzpercent);
148 
149 G.fill_buckets( testBucket); // all elements go to testbucket= TB
150 
151 matrix<T_C>* C = G.getCptr();
152 matrix<T_M>* M = G.getMptr();
153 matrix<T_A>* A = G.getAptr();
154 matrix<T_B>* B = G.getBptr();
155 
156 T_C *Cx = C->x;
157 T_A *Ax = A->x;
158 T_B *Bx = B->x;
159 
160        int nthrd = 32;
161        int sz = 4;
162        //int m = 256/sz;
163        int nblck = Cnz;
164        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
165 
166        GpuTimer kernTimer;
167        kernTimer.Start();
168        p1lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket,
169                                 C, M, A, B);
170 
171        kernTimer.Stop();
172        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
173 
174        return true;
175 }
176 
177 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_full_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)178 bool test_AxB_dot3_full_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
179 // Generates three randomized matrices, builds buckets and calls a kernel.
180 // This is the full version as called in SuiteSparse:GraphBLAS
181 
182 phase1launchFactory<T_C, T_M, T_A, T_B> p1lF();
183 phase2launchFactory<T_C> p2lF();
184 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "dndn");
185 
186 int testBucket = TB;
187 
188 //unsigned seed = 13372801;
189 //std::mt19937 r; //random number generator Mersenne Twister
190 //r.seed(seed);
191 int gpuID;
192 cudaGetDevice( &gpuID);
193 
194 std::cout<< "found device "<<gpuID<<std::endl;
195 
196 T_Z MONOID_IDENTITY;
197 if (SEMI_RING == "PLUS_TIMES") {
198    std::cout << "Plus Times (+,*) semiring"<<std::endl;
199    MONOID_IDENTITY = 0;
200    ADD_ptr<T_Z> = myOP_plus<T_Z>;
201    MUL_ptr<T_Z> = myOP_times<T_Z>;
202 
203 }
204 else if(SEMI_RING == "MIN_PLUS") {
205    std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
206    MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
207    ADD_ptr<T_Z> = myOP_min<T_Z>;
208    MUL_ptr<T_Z> = myOP_plus<T_Z>;
209 
210 }
211 else if(SEMI_RING == "MAX_PLUS") {
212    MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
213    std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
214    ADD_ptr<T_Z> = myOP_max<T_Z>;
215    MUL_ptr<T_Z> = myOP_plus<T_Z>;
216 }
217 
218 //Generate test data and setup for using a jitify kernel with 'bucket' interface
219 // The testBucket arg tells the generator which bucket we want to exercise
220 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G( testBucket);
221 int64_t Annz = N*N;
222 int64_t Bnnz = N*N;
223 int64_t Cnz = N;
224 float Cnzpercent = (float) Cnz/(N*N);
225 
226 G.init(N, Annz, Bnnz, Cnzpercent);
227 
228 G.fill_buckets( testBucket); // all elements go to testbucket= TB
229 
230 matrix<T_C>* C = G.getCptr();
231 matrix<T_M>* M = G.getMptr();
232 matrix<T_A>* A = G.getAptr();
233 matrix<T_B>* B = G.getBptr();
234 
235 T_C *Cx = C->x;
236 T_A *Ax = A->x;
237 T_B *Bx = B->x;
238 
239 // Set clear zombie count
240 C->zombie_count = 0;
241 
242 //std::cout<<"got all matrices"<<std::endl;
243 int64_t *Bucket = G.getBucket();
244 int64_t *BucketStart = G.getBucketStart();
245 
246 int zc_valid = 0;
247 
248 bool result = false;
249 
250    // Phase 1
251        int nthrd = 32;
252        int sz = 4;
253        //int m = 256/sz;
254        int nblck = Cnz;
255        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
256 
257        GpuTimer kernTimer;
258        kernTimer.Start();
259        p1lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket,
260                                 C, M, A, B);
261 
262        kernTimer.Stop();
263        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
264 
265    // Phase 2
266        int nthrd = 32;
267        int sz = 4;
268        //int m = 256/sz;
269        int nblck = Cnz;
270        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
271 
272        GpuTimer kernTimer;
273        kernTimer.Start();
274        p2lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket, bucketp,
275                                 C);
276 
277        kernTimer.Stop();
278        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
279 
280 
281 for (int b =0; b < 12; ++b) {// loop on buckets
282 
283     int64_t b_start = BucketStart [b] ;
284     int64_t b_end   = BucketStart [b+1] ;
285     int64_t nvecs = b_end - b_start ;
286     if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
287 
288     T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
289     int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
290     if (b == TB) { //test cases for dense-dense kernels
291        int nthrd = 32;
292        int sz = 4;
293        //int m = 256/sz;
294        int nblck = Cnz;
295        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
296 
297        GpuTimer kernTimer;
298        kernTimer.Start();
299        lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
300                                 C, M, A, B, sz);
301 
302        kernTimer.Stop();
303        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
304 
305        zc_valid = C->zombie_count;
306        C->zombie_count = 0;
307        for (int i =0 ; i< Cnz; ++i) {
308             //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
309             X_valid[i] = Cx[i];
310             Cx[i] = 0;
311             i_valid[i] = C->i[i];
312        }
313        G.loadCj();
314 
315        for (int64_t pair = b_start ; pair < b_end ; pair++) {
316 
317         // get the kth entry in bucket b
318         //std::cout<< " pair ="<<pair<<std::endl;
319         int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
320         int64_t i = M->i[pC] ;          // row index of C(i,j)
321 
322         // get C(i,j)
323         int64_t k = (C->i [pC] >> 4) ;    // col index of C(i,j)
324         //ASSERT ((C->i [pC] & 4) == b) ;
325         int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
326         //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
327 
328         // xvp, xvi, xvals:  A(:,i)
329         // xvp is Ap [i] and Ap [i+1]
330         int64_t pA_start = A->p [i] ;
331         int64_t pA_end   = A->p [i+1] ;
332         // indices are in Ai [pA_start ... pA_end-1]
333         // values  are in Ax [pA_start ... pA_end-1]
334 
335         // yvp, yvi, yvals:  B(:,j)
336         // yvp is Bp [j] and Bp [j+1]
337         int64_t pB_start = B->p [j] ;
338         int64_t pB_end   = B->p [j+1] ;
339         // indices are in Bi [pB_start ... pB_end-1]
340         // values  are in Bx [pB_start ... pB_end-1]
341         k = pA_start;
342         int64_t l = pB_start;
343         T_Z cij = MONOID_IDENTITY;
344         while( k < pA_end && l < pB_end) {
345            //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
346            cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
347            k++;
348            l++;
349            //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
350         }
351         //std::cout<< " dot  = "<< sum << std::endl;
352 
353         // output for this dot product is
354 
355         if (cij == MONOID_IDENTITY) {
356             C->i [pC] = -1;//GB_FLIP (i)
357             C->zombie_count++;
358         }
359         else {
360             Cx [pC] = (T_C)cij;
361             C->i [pC] = i;
362         }
363     }
364        T_C err = 0;
365        for (int j =0 ; j< N; ++j) {
366          for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
367              int64_t i =  C->i[l];
368              //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
369              if (i >= 0)
370                 err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
371          }
372        }
373        std::cout<< " 2-norm of err ="<< err<<std::endl;
374        std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
375 
376        EXPECT_EQ(err,0);
377        EXPECT_EQ( zc_valid, C->get_zombie_count());
378 
379        free(X_valid);
380        free(i_valid);
381      }
382     }
383 
384 G.del();
385 
386 return result;
387 
388 }
389 
390 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_dndn_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)391 bool test_AxB_dot3_dndn_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
392 // Assumes all matrices are square so far, so only N dimension given.
393 // Sparsity is dense here so Anz = Bnz = N*N.
394 // Generates three randomized matrices, builds buckets and calls a kernel.
395 
396 
397 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "dndn");
398 
399 int testBucket = TB;
400 
401 //unsigned seed = 13372801;
402 //std::mt19937 r; //random number generator Mersenne Twister
403 //r.seed(seed);
404 int gpuID;
405 cudaGetDevice( &gpuID);
406 
407 std::cout<< "found device "<<gpuID<<std::endl;
408 
409 T_Z MONOID_IDENTITY;
410 if (SEMI_RING == "PLUS_TIMES") {
411    std::cout << "Plus Times (+,*) semiring"<<std::endl;
412    MONOID_IDENTITY = 0;
413    ADD_ptr<T_Z> = myOP_plus<T_Z>;
414    MUL_ptr<T_Z> = myOP_times<T_Z>;
415 
416 }
417 else if(SEMI_RING == "MIN_PLUS") {
418    std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
419    MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
420    ADD_ptr<T_Z> = myOP_min<T_Z>;
421    MUL_ptr<T_Z> = myOP_plus<T_Z>;
422 
423 }
424 else if(SEMI_RING == "MAX_PLUS") {
425    MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
426    std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
427    ADD_ptr<T_Z> = myOP_max<T_Z>;
428    MUL_ptr<T_Z> = myOP_plus<T_Z>;
429 }
430 
431 //Generate test data and setup for using a jitify kernel with 'bucket' interface
432 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
433 int64_t Annz = N*N;
434 int64_t Bnnz = N*N;
435 int64_t Cnz = N;
436 float Cnzpercent = (float) Cnz/(N*N);
437 
438 G.init(N, Annz, Bnnz, Cnzpercent);
439 
440 G.fill_buckets( testBucket); // all elements go to testbucket= TB
441 
442 matrix<T_C>* C = G.getCptr();
443 matrix<T_M>* M = G.getMptr();
444 matrix<T_A>* A = G.getAptr();
445 matrix<T_B>* B = G.getBptr();
446 
447 T_C *Cx = C->x;
448 T_A *Ax = A->x;
449 T_B *Bx = B->x;
450 
451 // Set clear zombie count
452 C->zombie_count = 0;
453 
454 //std::cout<<"got all matrices"<<std::endl;
455 int64_t *Bucket = G.getBucket();
456 int64_t *BucketStart = G.getBucketStart();
457 
458 int zc_valid = 0;
459 
460 bool result = false;
461 
462 for (int b =0; b < 12; ++b) {// loop on buckets
463 
464     int64_t b_start = BucketStart [b] ;
465     int64_t b_end   = BucketStart [b+1] ;
466     int64_t nvecs = b_end - b_start ;
467     if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
468 
469     T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
470     int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
471     if (b == TB) { //test cases for dense-dense kernels
472        int nthrd = 32;
473        int sz = 4;
474        //int m = 256/sz;
475        int nblck = Cnz;
476        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
477 
478        GpuTimer kernTimer;
479        kernTimer.Start();
480        lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
481                                 C, M, A, B, sz);
482 
483        kernTimer.Stop();
484        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
485 
486        zc_valid = C->zombie_count;
487        C->zombie_count = 0;
488        for (int i =0 ; i< Cnz; ++i) {
489             //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
490             X_valid[i] = Cx[i];
491             Cx[i] = 0;
492             i_valid[i] = C->i[i];
493        }
494        G.loadCj();
495 
496        for (int64_t pair = b_start ; pair < b_end ; pair++) {
497 
498         // get the kth entry in bucket b
499         //std::cout<< " pair ="<<pair<<std::endl;
500         int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
501         int64_t i = M->i[pC] ;          // row index of C(i,j)
502 
503         // get C(i,j)
504         int64_t k = (C->i [pC] >> 4) ;    // col index of C(i,j)
505         //ASSERT ((C->i [pC] & 4) == b) ;
506         int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
507         //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
508 
509         // xvp, xvi, xvals:  A(:,i)
510         // xvp is Ap [i] and Ap [i+1]
511         int64_t pA_start = A->p [i] ;
512         int64_t pA_end   = A->p [i+1] ;
513         // indices are in Ai [pA_start ... pA_end-1]
514         // values  are in Ax [pA_start ... pA_end-1]
515 
516         // yvp, yvi, yvals:  B(:,j)
517         // yvp is Bp [j] and Bp [j+1]
518         int64_t pB_start = B->p [j] ;
519         int64_t pB_end   = B->p [j+1] ;
520         // indices are in Bi [pB_start ... pB_end-1]
521         // values  are in Bx [pB_start ... pB_end-1]
522         k = pA_start;
523         int64_t l = pB_start;
524         T_Z cij = MONOID_IDENTITY;
525         while( k < pA_end && l < pB_end) {
526            //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
527            cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
528            k++;
529            l++;
530            //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
531         }
532         //std::cout<< " dot  = "<< sum << std::endl;
533 
534         // output for this dot product is
535 
536         if (cij == MONOID_IDENTITY) {
537             C->i [pC] = -1;//GB_FLIP (i)
538             C->zombie_count++;
539         }
540         else {
541             Cx [pC] = (T_C)cij;
542             C->i [pC] = i;
543         }
544     }
545        T_C err = 0;
546        for (int j =0 ; j< N; ++j) {
547          for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
548              int64_t i =  C->i[l];
549              //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
550              if (i >= 0)
551                 err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
552          }
553        }
554        std::cout<< " 2-norm of err ="<< err<<std::endl;
555        std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
556 
557        EXPECT_EQ(err,0);
558        EXPECT_EQ( zc_valid, C->get_zombie_count());
559 
560        free(X_valid);
561        free(i_valid);
562      }
563     }
564 
565 G.del();
566 
567 return result;
568 
569 }
570 
571 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_vsvs_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)572 bool test_AxB_dot3_vsvs_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
573 // Assumes all matrices are square so far, so only N dimension given.
574 // Sparsity is controlled by Anz and Bnz vs N*N.
575 // Generates three randomized matrices, builds buckets and calls a kernel.
576 
577 
578 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "vsvs");
579 
580 int testBucket = TB;
581 
582 //unsigned seed = 13372801;
583 //std::mt19937 r; //random number generator Mersenne Twister
584 //r.seed(seed);
585 int gpuID;
586 cudaGetDevice( &gpuID);
587 std::cout<< "found device "<<gpuID<<std::endl;
588 
589 //T_Z MONOID_IDENTITY;
590 if (SEMI_RING == "PLUS_TIMES") {
591    //MONOID_IDENTITY =(T_Z)0;
592    ADD_ptr<T_Z> = myOP_plus<T_Z>;
593    MUL_ptr<T_Z> = myOP_times<T_Z>;
594 
595 }
596 else if(SEMI_RING == "MIN_PLUS") {
597    //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
598    ADD_ptr<T_Z> = myOP_min<T_Z>;
599    MUL_ptr<T_Z> = myOP_plus<T_Z>;
600 
601 }
602 else if(SEMI_RING == "MAX_PLUS") {
603    //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
604    ADD_ptr<T_Z> = myOP_max<T_Z>;
605    MUL_ptr<T_Z> = myOP_plus<T_Z>;
606 }
607 
608 //Generate test data and setup for using a jitify kernel with 'bucket' interface
609 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
610 int64_t Cnz = N;
611 float Cnzpercent = (float) Cnz/(N*N);
612 
613 G.init(N, Anz, Bnz, Cnzpercent);
614 
615 G.fill_buckets( testBucket); // all elements go to testbucket= TB
616 
617 matrix<T_C>* C = G.getCptr();
618 matrix<T_M>* M = G.getMptr();
619 matrix<T_A>* A = G.getAptr();
620 matrix<T_B>* B = G.getBptr();
621 
622 T_C *Cx = C->x;
623 T_A *Ax = A->x;
624 T_B *Bx = B->x;
625 int64_t *Ci = C->i;
626 int64_t *Mi = M->i;
627 int64_t *Ai = A->i;
628 int64_t *Bi = B->i;
629 int64_t *Ap = A->p;
630 int64_t *Bp = B->p;
631 
632 //std::cout<<"got all matrices"<<std::endl;
633 int64_t *Bucket = G.getBucket();
634 int64_t *BucketStart = G.getBucketStart();
635 
636 int zc_valid = 0;
637 
638 bool result = false;
639 
640 for (int b =0; b < 12; ++b) {// loop on buckets
641 
642     int64_t b_start = BucketStart [b] ;
643     int64_t b_end   = BucketStart [b+1] ;
644     int64_t nvecs = b_end - b_start ;
645     if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
646 
647     T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
648     int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
649     if (b == TB) { //test cases for v.sparse-v.sparse kernels
650        int nthrd = 32;
651        int sz = Anz/N;
652        int m = 256/sz;
653        int nblck = (Cnz -1 + m*nthrd )/(m*nthrd) ;
654        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
655 
656        GpuTimer kernTimer;
657        kernTimer.Start();
658        lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
659                                 C, M, A, B, sz);
660 
661        kernTimer.Stop();
662        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
663 
664        //std::cout<<"returned from kernel"<<std::endl;
665 
666        zc_valid = C->zombie_count;
667        C->zombie_count = 0;
668        for (int i =0 ; i< Cnz; ++i) {
669             X_valid[i] = Cx[i];
670             Cx[i] = 0;
671             i_valid[i] = Ci[i];
672        }
673        G.loadCj();
674        for (int64_t pair = b_start ; pair < b_end ; pair++) {
675 
676         // get the kth entry in bucket b
677         //std::cout<< " pair ="<<pair<<std::endl;
678         int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
679         int64_t i = Mi[pC] ;          // row index of C(i,j)
680 
681         // get C(i,j)
682         int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
683         //ASSERT ((C->i [pC] & 4) == b) ;
684         int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
685         //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
686 
687         // xvp, xvi, xvals:  A(:,i)
688         // xvp is Ap [i] and Ap [i+1]
689         int64_t pA_start = Ap [i] ;
690         int64_t pA_end   = Ap [i+1] ;
691         // indices are in Ai [pA_start ... pA_end-1]
692         // values  are in Ax [pA_start ... pA_end-1]
693 
694         // yvp, yvi, yvals:  B(:,j)
695         // yvp is Bp [j] and Bp [j+1]
696         int64_t pB_start = Bp [j] ;
697         int64_t pB_end   = Bp [j+1] ;
698         // indices are in Bi [pB_start ... pB_end-1]
699         // values  are in Bx [pB_start ... pB_end-1]
700         k = pA_start;
701         int64_t l = pB_start;
702         T_Z cij ;
703         bool cij_exists = false;
704         while( k < pA_end && l < pB_end) {
705             if ( Ai[k] < Bi[l]) ++k;
706             else if ( Ai[k] > Bi[l]) ++l;
707             else {
708                 if (cij_exists) {
709                    cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( Ax[k] , Bx[l] ) );
710                 }
711                 else{
712                    cij_exists = true;
713                    cij = (*MUL_ptr<T_Z>)( Ax[k], Bx[l]);
714                 }
715                 k++;
716                 l++;
717             }
718         }
719         //std::cout<< " dot  = "<< sum << std::endl;
720 
721         // output for this dot product is
722 
723         if (cij_exists) {
724             Ci [pC] = i;
725             Cx[pC] = (T_C)cij;
726         }
727         else {
728             Ci [pC] = -1;//GB_FLIP (i)
729             C->zombie_count++;
730         }
731     }
732        T_C err = 0;
733        for (int j =0 ; j< N; ++j) {
734          for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
735              //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
736              if (Ci[l] > 0)
737                 err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
738          }
739        }
740        std::cout<< " 2-norm of err ="<< err<<std::endl;
741        std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
742 
743        EXPECT_EQ(err,0);
744        EXPECT_EQ( zc_valid, C->get_zombie_count());
745 
746        free(X_valid);
747        free(i_valid);
748      }
749     }
750 
751 G.del();
752 
753 return result;
754 
755 }
756 
757 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_vssp_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)758 bool test_AxB_dot3_vssp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
759 // Assumes all matrices are square so far, so only N dimension given.
760 // Sparsity is controlled by Anz and Bnz vs N*N.
761 // Generates three randomized matrices, builds buckets and calls a kernel.
762 
763 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "vssp");
764 
765 int testBucket = TB;
766 
767 //unsigned seed = 13372801;
768 //std::mt19937 r; //random number generator Mersenne Twister
769 //r.seed(seed);
770 int gpuID;
771 cudaGetDevice( &gpuID);
772 std::cout<< "found device "<<gpuID<<std::endl;
773 
774 //T_Z MONOID_IDENTITY;
775 if (SEMI_RING == "PLUS_TIMES") {
776    //MONOID_IDENTITY =(T_Z)0;
777    ADD_ptr<T_Z> = myOP_plus<T_Z>;
778    MUL_ptr<T_Z> = myOP_times<T_Z>;
779 
780 }
781 else if(SEMI_RING == "MIN_PLUS") {
782    //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
783    ADD_ptr<T_Z> = myOP_min<T_Z>;
784    MUL_ptr<T_Z> = myOP_plus<T_Z>;
785 
786 }
787 else if(SEMI_RING == "MAX_PLUS") {
788    //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
789    ADD_ptr<T_Z> = myOP_max<T_Z>;
790    MUL_ptr<T_Z> = myOP_plus<T_Z>;
791 }
792 
793 //Generate test data and setup for using a jitify kernel with 'bucket' interface
794 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
795 
796 int64_t Cnz = N;
797 float Cnzpercent = (float)( Cnz)/(N*N);
798 
799 G.init(N, Anz, Bnz, Cnzpercent );
800 
801 G.fill_buckets( testBucket); // all elements go to testbucket= TB
802 
803 matrix<T_C>* C = G.getCptr();
804 matrix<T_M>* M = G.getMptr();
805 matrix<T_A>* A = G.getAptr();
806 matrix<T_B>* B = G.getBptr();
807 
808 T_C *Cx = C->x;
809 T_A *Ax = A->x;
810 T_B *Bx = B->x;
811 int64_t *Ci = C->i;
812 int64_t *Mi = M->i;
813 int64_t *Ai = A->i;
814 int64_t *Bi = B->i;
815 int64_t *Ap = A->p;
816 int64_t *Bp = B->p;
817 
818 
819 //std::cout<<"got all matrices"<<std::endl;
820 int64_t *Bucket = G.getBucket();
821 int64_t *BucketStart = G.getBucketStart();
822 
823 int zc_valid = 0;
824 int zc = 0;
825 
826 bool result = false;
827 
828 for (int b =0; b < 12; ++b) {// loop on buckets
829 
830     int64_t b_start = BucketStart [b] ;
831     int64_t b_end   = BucketStart [b+1] ;
832     int64_t nvecs = b_end - b_start ;
833     if (nvecs == 0) continue;
834     std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
835 
836     T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
837     int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
838     if (b == TB) { //test cases for v.sparse-dense kernels
839        int nthrd = 32;
840        int sz = 4;
841        //int m = 256/sz;
842        int nblck = (Cnz -1 + nthrd )/(nthrd) ;
843        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
844 
845        GpuTimer kernTimer;
846        kernTimer.Start();
847        lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
848                                 C, M, A, B, sz);
849 
850        kernTimer.Stop();
851        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
852 
853        //std::cout<<"returned from kernel"<<std::endl;
854 
855        zc_valid = C->zombie_count;
856        C->zombie_count = 0;
857        for (int i =0 ; i< Cnz; ++i) {
858             X_valid[i] = Cx[i];
859             Cx[i] = 0;
860             i_valid[i] = C->i[i];
861        }
862        G.loadCj();
863 
864 
865        for (int64_t pair = b_start ; pair < b_end ; pair++) {
866 
867         // get the kth entry in bucket b
868         //std::cout<< " pair ="<<pair<<std::endl;
869         int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
870 
871         int64_t i = Mi[pC] ;          // row index of C(i,j)
872         // get C(i,j)
873         int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
874         //ASSERT ((C->i [pC] & 4) == b) ;
875         int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
876         //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
877 
878         int64_t pA      = Ap[i];
879         int64_t pA_end  = Ap[i+1];
880         int64_t nnzA = pA_end - pA;
881 
882         int64_t pB      = Bp[j];
883         int64_t pB_end  = Bp[j+1];
884         int64_t nnzB = pB_end - pB;
885 
886         //Search for each nonzero in the smaller vector to find intersection
887         bool cij_exists = false;
888 
889         T_A aki;
890         T_B bkj;
891         T_Z cij;
892 
893         if (nnzA <= nnzB) {
894             //----------------------------------------------------------------------
895             // A(:,i) is very sparse compared to B(:,j)
896             //----------------------------------------------------------------------
897 
898             while (pA < pA_end && pB < pB_end)
899             {
900                 int64_t ia = Ai [pA] ;
901                 int64_t ib = Bi [pB] ;
902                 if (ia < ib)
903                 {
904                     // A(ia,i) appears before B(ib,j)
905                     pA++ ;
906                 }
907                 else if (ib < ia)
908                 {
909                     // B(ib,j) appears before A(ia,i)
910                     // discard all entries B(ib:ia-1,j)
911                     int64_t pleft = pB + 1 ;
912                     int64_t pright = pB_end - 1 ;
913                     GB_BINARY_TRIM_SEARCH (ia, Bi, pleft, pright) ;
914                     //ASSERT (pleft > pB) ;
915                     pB = pleft ;
916                 }
917                 else // ia == ib == k
918                 {
919                     // A(k,i) and B(k,j) are the next entries to merge
920                     #if defined ( GB_PHASE_1_OF_2 )
921                     cij_exists = true ;
922                     break ;
923                     #else
924                     GB_GETA (aki, Ax, pA) ;             /* aki = A(k,i) */
925                     GB_GETB (bkj, Bx, pB) ;             /* bkj = B(k,j) */
926                     if (cij_exists)
927                     {
928                         cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki , (T_Z)bkj ) );
929                         /* cij += aki * bkj */
930                     }
931                     else
932                     {
933                         /* cij = A(k,i) * B(k,j), and add to the pattern */
934                         cij_exists = true ;
935                         cij=  (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) ;
936                         /* cij = aki * bkj */
937                     }
938                     //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
939                     pA++ ;
940                     pB++ ;
941                     #endif
942                 }
943             }
944         }
945         else {
946             //----------------------------------------------------------------------
947             // B(:,j) is very sparse compared to A(:,i)
948             //----------------------------------------------------------------------
949 
950             while (pA < pA_end && pB < pB_end)
951             {
952                 int64_t ia = Ai [pA] ;
953                 int64_t ib = Bi [pB] ;
954                 if (ia < ib)
955                 {
956                     // A(ia,i) appears before B(ib,j)
957                     // discard all entries A(ia:ib-1,i)
958                     int64_t pleft = pA + 1 ;
959                     int64_t pright = pA_end - 1 ;
960                     GB_BINARY_TRIM_SEARCH (ib, Ai, pleft, pright) ;
961                     //ASSERT (pleft > pA) ;
962                     pA = pleft ;
963                 }
964                 else if (ib < ia)
965                 {
966                     // B(ib,j) appears before A(ia,i)
967                     pB++ ;
968                 }
969                 else // ia == ib == k
970                 {
971                     // A(k,i) and B(k,j) are the next entries to merge
972                     #if defined ( GB_PHASE_1_OF_2 )
973                     cij_exists = true ;
974                     break ;
975                     #else
976                     GB_GETA (aki, Ax, pA) ;             /* aki = A(k,i) */
977                     GB_GETB (bkj, Bx, pB) ;             /* bkj = B(k,j) */
978                     if (cij_exists)
979                     {
980                         cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki , (T_Z)bkj ) );
981                         /* cij += aki * bkj */      \
982                     }
983                     else
984                     {
985                         /* cij = A(k,i) * B(k,j), and add to the pattern */
986                         cij_exists = true ;
987                         cij=  (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) ;
988                     }
989                     //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
990                     pA++ ;
991                     pB++ ;
992                     #endif
993                 }
994             }
995 
996         }
997         if ( cij_exists){
998            Ci[pair] = i;
999            Cx[pair] = (T_C)cij;
1000         }
1001         else {
1002            zc++;
1003            //printf(" %lld, %lld is zombie %d!\n",i,j,zc);
1004            Ci[pair] = GB_FLIP( i );
1005         }
1006 
1007     }
1008        C->zombie_count = zc;
1009        T_C err = 0;
1010        for (int j =0 ; j< N; ++j) {
1011          for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
1012              int64_t i = Ci[l];
1013              //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
1014              if (i > 0){ //not a zombie!
1015                  err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
1016              }
1017          }
1018        }
1019        std::cout<< " 2-norm of err ="<< err<<std::endl;
1020        std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
1021 
1022        EXPECT_EQ(err,0);
1023        EXPECT_EQ( zc_valid, C->get_zombie_count());
1024 
1025        free(X_valid);
1026        free(i_valid);
1027      }
1028     }
1029 
1030 G.del();
1031 
1032 return result;
1033 
1034 }
1035 
1036 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_spdn_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)1037 bool test_AxB_dot3_spdn_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
1038 // Assumes all matrices are square so far, so only N dimension given.
1039 // Sparsity is controlled by Anz and Bnz vs N*N.
1040 // Generates three randomized matrices, builds buckets and calls a kernel.
1041 
1042 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "spdn");
1043 
1044 int testBucket = TB;
1045 
1046 //unsigned seed = 13372801;
1047 //std::mt19937 r; //random number generator Mersenne Twister
1048 //r.seed(seed);
1049 int gpuID;
1050 cudaGetDevice( &gpuID);
1051 std::cout<< "found device "<<gpuID<<std::endl;
1052 
1053 //T_Z MONOID_IDENTITY;
1054 if (SEMI_RING == "PLUS_TIMES") {
1055   // MONOID_IDENTITY =(T_Z)0;
1056    ADD_ptr<T_Z> = myOP_plus<T_Z>;
1057    MUL_ptr<T_Z> = myOP_times<T_Z>;
1058 
1059 }
1060 else if(SEMI_RING == "MIN_PLUS") {
1061   // MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
1062    ADD_ptr<T_Z> = myOP_min<T_Z>;
1063    MUL_ptr<T_Z> = myOP_plus<T_Z>;
1064 
1065 }
1066 else if(SEMI_RING == "MAX_PLUS") {
1067   // MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
1068    ADD_ptr<T_Z> = myOP_max<T_Z>;
1069    MUL_ptr<T_Z> = myOP_plus<T_Z>;
1070 }
1071 
1072 //Generate test data and setup for using a jitify kernel with 'bucket' interface
1073 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
1074 
1075 int64_t Cnz = N;
1076 float Cnzpercent = (float)( Cnz)/(N*N);
1077 
1078 //spdn case means B should be dense -> Bnz = N*N;
1079 G.init(N, Anz, N*N, Cnzpercent );
1080 
1081 G.fill_buckets( testBucket); // all elements go to testbucket= TB
1082 
1083 matrix<T_C>* C = G.getCptr();
1084 matrix<T_M>* M = G.getMptr();
1085 matrix<T_A>* A = G.getAptr();
1086 matrix<T_B>* B = G.getBptr();
1087 
1088 T_C *Cx = C->x;
1089 T_A *Ax = A->x;
1090 T_B *Bx = B->x;
1091 int64_t *Ci = C->i;
1092 int64_t *Mi = M->i;
1093 int64_t *Ai = A->i;
1094 int64_t *Bi = B->i;
1095 int64_t *Ap = A->p;
1096 int64_t *Bp = B->p;
1097 
1098 
1099 //std::cout<<"got all matrices"<<std::endl;
1100 int64_t *Bucket = G.getBucket();
1101 int64_t *BucketStart = G.getBucketStart();
1102 
1103 int zc_valid = 0;
1104 
1105 bool result = false;
1106 
1107 for (int b =0; b < 12; ++b) {// loop on buckets
1108 
1109     int64_t b_start = BucketStart [b] ;
1110     int64_t b_end   = BucketStart [b+1] ;
1111     int64_t nvecs = b_end - b_start ;
1112     if (nvecs == 0) continue;
1113     std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
1114 
1115     T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
1116     int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
1117     if (b == TB) { //test cases for v.sparse-dense kernels
1118        int nthrd = 32;
1119        int sz = Anz/N;
1120        int m = 256/sz;
1121        int nblck = (Cnz -1 + m*nthrd )/(m*nthrd) ;
1122        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
1123 
1124        GpuTimer kernTimer;
1125        kernTimer.Start();
1126        lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
1127                                 C, M, A, B, sz);
1128 
1129        kernTimer.Stop();
1130        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
1131 
1132        //std::cout<<"returned from kernel"<<std::endl;
1133 
1134        zc_valid = C->zombie_count;
1135        C->zombie_count = 0;
1136        for (int i =0 ; i< Cnz; ++i) {
1137             X_valid[i] = Cx[i];
1138             Cx[i] = 0;
1139             i_valid[i] = Ci[i];
1140        }
1141        G.loadCj();
1142        for (int64_t pair = b_start ; pair < b_end ; pair++) {
1143 
1144         // get the kth entry in bucket b
1145         //std::cout<< " pair ="<<pair<<std::endl;
1146         int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
1147         int64_t i = Mi[pC] ;          // row index of C(i,j)
1148 
1149         // get C(i,j)
1150         //int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
1151         //ASSERT ((C->i [pC] & 4) == b) ;
1152         //int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
1153         //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
1154 
1155          int64_t pA = Ap[i];
1156          int64_t pA_end   = Ap[i+1];
1157          int64_t nnzA   = pA_end - pA;
1158          int64_t pB = Bp[i];
1159          int64_t pB_end   = Bp[i+1];
1160          int64_t nnzB   = pB_end - pB;
1161          T_A aki;
1162          T_B bkj;
1163          T_Z cij;
1164 
1165          if( nnzA == A->vlen) // A is dense
1166          {
1167             int64_t k = Bi [pB] ;               // first row index of B(:,j)
1168             // cij = A(k,i) * B(k,j)
1169             GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
1170             GB_GETB (bkj, Bx, pB  ) ;           // bkj = B(k,j)
1171             cij = (*MUL_ptr<T_Z>)( aki, bkj) ;           // cij = aki * bkj
1172 
1173             for (int64_t p = pB+1 ; p < pB_end ; p++)
1174             {
1175                 //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
1176                 int64_t k = Bi [p] ;                // next row index of B(:,j)
1177                 // cij += A(k,i) * B(k,j)
1178                 GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
1179                 GB_GETB (bkj, Bx, p   ) ;           // bkj = B(k,j)
1180                 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) );
1181             }
1182 
1183          }
1184          if( nnzB == B->vlen) // B is dense
1185          {
1186             int64_t k = Ai [pA] ;               // first row index of A(:,i)
1187             // cij = A(k,i) * B(k,j)
1188             GB_GETA (aki, Ax, pA  ) ;           // aki = A(k,i)
1189             GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
1190             cij = (*MUL_ptr<T_Z>)( aki, bkj) ;           // cij = aki * bkj
1191 
1192             for (int64_t p = pA+1 ; p < pA_end ; p++)
1193             {
1194                 //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
1195                 int64_t k = Ai [p] ;                // next row index of A(:,i)
1196                 // cij += A(k,i) * B(k,j)
1197                 GB_GETA (aki, Ax, p   ) ;           // aki = A(k,i)
1198                 GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
1199                 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) );
1200             }
1201          }
1202 
1203          Ci[pair] = i;
1204          Cx[pair] = cij;
1205 
1206       }
1207        T_C err = 0;
1208        for (int j =0 ; j< N; ++j) {
1209          for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
1210              int64_t i =  Ci[l];
1211          //std::cout<<i<<","<<j<<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
1212              if (i >=0 )
1213                 err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
1214          }
1215        }
1216        std::cout<< " 2-norm of err ="<< err<<std::endl;
1217        std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
1218 
1219        EXPECT_EQ(err,0);
1220        EXPECT_EQ( zc_valid, C->get_zombie_count());
1221 
1222        free(X_valid);
1223        free(i_valid);
1224      }
1225     }
1226 
1227 G.del();
1228 
1229 return result;
1230 
1231 }
1232 
1233 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_mp_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)1234 bool test_AxB_dot3_mp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
1235 // Assumes all matrices are square so far, so only N dimension given.
1236 // Sparsity is dense here so Anz = Bnz = N*N.
1237 // Generates three randomized matrices, builds buckets and calls a kernel.
1238 
1239 
1240 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "mp");
1241 
1242 int testBucket = TB;
1243 
1244 //unsigned seed = 13372801;
1245 //std::mt19937 r; //random number generator Mersenne Twister
1246 //r.seed(seed);
1247 //int gpuID;
1248 //cudaGetDevice( &gpuID);
1249 
1250 //std::cout<< "found device "<<gpuID<<std::endl;
1251 
1252 //T_Z MONOID_IDENTITY;
1253 if (SEMI_RING == "PLUS_TIMES") {
1254    std::cout << "Plus Times (+,*) semiring"<<std::endl;
1255    //MONOID_IDENTITY = 0;
1256    ADD_ptr<T_Z> = myOP_plus<T_Z>;
1257    MUL_ptr<T_Z> = myOP_times<T_Z>;
1258 
1259 }
1260 else if(SEMI_RING == "MIN_PLUS") {
1261    std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
1262    //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
1263    ADD_ptr<T_Z> = myOP_min<T_Z>;
1264    MUL_ptr<T_Z> = myOP_plus<T_Z>;
1265 
1266 }
1267 else if(SEMI_RING == "MAX_PLUS") {
1268    //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
1269    std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
1270    ADD_ptr<T_Z> = myOP_max<T_Z>;
1271    MUL_ptr<T_Z> = myOP_plus<T_Z>;
1272 }
1273 
1274 //Generate test data and setup for using a jitify kernel with 'bucket' interface
1275 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
1276 int64_t Annz = Anz;
1277 int64_t Bnnz = Bnz;
1278 int64_t Cnz = N;
1279 float Cnzpercent = (float) Cnz/(N*N);
1280 
1281 G.init(N, Annz, Bnnz, Cnzpercent);
1282 
1283 G.fill_buckets( testBucket); // all elements go to testbucket= TB
1284 
1285 matrix<T_C>* C = G.getCptr();
1286 matrix<T_M>* M = G.getMptr();
1287 matrix<T_A>* A = G.getAptr();
1288 matrix<T_B>* B = G.getBptr();
1289 
1290 T_C *Cx = C->x;
1291 T_A *Ax = A->x;
1292 T_B *Bx = B->x;
1293 int64_t *Ci = C->i;
1294 int64_t *Mi = M->i;
1295 int64_t *Ai = A->i;
1296 int64_t *Bi = B->i;
1297 int64_t *Ap = A->p;
1298 int64_t *Bp = B->p;
1299 
1300 // Set clear zombie count
1301 C->zombie_count = 0;
1302 
1303 //std::cout<<"got all matrices"<<std::endl;
1304 int64_t *Bucket = G.getBucket();
1305 int64_t *BucketStart = G.getBucketStart();
1306 
1307 int zc_valid = 0;
1308 
1309 bool result = false;
1310 
1311 for (int b =0; b < 12; ++b) {// loop on buckets
1312 
1313     int64_t b_start = BucketStart [b] ;
1314     int64_t b_end   = BucketStart [b+1] ;
1315     int64_t nvecs = b_end - b_start ;
1316     if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
1317 
1318     T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
1319     int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
1320     if (b == TB) { //test cases for merge-path kernel
1321        int nthrd = 32;
1322        int nblck = Cnz;
1323        int sz = 0;
1324        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
1325 
1326        GpuTimer kernTimer;
1327        kernTimer.Start();
1328        lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
1329                                 C, M, A, B, sz);
1330 
1331        kernTimer.Stop();
1332        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
1333 
1334        //std::cout<<"returned from kernel"<<std::endl;
1335 
1336        zc_valid = C->zombie_count;
1337        C->zombie_count = 0;
1338        for (int i =0 ; i< Cnz; ++i) {
1339             //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
1340             X_valid[i] = Cx[i];
1341             i_valid[i] = C->i[i];
1342             // clear values for next test
1343             Cx[i] = 0;
1344        }
1345        G.loadCj();
1346 
1347        for (int64_t pair = b_start ; pair < b_end ; pair++) {
1348 
1349         // get the kth entry in bucket b
1350         //std::cout<< " pair ="<<pair<<std::endl;
1351         int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
1352         int64_t i = Mi[pC] ;          // row index of C(i,j)
1353 
1354         // get C(i,j)
1355         int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
1356         //ASSERT ((C->i [pC] & 4) == b) ;
1357         int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
1358         //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
1359 
1360         int64_t pA_start = Ap [i] ;
1361         int64_t pA_end   = Ap [i+1] ;
1362 
1363         int64_t pB_start = Bp [j] ;
1364         int64_t pB_end   = Bp [j+1] ;
1365         // NOTE: this test code is NOT doing merge-path. This is just a
1366         // single-threaded linear merge for correctness testing.
1367         k = pA_start;
1368         int64_t l = pB_start;
1369         T_Z cij ;
1370         bool cij_exists = false;
1371         while( k < pA_end && l < pB_end) {
1372            if      ( Ai[k] < Bi[l] ) k += 1;
1373            else if ( Ai[k] > Bi[l] ) l += 1;
1374            else {
1375              if (cij_exists) {
1376                //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
1377                cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
1378              }
1379              else {
1380                cij_exists = true;
1381                cij = (*MUL_ptr<T_Z>)( (T_Z)Ax[k], (T_Z)Bx[l] ) ;
1382              }
1383 
1384              k++;
1385              l++;
1386            }
1387            //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
1388         }
1389         //std::cout<< " dot  = "<< sum << std::endl;
1390 
1391         // output for this dot product is
1392 
1393         if (cij_exists) {
1394             Cx [pC] = (T_C)cij;
1395             Ci [pC] = i;
1396         }
1397         else {
1398             C->i [pC] = -1;//GB_FLIP (i)
1399             C->zombie_count++;
1400         }
1401     }
1402        T_C err = 0;
1403        for (int j =0 ; j< N; ++j) {
1404          for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
1405 
1406              if (Ci[l] > 0) {
1407                 //std::cout<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
1408                 err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
1409              }
1410          }
1411        }
1412        std::cout<< " 2-norm of err ="<< err<<std::endl;
1413        std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
1414 
1415        EXPECT_EQ(err,0);
1416        EXPECT_EQ( zc_valid, C->get_zombie_count());
1417 
1418        free(X_valid);
1419        free(i_valid);
1420      }
1421     }
1422 
1423 G.del();
1424 
1425 return result;
1426 
1427 }
1428 
1429 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_warp_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)1430 bool test_AxB_dot3_warp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
1431 // Assumes all matrices are square so far, so only N dimension given.
1432 // Sparsity is dense here so Anz = Bnz = N*N.
1433 // Generates three randomized matrices, builds buckets and calls a kernel.
1434 
1435 
1436 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "warp");
1437 
1438 int testBucket = TB;
1439 
1440 //unsigned seed = 13372801;
1441 //std::mt19937 r; //random number generator Mersenne Twister
1442 //r.seed(seed);
1443 //int gpuID;
1444 //cudaGetDevice( &gpuID);
1445 
1446 //std::cout<< "found device "<<gpuID<<std::endl;
1447 
1448 //T_Z MONOID_IDENTITY;
1449 if (SEMI_RING == "PLUS_TIMES") {
1450    std::cout << "Plus Times (+,*) semiring"<<std::endl;
1451    //MONOID_IDENTITY = 0;
1452    ADD_ptr<T_Z> = myOP_plus<T_Z>;
1453    MUL_ptr<T_Z> = myOP_times<T_Z>;
1454 
1455 }
1456 else if(SEMI_RING == "MIN_PLUS") {
1457    std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
1458    //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
1459    ADD_ptr<T_Z> = myOP_min<T_Z>;
1460    MUL_ptr<T_Z> = myOP_plus<T_Z>;
1461 
1462 }
1463 else if(SEMI_RING == "MAX_PLUS") {
1464    //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
1465    std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
1466    ADD_ptr<T_Z> = myOP_max<T_Z>;
1467    MUL_ptr<T_Z> = myOP_plus<T_Z>;
1468 }
1469 
1470 //Generate test data and setup for using a jitify kernel with 'bucket' interface
1471 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
1472 int64_t Cnz = N;
1473 float Cnzpercent = (float) Cnz/(N*N);
1474 
1475 G.init(N, Anz, Bnz, Cnzpercent);
1476 
1477 G.fill_buckets( testBucket); // all elements go to testbucket= TB
1478 
1479 matrix<T_C>* C = G.getCptr();
1480 matrix<T_M>* M = G.getMptr();
1481 matrix<T_A>* A = G.getAptr();
1482 matrix<T_B>* B = G.getBptr();
1483 
1484 T_C *Cx = C->x;
1485 T_A *Ax = A->x;
1486 T_B *Bx = B->x;
1487 int64_t *Ci = C->i;
1488 int64_t *Mi = M->i;
1489 int64_t *Ai = A->i;
1490 int64_t *Bi = B->i;
1491 int64_t *Ap = A->p;
1492 int64_t *Bp = B->p;
1493 
1494 // Set clear zombie count
1495 C->zombie_count = 0;
1496 
1497 //std::cout<<"got all matrices"<<std::endl;
1498 int64_t *Bucket = G.getBucket();
1499 int64_t *BucketStart = G.getBucketStart();
1500 
1501 int zc_valid = 0;
1502 
1503 bool result = false;
1504 
1505 for (int b =0; b < 12; ++b) {// loop on buckets
1506 
1507     int64_t b_start = BucketStart [b] ;
1508     int64_t b_end   = BucketStart [b+1] ;
1509     int64_t nvecs = b_end - b_start ;
1510     if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
1511 
1512     T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
1513     int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
1514     if (b == TB) { //test cases for merge-path kernel
1515        int nthrd = 32;
1516        int nblck = (Cnz + nthrd -1)/nthrd ;
1517        int sz = 0;
1518        std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
1519 
1520        GpuTimer kernTimer;
1521        kernTimer.Start();
1522        lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
1523                                 C, M, A, B, sz);
1524 
1525        kernTimer.Stop();
1526        std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
1527 
1528        //std::cout<<"returned from kernel"<<std::endl;
1529 
1530        zc_valid = C->zombie_count;
1531        C->zombie_count = 0;
1532        for (int i =0 ; i< Cnz; ++i) {
1533             //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
1534             X_valid[i] = Cx[i];
1535             i_valid[i] = C->i[i];
1536             // clear values for next test
1537             Cx[i] = 0;
1538        }
1539        G.loadCj();
1540 
1541        for (int64_t pair = b_start ; pair < b_end ; pair++) {
1542 
1543         // get the kth entry in bucket b
1544         //std::cout<< " pair ="<<pair<<std::endl;
1545         int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
1546         int64_t i = Mi[pC] ;          // row index of C(i,j)
1547 
1548         // get C(i,j)
1549         int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
1550         //ASSERT ((C->i [pC] & 4) == b) ;
1551         int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
1552         //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
1553 
1554         int64_t pA_start = Ap [i] ;
1555         int64_t pA_end   = Ap [i+1] ;
1556 
1557         int64_t pB_start = Bp [j] ;
1558         int64_t pB_end   = Bp [j+1] ;
1559         // NOTE: this test code is NOT doing merge-path. This is just a
1560         // single-threaded linear merge for correctness testing.
1561         k = pA_start;
1562         int64_t l = pB_start;
1563         T_Z cij ;
1564         bool cij_exists = false;
1565         while( k < pA_end && l < pB_end) {
1566            if      ( Ai[k] < Bi[l] ) k += 1;
1567            else if ( Ai[k] > Bi[l] ) l += 1;
1568            else {
1569              if (cij_exists) {
1570                //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
1571                cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
1572              }
1573              else {
1574                cij_exists = true;
1575                cij = (*MUL_ptr<T_Z>)( (T_Z)Ax[k], (T_Z)Bx[l] ) ;
1576              }
1577 
1578              k++;
1579              l++;
1580            }
1581            //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
1582         }
1583         //std::cout<< " dot  = "<< sum << std::endl;
1584 
1585         // output for this dot product is
1586 
1587         if (cij_exists) {
1588             Cx [pC] = (T_C)cij;
1589             Ci [pC] = i;
1590         }
1591         else {
1592             C->i [pC] = -1;//GB_FLIP (i)
1593             C->zombie_count++;
1594         }
1595     }
1596        T_C err = 0;
1597        for (int j =0 ; j< N; ++j) {
1598          for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
1599 
1600              if (Ci[l] > 0) {
1601                 //std::cout<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
1602                 err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
1603              }
1604          }
1605        }
1606        std::cout<< " 2-norm of err ="<< err<<std::endl;
1607        std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
1608 
1609        EXPECT_EQ(err,0);
1610        EXPECT_EQ( zc_valid, C->get_zombie_count());
1611 
1612        free(X_valid);
1613        free(i_valid);
1614      }
1615     }
1616 
1617 G.del();
1618 
1619 return result;
1620 
1621 }
1622 
1623 template <typename T>
test_reducefactoryUM(unsigned int N,std::string OP)1624 bool test_reducefactoryUM( unsigned int N, std::string OP) {
1625 
1626   reduceFactory<T> rF;
1627 
1628   int block(32);
1629   int nblock= (N + 8*block -1)/(8*block);
1630   int grid(nblock);
1631   T* d_data;
1632   T* output;
1633 
1634   //std::cout<<" alloc'ing data and output"<<std::endl;
1635   CHECK_CUDA( cudaMallocManaged((void**) &d_data, nblock*sizeof(T)) );
1636   CHECK_CUDA( cudaMallocManaged((void**) &output, nblock*sizeof(T)) );
1637   //std::cout<<" alloc done"<<std::endl;
1638   //std::cout<<" data fill start"<<std::endl;
1639 
1640   fillvector_linear<T> ( N, d_data);
1641 
1642   //std::cout<<" data fill complete"<<std::endl;
1643   //we will get a triangular sum = N*(N+1)/2 with this input
1644   //for (unsigned int i =0; i < N; ++i) d_data[i] = i;
1645 
1646   //std::cout<< " init data done"<<std::endl;
1647   //for (unsigned int i =0; i < N; ++i) std::cout<< d_data[i] <<" ";
1648 
1649 
1650   T sum;
1651   std::cout << "Launching reduce"<<OP<<GET_TYPE_NAME(sum)<<" kernel..."<<std::endl;
1652   rF.jitGridBlockLaunch( grid, block, d_data, output, N, OP );
1653 
1654   for (int i =0; i< nblock; ++i) std::cout<< output[i] <<" ";
1655 
1656   if (OP == "PLUS"){
1657       sum = (T) 0;
1658       myOpPTR<T> = myOP_plus<T>;
1659   }
1660   if (OP == "MIN") {
1661       sum = (T)std::numeric_limits<T>::max();
1662       myOpPTR<T> = myOP_min<T>;
1663   }
1664   if (OP == "MAX") {
1665       sum = (T)std::numeric_limits<T>::min();
1666       myOpPTR<T> = myOP_max<T>;
1667   }
1668 
1669   for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T>)(sum ,output[i]);
1670 
1671   T expect;
1672   bool result = false;
1673   if (OP == "PLUS") {
1674      expect  = (T)(N*(N-1)/2);
1675      T temp = (sum - expect) ;
1676      if (temp < 0) temp = -temp ;
1677      //result = (temp < (T)1) ; //adjust formula for leading 0
1678      EXPECT_LE(temp, 1);
1679   }
1680   else if (OP == "MAX") {
1681      expect = (T)(N-1);
1682      //result = (sum)== (T)(N-1) ; //max is N-1
1683      EXPECT_EQ( sum , (T)(N-1) );
1684 
1685   }
1686   else if (OP == "MIN") {
1687      expect = (T)0;
1688      //result = (sum)== (T)(0) ;   //min is 0
1689      EXPECT_EQ( sum , (T)(0) );
1690   }
1691   else expect = (T) 0;
1692   std::cout <<std::endl<<"result of test_reducefactoryUM with "<< OP<< " operation ="<< sum
1693             <<" expected "<<expect << std::endl;
1694 
1695   cudaFree(d_data);
1696   cudaFree(output);
1697   return result;
1698 }
1699 
1700 template <typename T1,typename T2,typename T3>
test_dndotfactoryUM(unsigned int N,std::string SEMI_RING)1701 bool test_dndotfactoryUM( unsigned int N, std::string SEMI_RING) {
1702 
1703   dotFactory<T1,T2,T3> dF;
1704 
1705   int block(512);
1706   int nblock= (N + 8*block -1)/(8*block);
1707   int grid(nblock);
1708   T1* x;
1709   T2* y;
1710   T3* output;
1711   CHECK_CUDA( cudaMallocManaged((void**)&x, N*sizeof(T1)) );
1712   CHECK_CUDA( cudaMallocManaged((void**)&y, N*sizeof(T2)) );
1713   CHECK_CUDA( cudaMallocManaged((void**)&output, nblock*sizeof(T3)) );
1714 
1715   //we will get a triangular sum = N*(N+1)/2 with these inputs
1716   fillvector_linear<T1> (N, x);
1717   fillvector_constant<T2> (N, y, T2(1));
1718 
1719   dF.jitGridBlockLaunch( grid, block, x, y, output, N, SEMI_RING );
1720 
1721   T3 sum;
1722   if (SEMI_RING == "PLUS_TIMES")
1723   {
1724       myOpPTR<T3> = myOP_plus<T3>;
1725       sum = (T3)0;
1726   }
1727   if (SEMI_RING == "MIN_PLUS")
1728   {
1729       sum = std::numeric_limits<T3>::max();
1730       myOpPTR<T3> = myOP_min<T3>;
1731   }
1732 
1733   for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T3>)(sum ,output[i]);
1734 
1735   bool result = false;
1736   T3 expect;
1737   if (SEMI_RING == "PLUS_TIMES") {
1738      expect = (T3)(N*(N-1)/2);
1739      T3 temp = (sum -expect) ;
1740      if (temp < 0) temp = -temp ;
1741      //result = (temp < (T3)1) ; //adjust formula for leading 0
1742      EXPECT_LE( temp, (T3)1 );
1743   }
1744   else if (SEMI_RING == "MIN_PLUS") {
1745      expect = (T3) 1;
1746      //result = (sum == expect) ;   //min is 1 from the (0,1) pair
1747      EXPECT_EQ( sum, expect);
1748   }
1749   else expect = (T3)0;
1750   std::cout <<"test_dotfactoryUM with "<<SEMI_RING<<" semi-ring="<< sum
1751                                        <<" expected "<<expect << std::endl;
1752 
1753   cudaFree(x);
1754   cudaFree(y);
1755   cudaFree(output);
1756   return result;
1757 }
1758 
1759 
1760 template <typename T1,typename T2,typename T3>
test_spdotfactoryUM(unsigned int N,unsigned int xn,unsigned int yn,std::string SEMI_RING)1761 bool test_spdotfactoryUM( unsigned int N, unsigned int xn, unsigned int yn, std::string SEMI_RING) {
1762 
1763 #define INTMIN( A, B) ( (A) < (B) ) ?  (A) : (B)
1764 
1765   // N here is the index space that the sparse vectors are drawn from.
1766   // Indices in xi and yi are in the range (0,N-1)
1767   // We will generate a number of random values in this range for test data
1768   std::cout<< " xn,yn= "<<xn<<','<<yn<<"min = "<< std::min( xn, yn) <<std::endl;
1769   int n_threads = std::min( xn, yn) / 4;
1770   std::cout<< "I think we need "<< n_threads<<" threads to do this."<<std::endl;
1771   int pad_threads = 2;
1772   while ( pad_threads < n_threads) {
1773       pad_threads *= 2;
1774   }
1775   int block= 32;
1776   int nblock= ( pad_threads + block -1)/(block);
1777   int grid(nblock);
1778   std::cout<<"N="<<N<<" xn ="<<xn<<", yn="<<yn<<" nblock="<<nblock<<" block ="<<block<<std::endl;
1779   unsigned int *xi;
1780   unsigned int *yi;
1781   T1* x;
1782   T2* y;
1783   T3* output;
1784   unsigned int intersection_size = 0; //will be filled in later if needed and xn != yn
1785   unsigned seed = 13372801;
1786   std::mt19937 r; //random number generator Mersenne Twister
1787   r.seed(seed);
1788   cudaMallocManaged((void**)&x, xn*sizeof(T1));
1789   cudaMallocManaged((void**)&xi, xn*sizeof(int));
1790   cudaMallocManaged((void**)&y, yn*sizeof(T2));
1791   cudaMallocManaged((void**)&yi, yn*sizeof(int));
1792   cudaMallocManaged((void**)&output, nblock*sizeof(T3));
1793 
1794   int inv_sparsity = N/std::max(xn,yn);  //= values not taken per value occupied in index space
1795   std::cout<<" Using inv_sparsity value of "<< inv_sparsity<<std::endl;
1796   fillvector_constant<T1> (xn, x, T1(1));
1797   fillvector_constant<T2> (yn, y, T2(1));
1798 
1799   if( xn == yn){  // test case : all values intersect, generate 1 random number for both
1800       intersection_size = xn;
1801       std::cout << " all-intersect case..."<<std::endl;
1802       for (unsigned int i =0; i < xn; ++i){
1803           unsigned int rand_i = inv_sparsity*i+ r() %(inv_sparsity);
1804           xi[i] = rand_i; //we will get a count of the intersection size
1805           yi[i] = rand_i; //we will get a count of the intersection size
1806       }
1807       //std::sort (xi, xi + xn);
1808       //std::sort (yi, yi + yn);
1809   }
1810   else { // generate two different sets of indices, no known intersection pattern
1811       for (unsigned int i =0; i < xn; ++i){
1812           unsigned int rand_i = inv_sparsity*i +r() % (inv_sparsity);
1813           xi[i] = rand_i; //we will get a count of the intersection size
1814       }
1815       for (unsigned int i =0; i < yn; ++i){
1816           unsigned int rand_i = inv_sparsity*i +r() % (inv_sparsity);
1817           yi[i] = rand_i; //we will get a count of the intersection size
1818       }
1819       //std::sort (xi, xi + xn);
1820       //std::sort (yi, yi + yn);
1821       unsigned int xp =0;
1822       unsigned int yp =0;
1823       while (1){  //find the intersection size by merge of two sorted lists
1824           if (xi[xp] < yi[yp]) xp++;
1825           else if (xi[xp] > yi[yp]) yp++;
1826           else {
1827               intersection_size++;
1828               xp++;
1829               yp++;
1830           }
1831           if ( ( xp == xn ) || ( yp == yn) )  break;
1832       }
1833   }
1834   if( xn < 128 ) {
1835 
1836     std::cout<< " xi = [";
1837     for (unsigned int i = 0 ; i < xn; ++i) {
1838         std::cout<< xi[i] << ",";
1839     }
1840     std::cout<< " ]" <<std::endl;
1841 
1842   }
1843   std::cout << " Launching sparseDot CUDA kernel xn = "<<xn<<" yn="<<yn<<std::endl;
1844   spdotFactory<T1,T2,T3> spdF;
1845   spdF.jitGridBlockLaunch( grid, block, xn, xi, x, yn, yi, y, output, SEMI_RING );
1846 
1847   cudaDeviceSynchronize ( ) ;
1848 
1849   T3 sum;
1850   if (SEMI_RING == "PLUS_TIMES")
1851   {
1852       myOpPTR<T3> = myOP_plus<T3>;
1853       sum = (T3)0;
1854   }
1855   if (SEMI_RING == "MIN_PLUS")
1856   {
1857       sum = std::numeric_limits<T3>::max();
1858       myOpPTR<T3> = myOP_min<T3>;
1859   }
1860 
1861   for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T3>)(sum ,output[i]);
1862 
1863   bool result = false;
1864   T3 expect;
1865   if (SEMI_RING == "PLUS_TIMES") {
1866      T3 temp;
1867      expect = intersection_size;
1868      temp = (sum - expect);
1869      if (temp < 0) temp = -temp ;
1870      result = (temp < (T3)1) ; //adjust formula for leading 0
1871   }
1872   else if (SEMI_RING == "MIN_PLUS") {
1873      expect = 2;
1874      result = (sum== expect) ;   //min is 2 from the (1,1) pair
1875   }
1876   else expect = (T3) 0;
1877 
1878   std::cout <<"test_spdotfactoryUM with "<<SEMI_RING<<" semi-ring= "
1879             << sum << " expected "<<intersection_size<< std::endl;
1880   cudaFree(x);
1881   cudaFree(xi);
1882   cudaFree(y);
1883   cudaFree(yi);
1884   cudaFree(output);
1885   return result;
1886 }
1887