1 // SPDX-License-Identifier: Apache-2.0
2
3 #include <cassert>
4 #include <cmath>
5 #include <random>
6 #include <algorithm>
7 #include "jitFactory.hpp"
8 #include "../binary_search.h"
9 #include "GpuTimer.h"
10 #include "gtest/gtest.h"
11
12 //Operations for test results on CPU
myOP_plus(T a,T b)13 template<typename T> T myOP_plus( T a, T b) { return a + b;}
myOP_min(T a,T b)14 template<typename T> T myOP_min ( T a, T b) { return a < b ? a : b;}
myOP_max(T a,T b)15 template<typename T> T myOP_max ( T a, T b) { return a > b ? a : b;}
myOP_first(T a,T b)16 template<typename T> T myOP_first ( T a, T b) { return a ;}
myOP_second(T a,T b)17 template<typename T> T myOP_second ( T a, T b) { return b ;}
myOP_times(T a,T b)18 template<typename T> T myOP_times ( T a, T b) { return a * b ;}
19
20 template<typename T> T (*myOpPTR)(T a, T b);
21 template<typename T> T (*ADD_ptr)(T a, T b);
22 template<typename T> T (*MUL_ptr)(T a, T b);
23
24
25 //Test generators using jitify
26 template <typename T>
27 bool test_reducefactoryUM( unsigned int N, std::string OP) ;
28
29 template <typename T1,typename T2,typename T3>
30 bool test_dndotfactoryUM( unsigned int N, std::string SEMI_RING) ;
31
32 template <typename T1,typename T2,typename T3>
33 bool test_spdotfactoryUM( unsigned int N, unsigned int xn, unsigned int yn, std::string SEMI_RING) ;
34
35 //AxB_dot3_phase1 kernels
36 template <typename T_C, typename T_M, typename T_A,typename T_B>
37 bool test_AxB_dot3_phase1_factory( int64_t , int64_t , int64_t , int64_t ) ;
38
39 //AxB_dot3_phase2 kernels
40 template <typename T_C>
41 bool test_AxB_dot3_phase2_factory( int , int64_t , int64_t , int64_t, int64_t ) ;
42
43 template <typename T_C>
44 bool test_AxB_dot3_phase2end_factory( int , int64_t , int64_t , int64_t ) ;
45
46 //AxB_dot3_phase3 kernels
47 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
48 bool test_AxB_dot3_dndn_factory( int , int64_t , int64_t , int64_t , std::string&) ;
49
50 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
51 bool test_AxB_dot3_vsvs_factory( int , int64_t , int64_t , int64_t , std::string&) ;
52
53 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
54 bool test_AxB_dot3_spdn_factory( int , int64_t , int64_t , int64_t , std::string&) ;
55
56 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
57 bool test_AxB_dot3_vssp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
58
59 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
60 bool test_AxB_dot3_mp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
61
62 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
63 bool test_AxB_dot3_warp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
64
65
66 //Fixture to generate valid inputs and hold them for tests
67 class AxB_dot3_Test : public ::testing::Test
68 {
SetUp()69 void SetUp()
70 {
71
72
73 }
74
TearDown()75 void TearDown()
76 {
77
78 }
79
80 }
81
82 // Test generator code, to allow parameterized tests
83 // Uses jitFactory, dataFactory and GB_jit
84 template <typename T_C, typename T_M, typename T_A,typename T_B>
test_AxB_dot3_phase1_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz)85 bool test_AxB_dot3_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
86
87 int gpuID;
88 cudaGetDevice( &gpuID);
89
90 std::cout<< "found device "<<gpuID<<std::endl;
91
92 phase1launchFactory<T_C, T_M, T_A, T_B> p1lF();
93
94 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
95 int64_t Annz = N*N;
96 int64_t Bnnz = N*N;
97 int64_t Cnz = N;
98 float Cnzpercent = (float) Cnz/(N*N);
99
100 G.init(N, Annz, Bnnz, Cnzpercent);
101
102 G.fill_buckets( testBucket); // all elements go to testbucket= TB
103
104 matrix<T_C>* C = G.getCptr();
105 matrix<T_M>* M = G.getMptr();
106 matrix<T_A>* A = G.getAptr();
107 matrix<T_B>* B = G.getBptr();
108
109 T_C *Cx = C->x;
110 T_A *Ax = A->x;
111 T_B *Bx = B->x;
112
113 int nthrd = 32;
114 int sz = 4;
115 //int m = 256/sz;
116 int nblck = Cnz;
117 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
118
119 GpuTimer kernTimer;
120 kernTimer.Start();
121 p1lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket,
122 C, M, A, B);
123
124 kernTimer.Stop();
125 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
126
127 return true;
128
129 }
130
131 template <typename T_C, typename T_M, typename T_A,typename T_B>
test_AxB_dot3_phase2_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz)132 bool test_AxB_dot3_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
133
134 int gpuID;
135 cudaGetDevice( &gpuID);
136
137 std::cout<< "found device "<<gpuID<<std::endl;
138
139 phase2launchFactory<T_C, T_M, T_A, T_B> p2lF();
140
141 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
142 int64_t Annz = N*N;
143 int64_t Bnnz = N*N;
144 int64_t Cnz = N;
145 float Cnzpercent = (float) Cnz/(N*N);
146
147 G.init(N, Annz, Bnnz, Cnzpercent);
148
149 G.fill_buckets( testBucket); // all elements go to testbucket= TB
150
151 matrix<T_C>* C = G.getCptr();
152 matrix<T_M>* M = G.getMptr();
153 matrix<T_A>* A = G.getAptr();
154 matrix<T_B>* B = G.getBptr();
155
156 T_C *Cx = C->x;
157 T_A *Ax = A->x;
158 T_B *Bx = B->x;
159
160 int nthrd = 32;
161 int sz = 4;
162 //int m = 256/sz;
163 int nblck = Cnz;
164 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
165
166 GpuTimer kernTimer;
167 kernTimer.Start();
168 p1lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket,
169 C, M, A, B);
170
171 kernTimer.Stop();
172 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
173
174 return true;
175 }
176
177 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_full_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)178 bool test_AxB_dot3_full_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
179 // Generates three randomized matrices, builds buckets and calls a kernel.
180 // This is the full version as called in SuiteSparse:GraphBLAS
181
182 phase1launchFactory<T_C, T_M, T_A, T_B> p1lF();
183 phase2launchFactory<T_C> p2lF();
184 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "dndn");
185
186 int testBucket = TB;
187
188 //unsigned seed = 13372801;
189 //std::mt19937 r; //random number generator Mersenne Twister
190 //r.seed(seed);
191 int gpuID;
192 cudaGetDevice( &gpuID);
193
194 std::cout<< "found device "<<gpuID<<std::endl;
195
196 T_Z MONOID_IDENTITY;
197 if (SEMI_RING == "PLUS_TIMES") {
198 std::cout << "Plus Times (+,*) semiring"<<std::endl;
199 MONOID_IDENTITY = 0;
200 ADD_ptr<T_Z> = myOP_plus<T_Z>;
201 MUL_ptr<T_Z> = myOP_times<T_Z>;
202
203 }
204 else if(SEMI_RING == "MIN_PLUS") {
205 std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
206 MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
207 ADD_ptr<T_Z> = myOP_min<T_Z>;
208 MUL_ptr<T_Z> = myOP_plus<T_Z>;
209
210 }
211 else if(SEMI_RING == "MAX_PLUS") {
212 MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
213 std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
214 ADD_ptr<T_Z> = myOP_max<T_Z>;
215 MUL_ptr<T_Z> = myOP_plus<T_Z>;
216 }
217
218 //Generate test data and setup for using a jitify kernel with 'bucket' interface
219 // The testBucket arg tells the generator which bucket we want to exercise
220 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G( testBucket);
221 int64_t Annz = N*N;
222 int64_t Bnnz = N*N;
223 int64_t Cnz = N;
224 float Cnzpercent = (float) Cnz/(N*N);
225
226 G.init(N, Annz, Bnnz, Cnzpercent);
227
228 G.fill_buckets( testBucket); // all elements go to testbucket= TB
229
230 matrix<T_C>* C = G.getCptr();
231 matrix<T_M>* M = G.getMptr();
232 matrix<T_A>* A = G.getAptr();
233 matrix<T_B>* B = G.getBptr();
234
235 T_C *Cx = C->x;
236 T_A *Ax = A->x;
237 T_B *Bx = B->x;
238
239 // Set clear zombie count
240 C->zombie_count = 0;
241
242 //std::cout<<"got all matrices"<<std::endl;
243 int64_t *Bucket = G.getBucket();
244 int64_t *BucketStart = G.getBucketStart();
245
246 int zc_valid = 0;
247
248 bool result = false;
249
250 // Phase 1
251 int nthrd = 32;
252 int sz = 4;
253 //int m = 256/sz;
254 int nblck = Cnz;
255 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
256
257 GpuTimer kernTimer;
258 kernTimer.Start();
259 p1lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket,
260 C, M, A, B);
261
262 kernTimer.Stop();
263 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
264
265 // Phase 2
266 int nthrd = 32;
267 int sz = 4;
268 //int m = 256/sz;
269 int nblck = Cnz;
270 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
271
272 GpuTimer kernTimer;
273 kernTimer.Start();
274 p2lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket, bucketp,
275 C);
276
277 kernTimer.Stop();
278 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
279
280
281 for (int b =0; b < 12; ++b) {// loop on buckets
282
283 int64_t b_start = BucketStart [b] ;
284 int64_t b_end = BucketStart [b+1] ;
285 int64_t nvecs = b_end - b_start ;
286 if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
287
288 T_C *X_valid = (T_C*) malloc( Cnz*sizeof(T_C));
289 int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
290 if (b == TB) { //test cases for dense-dense kernels
291 int nthrd = 32;
292 int sz = 4;
293 //int m = 256/sz;
294 int nblck = Cnz;
295 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
296
297 GpuTimer kernTimer;
298 kernTimer.Start();
299 lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
300 C, M, A, B, sz);
301
302 kernTimer.Stop();
303 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
304
305 zc_valid = C->zombie_count;
306 C->zombie_count = 0;
307 for (int i =0 ; i< Cnz; ++i) {
308 //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
309 X_valid[i] = Cx[i];
310 Cx[i] = 0;
311 i_valid[i] = C->i[i];
312 }
313 G.loadCj();
314
315 for (int64_t pair = b_start ; pair < b_end ; pair++) {
316
317 // get the kth entry in bucket b
318 //std::cout<< " pair ="<<pair<<std::endl;
319 int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
320 int64_t i = M->i[pC] ; // row index of C(i,j)
321
322 // get C(i,j)
323 int64_t k = (C->i [pC] >> 4) ; // col index of C(i,j)
324 //ASSERT ((C->i [pC] & 4) == b) ;
325 int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
326 //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
327
328 // xvp, xvi, xvals: A(:,i)
329 // xvp is Ap [i] and Ap [i+1]
330 int64_t pA_start = A->p [i] ;
331 int64_t pA_end = A->p [i+1] ;
332 // indices are in Ai [pA_start ... pA_end-1]
333 // values are in Ax [pA_start ... pA_end-1]
334
335 // yvp, yvi, yvals: B(:,j)
336 // yvp is Bp [j] and Bp [j+1]
337 int64_t pB_start = B->p [j] ;
338 int64_t pB_end = B->p [j+1] ;
339 // indices are in Bi [pB_start ... pB_end-1]
340 // values are in Bx [pB_start ... pB_end-1]
341 k = pA_start;
342 int64_t l = pB_start;
343 T_Z cij = MONOID_IDENTITY;
344 while( k < pA_end && l < pB_end) {
345 //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
346 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
347 k++;
348 l++;
349 //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
350 }
351 //std::cout<< " dot = "<< sum << std::endl;
352
353 // output for this dot product is
354
355 if (cij == MONOID_IDENTITY) {
356 C->i [pC] = -1;//GB_FLIP (i)
357 C->zombie_count++;
358 }
359 else {
360 Cx [pC] = (T_C)cij;
361 C->i [pC] = i;
362 }
363 }
364 T_C err = 0;
365 for (int j =0 ; j< N; ++j) {
366 for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
367 int64_t i = C->i[l];
368 //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
369 if (i >= 0)
370 err += ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
371 }
372 }
373 std::cout<< " 2-norm of err ="<< err<<std::endl;
374 std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
375
376 EXPECT_EQ(err,0);
377 EXPECT_EQ( zc_valid, C->get_zombie_count());
378
379 free(X_valid);
380 free(i_valid);
381 }
382 }
383
384 G.del();
385
386 return result;
387
388 }
389
390 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_dndn_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)391 bool test_AxB_dot3_dndn_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
392 // Assumes all matrices are square so far, so only N dimension given.
393 // Sparsity is dense here so Anz = Bnz = N*N.
394 // Generates three randomized matrices, builds buckets and calls a kernel.
395
396
397 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "dndn");
398
399 int testBucket = TB;
400
401 //unsigned seed = 13372801;
402 //std::mt19937 r; //random number generator Mersenne Twister
403 //r.seed(seed);
404 int gpuID;
405 cudaGetDevice( &gpuID);
406
407 std::cout<< "found device "<<gpuID<<std::endl;
408
409 T_Z MONOID_IDENTITY;
410 if (SEMI_RING == "PLUS_TIMES") {
411 std::cout << "Plus Times (+,*) semiring"<<std::endl;
412 MONOID_IDENTITY = 0;
413 ADD_ptr<T_Z> = myOP_plus<T_Z>;
414 MUL_ptr<T_Z> = myOP_times<T_Z>;
415
416 }
417 else if(SEMI_RING == "MIN_PLUS") {
418 std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
419 MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
420 ADD_ptr<T_Z> = myOP_min<T_Z>;
421 MUL_ptr<T_Z> = myOP_plus<T_Z>;
422
423 }
424 else if(SEMI_RING == "MAX_PLUS") {
425 MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
426 std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
427 ADD_ptr<T_Z> = myOP_max<T_Z>;
428 MUL_ptr<T_Z> = myOP_plus<T_Z>;
429 }
430
431 //Generate test data and setup for using a jitify kernel with 'bucket' interface
432 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
433 int64_t Annz = N*N;
434 int64_t Bnnz = N*N;
435 int64_t Cnz = N;
436 float Cnzpercent = (float) Cnz/(N*N);
437
438 G.init(N, Annz, Bnnz, Cnzpercent);
439
440 G.fill_buckets( testBucket); // all elements go to testbucket= TB
441
442 matrix<T_C>* C = G.getCptr();
443 matrix<T_M>* M = G.getMptr();
444 matrix<T_A>* A = G.getAptr();
445 matrix<T_B>* B = G.getBptr();
446
447 T_C *Cx = C->x;
448 T_A *Ax = A->x;
449 T_B *Bx = B->x;
450
451 // Set clear zombie count
452 C->zombie_count = 0;
453
454 //std::cout<<"got all matrices"<<std::endl;
455 int64_t *Bucket = G.getBucket();
456 int64_t *BucketStart = G.getBucketStart();
457
458 int zc_valid = 0;
459
460 bool result = false;
461
462 for (int b =0; b < 12; ++b) {// loop on buckets
463
464 int64_t b_start = BucketStart [b] ;
465 int64_t b_end = BucketStart [b+1] ;
466 int64_t nvecs = b_end - b_start ;
467 if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
468
469 T_C *X_valid = (T_C*) malloc( Cnz*sizeof(T_C));
470 int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
471 if (b == TB) { //test cases for dense-dense kernels
472 int nthrd = 32;
473 int sz = 4;
474 //int m = 256/sz;
475 int nblck = Cnz;
476 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
477
478 GpuTimer kernTimer;
479 kernTimer.Start();
480 lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
481 C, M, A, B, sz);
482
483 kernTimer.Stop();
484 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
485
486 zc_valid = C->zombie_count;
487 C->zombie_count = 0;
488 for (int i =0 ; i< Cnz; ++i) {
489 //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
490 X_valid[i] = Cx[i];
491 Cx[i] = 0;
492 i_valid[i] = C->i[i];
493 }
494 G.loadCj();
495
496 for (int64_t pair = b_start ; pair < b_end ; pair++) {
497
498 // get the kth entry in bucket b
499 //std::cout<< " pair ="<<pair<<std::endl;
500 int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
501 int64_t i = M->i[pC] ; // row index of C(i,j)
502
503 // get C(i,j)
504 int64_t k = (C->i [pC] >> 4) ; // col index of C(i,j)
505 //ASSERT ((C->i [pC] & 4) == b) ;
506 int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
507 //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
508
509 // xvp, xvi, xvals: A(:,i)
510 // xvp is Ap [i] and Ap [i+1]
511 int64_t pA_start = A->p [i] ;
512 int64_t pA_end = A->p [i+1] ;
513 // indices are in Ai [pA_start ... pA_end-1]
514 // values are in Ax [pA_start ... pA_end-1]
515
516 // yvp, yvi, yvals: B(:,j)
517 // yvp is Bp [j] and Bp [j+1]
518 int64_t pB_start = B->p [j] ;
519 int64_t pB_end = B->p [j+1] ;
520 // indices are in Bi [pB_start ... pB_end-1]
521 // values are in Bx [pB_start ... pB_end-1]
522 k = pA_start;
523 int64_t l = pB_start;
524 T_Z cij = MONOID_IDENTITY;
525 while( k < pA_end && l < pB_end) {
526 //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
527 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
528 k++;
529 l++;
530 //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
531 }
532 //std::cout<< " dot = "<< sum << std::endl;
533
534 // output for this dot product is
535
536 if (cij == MONOID_IDENTITY) {
537 C->i [pC] = -1;//GB_FLIP (i)
538 C->zombie_count++;
539 }
540 else {
541 Cx [pC] = (T_C)cij;
542 C->i [pC] = i;
543 }
544 }
545 T_C err = 0;
546 for (int j =0 ; j< N; ++j) {
547 for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
548 int64_t i = C->i[l];
549 //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
550 if (i >= 0)
551 err += ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
552 }
553 }
554 std::cout<< " 2-norm of err ="<< err<<std::endl;
555 std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
556
557 EXPECT_EQ(err,0);
558 EXPECT_EQ( zc_valid, C->get_zombie_count());
559
560 free(X_valid);
561 free(i_valid);
562 }
563 }
564
565 G.del();
566
567 return result;
568
569 }
570
571 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_vsvs_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)572 bool test_AxB_dot3_vsvs_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
573 // Assumes all matrices are square so far, so only N dimension given.
574 // Sparsity is controlled by Anz and Bnz vs N*N.
575 // Generates three randomized matrices, builds buckets and calls a kernel.
576
577
578 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "vsvs");
579
580 int testBucket = TB;
581
582 //unsigned seed = 13372801;
583 //std::mt19937 r; //random number generator Mersenne Twister
584 //r.seed(seed);
585 int gpuID;
586 cudaGetDevice( &gpuID);
587 std::cout<< "found device "<<gpuID<<std::endl;
588
589 //T_Z MONOID_IDENTITY;
590 if (SEMI_RING == "PLUS_TIMES") {
591 //MONOID_IDENTITY =(T_Z)0;
592 ADD_ptr<T_Z> = myOP_plus<T_Z>;
593 MUL_ptr<T_Z> = myOP_times<T_Z>;
594
595 }
596 else if(SEMI_RING == "MIN_PLUS") {
597 //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
598 ADD_ptr<T_Z> = myOP_min<T_Z>;
599 MUL_ptr<T_Z> = myOP_plus<T_Z>;
600
601 }
602 else if(SEMI_RING == "MAX_PLUS") {
603 //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
604 ADD_ptr<T_Z> = myOP_max<T_Z>;
605 MUL_ptr<T_Z> = myOP_plus<T_Z>;
606 }
607
608 //Generate test data and setup for using a jitify kernel with 'bucket' interface
609 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
610 int64_t Cnz = N;
611 float Cnzpercent = (float) Cnz/(N*N);
612
613 G.init(N, Anz, Bnz, Cnzpercent);
614
615 G.fill_buckets( testBucket); // all elements go to testbucket= TB
616
617 matrix<T_C>* C = G.getCptr();
618 matrix<T_M>* M = G.getMptr();
619 matrix<T_A>* A = G.getAptr();
620 matrix<T_B>* B = G.getBptr();
621
622 T_C *Cx = C->x;
623 T_A *Ax = A->x;
624 T_B *Bx = B->x;
625 int64_t *Ci = C->i;
626 int64_t *Mi = M->i;
627 int64_t *Ai = A->i;
628 int64_t *Bi = B->i;
629 int64_t *Ap = A->p;
630 int64_t *Bp = B->p;
631
632 //std::cout<<"got all matrices"<<std::endl;
633 int64_t *Bucket = G.getBucket();
634 int64_t *BucketStart = G.getBucketStart();
635
636 int zc_valid = 0;
637
638 bool result = false;
639
640 for (int b =0; b < 12; ++b) {// loop on buckets
641
642 int64_t b_start = BucketStart [b] ;
643 int64_t b_end = BucketStart [b+1] ;
644 int64_t nvecs = b_end - b_start ;
645 if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
646
647 T_C *X_valid = (T_C*) malloc( Cnz*sizeof(T_C));
648 int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
649 if (b == TB) { //test cases for v.sparse-v.sparse kernels
650 int nthrd = 32;
651 int sz = Anz/N;
652 int m = 256/sz;
653 int nblck = (Cnz -1 + m*nthrd )/(m*nthrd) ;
654 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
655
656 GpuTimer kernTimer;
657 kernTimer.Start();
658 lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
659 C, M, A, B, sz);
660
661 kernTimer.Stop();
662 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
663
664 //std::cout<<"returned from kernel"<<std::endl;
665
666 zc_valid = C->zombie_count;
667 C->zombie_count = 0;
668 for (int i =0 ; i< Cnz; ++i) {
669 X_valid[i] = Cx[i];
670 Cx[i] = 0;
671 i_valid[i] = Ci[i];
672 }
673 G.loadCj();
674 for (int64_t pair = b_start ; pair < b_end ; pair++) {
675
676 // get the kth entry in bucket b
677 //std::cout<< " pair ="<<pair<<std::endl;
678 int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
679 int64_t i = Mi[pC] ; // row index of C(i,j)
680
681 // get C(i,j)
682 int64_t k = (Ci [pC] >> 4) ; // col index of C(i,j)
683 //ASSERT ((C->i [pC] & 4) == b) ;
684 int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
685 //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
686
687 // xvp, xvi, xvals: A(:,i)
688 // xvp is Ap [i] and Ap [i+1]
689 int64_t pA_start = Ap [i] ;
690 int64_t pA_end = Ap [i+1] ;
691 // indices are in Ai [pA_start ... pA_end-1]
692 // values are in Ax [pA_start ... pA_end-1]
693
694 // yvp, yvi, yvals: B(:,j)
695 // yvp is Bp [j] and Bp [j+1]
696 int64_t pB_start = Bp [j] ;
697 int64_t pB_end = Bp [j+1] ;
698 // indices are in Bi [pB_start ... pB_end-1]
699 // values are in Bx [pB_start ... pB_end-1]
700 k = pA_start;
701 int64_t l = pB_start;
702 T_Z cij ;
703 bool cij_exists = false;
704 while( k < pA_end && l < pB_end) {
705 if ( Ai[k] < Bi[l]) ++k;
706 else if ( Ai[k] > Bi[l]) ++l;
707 else {
708 if (cij_exists) {
709 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( Ax[k] , Bx[l] ) );
710 }
711 else{
712 cij_exists = true;
713 cij = (*MUL_ptr<T_Z>)( Ax[k], Bx[l]);
714 }
715 k++;
716 l++;
717 }
718 }
719 //std::cout<< " dot = "<< sum << std::endl;
720
721 // output for this dot product is
722
723 if (cij_exists) {
724 Ci [pC] = i;
725 Cx[pC] = (T_C)cij;
726 }
727 else {
728 Ci [pC] = -1;//GB_FLIP (i)
729 C->zombie_count++;
730 }
731 }
732 T_C err = 0;
733 for (int j =0 ; j< N; ++j) {
734 for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
735 //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
736 if (Ci[l] > 0)
737 err += ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
738 }
739 }
740 std::cout<< " 2-norm of err ="<< err<<std::endl;
741 std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
742
743 EXPECT_EQ(err,0);
744 EXPECT_EQ( zc_valid, C->get_zombie_count());
745
746 free(X_valid);
747 free(i_valid);
748 }
749 }
750
751 G.del();
752
753 return result;
754
755 }
756
757 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_vssp_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)758 bool test_AxB_dot3_vssp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
759 // Assumes all matrices are square so far, so only N dimension given.
760 // Sparsity is controlled by Anz and Bnz vs N*N.
761 // Generates three randomized matrices, builds buckets and calls a kernel.
762
763 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "vssp");
764
765 int testBucket = TB;
766
767 //unsigned seed = 13372801;
768 //std::mt19937 r; //random number generator Mersenne Twister
769 //r.seed(seed);
770 int gpuID;
771 cudaGetDevice( &gpuID);
772 std::cout<< "found device "<<gpuID<<std::endl;
773
774 //T_Z MONOID_IDENTITY;
775 if (SEMI_RING == "PLUS_TIMES") {
776 //MONOID_IDENTITY =(T_Z)0;
777 ADD_ptr<T_Z> = myOP_plus<T_Z>;
778 MUL_ptr<T_Z> = myOP_times<T_Z>;
779
780 }
781 else if(SEMI_RING == "MIN_PLUS") {
782 //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
783 ADD_ptr<T_Z> = myOP_min<T_Z>;
784 MUL_ptr<T_Z> = myOP_plus<T_Z>;
785
786 }
787 else if(SEMI_RING == "MAX_PLUS") {
788 //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
789 ADD_ptr<T_Z> = myOP_max<T_Z>;
790 MUL_ptr<T_Z> = myOP_plus<T_Z>;
791 }
792
793 //Generate test data and setup for using a jitify kernel with 'bucket' interface
794 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
795
796 int64_t Cnz = N;
797 float Cnzpercent = (float)( Cnz)/(N*N);
798
799 G.init(N, Anz, Bnz, Cnzpercent );
800
801 G.fill_buckets( testBucket); // all elements go to testbucket= TB
802
803 matrix<T_C>* C = G.getCptr();
804 matrix<T_M>* M = G.getMptr();
805 matrix<T_A>* A = G.getAptr();
806 matrix<T_B>* B = G.getBptr();
807
808 T_C *Cx = C->x;
809 T_A *Ax = A->x;
810 T_B *Bx = B->x;
811 int64_t *Ci = C->i;
812 int64_t *Mi = M->i;
813 int64_t *Ai = A->i;
814 int64_t *Bi = B->i;
815 int64_t *Ap = A->p;
816 int64_t *Bp = B->p;
817
818
819 //std::cout<<"got all matrices"<<std::endl;
820 int64_t *Bucket = G.getBucket();
821 int64_t *BucketStart = G.getBucketStart();
822
823 int zc_valid = 0;
824 int zc = 0;
825
826 bool result = false;
827
828 for (int b =0; b < 12; ++b) {// loop on buckets
829
830 int64_t b_start = BucketStart [b] ;
831 int64_t b_end = BucketStart [b+1] ;
832 int64_t nvecs = b_end - b_start ;
833 if (nvecs == 0) continue;
834 std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
835
836 T_C *X_valid = (T_C*) malloc( Cnz*sizeof(T_C));
837 int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
838 if (b == TB) { //test cases for v.sparse-dense kernels
839 int nthrd = 32;
840 int sz = 4;
841 //int m = 256/sz;
842 int nblck = (Cnz -1 + nthrd )/(nthrd) ;
843 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
844
845 GpuTimer kernTimer;
846 kernTimer.Start();
847 lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
848 C, M, A, B, sz);
849
850 kernTimer.Stop();
851 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
852
853 //std::cout<<"returned from kernel"<<std::endl;
854
855 zc_valid = C->zombie_count;
856 C->zombie_count = 0;
857 for (int i =0 ; i< Cnz; ++i) {
858 X_valid[i] = Cx[i];
859 Cx[i] = 0;
860 i_valid[i] = C->i[i];
861 }
862 G.loadCj();
863
864
865 for (int64_t pair = b_start ; pair < b_end ; pair++) {
866
867 // get the kth entry in bucket b
868 //std::cout<< " pair ="<<pair<<std::endl;
869 int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
870
871 int64_t i = Mi[pC] ; // row index of C(i,j)
872 // get C(i,j)
873 int64_t k = (Ci [pC] >> 4) ; // col index of C(i,j)
874 //ASSERT ((C->i [pC] & 4) == b) ;
875 int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
876 //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
877
878 int64_t pA = Ap[i];
879 int64_t pA_end = Ap[i+1];
880 int64_t nnzA = pA_end - pA;
881
882 int64_t pB = Bp[j];
883 int64_t pB_end = Bp[j+1];
884 int64_t nnzB = pB_end - pB;
885
886 //Search for each nonzero in the smaller vector to find intersection
887 bool cij_exists = false;
888
889 T_A aki;
890 T_B bkj;
891 T_Z cij;
892
893 if (nnzA <= nnzB) {
894 //----------------------------------------------------------------------
895 // A(:,i) is very sparse compared to B(:,j)
896 //----------------------------------------------------------------------
897
898 while (pA < pA_end && pB < pB_end)
899 {
900 int64_t ia = Ai [pA] ;
901 int64_t ib = Bi [pB] ;
902 if (ia < ib)
903 {
904 // A(ia,i) appears before B(ib,j)
905 pA++ ;
906 }
907 else if (ib < ia)
908 {
909 // B(ib,j) appears before A(ia,i)
910 // discard all entries B(ib:ia-1,j)
911 int64_t pleft = pB + 1 ;
912 int64_t pright = pB_end - 1 ;
913 GB_BINARY_TRIM_SEARCH (ia, Bi, pleft, pright) ;
914 //ASSERT (pleft > pB) ;
915 pB = pleft ;
916 }
917 else // ia == ib == k
918 {
919 // A(k,i) and B(k,j) are the next entries to merge
920 #if defined ( GB_PHASE_1_OF_2 )
921 cij_exists = true ;
922 break ;
923 #else
924 GB_GETA (aki, Ax, pA) ; /* aki = A(k,i) */
925 GB_GETB (bkj, Bx, pB) ; /* bkj = B(k,j) */
926 if (cij_exists)
927 {
928 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki , (T_Z)bkj ) );
929 /* cij += aki * bkj */
930 }
931 else
932 {
933 /* cij = A(k,i) * B(k,j), and add to the pattern */
934 cij_exists = true ;
935 cij= (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) ;
936 /* cij = aki * bkj */
937 }
938 //GB_DOT_TERMINAL (cij) ; // break if cij == terminal
939 pA++ ;
940 pB++ ;
941 #endif
942 }
943 }
944 }
945 else {
946 //----------------------------------------------------------------------
947 // B(:,j) is very sparse compared to A(:,i)
948 //----------------------------------------------------------------------
949
950 while (pA < pA_end && pB < pB_end)
951 {
952 int64_t ia = Ai [pA] ;
953 int64_t ib = Bi [pB] ;
954 if (ia < ib)
955 {
956 // A(ia,i) appears before B(ib,j)
957 // discard all entries A(ia:ib-1,i)
958 int64_t pleft = pA + 1 ;
959 int64_t pright = pA_end - 1 ;
960 GB_BINARY_TRIM_SEARCH (ib, Ai, pleft, pright) ;
961 //ASSERT (pleft > pA) ;
962 pA = pleft ;
963 }
964 else if (ib < ia)
965 {
966 // B(ib,j) appears before A(ia,i)
967 pB++ ;
968 }
969 else // ia == ib == k
970 {
971 // A(k,i) and B(k,j) are the next entries to merge
972 #if defined ( GB_PHASE_1_OF_2 )
973 cij_exists = true ;
974 break ;
975 #else
976 GB_GETA (aki, Ax, pA) ; /* aki = A(k,i) */
977 GB_GETB (bkj, Bx, pB) ; /* bkj = B(k,j) */
978 if (cij_exists)
979 {
980 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki , (T_Z)bkj ) );
981 /* cij += aki * bkj */ \
982 }
983 else
984 {
985 /* cij = A(k,i) * B(k,j), and add to the pattern */
986 cij_exists = true ;
987 cij= (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) ;
988 }
989 //GB_DOT_TERMINAL (cij) ; // break if cij == terminal
990 pA++ ;
991 pB++ ;
992 #endif
993 }
994 }
995
996 }
997 if ( cij_exists){
998 Ci[pair] = i;
999 Cx[pair] = (T_C)cij;
1000 }
1001 else {
1002 zc++;
1003 //printf(" %lld, %lld is zombie %d!\n",i,j,zc);
1004 Ci[pair] = GB_FLIP( i );
1005 }
1006
1007 }
1008 C->zombie_count = zc;
1009 T_C err = 0;
1010 for (int j =0 ; j< N; ++j) {
1011 for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
1012 int64_t i = Ci[l];
1013 //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
1014 if (i > 0){ //not a zombie!
1015 err += ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
1016 }
1017 }
1018 }
1019 std::cout<< " 2-norm of err ="<< err<<std::endl;
1020 std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
1021
1022 EXPECT_EQ(err,0);
1023 EXPECT_EQ( zc_valid, C->get_zombie_count());
1024
1025 free(X_valid);
1026 free(i_valid);
1027 }
1028 }
1029
1030 G.del();
1031
1032 return result;
1033
1034 }
1035
1036 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_spdn_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)1037 bool test_AxB_dot3_spdn_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
1038 // Assumes all matrices are square so far, so only N dimension given.
1039 // Sparsity is controlled by Anz and Bnz vs N*N.
1040 // Generates three randomized matrices, builds buckets and calls a kernel.
1041
1042 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "spdn");
1043
1044 int testBucket = TB;
1045
1046 //unsigned seed = 13372801;
1047 //std::mt19937 r; //random number generator Mersenne Twister
1048 //r.seed(seed);
1049 int gpuID;
1050 cudaGetDevice( &gpuID);
1051 std::cout<< "found device "<<gpuID<<std::endl;
1052
1053 //T_Z MONOID_IDENTITY;
1054 if (SEMI_RING == "PLUS_TIMES") {
1055 // MONOID_IDENTITY =(T_Z)0;
1056 ADD_ptr<T_Z> = myOP_plus<T_Z>;
1057 MUL_ptr<T_Z> = myOP_times<T_Z>;
1058
1059 }
1060 else if(SEMI_RING == "MIN_PLUS") {
1061 // MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
1062 ADD_ptr<T_Z> = myOP_min<T_Z>;
1063 MUL_ptr<T_Z> = myOP_plus<T_Z>;
1064
1065 }
1066 else if(SEMI_RING == "MAX_PLUS") {
1067 // MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
1068 ADD_ptr<T_Z> = myOP_max<T_Z>;
1069 MUL_ptr<T_Z> = myOP_plus<T_Z>;
1070 }
1071
1072 //Generate test data and setup for using a jitify kernel with 'bucket' interface
1073 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
1074
1075 int64_t Cnz = N;
1076 float Cnzpercent = (float)( Cnz)/(N*N);
1077
1078 //spdn case means B should be dense -> Bnz = N*N;
1079 G.init(N, Anz, N*N, Cnzpercent );
1080
1081 G.fill_buckets( testBucket); // all elements go to testbucket= TB
1082
1083 matrix<T_C>* C = G.getCptr();
1084 matrix<T_M>* M = G.getMptr();
1085 matrix<T_A>* A = G.getAptr();
1086 matrix<T_B>* B = G.getBptr();
1087
1088 T_C *Cx = C->x;
1089 T_A *Ax = A->x;
1090 T_B *Bx = B->x;
1091 int64_t *Ci = C->i;
1092 int64_t *Mi = M->i;
1093 int64_t *Ai = A->i;
1094 int64_t *Bi = B->i;
1095 int64_t *Ap = A->p;
1096 int64_t *Bp = B->p;
1097
1098
1099 //std::cout<<"got all matrices"<<std::endl;
1100 int64_t *Bucket = G.getBucket();
1101 int64_t *BucketStart = G.getBucketStart();
1102
1103 int zc_valid = 0;
1104
1105 bool result = false;
1106
1107 for (int b =0; b < 12; ++b) {// loop on buckets
1108
1109 int64_t b_start = BucketStart [b] ;
1110 int64_t b_end = BucketStart [b+1] ;
1111 int64_t nvecs = b_end - b_start ;
1112 if (nvecs == 0) continue;
1113 std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
1114
1115 T_C *X_valid = (T_C*) malloc( Cnz*sizeof(T_C));
1116 int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
1117 if (b == TB) { //test cases for v.sparse-dense kernels
1118 int nthrd = 32;
1119 int sz = Anz/N;
1120 int m = 256/sz;
1121 int nblck = (Cnz -1 + m*nthrd )/(m*nthrd) ;
1122 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
1123
1124 GpuTimer kernTimer;
1125 kernTimer.Start();
1126 lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
1127 C, M, A, B, sz);
1128
1129 kernTimer.Stop();
1130 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
1131
1132 //std::cout<<"returned from kernel"<<std::endl;
1133
1134 zc_valid = C->zombie_count;
1135 C->zombie_count = 0;
1136 for (int i =0 ; i< Cnz; ++i) {
1137 X_valid[i] = Cx[i];
1138 Cx[i] = 0;
1139 i_valid[i] = Ci[i];
1140 }
1141 G.loadCj();
1142 for (int64_t pair = b_start ; pair < b_end ; pair++) {
1143
1144 // get the kth entry in bucket b
1145 //std::cout<< " pair ="<<pair<<std::endl;
1146 int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
1147 int64_t i = Mi[pC] ; // row index of C(i,j)
1148
1149 // get C(i,j)
1150 //int64_t k = (Ci [pC] >> 4) ; // col index of C(i,j)
1151 //ASSERT ((C->i [pC] & 4) == b) ;
1152 //int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
1153 //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
1154
1155 int64_t pA = Ap[i];
1156 int64_t pA_end = Ap[i+1];
1157 int64_t nnzA = pA_end - pA;
1158 int64_t pB = Bp[i];
1159 int64_t pB_end = Bp[i+1];
1160 int64_t nnzB = pB_end - pB;
1161 T_A aki;
1162 T_B bkj;
1163 T_Z cij;
1164
1165 if( nnzA == A->vlen) // A is dense
1166 {
1167 int64_t k = Bi [pB] ; // first row index of B(:,j)
1168 // cij = A(k,i) * B(k,j)
1169 GB_GETA (aki, Ax, pA+k) ; // aki = A(k,i)
1170 GB_GETB (bkj, Bx, pB ) ; // bkj = B(k,j)
1171 cij = (*MUL_ptr<T_Z>)( aki, bkj) ; // cij = aki * bkj
1172
1173 for (int64_t p = pB+1 ; p < pB_end ; p++)
1174 {
1175 //GB_DOT_TERMINAL (cij) ; // break if cij == terminal
1176 int64_t k = Bi [p] ; // next row index of B(:,j)
1177 // cij += A(k,i) * B(k,j)
1178 GB_GETA (aki, Ax, pA+k) ; // aki = A(k,i)
1179 GB_GETB (bkj, Bx, p ) ; // bkj = B(k,j)
1180 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) );
1181 }
1182
1183 }
1184 if( nnzB == B->vlen) // B is dense
1185 {
1186 int64_t k = Ai [pA] ; // first row index of A(:,i)
1187 // cij = A(k,i) * B(k,j)
1188 GB_GETA (aki, Ax, pA ) ; // aki = A(k,i)
1189 GB_GETB (bkj, Bx, pB+k) ; // bkj = B(k,j)
1190 cij = (*MUL_ptr<T_Z>)( aki, bkj) ; // cij = aki * bkj
1191
1192 for (int64_t p = pA+1 ; p < pA_end ; p++)
1193 {
1194 //GB_DOT_TERMINAL (cij) ; // break if cij == terminal
1195 int64_t k = Ai [p] ; // next row index of A(:,i)
1196 // cij += A(k,i) * B(k,j)
1197 GB_GETA (aki, Ax, p ) ; // aki = A(k,i)
1198 GB_GETB (bkj, Bx, pB+k) ; // bkj = B(k,j)
1199 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) );
1200 }
1201 }
1202
1203 Ci[pair] = i;
1204 Cx[pair] = cij;
1205
1206 }
1207 T_C err = 0;
1208 for (int j =0 ; j< N; ++j) {
1209 for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
1210 int64_t i = Ci[l];
1211 //std::cout<<i<<","<<j<<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
1212 if (i >=0 )
1213 err += ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
1214 }
1215 }
1216 std::cout<< " 2-norm of err ="<< err<<std::endl;
1217 std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
1218
1219 EXPECT_EQ(err,0);
1220 EXPECT_EQ( zc_valid, C->get_zombie_count());
1221
1222 free(X_valid);
1223 free(i_valid);
1224 }
1225 }
1226
1227 G.del();
1228
1229 return result;
1230
1231 }
1232
1233 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_mp_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)1234 bool test_AxB_dot3_mp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
1235 // Assumes all matrices are square so far, so only N dimension given.
1236 // Sparsity is dense here so Anz = Bnz = N*N.
1237 // Generates three randomized matrices, builds buckets and calls a kernel.
1238
1239
1240 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "mp");
1241
1242 int testBucket = TB;
1243
1244 //unsigned seed = 13372801;
1245 //std::mt19937 r; //random number generator Mersenne Twister
1246 //r.seed(seed);
1247 //int gpuID;
1248 //cudaGetDevice( &gpuID);
1249
1250 //std::cout<< "found device "<<gpuID<<std::endl;
1251
1252 //T_Z MONOID_IDENTITY;
1253 if (SEMI_RING == "PLUS_TIMES") {
1254 std::cout << "Plus Times (+,*) semiring"<<std::endl;
1255 //MONOID_IDENTITY = 0;
1256 ADD_ptr<T_Z> = myOP_plus<T_Z>;
1257 MUL_ptr<T_Z> = myOP_times<T_Z>;
1258
1259 }
1260 else if(SEMI_RING == "MIN_PLUS") {
1261 std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
1262 //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
1263 ADD_ptr<T_Z> = myOP_min<T_Z>;
1264 MUL_ptr<T_Z> = myOP_plus<T_Z>;
1265
1266 }
1267 else if(SEMI_RING == "MAX_PLUS") {
1268 //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
1269 std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
1270 ADD_ptr<T_Z> = myOP_max<T_Z>;
1271 MUL_ptr<T_Z> = myOP_plus<T_Z>;
1272 }
1273
1274 //Generate test data and setup for using a jitify kernel with 'bucket' interface
1275 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
1276 int64_t Annz = Anz;
1277 int64_t Bnnz = Bnz;
1278 int64_t Cnz = N;
1279 float Cnzpercent = (float) Cnz/(N*N);
1280
1281 G.init(N, Annz, Bnnz, Cnzpercent);
1282
1283 G.fill_buckets( testBucket); // all elements go to testbucket= TB
1284
1285 matrix<T_C>* C = G.getCptr();
1286 matrix<T_M>* M = G.getMptr();
1287 matrix<T_A>* A = G.getAptr();
1288 matrix<T_B>* B = G.getBptr();
1289
1290 T_C *Cx = C->x;
1291 T_A *Ax = A->x;
1292 T_B *Bx = B->x;
1293 int64_t *Ci = C->i;
1294 int64_t *Mi = M->i;
1295 int64_t *Ai = A->i;
1296 int64_t *Bi = B->i;
1297 int64_t *Ap = A->p;
1298 int64_t *Bp = B->p;
1299
1300 // Set clear zombie count
1301 C->zombie_count = 0;
1302
1303 //std::cout<<"got all matrices"<<std::endl;
1304 int64_t *Bucket = G.getBucket();
1305 int64_t *BucketStart = G.getBucketStart();
1306
1307 int zc_valid = 0;
1308
1309 bool result = false;
1310
1311 for (int b =0; b < 12; ++b) {// loop on buckets
1312
1313 int64_t b_start = BucketStart [b] ;
1314 int64_t b_end = BucketStart [b+1] ;
1315 int64_t nvecs = b_end - b_start ;
1316 if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
1317
1318 T_C *X_valid = (T_C*) malloc( Cnz*sizeof(T_C));
1319 int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
1320 if (b == TB) { //test cases for merge-path kernel
1321 int nthrd = 32;
1322 int nblck = Cnz;
1323 int sz = 0;
1324 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
1325
1326 GpuTimer kernTimer;
1327 kernTimer.Start();
1328 lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
1329 C, M, A, B, sz);
1330
1331 kernTimer.Stop();
1332 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
1333
1334 //std::cout<<"returned from kernel"<<std::endl;
1335
1336 zc_valid = C->zombie_count;
1337 C->zombie_count = 0;
1338 for (int i =0 ; i< Cnz; ++i) {
1339 //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
1340 X_valid[i] = Cx[i];
1341 i_valid[i] = C->i[i];
1342 // clear values for next test
1343 Cx[i] = 0;
1344 }
1345 G.loadCj();
1346
1347 for (int64_t pair = b_start ; pair < b_end ; pair++) {
1348
1349 // get the kth entry in bucket b
1350 //std::cout<< " pair ="<<pair<<std::endl;
1351 int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
1352 int64_t i = Mi[pC] ; // row index of C(i,j)
1353
1354 // get C(i,j)
1355 int64_t k = (Ci [pC] >> 4) ; // col index of C(i,j)
1356 //ASSERT ((C->i [pC] & 4) == b) ;
1357 int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
1358 //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
1359
1360 int64_t pA_start = Ap [i] ;
1361 int64_t pA_end = Ap [i+1] ;
1362
1363 int64_t pB_start = Bp [j] ;
1364 int64_t pB_end = Bp [j+1] ;
1365 // NOTE: this test code is NOT doing merge-path. This is just a
1366 // single-threaded linear merge for correctness testing.
1367 k = pA_start;
1368 int64_t l = pB_start;
1369 T_Z cij ;
1370 bool cij_exists = false;
1371 while( k < pA_end && l < pB_end) {
1372 if ( Ai[k] < Bi[l] ) k += 1;
1373 else if ( Ai[k] > Bi[l] ) l += 1;
1374 else {
1375 if (cij_exists) {
1376 //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
1377 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
1378 }
1379 else {
1380 cij_exists = true;
1381 cij = (*MUL_ptr<T_Z>)( (T_Z)Ax[k], (T_Z)Bx[l] ) ;
1382 }
1383
1384 k++;
1385 l++;
1386 }
1387 //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
1388 }
1389 //std::cout<< " dot = "<< sum << std::endl;
1390
1391 // output for this dot product is
1392
1393 if (cij_exists) {
1394 Cx [pC] = (T_C)cij;
1395 Ci [pC] = i;
1396 }
1397 else {
1398 C->i [pC] = -1;//GB_FLIP (i)
1399 C->zombie_count++;
1400 }
1401 }
1402 T_C err = 0;
1403 for (int j =0 ; j< N; ++j) {
1404 for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
1405
1406 if (Ci[l] > 0) {
1407 //std::cout<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
1408 err += ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
1409 }
1410 }
1411 }
1412 std::cout<< " 2-norm of err ="<< err<<std::endl;
1413 std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
1414
1415 EXPECT_EQ(err,0);
1416 EXPECT_EQ( zc_valid, C->get_zombie_count());
1417
1418 free(X_valid);
1419 free(i_valid);
1420 }
1421 }
1422
1423 G.del();
1424
1425 return result;
1426
1427 }
1428
1429 template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
test_AxB_dot3_warp_factory(int TB,int64_t N,int64_t Anz,int64_t Bnz,std::string & SEMI_RING)1430 bool test_AxB_dot3_warp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
1431 // Assumes all matrices are square so far, so only N dimension given.
1432 // Sparsity is dense here so Anz = Bnz = N*N.
1433 // Generates three randomized matrices, builds buckets and calls a kernel.
1434
1435
1436 launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "warp");
1437
1438 int testBucket = TB;
1439
1440 //unsigned seed = 13372801;
1441 //std::mt19937 r; //random number generator Mersenne Twister
1442 //r.seed(seed);
1443 //int gpuID;
1444 //cudaGetDevice( &gpuID);
1445
1446 //std::cout<< "found device "<<gpuID<<std::endl;
1447
1448 //T_Z MONOID_IDENTITY;
1449 if (SEMI_RING == "PLUS_TIMES") {
1450 std::cout << "Plus Times (+,*) semiring"<<std::endl;
1451 //MONOID_IDENTITY = 0;
1452 ADD_ptr<T_Z> = myOP_plus<T_Z>;
1453 MUL_ptr<T_Z> = myOP_times<T_Z>;
1454
1455 }
1456 else if(SEMI_RING == "MIN_PLUS") {
1457 std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
1458 //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
1459 ADD_ptr<T_Z> = myOP_min<T_Z>;
1460 MUL_ptr<T_Z> = myOP_plus<T_Z>;
1461
1462 }
1463 else if(SEMI_RING == "MAX_PLUS") {
1464 //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
1465 std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
1466 ADD_ptr<T_Z> = myOP_max<T_Z>;
1467 MUL_ptr<T_Z> = myOP_plus<T_Z>;
1468 }
1469
1470 //Generate test data and setup for using a jitify kernel with 'bucket' interface
1471 SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
1472 int64_t Cnz = N;
1473 float Cnzpercent = (float) Cnz/(N*N);
1474
1475 G.init(N, Anz, Bnz, Cnzpercent);
1476
1477 G.fill_buckets( testBucket); // all elements go to testbucket= TB
1478
1479 matrix<T_C>* C = G.getCptr();
1480 matrix<T_M>* M = G.getMptr();
1481 matrix<T_A>* A = G.getAptr();
1482 matrix<T_B>* B = G.getBptr();
1483
1484 T_C *Cx = C->x;
1485 T_A *Ax = A->x;
1486 T_B *Bx = B->x;
1487 int64_t *Ci = C->i;
1488 int64_t *Mi = M->i;
1489 int64_t *Ai = A->i;
1490 int64_t *Bi = B->i;
1491 int64_t *Ap = A->p;
1492 int64_t *Bp = B->p;
1493
1494 // Set clear zombie count
1495 C->zombie_count = 0;
1496
1497 //std::cout<<"got all matrices"<<std::endl;
1498 int64_t *Bucket = G.getBucket();
1499 int64_t *BucketStart = G.getBucketStart();
1500
1501 int zc_valid = 0;
1502
1503 bool result = false;
1504
1505 for (int b =0; b < 12; ++b) {// loop on buckets
1506
1507 int64_t b_start = BucketStart [b] ;
1508 int64_t b_end = BucketStart [b+1] ;
1509 int64_t nvecs = b_end - b_start ;
1510 if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
1511
1512 T_C *X_valid = (T_C*) malloc( Cnz*sizeof(T_C));
1513 int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
1514 if (b == TB) { //test cases for merge-path kernel
1515 int nthrd = 32;
1516 int nblck = (Cnz + nthrd -1)/nthrd ;
1517 int sz = 0;
1518 std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
1519
1520 GpuTimer kernTimer;
1521 kernTimer.Start();
1522 lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
1523 C, M, A, B, sz);
1524
1525 kernTimer.Stop();
1526 std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
1527
1528 //std::cout<<"returned from kernel"<<std::endl;
1529
1530 zc_valid = C->zombie_count;
1531 C->zombie_count = 0;
1532 for (int i =0 ; i< Cnz; ++i) {
1533 //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
1534 X_valid[i] = Cx[i];
1535 i_valid[i] = C->i[i];
1536 // clear values for next test
1537 Cx[i] = 0;
1538 }
1539 G.loadCj();
1540
1541 for (int64_t pair = b_start ; pair < b_end ; pair++) {
1542
1543 // get the kth entry in bucket b
1544 //std::cout<< " pair ="<<pair<<std::endl;
1545 int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
1546 int64_t i = Mi[pC] ; // row index of C(i,j)
1547
1548 // get C(i,j)
1549 int64_t k = (Ci [pC] >> 4) ; // col index of C(i,j)
1550 //ASSERT ((C->i [pC] & 4) == b) ;
1551 int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
1552 //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
1553
1554 int64_t pA_start = Ap [i] ;
1555 int64_t pA_end = Ap [i+1] ;
1556
1557 int64_t pB_start = Bp [j] ;
1558 int64_t pB_end = Bp [j+1] ;
1559 // NOTE: this test code is NOT doing merge-path. This is just a
1560 // single-threaded linear merge for correctness testing.
1561 k = pA_start;
1562 int64_t l = pB_start;
1563 T_Z cij ;
1564 bool cij_exists = false;
1565 while( k < pA_end && l < pB_end) {
1566 if ( Ai[k] < Bi[l] ) k += 1;
1567 else if ( Ai[k] > Bi[l] ) l += 1;
1568 else {
1569 if (cij_exists) {
1570 //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
1571 cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
1572 }
1573 else {
1574 cij_exists = true;
1575 cij = (*MUL_ptr<T_Z>)( (T_Z)Ax[k], (T_Z)Bx[l] ) ;
1576 }
1577
1578 k++;
1579 l++;
1580 }
1581 //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
1582 }
1583 //std::cout<< " dot = "<< sum << std::endl;
1584
1585 // output for this dot product is
1586
1587 if (cij_exists) {
1588 Cx [pC] = (T_C)cij;
1589 Ci [pC] = i;
1590 }
1591 else {
1592 C->i [pC] = -1;//GB_FLIP (i)
1593 C->zombie_count++;
1594 }
1595 }
1596 T_C err = 0;
1597 for (int j =0 ; j< N; ++j) {
1598 for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
1599
1600 if (Ci[l] > 0) {
1601 //std::cout<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
1602 err += ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
1603 }
1604 }
1605 }
1606 std::cout<< " 2-norm of err ="<< err<<std::endl;
1607 std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
1608
1609 EXPECT_EQ(err,0);
1610 EXPECT_EQ( zc_valid, C->get_zombie_count());
1611
1612 free(X_valid);
1613 free(i_valid);
1614 }
1615 }
1616
1617 G.del();
1618
1619 return result;
1620
1621 }
1622
1623 template <typename T>
test_reducefactoryUM(unsigned int N,std::string OP)1624 bool test_reducefactoryUM( unsigned int N, std::string OP) {
1625
1626 reduceFactory<T> rF;
1627
1628 int block(32);
1629 int nblock= (N + 8*block -1)/(8*block);
1630 int grid(nblock);
1631 T* d_data;
1632 T* output;
1633
1634 //std::cout<<" alloc'ing data and output"<<std::endl;
1635 CHECK_CUDA( cudaMallocManaged((void**) &d_data, nblock*sizeof(T)) );
1636 CHECK_CUDA( cudaMallocManaged((void**) &output, nblock*sizeof(T)) );
1637 //std::cout<<" alloc done"<<std::endl;
1638 //std::cout<<" data fill start"<<std::endl;
1639
1640 fillvector_linear<T> ( N, d_data);
1641
1642 //std::cout<<" data fill complete"<<std::endl;
1643 //we will get a triangular sum = N*(N+1)/2 with this input
1644 //for (unsigned int i =0; i < N; ++i) d_data[i] = i;
1645
1646 //std::cout<< " init data done"<<std::endl;
1647 //for (unsigned int i =0; i < N; ++i) std::cout<< d_data[i] <<" ";
1648
1649
1650 T sum;
1651 std::cout << "Launching reduce"<<OP<<GET_TYPE_NAME(sum)<<" kernel..."<<std::endl;
1652 rF.jitGridBlockLaunch( grid, block, d_data, output, N, OP );
1653
1654 for (int i =0; i< nblock; ++i) std::cout<< output[i] <<" ";
1655
1656 if (OP == "PLUS"){
1657 sum = (T) 0;
1658 myOpPTR<T> = myOP_plus<T>;
1659 }
1660 if (OP == "MIN") {
1661 sum = (T)std::numeric_limits<T>::max();
1662 myOpPTR<T> = myOP_min<T>;
1663 }
1664 if (OP == "MAX") {
1665 sum = (T)std::numeric_limits<T>::min();
1666 myOpPTR<T> = myOP_max<T>;
1667 }
1668
1669 for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T>)(sum ,output[i]);
1670
1671 T expect;
1672 bool result = false;
1673 if (OP == "PLUS") {
1674 expect = (T)(N*(N-1)/2);
1675 T temp = (sum - expect) ;
1676 if (temp < 0) temp = -temp ;
1677 //result = (temp < (T)1) ; //adjust formula for leading 0
1678 EXPECT_LE(temp, 1);
1679 }
1680 else if (OP == "MAX") {
1681 expect = (T)(N-1);
1682 //result = (sum)== (T)(N-1) ; //max is N-1
1683 EXPECT_EQ( sum , (T)(N-1) );
1684
1685 }
1686 else if (OP == "MIN") {
1687 expect = (T)0;
1688 //result = (sum)== (T)(0) ; //min is 0
1689 EXPECT_EQ( sum , (T)(0) );
1690 }
1691 else expect = (T) 0;
1692 std::cout <<std::endl<<"result of test_reducefactoryUM with "<< OP<< " operation ="<< sum
1693 <<" expected "<<expect << std::endl;
1694
1695 cudaFree(d_data);
1696 cudaFree(output);
1697 return result;
1698 }
1699
1700 template <typename T1,typename T2,typename T3>
test_dndotfactoryUM(unsigned int N,std::string SEMI_RING)1701 bool test_dndotfactoryUM( unsigned int N, std::string SEMI_RING) {
1702
1703 dotFactory<T1,T2,T3> dF;
1704
1705 int block(512);
1706 int nblock= (N + 8*block -1)/(8*block);
1707 int grid(nblock);
1708 T1* x;
1709 T2* y;
1710 T3* output;
1711 CHECK_CUDA( cudaMallocManaged((void**)&x, N*sizeof(T1)) );
1712 CHECK_CUDA( cudaMallocManaged((void**)&y, N*sizeof(T2)) );
1713 CHECK_CUDA( cudaMallocManaged((void**)&output, nblock*sizeof(T3)) );
1714
1715 //we will get a triangular sum = N*(N+1)/2 with these inputs
1716 fillvector_linear<T1> (N, x);
1717 fillvector_constant<T2> (N, y, T2(1));
1718
1719 dF.jitGridBlockLaunch( grid, block, x, y, output, N, SEMI_RING );
1720
1721 T3 sum;
1722 if (SEMI_RING == "PLUS_TIMES")
1723 {
1724 myOpPTR<T3> = myOP_plus<T3>;
1725 sum = (T3)0;
1726 }
1727 if (SEMI_RING == "MIN_PLUS")
1728 {
1729 sum = std::numeric_limits<T3>::max();
1730 myOpPTR<T3> = myOP_min<T3>;
1731 }
1732
1733 for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T3>)(sum ,output[i]);
1734
1735 bool result = false;
1736 T3 expect;
1737 if (SEMI_RING == "PLUS_TIMES") {
1738 expect = (T3)(N*(N-1)/2);
1739 T3 temp = (sum -expect) ;
1740 if (temp < 0) temp = -temp ;
1741 //result = (temp < (T3)1) ; //adjust formula for leading 0
1742 EXPECT_LE( temp, (T3)1 );
1743 }
1744 else if (SEMI_RING == "MIN_PLUS") {
1745 expect = (T3) 1;
1746 //result = (sum == expect) ; //min is 1 from the (0,1) pair
1747 EXPECT_EQ( sum, expect);
1748 }
1749 else expect = (T3)0;
1750 std::cout <<"test_dotfactoryUM with "<<SEMI_RING<<" semi-ring="<< sum
1751 <<" expected "<<expect << std::endl;
1752
1753 cudaFree(x);
1754 cudaFree(y);
1755 cudaFree(output);
1756 return result;
1757 }
1758
1759
1760 template <typename T1,typename T2,typename T3>
test_spdotfactoryUM(unsigned int N,unsigned int xn,unsigned int yn,std::string SEMI_RING)1761 bool test_spdotfactoryUM( unsigned int N, unsigned int xn, unsigned int yn, std::string SEMI_RING) {
1762
1763 #define INTMIN( A, B) ( (A) < (B) ) ? (A) : (B)
1764
1765 // N here is the index space that the sparse vectors are drawn from.
1766 // Indices in xi and yi are in the range (0,N-1)
1767 // We will generate a number of random values in this range for test data
1768 std::cout<< " xn,yn= "<<xn<<','<<yn<<"min = "<< std::min( xn, yn) <<std::endl;
1769 int n_threads = std::min( xn, yn) / 4;
1770 std::cout<< "I think we need "<< n_threads<<" threads to do this."<<std::endl;
1771 int pad_threads = 2;
1772 while ( pad_threads < n_threads) {
1773 pad_threads *= 2;
1774 }
1775 int block= 32;
1776 int nblock= ( pad_threads + block -1)/(block);
1777 int grid(nblock);
1778 std::cout<<"N="<<N<<" xn ="<<xn<<", yn="<<yn<<" nblock="<<nblock<<" block ="<<block<<std::endl;
1779 unsigned int *xi;
1780 unsigned int *yi;
1781 T1* x;
1782 T2* y;
1783 T3* output;
1784 unsigned int intersection_size = 0; //will be filled in later if needed and xn != yn
1785 unsigned seed = 13372801;
1786 std::mt19937 r; //random number generator Mersenne Twister
1787 r.seed(seed);
1788 cudaMallocManaged((void**)&x, xn*sizeof(T1));
1789 cudaMallocManaged((void**)&xi, xn*sizeof(int));
1790 cudaMallocManaged((void**)&y, yn*sizeof(T2));
1791 cudaMallocManaged((void**)&yi, yn*sizeof(int));
1792 cudaMallocManaged((void**)&output, nblock*sizeof(T3));
1793
1794 int inv_sparsity = N/std::max(xn,yn); //= values not taken per value occupied in index space
1795 std::cout<<" Using inv_sparsity value of "<< inv_sparsity<<std::endl;
1796 fillvector_constant<T1> (xn, x, T1(1));
1797 fillvector_constant<T2> (yn, y, T2(1));
1798
1799 if( xn == yn){ // test case : all values intersect, generate 1 random number for both
1800 intersection_size = xn;
1801 std::cout << " all-intersect case..."<<std::endl;
1802 for (unsigned int i =0; i < xn; ++i){
1803 unsigned int rand_i = inv_sparsity*i+ r() %(inv_sparsity);
1804 xi[i] = rand_i; //we will get a count of the intersection size
1805 yi[i] = rand_i; //we will get a count of the intersection size
1806 }
1807 //std::sort (xi, xi + xn);
1808 //std::sort (yi, yi + yn);
1809 }
1810 else { // generate two different sets of indices, no known intersection pattern
1811 for (unsigned int i =0; i < xn; ++i){
1812 unsigned int rand_i = inv_sparsity*i +r() % (inv_sparsity);
1813 xi[i] = rand_i; //we will get a count of the intersection size
1814 }
1815 for (unsigned int i =0; i < yn; ++i){
1816 unsigned int rand_i = inv_sparsity*i +r() % (inv_sparsity);
1817 yi[i] = rand_i; //we will get a count of the intersection size
1818 }
1819 //std::sort (xi, xi + xn);
1820 //std::sort (yi, yi + yn);
1821 unsigned int xp =0;
1822 unsigned int yp =0;
1823 while (1){ //find the intersection size by merge of two sorted lists
1824 if (xi[xp] < yi[yp]) xp++;
1825 else if (xi[xp] > yi[yp]) yp++;
1826 else {
1827 intersection_size++;
1828 xp++;
1829 yp++;
1830 }
1831 if ( ( xp == xn ) || ( yp == yn) ) break;
1832 }
1833 }
1834 if( xn < 128 ) {
1835
1836 std::cout<< " xi = [";
1837 for (unsigned int i = 0 ; i < xn; ++i) {
1838 std::cout<< xi[i] << ",";
1839 }
1840 std::cout<< " ]" <<std::endl;
1841
1842 }
1843 std::cout << " Launching sparseDot CUDA kernel xn = "<<xn<<" yn="<<yn<<std::endl;
1844 spdotFactory<T1,T2,T3> spdF;
1845 spdF.jitGridBlockLaunch( grid, block, xn, xi, x, yn, yi, y, output, SEMI_RING );
1846
1847 cudaDeviceSynchronize ( ) ;
1848
1849 T3 sum;
1850 if (SEMI_RING == "PLUS_TIMES")
1851 {
1852 myOpPTR<T3> = myOP_plus<T3>;
1853 sum = (T3)0;
1854 }
1855 if (SEMI_RING == "MIN_PLUS")
1856 {
1857 sum = std::numeric_limits<T3>::max();
1858 myOpPTR<T3> = myOP_min<T3>;
1859 }
1860
1861 for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T3>)(sum ,output[i]);
1862
1863 bool result = false;
1864 T3 expect;
1865 if (SEMI_RING == "PLUS_TIMES") {
1866 T3 temp;
1867 expect = intersection_size;
1868 temp = (sum - expect);
1869 if (temp < 0) temp = -temp ;
1870 result = (temp < (T3)1) ; //adjust formula for leading 0
1871 }
1872 else if (SEMI_RING == "MIN_PLUS") {
1873 expect = 2;
1874 result = (sum== expect) ; //min is 2 from the (1,1) pair
1875 }
1876 else expect = (T3) 0;
1877
1878 std::cout <<"test_spdotfactoryUM with "<<SEMI_RING<<" semi-ring= "
1879 << sum << " expected "<<intersection_size<< std::endl;
1880 cudaFree(x);
1881 cudaFree(xi);
1882 cudaFree(y);
1883 cudaFree(yi);
1884 cudaFree(output);
1885 return result;
1886 }
1887