1 //------------------------------------------------------------------------------
2 // GB_AxB_dot3_cuda: compute C<M> = A'*B in parallel, on the GPU(s)
3 //------------------------------------------------------------------------------
4
5 // SPDX-License-Identifier: Apache-2.0
6 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
7 // http://suitesparse.com See GraphBLAS/Doc/License.txt for license.
8
9 //------------------------------------------------------------------------------
10
11 // This function only computes C<M>=A'*B on the GPUs. The mask must be
12 // present, and not complemented. The mask is always applied.
13
14 // The output matrix C always has a static header. The input matrices M, A,
15 // and B may have static or dynamic headers.
16
17 extern "C"
18 {
19 #include "GB_mxm.h"
20 #include "GB_dynamic.h"
21 }
22 #include "GB_cuda.h"
23
24
25 #include "templates/GB_jit_AxB_dot3_phase1.cu.jit"
26 #include "templates/GB_jit_AxB_dot3_phase2.cu.jit"
27 // the 5 kernels for the 5 buckets:
28 #include "templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit"
29 #include "templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit"
30 #include "templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit"
31 #include "templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit"
32 #include "templates/GB_jit_AxB_dot3_phase3_mp.cu.jit"
33 #include "templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit"
34 #include "templates/reduceNonZombiesWarp.cu.jit"
35
36 #include "GB_jit_launcher.h"
37 #include "GB_cuda_stringifier.hpp"
38 #include "GB_cuda_global.h"
39
40 GB_cuda_stringifier *SR_callback_ptr;
41
42 const std::vector<std::string> header_names ={};
43
44 #define GB_FREE_WORK \
45 { \
46 /* free any dynamic headers allocated by GB_to_dynamic */ \
47 if (M_input_static) GB_Matrix_free (&M) ; \
48 if (A_input_static) GB_Matrix_free (&A) ; \
49 if (B_input_static) GB_Matrix_free (&B) ; \
50 GB_cuda_free (Nanobuckets) ; Nanobuckets = NULL ; \
51 GB_cuda_free (Blockbucket) ; Blockbucket = NULL ; \
52 GB_cuda_free (Bucket); Bucket = NULL; \
53 GB_cuda_free (Bucketp); Bucketp = NULL; \
54 GB_cuda_free (offset); offset = NULL; \
55 }
56
57 #define GB_FREE_ALL \
58 { \
59 GB_FREE_WORK ; \
60 GB_Matrix_free (&C) ; \
61 }
62
GB_AxB_dot3_cuda(GrB_Matrix C_static,const GrB_Matrix M_input,const bool Mask_struct,const GrB_Matrix A_input,const GrB_Matrix B_input,const GrB_Semiring semiring,const bool flipxy,GB_Context Context)63 GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
64 (
65 GrB_Matrix C_static, // output matrix, static header
66 const GrB_Matrix M_input, // mask matrix (may have a static header)
67 const bool Mask_struct, // if true, use the only structure of M
68 const GrB_Matrix A_input, // input matrix (may have a static header)
69 const GrB_Matrix B_input, // input matrix (may have a static header)
70 const GrB_Semiring semiring, // semiring that defines C=A*B
71 const bool flipxy, // if true, do z=fmult(b,a) vs fmult(a,b)
72 GB_Context Context
73 )
74 {
75
76 //--------------------------------------------------------------------------
77 // check inputs
78 //--------------------------------------------------------------------------
79
80 GrB_Info info ;
81 ASSERT (C_static != NULL && C_static->static_header) ;
82
83 ASSERT_MATRIX_OK (M_input, "M for dot3 cuda A'*B", GB0) ;
84 ASSERT_MATRIX_OK (A_input, "A for dot3 cuda A'*B", GB0) ;
85 ASSERT_MATRIX_OK (B_input, "B for dot3 cuda A'*B", GB0) ;
86
87 ASSERT (!GB_PENDING (M_input)) ;
88 ASSERT (GB_JUMBLED_OK (M_input)) ;
89 ASSERT (!GB_ZOMBIES (M_input)) ;
90
91 ASSERT (!GB_PENDING (A_input)) ;
92 ASSERT (!GB_JUMBLED (A_input)) ;
93 ASSERT (!GB_ZOMBIES (A_input)) ;
94
95 ASSERT (!GB_PENDING (B_input)) ;
96 ASSERT (!GB_ZOMBIES (B_input)) ;
97 ASSERT (!GB_JUMBLED (B_input)) ;
98
99 ASSERT_SEMIRING_OK (semiring, "semiring for dot3 numeric A'*B", GB0) ;
100
101 ASSERT (A_input->vlen == B_input->vlen) ;
102 GBURBLE ("(GPU dot3) ") ;
103
104 //--------------------------------------------------------------------------
105 // initializations
106 //--------------------------------------------------------------------------
107
108 GrB_Matrix C = NULL, M = NULL, A = NULL, B = NULL ;
109 int ntasks = 0, number_of_sms = 0 ;
110 int64_t *Nanobuckets = NULL, *Blockbucket = NULL ;
111 int64_t *Bucket = NULL;
112 int64_t *Bucketp = NULL;
113 int64_t *offset = NULL;
114 bool M_input_static = false ;
115 bool A_input_static = false ;
116 bool B_input_static = false ;
117
118 // just in case M is jumbled and we don't handle it yet (TODO)
119 GB_MATRIX_WAIT (M_input) ;
120 ASSERT (!GB_JUMBLED (M_input)) ;
121
122 // note that this appears after the wait on M_input:
123 GB_OK (GB_to_dynamic (&M, &M_input_static, M_input, Context)) ;
124 GB_OK (GB_to_dynamic (&A, &A_input_static, A_input, Context)) ;
125 GB_OK (GB_to_dynamic (&B, &B_input_static, B_input, Context)) ;
126
127 int device = -1;
128
129 cudaSetDevice( 0 ) ;
130
131 cudaGetDevice(&device);
132
133 //--------------------------------------------------------------------------
134 // get M
135 //--------------------------------------------------------------------------
136
137 const int64_t *restrict Mp = M->p ;
138 const int64_t *restrict Mh = M->h ;
139 // const int64_t *restrict Mi = M->i ;
140 // const GB_void *restrict Mx = M->x ;
141 // const size_t msize = M->type->size ;
142 const int64_t mvlen = M->vlen ;
143 const int64_t mvdim = M->vdim ;
144 const int64_t mnz = GB_NNZ (M) ;
145 const int64_t mnvec = M->nvec ;
146 const bool M_is_hyper = GB_IS_HYPERSPARSE( M ) ;
147
148 const int64_t anz = GB_NNZ (A) ;
149 const int64_t anvec = A->nvec ;
150
151 const int64_t bnz = GB_NNZ (B) ;
152 const int64_t bnvec = B->nvec ;
153
154 //--------------------------------------------------------------------------
155 // allocate C, the same size and # of entries as M
156 //--------------------------------------------------------------------------
157
158 // FUTURE: ctype need not be the op->ztype
159 GrB_Type ctype = semiring->add->op->ztype ;
160 int64_t cvlen = mvlen ;
161 int64_t cvdim = mvdim ;
162 int64_t cnz = mnz ;
163 int64_t cnvec = mnvec ;
164
165 // TODO tell GB_CREATE where to put the data: CPU or GPU (via
166 // cudaMemAdvise), but this works as-is.
167 int sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
168 info = GB_new_bix (&C, false, // sparse or hyper (from M), dynamic header
169 ctype, cvlen, cvdim, GB_Ap_malloc, true,
170 sparsity, false, M->hyper_switch, cnvec,
171 cnz+1, // add one to cnz for GB_cumsum of Cwork
172 true, Context) ;
173
174 if (info != GrB_SUCCESS)
175 {
176 // out of memory
177 GB_FREE_ALL ;
178 return (info) ;
179 }
180
181 //int64_t *Citemp = C->i ;
182 //auto *Cxtemp = C->x ;
183 //cudaMalloc ((void**) &(C->i), cnz * sizeof( int64_t) );
184 //cudaMalloc ((void**) &(C->x), cnz * C->type->size );
185 cudaMemAdvise( C->i, cnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device);
186 cudaMemAdvise( C->x, cnz * C->type->size , cudaMemAdviseSetPreferredLocation, device);
187
188 int64_t *restrict Cp = M->p ;
189 int64_t *restrict Ch = M->h ;
190 // int64_t *restrict Ci = C->i ;
191 // use C->i as workspace
192
193 //--------------------------------------------------------------------------
194 // copy Mp and Mh into C
195 //--------------------------------------------------------------------------
196
197 //cudaMemcpy (Cp, Mp, (cnvec+1) * sizeof (int64_t), cudaMemcpyDefault) ;
198 if (M_is_hyper)
199 {
200 //cudaMemcpy (Ch, Mh, cnvec * sizeof (int64_t), cudaMemcpyDefault) ;
201 }
202 C->magic = GB_MAGIC ;
203 C->nvec_nonempty = M->nvec_nonempty ;
204 C->nvec = M->nvec ;
205
206 GBURBLE ("(GPU C created and copied from M) ") ;
207 //--------------------------------------------------------------------------
208 // stringify the semiring and the mask
209 //--------------------------------------------------------------------------
210
211 char semiring_name [GB_CUDA_STRLEN+2] ;
212 char semiring_code [GB_CUDA_STRLEN+2] ;
213 char mask_name [GB_CUDA_STRLEN+2] ;
214
215 GB_cuda_stringifier mysemiring = GB_cuda_stringifier();
216 uint64_t sr_code;
217
218 mysemiring.enumify_semiring ( &sr_code, semiring, flipxy,
219 ctype, A->type, B->type, M->type, Mask_struct, // matrix types
220 false, C->sparsity, M->sparsity, A->sparsity, B->sparsity) ;
221
222 const char *header_name = (const char *)"mySemiRing.h";
223 mysemiring.load_string(header_name, semiring_code ) ;
224
225 SR_callback_ptr = &mysemiring;
226
227 GBURBLE ("(GPU stringified) ") ;
228 //--------------------------------------------------------------------------
229 // construct the tasks for phase1 and phase2
230 //--------------------------------------------------------------------------
231
232 // on the CPU: nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
233 // on the GPU:
234
235 // # of threads in phase1 and phase2 kernel launches must be the same
236 #define chunksize 128
237 #define SYMBOLIC_PHASE_NTHREADS 32
238 #define NBUCKETS (GB_BUCKET_MERGEPATH + 1)
239
240 number_of_sms = GB_Global_gpu_sm_get (0) ;
241 // C and M have cnz entries, so create ...
242 //ntasks = ( (mnvec +7)/8 + SYMBOLIC_PHASE_NTHREADS -1 )/SYMBOLIC_PHASE_NTHREADS;
243 ntasks = ( mnz +chunksize -1)/chunksize;
244 // Idea is to have each task work on a continguous block of columns of C
245 ntasks = GB_IMIN( ntasks, 128*number_of_sms) ; // ntasks will be grid.x
246
247 GBURBLE ("(GPU mnz=%ld mnvec=%ld blockDim=32, nblock= %d) ", mnz, mnvec, ntasks ) ;
248
249 std::cout<< "ntasks, nthreads = " <<ntasks<<","<<SYMBOLIC_PHASE_NTHREADS<<std::endl;
250 //--------------------------------------------------------------------------
251 // phase1 and phase2: place each C(i,j) in a bucket
252 //--------------------------------------------------------------------------
253
254 cudaMalloc ((void**) &Nanobuckets,
255 NBUCKETS * SYMBOLIC_PHASE_NTHREADS * ntasks * sizeof (int64_t)) ;
256
257 //Nanobuckets = (int64_t*)GB_cuda_malloc (
258 // NBUCKETS * SYMBOLIC_PHASE_NTHREADS * ntasks * sizeof (int64_t)) ;
259 //cudaMemAdvise( Nanobuckets, NBUCKETS * SYMBOLIC_PHASE_NTHREADS * ntasks
260 // * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device);
261 /*
262 */
263
264 cudaMalloc ((void**) &Blockbucket,
265 NBUCKETS * ntasks* sizeof (int64_t) ) ;
266 //Blockbucket = (int64_t*)GB_cuda_malloc ( NBUCKETS * ntasks* sizeof (int64_t) ) ;
267 //cudaMemAdvise( Blockbucket, NBUCKETS * ntasks
268 // * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device);
269 /*
270 */
271
272 cudaMalloc ((void**) &Bucket, cnz*sizeof(int64_t));
273 //Bucket = (int64_t*)GB_cuda_malloc ( cnz*sizeof(int64_t) );
274 //cudaMemAdvise( Bucket, cnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device);
275 /*
276 */
277
278 //cudaMallocManaged ((void**) &Bucketp, (NBUCKETS+1)*sizeof(int64_t)) ;
279 Bucketp = (int64_t*)GB_cuda_malloc( (NBUCKETS+1)*sizeof(int64_t) ) ;
280 cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
281 cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device);
282
283 //cudaMallocManaged ((void**) &offset, (NBUCKETS)*sizeof(int64_t)) ;
284 offset = (int64_t*)GB_cuda_malloc( (NBUCKETS)*sizeof(int64_t) ) ;
285 cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
286 cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device);
287
288 memset( offset, 0, NBUCKETS * sizeof(int64_t) );
289
290 /*
291 if (Blockbucket == NULL || Nanobuckets == NULL || Bucket == NULL || Bucketp == NULL )
292 {
293 // out of memory
294 GB_FREE_ALL ;
295 return (GB_OUT_OF_MEMORY) ;
296 }
297 */
298
299
300 //--------------------------------------------------------------------------
301 // Pre-fetch arrays that will be used on the device
302 //--------------------------------------------------------------------------
303
304
305 //cudaMemPrefetchAsync( Nanobuckets, NBUCKETS * SYMBOLIC_PHASE_NTHREADS
306 // * ntasks * sizeof (int64_t), device, NULL) ;
307
308 //cudaMemPrefetchAsync( Blockbucket, NBUCKETS * ntasks
309 // * sizeof (int64_t), device, NULL) ;
310
311 //cudaMemPrefetchAsync( Bucket, cnz * sizeof (int64_t), device, NULL) ;
312
313
314 /*
315
316 //cudaStream_t stream_data;
317 //cudaStreamCreate ( &stream_data);
318 */
319 /*
320 cudaMemAdvise( M->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetPreferredLocation, device) ;
321 cudaMemAdvise( M->i, mnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device);
322 cudaMemAdvise( M->x, mnz * M->type->size, cudaMemAdviseSetPreferredLocation,device) ;
323
324 cudaMemAdvise( M->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
325 cudaMemAdvise( M->i, mnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
326 cudaMemAdvise( M->x, mnz * M->type->size, cudaMemAdviseSetReadMostly,device) ;
327 */
328
329 cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ;
330 cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t), device, NULL ) ; //stream_data) ;
331 cudaMemPrefetchAsync( M->x, mnz * M->type->size, device, NULL ) ; //stream_data) ;
332 /*
333 cudaMemAdvise( C->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
334 cudaMemAdvise( C->i, mnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
335 cudaMemAdvise( C->x, mnz * C->type->size, cudaMemAdviseSetReadMostly,device) ;
336 */
337 //cudaMemPrefetchAsync( C->p, (mnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ;
338 cudaMemPrefetchAsync( C->i, mnz * sizeof (int64_t), device, NULL ); //stream_data) ;
339 cudaMemPrefetchAsync( C->x, mnz * C->type->size, device, NULL ); //stream_data) ;
340
341 /*
342 cudaMemAdvise( A->p, (anvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
343 cudaMemAdvise( A->i, anz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
344 cudaMemAdvise( A->x, anz * A->type->size, cudaMemAdviseSetReadMostly,device) ;
345 */
346 cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t), device, NULL); // stream_data) ;
347 cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t), device, NULL ) ; //stream_data) ;
348 cudaMemPrefetchAsync( A->x, anz * A->type->size, device, NULL ) ; //stream_data) ;
349
350 /*
351 cudaMemAdvise( B->p, (bnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
352 cudaMemAdvise( B->i, bnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
353 cudaMemAdvise( B->x, bnz * B->type->size, cudaMemAdviseSetReadMostly, device) ;
354 */
355 cudaMemPrefetchAsync( B->p, (bnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ;
356 cudaMemPrefetchAsync( B->i, bnz * sizeof (int64_t), device, NULL ) ; //stream_data) ;
357 cudaMemPrefetchAsync( B->x, bnz * B->type->size, device, NULL ) ; //stream_data) ;
358
359
360
361 // The work to compute C(i,j) is held in Ci [p], if C(i,j) appears in
362 // as the pth entry in C.
363
364
365 //cudaStream_t stream_AxB;
366 //cudaStreamCreate ( &stream_AxB);
367 //----------------------------------------------------------------------
368 // phase1: assign each C(i,j) to a bucket, and count them
369 //----------------------------------------------------------------------
370 dim3 grid( ntasks) ;
371 dim3 p2grid( (ntasks + SYMBOLIC_PHASE_NTHREADS -1)
372 / (SYMBOLIC_PHASE_NTHREADS) ) ;
373 dim3 block( SYMBOLIC_PHASE_NTHREADS ) ;
374
375 std::string base_name = "GB_jit_AxB_dot3_";
376 std::string Opname = "phase1_" ;
377
378 jitify::experimental::KernelLauncher phase1Kernel =
379 jit::launcher( base_name + Opname + mask_name,
380 templates_GB_jit_AxB_dot3_phase1_cu,
381 header_names,
382 compiler_flags,
383 callback_wrapper) //,
384 //stream_AxB)
385 .set_kernel_inst("GB_AxB_cuda_dot3_phase1",
386 {M->type->name})
387 .configure(grid, block);
388
389 //----------------------------------------------------------------------
390 // phase2: cumsum across the blockbuckets, propagate to thread level
391 //----------------------------------------------------------------------
392 base_name = "GB_jit_AxB_dot3_";
393 Opname = "phase2";
394 jitify::experimental::KernelLauncher phase2Kernel =
395 jit::launcher( base_name + Opname,
396 templates_GB_jit_AxB_dot3_phase2_cu,
397 header_names,
398 compiler_flags,
399 callback_wrapper) //,
400 //stream_AxB)
401 .set_kernel_inst("GB_AxB_dot3_phase2",
402 {})
403 .configure(p2grid, block);
404
405 base_name = "GB_jit_AxB_dot3_";
406 Opname = "phase2";
407 jitify::experimental::KernelLauncher phase2endKernel =
408 jit::launcher( base_name + Opname,
409 templates_GB_jit_AxB_dot3_phase2_cu,
410 header_names,
411 compiler_flags,
412 callback_wrapper) //,
413 //stream_AxB)
414 .set_kernel_inst("GB_AxB_dot3_phase2end",
415 {})
416 .configure(grid, block);
417
418
419 phase1Kernel.launch(
420 Nanobuckets, // array of size NBUCKETS-blockDim.x-by-gridDim.x
421 Blockbucket, // bucket counts, of size NBUCKETS-by-gridDim.x
422 // input/output:
423 C, // final output matrix
424 // inputs, not modified:
425 M, // mask matrix
426 A, // input matrix
427 B // input matrix
428 );
429
430
431 // cudaDeviceSynchronize();
432
433
434 GBURBLE ("(GPU phase1 done) ") ;
435 //for (int i = 0; i< cnz; i++){
436 // printf("C[%d] = %ld\n", i , Ci[i]);
437 //}
438 //----------------------------------------------------------------------
439 // phase2: cumsum across the blockbuckets, propagate to thread level
440 //----------------------------------------------------------------------
441 int nblock = ntasks;
442
443 phase2Kernel.launch( // input
444 Nanobuckets, // array of size NBUCKETS-blockDim.x-by-gridDim.x
445 Blockbucket, // bucket counts, of size NBUCKETS-by-gridDim.x
446 // input/output:
447 Bucketp, // global bucket cumsum, of size NBUCKETS+1
448 Bucket, // global buckets, of size cnz (== mnz)
449 offset,
450 C, // final output matrix
451 // inputs, not modified:
452 cnz, // number of entries in mask and output matrix
453 nblock
454 );
455
456 cudaDeviceSynchronize();
457 //cudaMemPrefetchAsync( offset, (NBUCKETS) * sizeof (int64_t), cudaCpuDeviceId, NULL) ;
458
459 int64_t s= 0;
460 for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
461 {
462 Bucketp[bucket] = s;
463 s+= offset[bucket];
464 //printf("bucketp[%d] = %ld\n", bucket, Bucketp[bucket]);
465 }
466
467 GBURBLE ("(GPU phase2 done) ") ;
468
469 phase2endKernel.launch( // input
470 Nanobuckets, // array of size NBUCKETS-blockDim.x-by-gridDim.x
471 Blockbucket, // bucket counts, of size NBUCKETS-by-gridDim.x
472 // input/output:
473 Bucketp, // global bucket cumsum, of size NBUCKETS+1
474 Bucket, // global buckets, of size cnz (== mnz)
475 offset,
476 C, // final output matrix
477 // inputs, not modified:
478 cnz // number of entries in mask and output matrix
479 );
480
481 cudaDeviceSynchronize();
482
483 GBURBLE ("(GPU phase2end done) ") ;
484 /*
485 for (int i = 0; i< cnz; i++){
486 printf("C[%d],Bucket = %ld,%ld\n", i , Ci[i], Bucket[i]);
487 }
488 */
489
490 //----------------------------------------------------------------------
491 // phase3: do the numerical work
492 //----------------------------------------------------------------------
493
494 base_name = "GB_jit_";
495 std::string kernel_name = "AxB_dot3_phase3_";
496 C->nzombies = Bucketp[1]; //set pre-zombie counts
497
498 for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket)
499 {
500 std::string Opname = "";
501 int sz = 0 ;
502
503 const char* jit_template;
504
505 int64_t start = Bucketp[bucket];
506 int64_t end = Bucketp[bucket+1];
507
508 //if( (end-start>0) && (start == Bucketp[1]) ) start = Bucketp[0]; //add in zombie slots
509
510 int64_t Cnz = end- start;
511
512 int gridsz, blocksz;
513
514 //Nothing to do, next bucket
515 if ( Cnz == 0 ) continue;
516
517 GBURBLE ("\n\n(GPU phase3 bucket,bucketsize= %d,%ld) ",bucket,Cnz) ;
518
519 switch (bucket)
520 {
521
522 //--------------------------------------------------------------
523 // not a bucket ... bring out your dead:
524 //--------------------------------------------------------------
525
526 case GB_BUCKET_ZOMBIE : // C(i,j) is a zombie (not a bucket)
527 break ;
528
529 //--------------------------------------------------------------
530 // CUDA kernel: dndn, handles a single bucket:
531 //--------------------------------------------------------------
532
533 // both A(:,i) and B(:,j) are dense
534 case GB_BUCKET_DNDN :
535 Opname = "dndn" ;
536 jit_template = templates_GB_jit_AxB_dot3_phase3_dndn_cu;
537 blocksz = 32;
538 gridsz = ( Cnz -1 + blocksz)/blocksz;
539 break ;
540
541 //--------------------------------------------------------------
542 // CUDA kernel: spdn, handles 4 buckets:
543 //--------------------------------------------------------------
544
545 // A(:,i) is dense and B(:,j) is very sparse (< 256 entries)
546 case GB_BUCKET_DNVS :
547 // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense
548 case GB_BUCKET_VSDN :
549 sz = 64 ;
550 Opname = "spdn" ;
551 jit_template = templates_GB_jit_AxB_dot3_phase3_spdn_cu;
552 blocksz = 32;
553 gridsz = ( Cnz -1 + blocksz)/blocksz;
554 break ;
555
556 // A(:,i) is dense and B(:,j) is sparse (>= 256 entries)
557 case GB_BUCKET_DNSP :
558 // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense
559 case GB_BUCKET_SPDN :
560 sz = 256 ;
561 Opname = "spdn" ;
562 jit_template = templates_GB_jit_AxB_dot3_phase3_spdn_cu;
563 blocksz = 32;
564 gridsz = ( Cnz -1 + blocksz)/blocksz;
565 break ;
566
567 //--------------------------------------------------------------
568 // CUDA kernel: vssp, handles 1 bucket, uses binary search:
569 //--------------------------------------------------------------
570
571 // A(:,i) is very sparse compared to B(:,j), or visa versa
572 case GB_BUCKET_VSSP :
573 Opname = "vssp" ;
574 jit_template = templates_GB_jit_AxB_dot3_phase3_vssp_cu;
575 blocksz = 32;
576 gridsz = ( Cnz -1 + blocksz)/blocksz;
577 break ;
578
579 //--------------------------------------------------------------
580 // CUDA kernel: vsvs, handles 4 buckets:
581 //--------------------------------------------------------------
582
583 // let len = nnz (A (:,i) + nnz (B (:,j)), then:
584
585 case GB_BUCKET_VSVS_256 : sz += 256-64 ;
586 case GB_BUCKET_VSVS_64 : sz += 64-16 ;
587 case GB_BUCKET_VSVS_16 : sz += 16-4 ;
588 case GB_BUCKET_VSVS_4 : sz += 4 ;
589 Opname = "vsvs" ;
590 jit_template = templates_GB_jit_AxB_dot3_phase3_vsvs_cu;
591 blocksz = 1024;
592 gridsz = GB_IMIN( 1024*number_of_sms, ( Cnz + blocksz -1 )/blocksz);
593 gridsz = ( Cnz + blocksz -1 )/blocksz;
594 /*
595 Opname = "warpix" ;
596 jit_template = templates_GB_jit_AxB_dot3_phase3_warpix_cu;
597 blocksz = 32;
598 gridsz = GB_IMIN( (mnvec+15)/16, 256*number_of_sms);
599 */
600 break ;
601
602 //--------------------------------------------------------------
603 // CUDA kernel: mp, use the merge-path method:
604 //--------------------------------------------------------------
605
606 case GB_BUCKET_MERGEPATH :
607 Opname = "mp" ;
608 jit_template = templates_GB_jit_AxB_dot3_phase3_mp_cu;
609 blocksz = 32;
610 gridsz = ( Cnz -1 + blocksz)/blocksz;
611 break ;
612
613 case GB_BUCKET_WARP_IX : sz = 32 ;
614 Opname = "warpix" ;
615 jit_template = templates_GB_jit_AxB_dot3_phase3_warpix_cu;
616 blocksz = 32;
617 gridsz = GB_IMIN( (mnvec+15)/16, 256*number_of_sms);
618 break ;
619
620 default:
621 break ;
622 }
623
624 dim3 grid(gridsz);
625 dim3 block(blocksz);
626
627 std::cout<< "Kernel name =" <<Opname<<std::endl;
628 GBURBLE ("(GPU phase3 launch st,end=%ld,%ld nblocks,blocksize= %d,%d )\n",start,end,gridsz,blocksz) ;
629 jit::launcher( base_name + kernel_name + Opname + "_" + semiring_name,
630 jit_template,
631 header_names,
632 compiler_flags,
633 callback_wrapper)
634 .set_kernel_inst(kernel_name + Opname,
635 { ctype->name,
636 A->type->name,
637 B->type->name,
638 semiring->multiply->xtype->name,
639 semiring->multiply->ytype->name,
640 semiring->multiply->ztype->name })
641 .configure(grid, block) //if commented, use implicit 1D configure in launch
642 .launch(
643 start, // input/output:
644 end, // global bucket cumsum, of size NBUCKETS+1
645 Bucket, // global buckets, of size cnz (== mnz)
646 C, // final output matrix
647 // inputs, not modified:
648 M, // Mi used for column index
649 A, // A matrix
650 B, // B matrix
651 sz // only used for sparse-sparse cases
652
653 );
654
655 cudaDeviceSynchronize();
656 }
657 GBURBLE ("(GPU phase3 done) ") ;
658
659 // transplant C into C_static, and free C
660 GB_to_static (C_static, &C, Context) ;
661 ASSERT (C == NULL) ;
662
663 std::string reduce_kernel_name = "reduceNonZombiesWarp";
664 const char* jit_template;
665 #define red_blocksz 1024
666 jit_template = templates_reduceNonZombiesWarp_cu;
667 int num_reduce_blocks = GB_IMIN( 32*number_of_sms, (cnz + red_blocksz -1)/ red_blocksz ) ;
668 dim3 red_grid( num_reduce_blocks ) ;
669 dim3 red_block( red_blocksz ) ;
670
671 int32_t *block_sum;
672 //cudaMallocManaged ((void**) &block_sum, (num_reduce_blocks)*sizeof(int32_t)) ;
673 block_sum = (int32_t*)GB_cuda_malloc( (num_reduce_blocks)*sizeof(int32_t)) ;
674
675 GBURBLE ("(GPU reduce launch nblocks,blocksize= %d,%d )\n", num_reduce_blocks, red_blocksz) ;
676 jit::launcher( reduce_kernel_name + "_" + semiring_name,
677 jit_template,
678 header_names,
679 compiler_flags,
680 callback_wrapper)
681 .set_kernel_inst( reduce_kernel_name , { ctype->name })
682 .configure(red_grid, red_block) //if commented, use implicit 1D configure in launch
683 .launch(
684 C_static->i, // index vector, only sum up values >= 0
685 C_static->x, // input pointer to vector to reduce, with zombies
686 block_sum, // Block sums on return
687 (unsigned int)cnz // length of vector to reduce to scalar
688
689 );
690
691 cudaDeviceSynchronize();
692
693 int32_t num_triangles = 0;
694 for (int i = 0; i< num_reduce_blocks; i++){
695 //printf("block%d num_triangles = %d\n", i, block_sum[i] );
696 num_triangles += block_sum[i] ;
697 }
698 printf("num_triangles = %d\n", num_triangles );
699
700 GB_cuda_free( block_sum );
701 //cudaMemPrefetchAsync( C->p, (mnvec+1) * sizeof (int64_t), cudaCpuDeviceId, NULL) ; //stream_data ) ;
702 //cudaMemPrefetchAsync( C->i, cnz * sizeof (int64_t), cudaCpuDeviceId, NULL ) ; //stream_data ) ;
703 //cudaMemPrefetchAsync( C->x, cnz * sizeof (int32_t), cudaCpuDeviceId, NULL ) ; //stream_data ) ;
704 /*
705 cudaMemcpy( Citemp, C->i, cnz * sizeof( int64_t), cudaMemcpyDefault );
706 cudaMemcpy( Cxtemp, C->x, cnz * C->type->size, cudaMemcpyDefault );
707 GB_cuda_free( C->i);
708 GB_cuda_free( C->x);
709 C->i = Citemp;
710 C->x = Cxtemp;
711 */
712
713 cudaDeviceSynchronize();
714
715 GB_FREE_WORK ;
716 return GrB_SUCCESS;
717 }
718
719