1 //------------------------------------------------------------------------------
2 // GB_AxB_dot3_cuda: compute C<M> = A'*B in parallel, on the GPU(s)
3 //------------------------------------------------------------------------------
4 
5 // SPDX-License-Identifier: Apache-2.0
6 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
7 // http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
8 
9 //------------------------------------------------------------------------------
10 
11 // This function only computes C<M>=A'*B on the GPUs.  The mask must be
12 // present, and not complemented.  The mask is always applied.
13 
14 // The output matrix C always has a static header.  The input matrices M, A,
15 // and B may have static or dynamic headers.
16 
17 extern "C"
18 {
19   #include "GB_mxm.h"
20   #include "GB_dynamic.h"
21 }
22 #include "GB_cuda.h"
23 
24 
25 #include "templates/GB_jit_AxB_dot3_phase1.cu.jit"
26 #include "templates/GB_jit_AxB_dot3_phase2.cu.jit"
27 // the 5 kernels for the 5 buckets:
28 #include "templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit"
29 #include "templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit"
30 #include "templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit"
31 #include "templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit"
32 #include "templates/GB_jit_AxB_dot3_phase3_mp.cu.jit"
33 #include "templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit"
34 #include "templates/reduceNonZombiesWarp.cu.jit"
35 
36 #include "GB_jit_launcher.h"
37 #include "GB_cuda_stringifier.hpp"
38 #include "GB_cuda_global.h"
39 
40 GB_cuda_stringifier *SR_callback_ptr;
41 
42 const std::vector<std::string> header_names ={};
43 
44 #define GB_FREE_WORK                                                    \
45 {                                                                       \
46     /* free any dynamic headers allocated by GB_to_dynamic */           \
47     if (M_input_static) GB_Matrix_free (&M) ;                           \
48     if (A_input_static) GB_Matrix_free (&A) ;                           \
49     if (B_input_static) GB_Matrix_free (&B) ;                           \
50     GB_cuda_free (Nanobuckets) ;    Nanobuckets = NULL ;                \
51     GB_cuda_free (Blockbucket) ;    Blockbucket = NULL ;                \
52     GB_cuda_free (Bucket);          Bucket      = NULL;                 \
53     GB_cuda_free (Bucketp);         Bucketp     = NULL;                 \
54     GB_cuda_free (offset);          offset      = NULL;                 \
55 }
56 
57 #define GB_FREE_ALL                                                     \
58 {                                                                       \
59     GB_FREE_WORK ;                                                      \
60     GB_Matrix_free (&C) ;                                               \
61 }
62 
GB_AxB_dot3_cuda(GrB_Matrix C_static,const GrB_Matrix M_input,const bool Mask_struct,const GrB_Matrix A_input,const GrB_Matrix B_input,const GrB_Semiring semiring,const bool flipxy,GB_Context Context)63 GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
64 (
65     GrB_Matrix C_static,            // output matrix, static header
66     const GrB_Matrix M_input,       // mask matrix (may have a static header)
67     const bool Mask_struct,         // if true, use the only structure of M
68     const GrB_Matrix A_input,       // input matrix (may have a static header)
69     const GrB_Matrix B_input,       // input matrix (may have a static header)
70     const GrB_Semiring semiring,    // semiring that defines C=A*B
71     const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
72     GB_Context Context
73 )
74 {
75 
76     //--------------------------------------------------------------------------
77     // check inputs
78     //--------------------------------------------------------------------------
79 
80     GrB_Info info ;
81     ASSERT (C_static != NULL && C_static->static_header) ;
82 
83     ASSERT_MATRIX_OK (M_input, "M for dot3 cuda A'*B", GB0) ;
84     ASSERT_MATRIX_OK (A_input, "A for dot3 cuda A'*B", GB0) ;
85     ASSERT_MATRIX_OK (B_input, "B for dot3 cuda A'*B", GB0) ;
86 
87     ASSERT (!GB_PENDING (M_input)) ;
88     ASSERT (GB_JUMBLED_OK (M_input)) ;
89     ASSERT (!GB_ZOMBIES (M_input)) ;
90 
91     ASSERT (!GB_PENDING (A_input)) ;
92     ASSERT (!GB_JUMBLED (A_input)) ;
93     ASSERT (!GB_ZOMBIES (A_input)) ;
94 
95     ASSERT (!GB_PENDING (B_input)) ;
96     ASSERT (!GB_ZOMBIES (B_input)) ;
97     ASSERT (!GB_JUMBLED (B_input)) ;
98 
99     ASSERT_SEMIRING_OK (semiring, "semiring for dot3 numeric A'*B", GB0) ;
100 
101     ASSERT (A_input->vlen == B_input->vlen) ;
102     GBURBLE ("(GPU dot3) ") ;
103 
104     //--------------------------------------------------------------------------
105     // initializations
106     //--------------------------------------------------------------------------
107 
108     GrB_Matrix C = NULL, M = NULL, A = NULL, B = NULL ;
109     int ntasks = 0, number_of_sms = 0 ;
110     int64_t *Nanobuckets = NULL, *Blockbucket = NULL ;
111     int64_t *Bucket = NULL;
112     int64_t *Bucketp = NULL;
113     int64_t *offset = NULL;
114     bool M_input_static = false ;
115     bool A_input_static = false ;
116     bool B_input_static = false ;
117 
118     // just in case M is jumbled and we don't handle it yet (TODO)
119     GB_MATRIX_WAIT (M_input) ;
120     ASSERT (!GB_JUMBLED (M_input)) ;
121 
122     // note that this appears after the wait on M_input:
123     GB_OK (GB_to_dynamic (&M, &M_input_static, M_input, Context)) ;
124     GB_OK (GB_to_dynamic (&A, &A_input_static, A_input, Context)) ;
125     GB_OK (GB_to_dynamic (&B, &B_input_static, B_input, Context)) ;
126 
127     int device = -1;
128 
129     cudaSetDevice( 0 ) ;
130 
131     cudaGetDevice(&device);
132 
133     //--------------------------------------------------------------------------
134     // get M
135     //--------------------------------------------------------------------------
136 
137     const int64_t *restrict Mp = M->p ;
138     const int64_t *restrict Mh = M->h ;
139     // const int64_t *restrict Mi = M->i ;
140     // const GB_void *restrict Mx = M->x ;
141     // const size_t msize = M->type->size ;
142     const int64_t mvlen = M->vlen ;
143     const int64_t mvdim = M->vdim ;
144     const int64_t mnz = GB_NNZ (M) ;
145     const int64_t mnvec = M->nvec ;
146     const bool M_is_hyper = GB_IS_HYPERSPARSE( M ) ;
147 
148     const int64_t anz = GB_NNZ (A) ;
149     const int64_t anvec = A->nvec ;
150 
151     const int64_t bnz = GB_NNZ (B) ;
152     const int64_t bnvec = B->nvec ;
153 
154     //--------------------------------------------------------------------------
155     // allocate C, the same size and # of entries as M
156     //--------------------------------------------------------------------------
157 
158     // FUTURE: ctype need not be the op->ztype
159     GrB_Type ctype = semiring->add->op->ztype ;
160     int64_t cvlen = mvlen ;
161     int64_t cvdim = mvdim ;
162     int64_t cnz = mnz ;
163     int64_t cnvec = mnvec ;
164 
165     // TODO tell GB_CREATE where to put the data: CPU or GPU (via
166     // cudaMemAdvise), but this works as-is.
167     int sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
168     info = GB_new_bix (&C, false, // sparse or hyper (from M), dynamic header
169         ctype, cvlen, cvdim, GB_Ap_malloc, true,
170         sparsity, false, M->hyper_switch, cnvec,
171         cnz+1,  // add one to cnz for GB_cumsum of Cwork
172         true, Context) ;
173 
174     if (info != GrB_SUCCESS)
175     {
176         // out of memory
177         GB_FREE_ALL ;
178         return (info) ;
179     }
180 
181     //int64_t *Citemp =  C->i ;
182     //auto *Cxtemp = C->x ;
183     //cudaMalloc ((void**) &(C->i), cnz * sizeof( int64_t) );
184     //cudaMalloc ((void**) &(C->x), cnz * C->type->size );
185     cudaMemAdvise( C->i, cnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device);
186     cudaMemAdvise( C->x, cnz * C->type->size , cudaMemAdviseSetPreferredLocation, device);
187 
188     int64_t *restrict Cp = M->p ;
189     int64_t *restrict Ch = M->h ;
190     // int64_t *restrict Ci = C->i ;
191     // use C->i as workspace
192 
193     //--------------------------------------------------------------------------
194     // copy Mp and Mh into C
195     //--------------------------------------------------------------------------
196 
197     //cudaMemcpy (Cp, Mp, (cnvec+1) * sizeof (int64_t), cudaMemcpyDefault) ;
198     if (M_is_hyper)
199     {
200         //cudaMemcpy (Ch, Mh, cnvec * sizeof (int64_t), cudaMemcpyDefault) ;
201     }
202     C->magic = GB_MAGIC ;
203     C->nvec_nonempty = M->nvec_nonempty ;
204     C->nvec = M->nvec ;
205 
206     GBURBLE ("(GPU C created and copied from M) ") ;
207     //--------------------------------------------------------------------------
208     // stringify the semiring and the mask
209     //--------------------------------------------------------------------------
210 
211     char semiring_name [GB_CUDA_STRLEN+2] ;
212     char semiring_code [GB_CUDA_STRLEN+2] ;
213     char mask_name [GB_CUDA_STRLEN+2] ;
214 
215     GB_cuda_stringifier mysemiring =  GB_cuda_stringifier();
216     uint64_t sr_code;
217 
218     mysemiring.enumify_semiring ( &sr_code, semiring, flipxy,
219         ctype, A->type, B->type, M->type, Mask_struct,  // matrix types
220         false, C->sparsity, M->sparsity, A->sparsity, B->sparsity) ;
221 
222     const char *header_name = (const char *)"mySemiRing.h";
223     mysemiring.load_string(header_name, semiring_code ) ;
224 
225     SR_callback_ptr = &mysemiring;
226 
227     GBURBLE ("(GPU stringified) ") ;
228     //--------------------------------------------------------------------------
229     // construct the tasks for phase1 and phase2
230     //--------------------------------------------------------------------------
231 
232     // on the CPU: nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
233     // on the GPU:
234 
235     // # of threads in phase1 and phase2 kernel launches must be the same
236     #define chunksize 128
237     #define SYMBOLIC_PHASE_NTHREADS 32
238     #define NBUCKETS (GB_BUCKET_MERGEPATH + 1)
239 
240     number_of_sms = GB_Global_gpu_sm_get (0) ;
241     // C and M have cnz entries, so create ...
242     //ntasks = ( (mnvec +7)/8   + SYMBOLIC_PHASE_NTHREADS -1 )/SYMBOLIC_PHASE_NTHREADS;
243     ntasks =  ( mnz +chunksize -1)/chunksize;
244     // Idea is to have each task work on a continguous block of columns of C
245     ntasks = GB_IMIN( ntasks,  128*number_of_sms) ;    // ntasks will be grid.x
246 
247     GBURBLE ("(GPU mnz=%ld mnvec=%ld blockDim=32, nblock= %d) ", mnz, mnvec, ntasks ) ;
248 
249     std::cout<< "ntasks, nthreads = " <<ntasks<<","<<SYMBOLIC_PHASE_NTHREADS<<std::endl;
250     //--------------------------------------------------------------------------
251     // phase1 and phase2: place each C(i,j) in a bucket
252     //--------------------------------------------------------------------------
253 
254     cudaMalloc ((void**) &Nanobuckets,
255         NBUCKETS * SYMBOLIC_PHASE_NTHREADS * ntasks * sizeof (int64_t)) ;
256 
257     //Nanobuckets = (int64_t*)GB_cuda_malloc (
258     //    NBUCKETS * SYMBOLIC_PHASE_NTHREADS * ntasks * sizeof (int64_t)) ;
259     //cudaMemAdvise( Nanobuckets, NBUCKETS * SYMBOLIC_PHASE_NTHREADS * ntasks
260     //                           * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device);
261     /*
262     */
263 
264     cudaMalloc ((void**) &Blockbucket,
265         NBUCKETS * ntasks* sizeof (int64_t) ) ;
266     //Blockbucket = (int64_t*)GB_cuda_malloc ( NBUCKETS * ntasks* sizeof (int64_t) ) ;
267     //cudaMemAdvise( Blockbucket, NBUCKETS * ntasks
268     //                           * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device);
269     /*
270     */
271 
272     cudaMalloc ((void**) &Bucket, cnz*sizeof(int64_t));
273     //Bucket = (int64_t*)GB_cuda_malloc ( cnz*sizeof(int64_t) );
274     //cudaMemAdvise( Bucket, cnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device);
275     /*
276     */
277 
278     //cudaMallocManaged ((void**) &Bucketp, (NBUCKETS+1)*sizeof(int64_t)) ;
279     Bucketp = (int64_t*)GB_cuda_malloc( (NBUCKETS+1)*sizeof(int64_t) ) ;
280     cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
281     cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device);
282 
283     //cudaMallocManaged ((void**) &offset, (NBUCKETS)*sizeof(int64_t)) ;
284     offset = (int64_t*)GB_cuda_malloc( (NBUCKETS)*sizeof(int64_t) ) ;
285     cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
286     cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device);
287 
288     memset( offset, 0, NBUCKETS * sizeof(int64_t) );
289 
290   /*
291     if (Blockbucket == NULL || Nanobuckets == NULL || Bucket == NULL || Bucketp == NULL )
292     {
293         // out of memory
294         GB_FREE_ALL ;
295         return (GB_OUT_OF_MEMORY) ;
296     }
297     */
298 
299 
300     //--------------------------------------------------------------------------
301     // Pre-fetch arrays that will be used on the device
302     //--------------------------------------------------------------------------
303 
304 
305     //cudaMemPrefetchAsync( Nanobuckets, NBUCKETS * SYMBOLIC_PHASE_NTHREADS
306     //                     * ntasks * sizeof (int64_t), device, NULL) ;
307 
308     //cudaMemPrefetchAsync( Blockbucket, NBUCKETS * ntasks
309     //                        * sizeof (int64_t), device, NULL) ;
310 
311     //cudaMemPrefetchAsync( Bucket, cnz * sizeof (int64_t), device, NULL) ;
312 
313 
314     /*
315 
316     //cudaStream_t stream_data;
317     //cudaStreamCreate ( &stream_data);
318     */
319     /*
320     cudaMemAdvise( M->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetPreferredLocation, device) ;
321     cudaMemAdvise( M->i, mnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device);
322     cudaMemAdvise( M->x, mnz * M->type->size, cudaMemAdviseSetPreferredLocation,device) ;
323 
324     cudaMemAdvise( M->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
325     cudaMemAdvise( M->i, mnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
326     cudaMemAdvise( M->x, mnz * M->type->size, cudaMemAdviseSetReadMostly,device) ;
327     */
328 
329     cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ;
330     cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t), device, NULL ) ; //stream_data) ;
331     cudaMemPrefetchAsync( M->x, mnz * M->type->size, device, NULL ) ; //stream_data) ;
332     /*
333     cudaMemAdvise( C->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
334     cudaMemAdvise( C->i, mnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
335     cudaMemAdvise( C->x, mnz * C->type->size, cudaMemAdviseSetReadMostly,device) ;
336     */
337     //cudaMemPrefetchAsync( C->p, (mnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ;
338     cudaMemPrefetchAsync( C->i, mnz * sizeof (int64_t), device, NULL ); //stream_data) ;
339     cudaMemPrefetchAsync( C->x, mnz * C->type->size, device, NULL ); //stream_data) ;
340 
341     /*
342     cudaMemAdvise( A->p, (anvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
343     cudaMemAdvise( A->i, anz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
344     cudaMemAdvise( A->x, anz * A->type->size, cudaMemAdviseSetReadMostly,device) ;
345     */
346     cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t), device, NULL); // stream_data) ;
347     cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t), device, NULL ) ; //stream_data) ;
348     cudaMemPrefetchAsync( A->x, anz * A->type->size, device, NULL ) ; //stream_data) ;
349 
350     /*
351     cudaMemAdvise( B->p, (bnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
352     cudaMemAdvise( B->i, bnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
353     cudaMemAdvise( B->x, bnz * B->type->size, cudaMemAdviseSetReadMostly, device) ;
354     */
355     cudaMemPrefetchAsync( B->p, (bnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ;
356     cudaMemPrefetchAsync( B->i, bnz * sizeof (int64_t), device, NULL ) ; //stream_data) ;
357     cudaMemPrefetchAsync( B->x, bnz * B->type->size, device, NULL ) ; //stream_data) ;
358 
359 
360 
361     // The work to compute C(i,j) is held in Ci [p], if C(i,j) appears in
362     // as the pth entry in C.
363 
364 
365     //cudaStream_t stream_AxB;
366     //cudaStreamCreate ( &stream_AxB);
367     //----------------------------------------------------------------------
368     // phase1: assign each C(i,j) to a bucket, and count them
369     //----------------------------------------------------------------------
370     dim3 grid( ntasks) ;
371     dim3 p2grid( (ntasks +  SYMBOLIC_PHASE_NTHREADS -1)
372                           / (SYMBOLIC_PHASE_NTHREADS) ) ;
373     dim3 block( SYMBOLIC_PHASE_NTHREADS ) ;
374 
375     std::string base_name = "GB_jit_AxB_dot3_";
376     std::string Opname = "phase1_" ;
377 
378     jitify::experimental::KernelLauncher phase1Kernel =
379     jit::launcher( base_name + Opname + mask_name,
380                    templates_GB_jit_AxB_dot3_phase1_cu,
381                    header_names,
382                    compiler_flags,
383                    callback_wrapper) //,
384                    //stream_AxB)
385                .set_kernel_inst("GB_AxB_cuda_dot3_phase1",
386                                 {M->type->name})
387                .configure(grid, block);
388 
389     //----------------------------------------------------------------------
390     // phase2: cumsum across the blockbuckets, propagate to thread level
391     //----------------------------------------------------------------------
392     base_name = "GB_jit_AxB_dot3_";
393     Opname = "phase2";
394     jitify::experimental::KernelLauncher phase2Kernel =
395     jit::launcher( base_name + Opname,
396                    templates_GB_jit_AxB_dot3_phase2_cu,
397                    header_names,
398                    compiler_flags,
399                    callback_wrapper) //,
400                    //stream_AxB)
401                .set_kernel_inst("GB_AxB_dot3_phase2",
402                                 {})
403                .configure(p2grid, block);
404 
405     base_name = "GB_jit_AxB_dot3_";
406     Opname = "phase2";
407     jitify::experimental::KernelLauncher phase2endKernel =
408     jit::launcher( base_name + Opname,
409                    templates_GB_jit_AxB_dot3_phase2_cu,
410                    header_names,
411                    compiler_flags,
412                    callback_wrapper) //,
413                    //stream_AxB)
414                .set_kernel_inst("GB_AxB_dot3_phase2end",
415                                 {})
416                .configure(grid, block);
417 
418 
419     phase1Kernel.launch(
420                         Nanobuckets,       // array of size NBUCKETS-blockDim.x-by-gridDim.x
421                         Blockbucket,       // bucket counts, of size NBUCKETS-by-gridDim.x
422                                            // input/output:
423                         C,                 // final output matrix
424                                            // inputs, not modified:
425                         M,                 // mask matrix
426                         A,                 // input matrix
427                         B                  // input matrix
428                     );
429 
430 
431     // cudaDeviceSynchronize();
432 
433 
434     GBURBLE ("(GPU phase1 done) ") ;
435     //for (int i = 0; i< cnz; i++){
436     //  printf("C[%d] = %ld\n", i , Ci[i]);
437     //}
438     //----------------------------------------------------------------------
439     // phase2: cumsum across the blockbuckets, propagate to thread level
440     //----------------------------------------------------------------------
441     int nblock = ntasks;
442 
443     phase2Kernel.launch(                    // input
444                         Nanobuckets,       // array of size NBUCKETS-blockDim.x-by-gridDim.x
445                         Blockbucket,       // bucket counts, of size NBUCKETS-by-gridDim.x
446                                            // input/output:
447                         Bucketp,           // global bucket cumsum, of size NBUCKETS+1
448                         Bucket,            // global buckets, of size cnz (== mnz)
449                         offset,
450                         C,                 // final output matrix
451                                            // inputs, not modified:
452                         cnz,               // number of entries in mask and output matrix
453                         nblock
454                     );
455 
456     cudaDeviceSynchronize();
457     //cudaMemPrefetchAsync( offset, (NBUCKETS) * sizeof (int64_t), cudaCpuDeviceId, NULL) ;
458 
459     int64_t s= 0;
460     for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
461     {
462        Bucketp[bucket] = s;
463        s+= offset[bucket];
464        //printf("bucketp[%d] = %ld\n", bucket, Bucketp[bucket]);
465     }
466 
467     GBURBLE ("(GPU phase2 done) ") ;
468 
469     phase2endKernel.launch(                    // input
470                         Nanobuckets,       // array of size NBUCKETS-blockDim.x-by-gridDim.x
471                         Blockbucket,       // bucket counts, of size NBUCKETS-by-gridDim.x
472                                            // input/output:
473                         Bucketp,           // global bucket cumsum, of size NBUCKETS+1
474                         Bucket,            // global buckets, of size cnz (== mnz)
475                         offset,
476                         C,                 // final output matrix
477                                            // inputs, not modified:
478                         cnz                // number of entries in mask and output matrix
479                     );
480 
481     cudaDeviceSynchronize();
482 
483     GBURBLE ("(GPU phase2end done) ") ;
484     /*
485     for (int i = 0; i< cnz; i++){
486       printf("C[%d],Bucket = %ld,%ld\n", i , Ci[i], Bucket[i]);
487     }
488     */
489 
490     //----------------------------------------------------------------------
491     // phase3: do the numerical work
492     //----------------------------------------------------------------------
493 
494     base_name = "GB_jit_";
495     std::string kernel_name = "AxB_dot3_phase3_";
496     C->nzombies = Bucketp[1];  //set pre-zombie counts
497 
498     for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket)
499     {
500         std::string Opname = "";
501         int sz = 0 ;
502 
503         const char*  jit_template;
504 
505         int64_t start = Bucketp[bucket];
506         int64_t end = Bucketp[bucket+1];
507 
508         //if( (end-start>0)  && (start == Bucketp[1]) ) start = Bucketp[0]; //add in zombie slots
509 
510         int64_t Cnz = end- start;
511 
512         int gridsz, blocksz;
513 
514         //Nothing to do, next bucket
515         if ( Cnz == 0 ) continue;
516 
517         GBURBLE ("\n\n(GPU phase3 bucket,bucketsize= %d,%ld) ",bucket,Cnz) ;
518 
519         switch (bucket)
520         {
521 
522             //--------------------------------------------------------------
523             // not a bucket ... bring out your dead:
524             //--------------------------------------------------------------
525 
526             case GB_BUCKET_ZOMBIE : // C(i,j) is a zombie (not a bucket)
527                 break ;
528 
529             //--------------------------------------------------------------
530             // CUDA kernel: dndn, handles a single bucket:
531             //--------------------------------------------------------------
532 
533             // both A(:,i) and B(:,j) are dense
534             case GB_BUCKET_DNDN :
535                 Opname = "dndn" ;
536                 jit_template = templates_GB_jit_AxB_dot3_phase3_dndn_cu;
537                 blocksz = 32;
538                 gridsz = ( Cnz -1 + blocksz)/blocksz;
539                 break ;
540 
541             //--------------------------------------------------------------
542             // CUDA kernel: spdn, handles 4 buckets:
543             //--------------------------------------------------------------
544 
545             // A(:,i) is dense and B(:,j) is very sparse (< 256 entries)
546             case GB_BUCKET_DNVS :
547             // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense
548             case GB_BUCKET_VSDN :
549                 sz = 64 ;
550                 Opname = "spdn" ;
551                 jit_template = templates_GB_jit_AxB_dot3_phase3_spdn_cu;
552                 blocksz = 32;
553                 gridsz = ( Cnz -1 + blocksz)/blocksz;
554                 break ;
555 
556             // A(:,i) is dense and B(:,j) is sparse (>= 256 entries)
557             case GB_BUCKET_DNSP :
558             // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense
559             case GB_BUCKET_SPDN :
560                 sz = 256 ;
561                 Opname = "spdn" ;
562                 jit_template = templates_GB_jit_AxB_dot3_phase3_spdn_cu;
563                 blocksz = 32;
564                 gridsz = ( Cnz -1 + blocksz)/blocksz;
565                 break ;
566 
567             //--------------------------------------------------------------
568             // CUDA kernel: vssp, handles 1 bucket, uses binary search:
569             //--------------------------------------------------------------
570 
571             // A(:,i) is very sparse compared to B(:,j), or visa versa
572             case GB_BUCKET_VSSP :
573                 Opname = "vssp" ;
574                 jit_template = templates_GB_jit_AxB_dot3_phase3_vssp_cu;
575                 blocksz = 32;
576                 gridsz = ( Cnz -1 + blocksz)/blocksz;
577                 break ;
578 
579             //--------------------------------------------------------------
580             // CUDA kernel: vsvs, handles 4 buckets:
581             //--------------------------------------------------------------
582 
583             // let len = nnz (A (:,i) + nnz (B (:,j)), then:
584 
585             case GB_BUCKET_VSVS_256 : sz += 256-64 ;
586             case GB_BUCKET_VSVS_64 :  sz += 64-16  ;
587             case GB_BUCKET_VSVS_16 :  sz += 16-4   ;
588             case GB_BUCKET_VSVS_4 :   sz += 4      ;
589                 Opname = "vsvs" ;
590                 jit_template = templates_GB_jit_AxB_dot3_phase3_vsvs_cu;
591                 blocksz = 1024;
592                 gridsz = GB_IMIN( 1024*number_of_sms, ( Cnz  + blocksz -1 )/blocksz);
593                 gridsz =  ( Cnz  + blocksz -1 )/blocksz;
594                 /*
595                 Opname = "warpix" ;
596                 jit_template = templates_GB_jit_AxB_dot3_phase3_warpix_cu;
597                 blocksz = 32;
598                 gridsz =  GB_IMIN( (mnvec+15)/16, 256*number_of_sms);
599                 */
600                 break ;
601 
602             //--------------------------------------------------------------
603             // CUDA kernel: mp, use the merge-path method:
604             //--------------------------------------------------------------
605 
606             case GB_BUCKET_MERGEPATH :
607                 Opname = "mp" ;
608                 jit_template = templates_GB_jit_AxB_dot3_phase3_mp_cu;
609                 blocksz = 32;
610                 gridsz = ( Cnz -1 + blocksz)/blocksz;
611                 break ;
612 
613             case GB_BUCKET_WARP_IX :   sz = 32      ;
614                 Opname = "warpix" ;
615                 jit_template = templates_GB_jit_AxB_dot3_phase3_warpix_cu;
616                 blocksz = 32;
617                 gridsz =  GB_IMIN( (mnvec+15)/16, 256*number_of_sms);
618                 break ;
619 
620             default:
621                 break ;
622         }
623 
624         dim3 grid(gridsz);
625         dim3 block(blocksz);
626 
627         std::cout<< "Kernel name =" <<Opname<<std::endl;
628         GBURBLE ("(GPU phase3 launch st,end=%ld,%ld nblocks,blocksize= %d,%d )\n",start,end,gridsz,blocksz) ;
629         jit::launcher( base_name + kernel_name + Opname + "_" + semiring_name,
630                        jit_template,
631                        header_names,
632                        compiler_flags,
633                        callback_wrapper)
634                    .set_kernel_inst(kernel_name + Opname,
635                                     { ctype->name,
636                                       A->type->name,
637                                       B->type->name,
638                                       semiring->multiply->xtype->name,
639                                       semiring->multiply->ytype->name,
640                                       semiring->multiply->ztype->name  })
641                    .configure(grid, block) //if commented, use implicit 1D configure in launch
642                    .launch(
643                             start,   // input/output:
644                             end, // global bucket cumsum, of size NBUCKETS+1
645                             Bucket,            // global buckets, of size cnz (== mnz)
646                             C,                 // final output matrix
647                                                // inputs, not modified:
648                             M,                 // Mi used for column index
649                             A,                 // A matrix
650                             B,                 // B matrix
651                             sz                 // only used for sparse-sparse cases
652 
653                         );
654 
655         cudaDeviceSynchronize();
656     }
657     GBURBLE ("(GPU phase3 done) ") ;
658 
659     // transplant C into C_static, and free C
660     GB_to_static (C_static, &C, Context) ;
661     ASSERT (C == NULL) ;
662 
663     std::string reduce_kernel_name = "reduceNonZombiesWarp";
664     const char*  jit_template;
665     #define red_blocksz 1024
666     jit_template = templates_reduceNonZombiesWarp_cu;
667     int num_reduce_blocks = GB_IMIN( 32*number_of_sms, (cnz + red_blocksz -1)/ red_blocksz  ) ;
668     dim3 red_grid( num_reduce_blocks ) ;
669     dim3 red_block( red_blocksz ) ;
670 
671     int32_t *block_sum;
672     //cudaMallocManaged ((void**) &block_sum, (num_reduce_blocks)*sizeof(int32_t)) ;
673     block_sum = (int32_t*)GB_cuda_malloc( (num_reduce_blocks)*sizeof(int32_t)) ;
674 
675     GBURBLE ("(GPU reduce launch nblocks,blocksize= %d,%d )\n", num_reduce_blocks, red_blocksz) ;
676     jit::launcher( reduce_kernel_name + "_" + semiring_name,
677                    jit_template,
678                    header_names,
679                    compiler_flags,
680                    callback_wrapper)
681                    .set_kernel_inst( reduce_kernel_name , { ctype->name })
682                    .configure(red_grid, red_block) //if commented, use implicit 1D configure in launch
683                    .launch(
684                             C_static->i,   // index vector, only sum up values >= 0
685                             C_static->x,   // input pointer to vector to reduce, with zombies
686                             block_sum,             // Block sums on return
687                             (unsigned int)cnz      // length of vector to reduce to scalar
688 
689                         );
690 
691     cudaDeviceSynchronize();
692 
693     int32_t num_triangles = 0;
694     for (int i = 0; i< num_reduce_blocks; i++){
695        //printf("block%d num_triangles = %d\n", i, block_sum[i] );
696        num_triangles += block_sum[i] ;
697     }
698     printf("num_triangles = %d\n",  num_triangles );
699 
700     GB_cuda_free( block_sum );
701     //cudaMemPrefetchAsync( C->p, (mnvec+1) * sizeof (int64_t), cudaCpuDeviceId, NULL) ; //stream_data ) ;
702     //cudaMemPrefetchAsync( C->i, cnz * sizeof (int64_t), cudaCpuDeviceId, NULL ) ; //stream_data ) ;
703     //cudaMemPrefetchAsync( C->x, cnz * sizeof (int32_t), cudaCpuDeviceId, NULL ) ; //stream_data ) ;
704     /*
705     cudaMemcpy( Citemp, C->i, cnz * sizeof( int64_t), cudaMemcpyDefault );
706     cudaMemcpy( Cxtemp, C->x, cnz * C->type->size, cudaMemcpyDefault );
707     GB_cuda_free( C->i);
708     GB_cuda_free( C->x);
709     C->i = Citemp;
710     C->x = Cxtemp;
711     */
712 
713     cudaDeviceSynchronize();
714 
715     GB_FREE_WORK ;
716     return GrB_SUCCESS;
717 }
718 
719