1 // SPDX-License-Identifier: Apache-2.0
2 
3 //------------------------------------------------------------------------------
4 // GB_cuda_buckets.h: definitions for buckets using for dot3
5 //------------------------------------------------------------------------------
6 
7 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
8 // http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
9 
10 //------------------------------------------------------------------------------
11 
12 // This file is #include'd only in the GraphBLAS/CUDA/GB_cuda*.cu source files.
13 
14 #ifndef GB_CUDA_BUCKETS_H
15 #define GB_CUDA_BUCKETS_H
16 
17 // nvcc chokes on the 'restrict' keyword, so define it to the empty string
18 // for compiling the *.cu files.
19 #define restrict
20 
21 // nvcc also chokes on fpclassify (an ANSI C11 construct that does not appear
22 // in C++11, it seems).  It also issues spurious warnings about compiler
23 // pragmas.  Source/GB.h avoids these constructs if GB_NVCC is defined.
24 #define GB_NVCC
25 
26 
27 // 12 buckets: computed by up to 11 kernel launches (zombies need no work...),
28 // using 5 different kernels (with different configurations depending on the
29 // bucket).
30 typedef enum
31 {
32     // bring out your dead:
33     GB_BUCKET_ZOMBIE = 0,              // C(i,j) is a zombie (not a bucket)
34 
35 // dot3:  C<M>=A'B, M is sparse or hyper, C is sparse or hyper
36 // 32 kernels A,B: (hyper,sparse,bitmap,full)^2 x M is (sparse/hyper)
37 
38 // a full/full kernel:
39     // CUDA kernel: dndn, handles a single bucket:
40     // both A(:,i) and B(:,j) are dense
41     GB_BUCKET_DNDN = 1,
42 
43 // two full/(sparse,hyper) kernels:
44     // CUDA kernel: spdn, handles 4 buckets:
45     // A(:,i) is dense and B(:,j) is very sparse (< 256 entries)
46     GB_BUCKET_DNVS = 2,
47     // A(:,i) is dense and B(:,j) is sparse (>= 256 entries)
48     GB_BUCKET_DNSP = 3,
49 
50 // a sparse/full kernel
51     // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense
52     GB_BUCKET_VSDN = 4,
53     // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense
54     GB_BUCKET_SPDN = 5,
55 
56 // a sparse/bitmap kernel
57 // a bitmap/bitmap kernel
58 // a bitmap/sparse kernel
59 // ...
60 
61 
62 // sparse/sparse:
63     // CUDA kernel: vssp, handles 1 bucket, uses binary search:
64     // A(:,i) is very sparse compared to B(:,j), or visa versa
65     GB_BUCKET_VSSP = 6,
66 
67     // CUDA kernel: vsvs, handles 4 buckets:
68     // let len = nnz (A (:,i) + nnz (B (:,j)), then:
69     GB_BUCKET_VSVS_4 = 7,       // len <= 4
70     GB_BUCKET_VSVS_16 = 8,      // len <= 16
71     GB_BUCKET_VSVS_64 = 9,      // len <= 64
72     GB_BUCKET_VSVS_256 = 10,     // len <= 256
73 
74     // CUDA kernel: mp, use the merge-path method:
75     GB_BUCKET_MERGEPATH = 11,
76 
77     // CUDA kernel: warpix, use the warp-intersect method, unused so far:
78     GB_BUCKET_WARP_IX = 12
79 }
80 GB_bucket_code ;
81 
82 #endif
83