CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu.jit

const char* const templates_GB_jit_AxB_dot3_phase3_mp_cu = "templates/GB_jit_AxB_dot3_phase3_mp.cu\n"
"//------------------------------------------------------------------------------\n"
"// AxB_dot3_phase3_mp.cu \n"
"//------------------------------------------------------------------------------\n"
"\n"
"// This CUDA kernel produces the semi-ring product of two\n"
"// sparse matrices of types T_A and T_B and common index space size n, to a  \n"
"// output matrix of type T_C. The matrices are sparse, with different numbers\n"
"// of non-zeros and different sparsity patterns. \n"
"// ie. we want to produce C = A'*B in the sense of the given semi-ring.\n"
"\n"
"// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are \n"
"// relatively close in size, neither is very spare nor dense, for any size of N.\n"
"// Handles arbitrary sparsity patterns with guaranteed load balance.\n"
"\n"
"// Both the grid and block are 1D, so blockDim.x is the # threads in a\n"
"// threadblock, and the # of threadblocks is grid.x\n"
"\n"
"// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number\n"
"// of active threads = min( min(g_xnz, g_ynz), 32) \n"
"\n"
"// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi.  Its job\n"
"// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot\n"
"// product on those items in the intersection, and finally reduce this data to a scalar, \n"
"// on exit write it to g_odata [b].\n"
"\n"
"//  int64_t start          <- start of vector pairs for this kernel\n"
"//  int64_t end            <- end of vector pairs for this kernel\n"
"//  int64_t *Bucket        <- array of pair indices for all kernels \n"
"//  matrix<T_C> *C         <- result matrix \n"
"//  matrix<T_M> *M         <- mask matrix\n"
"//  matrix<T_A> *A         <- input matrix A\n"
"//  matrix<T_B> *B         <- input matrix B\n"
"#include <limits>\n"
"#include <cstdint>\n"
"#include <cooperative_groups.h>\n"
"#include \"mySemiRing.h\"\n"
"#include \"matrix.h\"\n"
"\n"
"// Using tile size fixed at compile time, we don't need shared memory\n"
"#define tile_sz 32 \n"
"\n"
"using namespace cooperative_groups;\n"
"\n"
"template< typename T, int warp_sz>\n"
"__device__ __inline__ \n"
"T GB_reduce_sum(thread_block_tile<warp_sz> g, T val)\n"
"{\n"
"    // Each iteration halves the number of active threads\n"
"    // Each thread adds its partial sum[i] to sum[lane+i]\n"
"    for (int i = g.size() / 2; i > 0; i /= 2)\n"
"    {\n"
"        T next = g.shfl_down( val, i);\n"
"        val = GB_ADD( val, next ) ;\n"
"    }\n"
"    return val;\n"
"}\n"
"\n"
"template< typename T, int warp_sz>\n"
"__device__ __inline__ \n"
"T reduce_plus(thread_block_tile<warp_sz> g, T val)\n"
"{\n"
"    // Each iteration halves the number of active threads\n"
"    // Each thread adds its partial sum[i] to sum[lane+i]\n"
"    for (int i = g.size() / 2; i > 0; i /= 2)\n"
"    {\n"
"        val += g.shfl_down( val, i) ;\n"
"    }\n"
"    return val; // note: only thread 0 will return full sum and flag value\n"
"}\n"
"\n"
"#define intersects_per_thread 8\n"
"\n"
"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>  \n"
"__global__ void AxB_dot3_phase3_mp\n"
"(\n"
"    int64_t start,\n"
"    int64_t end,\n"
"    int64_t *Bucket,\n"
"    GrB_Matrix C,\n"
"    GrB_Matrix M,\n"
"    GrB_Matrix A,\n"
"    GrB_Matrix B,\n"
"    int sz\n"
")\n"
"{\n"
"\n"
"    T_A *Ax = (T_A*)A->x;\n"
"    T_B *Bx = (T_B*)B->x;\n"
"    T_C *Cx = (T_C*)C->x;\n"
"    int64_t *Ci = C->i;\n"
"    int64_t *Mi = M->i;\n"
"    int64_t *Ai = A->i;\n"
"    int64_t *Bi = B->i;\n"
"    int64_t *Ap = A->p;\n"
"    int64_t *Bp = B->p;\n"
"\n"
"\n"
"    // zombie count\n"
"    int zc = 0;\n"
"\n"
"    int64_t pair_id;\n"
"\n"
"    // set thread ID\n"
"    int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;\n"
"    int tid = threadIdx.x;\n"
"\n"
"    int b = blockIdx.x ;\n"
"\n"
"    // total items to be inspected\n"
"    int64_t nnzA = 0;\n"
"    int64_t nnzB = 0;\n"
"    int64_t n_intersect = 0;\n"
"\n"
"    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());\n"
"\n"
"    int parts = blockDim.x; //(n_intersect+ intersects_per_thread -1)/ intersects_per_thread; \n"
"\n"
"    // int has_zombies = 0 ;\n"
"\n"
"    // Main loop over pairs \n"
"    for (pair_id = start+ blockIdx.x; //warp per pair \n"
"         pair_id < end;  \n"
"         pair_id += gridDim.x )\n"
"    {\n"
"\n"
"         int64_t i = Mi[pair_id];\n"
"         int64_t j = Ci[pair_id] >> 4;\n"
"\n"
"         int64_t xstart = Ap[i];\n"
"         int64_t xend   = Ap[i+1];\n"
"         nnzA = xend - xstart;\n"
"\n"
"         int64_t ystart = Bp[j]; \n"
"         int64_t yend   = Bp[j+1]; \n"
"         nnzB = yend - ystart;\n"
"\n"
"         n_intersect = GB_IMIN( xend -xstart, yend -ystart); \n"
"    /* \n"
"    if (threadIdx.x ==0 ) {\n"
"      printf(\"block %d  doing dot %lld  i,j= %lld,%lld\\n\", blockIdx.x, pair_id, i, j);\n"
"    }\n"
"    */\n"
"    //we want more than one intersection per thread\n"
"    int64_t nxy = nnzA + nnzB;\n"
"\n"
"    int work_per_thread = (nxy +parts -1)/parts;\n"
"    int diag = GB_IMIN( work_per_thread*tid, nxy);\n"
"    int diag_end = GB_IMIN( diag + work_per_thread, nxy);\n"
"    //printf(\" thd%d parts = %u wpt = %u diag, diag_end  = %u,%u\\n\",tid, parts, work_per_thread, diag, diag_end); \n"
"\n"
"    int x_min = GB_IMAX( (int)(diag - nnzB), 0);\n"
"    int x_max = GB_IMIN( diag, nnzA);\n"
"\n"
"    //printf(\"start thd%u x_min = %u x_max = %u\\n\", tid_global, x_min,x_max);\n"
"    while ( x_min < x_max) { //binary search for correct diag break\n"
"      int pivot = (x_min +x_max)/2;\n"
"      if ( Ai[pivot + xstart] < Bi[ diag -pivot -1 + ystart]) {\n"
"         x_min = pivot +1;\n"
"      }\n"
"      else {\n"
"         x_max = pivot;\n"
"      }\n"
"    }\n"
"    int xcoord = x_min;\n"
"    int ycoord = diag -x_min -1;\n"
"    if (( diag > 0) &&(diag < (nnzA+nnzB)) && (Ai[xcoord+xstart] == Bi[ycoord+ystart]) ) { \n"
"       diag--; //adjust for intersection incrementing both pointers \n"
"    }\n"
"    // two start points are known now\n"
"    int tx_start = xcoord +xstart;\n"
"    int ty_start = diag -xcoord +ystart; \n"
"\n"
"    //if (x_start != y_start)\n"
"    //   printf(\"start thd%u  xs,ys = %i,%i\\n\", tid_global, x_start, y_start);\n"
"\n"
"    x_min = GB_IMAX( (int)(diag_end - nnzB), 0);\n"
"    x_max = GB_IMIN( diag_end, nnzA);\n"
"\n"
"    while ( x_min < x_max) {\n"
"       int pivot = (x_min +x_max)/2;\n"
"       //printf(\"thd%u pre_sw piv=%u diag_e = %u  xmin,xmax=%u,%u\\n\", tid_global, pivot, diag_end,x_min, x_max);\n"
"       if ( Ai[pivot+ xstart] < Bi[ diag_end -pivot -1 +ystart]) {\n"
"          x_min = pivot +1;\n"
"       }\n"
"       else {\n"
"          x_max = pivot;\n"
"       }\n"
"       //printf(\"thd%u piv=%u xmin,xmax = %u,%u\\n\", tid_global, pivot, x_min, x_max);\n"
"    }\n"
"    xcoord = x_min;\n"
"    ycoord = diag_end -x_min -1;\n"
"    if ( (diag_end < (nnzA +nnzB)) && (Ai[xcoord +xstart] == Bi[ycoord + ystart]) ) { \n"
"        diag--; //adjust for intersection incrementing both pointers  \n"
"    }\n"
"    // two end points are known now\n"
"    int tx_end = xcoord +xstart; \n"
"    int ty_end = diag_end - xcoord + ystart; \n"
"\n"
"    T_A aki;\n"
"    T_B bkj;\n"
"    T_Z cij = GB_IDENTITY ;\n"
"\n"
"    // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists.\n"
"    // just check if cij > 0\n"
"\n"
"    int cij_exists  = 0 ;\n"
"    //printf(\" thd%u has init value %f\\n\",tid, cij);\n"
"\n"
"    //merge-path dot product\n"
"    int k = tx_start;\n"
"    int l = ty_start;\n"
"    while ( k < tx_end && l < ty_end )\n"
"    {\n"
"       if (Ai [k] == Bi [l])\n"
"       {\n"
"          GB_GETA ( aki=(T_Z)Ax[k] ) ;\n"
"          GB_GETB ( bkj=(T_Z)Bx[l] ) ;\n"
"          if (cij_exists)\n"
"          {\n"
"            T_Z t = GB_MULT( (T_Z)aki, (T_Z)bkj );\n"
"            GB_ADD_F (cij, t ) ;\n"
"          //printf(\"  thd%d ix at %lld   cij += %d * %d \\n\", tid_global, Ai[k], aki, bkj);\n"
"          }\n"
"          else\n"
"          {\n"
"            cij_exists = 1 ;\n"
"            cij = GB_MULT ( (T_Z)aki, (T_Z)bkj ) ;\n"
"          //printf(\"  thd%d ix at %lld   cij = %d * %d \\n\", tid_global, Ai[k], Ax[k], Bx[l]);\n"
"          }\n"
"          // TODO check terminal condition\n"
"          k+= 1;\n"
"          l+= 1;\n"
"          //printf(\" block%u work value = %d, exists = %d\\n\", b, cij, cij_exists);\n"
"       }\n"
"       else\n"
"       {\n"
"            k += ( Ai[k] < Bi[l] ) ;\n"
"            l += ( Ai[k] > Bi[l] ) ;\n"
"       }\n"
"    }\n"
"\n"
"    //tile.sync( ) ;\n"
"    //--------------------------------------------------------------------------\n"
"    // reduce sum per-thread values to a single scalar, get OR of flag\n"
"    //--------------------------------------------------------------------------\n"
"    /*\n"
"    if (tid == 0)\n"
"    {\n"
"        printf (\"reduce %d : %d exists = %d\\n\", b,  cij, cij_exists) ;\n"
"    }\n"
"    __syncthreads();\n"
"    */\n"
"\n"
"    // Do vote here for control.\n"
"    cij_exists  = tile.any( cij_exists);\n"
"    //tile.sync();\n"
"\n"
"    if (cij_exists)\n"
"    {\n"
"       cij = GB_reduce_sum<T_Z, tile_sz>( tile, cij );\n"
"       \n"
"    }\n"
"    // else has_zombies = 1;\n"
"\n"
"\n"
"    //__syncthreads();\n"
"    //tile.sync( );\n"
"    // write result for this block to global mem\n"
"    if (tid == 0)\n"
"    {\n"
"        //printf (\"final %d : %d exists = %d\\n\", b,  cij, cij_exists) ;\n"
"        if (cij_exists)\n"
"        {\n"
"           //printf(\" cij = %d\\n\", cij);\n"
"           GB_PUTC ( Cx[pair_id]=(T_C)cij ) ;\n"
"           GB_PUTC ( Ci[pair_id]=i ) ;\n"
"        }\n"
"        else\n"
"        {\n"
"           //printf(\" dot %d is a zombie\\n\", pair_id);\n"
"           zc++;\n"
"           GB_PUTC ( Ci[pair_id]=GB_FLIP (i) ) ;\n"
"        }\n"
"    }\n"
"    //__syncthreads(); \n"
"  }\n"
"\n"
"//--------------------------------------------------------------------------\n"
"\n"
"  if( tid ==0 && zc > 0)\n"
"  {\n"
"      //printf(\"warp %d zombie count = %d\\n\", blockIdx.x, zc);\n"
"      atomicAdd( (unsigned long long int*)&(C->zombie_count), (unsigned long long int)zc);\n"
"      //printf(\" Czombie = %lld\\n\",C->zombie_count);\n"
"  }\n"
"\n"
"  //__syncthreads();\n"
"\n"
"}\n"
"\n"
;