1 /* ========================================================================== */ 2 /* ========================= CHOLMOD CUDA/C kernels ========================= */ 3 /* ========================================================================== */ 4 5 /* ----------------------------------------------------------------------------- 6 * CHOLMOD/GPU Module. Copyright (C) 2005-2006, Timothy A. Davis 7 * http://www.suitesparse.com 8 * -------------------------------------------------------------------------- */ 9 10 #include <stdio.h> 11 #include "SuiteSparse_config.h" 12 /* 64-bit version only */ 13 #define Int SuiteSparse_long 14 15 extern "C" { 16 kernelCreateMap(Int * d_Map,Int * d_Ls,Int psi,Int nsrow)17 __global__ void kernelCreateMap ( Int *d_Map, Int *d_Ls, 18 Int psi, Int nsrow ) 19 /* 20 Ls[supernode row] = Matrix Row 21 */ 22 { 23 int tid = blockIdx.x * blockDim.x + threadIdx.x; 24 if ( tid < nsrow ) { 25 d_Map[d_Ls[psi+tid]] = ((Int) (tid)); 26 } 27 } 28 kernelCreateRelativeMap(Int * d_Map,Int * d_Ls,Int * d_RelativeMap,Int pdi1,Int ndrow)29 __global__ void kernelCreateRelativeMap ( Int *d_Map, Int *d_Ls, 30 Int *d_RelativeMap, 31 Int pdi1, Int ndrow ) 32 { 33 int tid = blockIdx.x * blockDim.x + threadIdx.x; 34 if ( tid < ndrow ) { 35 d_RelativeMap[tid] = d_Map[d_Ls[pdi1+tid]]; 36 } 37 } 38 kernelAddUpdate(double * d_A,double * devPtrC,Int * d_RelativeMap,Int ndrow1,Int ndrow2,Int nsrow)39 __global__ void kernelAddUpdate ( double *d_A, double *devPtrC, 40 Int *d_RelativeMap, 41 Int ndrow1, Int ndrow2, 42 Int nsrow ) 43 { 44 int idrow = blockIdx.x * blockDim.x + threadIdx.x; 45 int idcol = blockIdx.y * blockDim.y + threadIdx.y; 46 if ( idrow < ndrow2 && idcol < ndrow1 ) { 47 Int idx = d_RelativeMap[idrow] + d_RelativeMap[idcol] * nsrow; 48 d_A[idx] += devPtrC[idrow+ndrow2*idcol]; 49 } 50 } 51 kernelAddComplexUpdate(double * d_A,double * devPtrC,Int * d_RelativeMap,Int ndrow1,Int ndrow2,Int nsrow)52 __global__ void kernelAddComplexUpdate ( double *d_A, double *devPtrC, 53 Int *d_RelativeMap, 54 Int ndrow1, Int ndrow2, 55 Int nsrow ) 56 { 57 int idrow = blockIdx.x * blockDim.x + threadIdx.x; 58 int idcol = blockIdx.y * blockDim.y + threadIdx.y; 59 if ( idrow < ndrow2 && idcol < ndrow1 ) { 60 Int idx = d_RelativeMap[idrow] + d_RelativeMap[idcol] * nsrow; 61 d_A[idx*2] += devPtrC[(idrow+ndrow2*idcol)*2]; 62 d_A[idx*2+1] += devPtrC[(idrow+ndrow2*idcol)*2+1]; 63 } 64 } 65 kernelSumA(double * a1,double * a2,const double alpha,int nsrow,int nscol)66 __global__ void kernelSumA ( double *a1, double *a2, const double alpha, 67 int nsrow, int nscol ) { 68 int isrow = blockIdx.x * blockDim.x + threadIdx.x; 69 int iscol = blockIdx.y * blockDim.y + threadIdx.y; 70 if ( isrow < nsrow && iscol < nscol ) { 71 Int idx = iscol*nsrow + isrow; 72 a1[idx] += alpha * a2[idx]; 73 } 74 } 75 kernelSumComplexA(double * a1,double * a2,const double alpha,int nsrow,int nscol)76 __global__ void kernelSumComplexA ( double *a1, double *a2, 77 const double alpha, int nsrow, 78 int nscol ) { 79 int isrow = blockIdx.x * blockDim.x + threadIdx.x; 80 int iscol = blockIdx.y * blockDim.y + threadIdx.y; 81 if ( isrow < nsrow && iscol < nscol ) { 82 Int idx = iscol*nsrow + isrow; 83 a1[idx*2] += alpha * a2[idx*2]; 84 a1[idx*2+1] += alpha * a2[idx*2+1]; 85 } 86 } 87 88 /* ======================================================================== */ 89 /* using Ls and Lpi data already on the device, construct Map */ 90 /* ======================================================================== */ createMapOnDevice(Int * d_Map,Int * d_Ls,Int psi,Int nsrow)91 int createMapOnDevice ( Int *d_Map, Int *d_Ls, 92 Int psi, Int nsrow ) 93 { 94 unsigned int kgrid = (nsrow+31)/32; 95 unsigned int kblock = 32; 96 kernelCreateMap <<<kgrid, kblock>>> ( d_Map, d_Ls, psi, nsrow ); 97 return 0; 98 } 99 100 createRelativeMapOnDevice(Int * d_Map,Int * d_Ls,Int * d_RelativeMap,Int pdi1,Int ndrow,cudaStream_t * astream)101 int createRelativeMapOnDevice ( Int *d_Map, Int *d_Ls, 102 Int *d_RelativeMap,Int pdi1, 103 Int ndrow, cudaStream_t* astream ) 104 { 105 unsigned int kgrid = (ndrow+255)/256; 106 unsigned int kblock = 256; 107 kernelCreateRelativeMap <<<kgrid, kblock, 0, *astream>>> 108 ( d_Map, d_Ls, d_RelativeMap, pdi1, ndrow); 109 return 0; 110 } 111 112 113 /* ======================================================================== */ addUpdateOnDevice(double * d_A,double * devPtrC,Int * d_RelativeMap,Int ndrow1,Int ndrow2,Int nsrow,cudaStream_t * astream)114 int addUpdateOnDevice ( double *d_A, double *devPtrC, 115 Int *d_RelativeMap, Int ndrow1, 116 Int ndrow2, Int nsrow, 117 cudaStream_t* astream ) 118 /* ======================================================================== */ 119 /* Assemble the Schur complment from a descendant supernode into the current 120 supernode */ 121 /* ======================================================================== */ 122 { 123 dim3 grids; 124 dim3 blocks; 125 126 blocks.x = 16; 127 blocks.y = 16; 128 blocks.z = 1; 129 130 grids.x = (ndrow2+15)/16; 131 grids.y = (ndrow1+15)/16; 132 133 kernelAddUpdate <<<grids, blocks, 0, *astream>>> 134 ( d_A, devPtrC, d_RelativeMap, ndrow1, ndrow2, nsrow ); 135 136 return 0; 137 } 138 139 /* ======================================================================== */ addComplexUpdateOnDevice(double * d_A,double * devPtrC,Int * d_RelativeMap,Int ndrow1,Int ndrow2,Int nsrow,cudaStream_t * astream)140 int addComplexUpdateOnDevice ( double *d_A, double *devPtrC, 141 Int *d_RelativeMap, Int ndrow1, 142 Int ndrow2, Int nsrow, 143 cudaStream_t* astream ) 144 /* ======================================================================== */ 145 /* Assemble the Schur complment from a descendant supernode into the current 146 supernode */ 147 /* ======================================================================== */ 148 { 149 dim3 grids; 150 dim3 blocks; 151 152 blocks.x = 16; 153 blocks.y = 16; 154 blocks.z = 1; 155 156 grids.x = (ndrow2+15)/16; 157 grids.y = (ndrow1+15)/16; 158 159 kernelAddComplexUpdate <<<grids, blocks, 0, *astream>>> 160 ( d_A, devPtrC, d_RelativeMap, ndrow1, ndrow2, nsrow ); 161 162 return 0; 163 } 164 sumAOnDevice(double * a1,double * a2,const double alpha,int nsrow,int nscol)165 int sumAOnDevice ( double *a1, double *a2, const double alpha, 166 int nsrow, int nscol ) 167 { 168 dim3 grids; 169 dim3 blocks; 170 blocks.x = 16; 171 blocks.y = 16; 172 blocks.z = 1; 173 grids.x = (nsrow+15)/16; 174 grids.y = (nscol+15)/16; 175 kernelSumA <<<grids, blocks, 0, 0>>> ( a1, a2, alpha, nsrow, nscol ); 176 return 0; 177 } 178 sumComplexAOnDevice(double * a1,double * a2,const double alpha,int nsrow,int nscol)179 int sumComplexAOnDevice ( double *a1, double *a2, const double alpha, 180 int nsrow, int nscol ) 181 { 182 dim3 grids; 183 dim3 blocks; 184 blocks.x = 16; 185 blocks.y = 16; 186 blocks.z = 1; 187 grids.x = (nsrow+15)/16; 188 grids.y = (nscol+15)/16; 189 kernelSumComplexA <<<grids, blocks, 0, 0>>> ( a1, a2, alpha, nsrow, nscol ); 190 return 0; 191 } 192 193 } 194