1 /* ========================================================================== */
2 /* ========================= CHOLMOD CUDA/C kernels ========================= */
3 /* ========================================================================== */
4 
5 /* -----------------------------------------------------------------------------
6  * CHOLMOD/GPU Module.  Copyright (C) 2005-2006, Timothy A. Davis
7  * http://www.suitesparse.com
8  * -------------------------------------------------------------------------- */
9 
10 #include <stdio.h>
11 #include "SuiteSparse_config.h"
12 /* 64-bit version only */
13 #define Int SuiteSparse_long
14 
15 extern "C" {
16 
kernelCreateMap(Int * d_Map,Int * d_Ls,Int psi,Int nsrow)17   __global__ void kernelCreateMap ( Int *d_Map, Int *d_Ls,
18 				    Int psi, Int nsrow )
19   /*
20     Ls[supernode row] = Matrix Row
21   */
22   {
23     int tid = blockIdx.x * blockDim.x + threadIdx.x;
24     if ( tid < nsrow ) {
25       d_Map[d_Ls[psi+tid]] = ((Int) (tid));
26     }
27   }
28 
kernelCreateRelativeMap(Int * d_Map,Int * d_Ls,Int * d_RelativeMap,Int pdi1,Int ndrow)29   __global__ void kernelCreateRelativeMap ( Int *d_Map, Int *d_Ls,
30 					    Int *d_RelativeMap,
31 					    Int pdi1, Int ndrow )
32   {
33     int tid = blockIdx.x * blockDim.x + threadIdx.x;
34     if ( tid < ndrow ) {
35       d_RelativeMap[tid] = d_Map[d_Ls[pdi1+tid]];
36     }
37   }
38 
kernelAddUpdate(double * d_A,double * devPtrC,Int * d_RelativeMap,Int ndrow1,Int ndrow2,Int nsrow)39   __global__ void kernelAddUpdate ( double *d_A, double *devPtrC,
40 				    Int *d_RelativeMap,
41 				    Int ndrow1, Int ndrow2,
42 				    Int nsrow )
43   {
44     int idrow = blockIdx.x * blockDim.x + threadIdx.x;
45     int idcol = blockIdx.y * blockDim.y + threadIdx.y;
46     if ( idrow < ndrow2  && idcol < ndrow1 ) {
47       Int idx = d_RelativeMap[idrow] + d_RelativeMap[idcol] * nsrow;
48       d_A[idx] += devPtrC[idrow+ndrow2*idcol];
49     }
50   }
51 
kernelAddComplexUpdate(double * d_A,double * devPtrC,Int * d_RelativeMap,Int ndrow1,Int ndrow2,Int nsrow)52   __global__ void kernelAddComplexUpdate ( double *d_A, double *devPtrC,
53 					   Int *d_RelativeMap,
54 					   Int ndrow1, Int ndrow2,
55 					   Int nsrow )
56   {
57     int idrow = blockIdx.x * blockDim.x + threadIdx.x;
58     int idcol = blockIdx.y * blockDim.y + threadIdx.y;
59     if ( idrow < ndrow2  && idcol < ndrow1 ) {
60       Int idx = d_RelativeMap[idrow] + d_RelativeMap[idcol] * nsrow;
61       d_A[idx*2] += devPtrC[(idrow+ndrow2*idcol)*2];
62       d_A[idx*2+1] += devPtrC[(idrow+ndrow2*idcol)*2+1];
63     }
64   }
65 
kernelSumA(double * a1,double * a2,const double alpha,int nsrow,int nscol)66   __global__ void kernelSumA ( double *a1, double *a2, const double alpha,
67 			       int nsrow, int nscol ) {
68     int isrow = blockIdx.x * blockDim.x + threadIdx.x;
69     int iscol = blockIdx.y * blockDim.y + threadIdx.y;
70     if ( isrow < nsrow && iscol < nscol ) {
71       Int idx = iscol*nsrow + isrow;
72       a1[idx] += alpha * a2[idx];
73     }
74   }
75 
kernelSumComplexA(double * a1,double * a2,const double alpha,int nsrow,int nscol)76   __global__ void kernelSumComplexA ( double *a1, double *a2,
77 				      const double alpha, int nsrow,
78 				      int nscol ) {
79     int isrow = blockIdx.x * blockDim.x + threadIdx.x;
80     int iscol = blockIdx.y * blockDim.y + threadIdx.y;
81     if ( isrow < nsrow && iscol < nscol ) {
82       Int idx = iscol*nsrow + isrow;
83       a1[idx*2] += alpha * a2[idx*2];
84       a1[idx*2+1] += alpha * a2[idx*2+1];
85     }
86   }
87 
88   /* ======================================================================== */
89   /* using Ls and Lpi data already on the device, construct Map */
90   /* ======================================================================== */
createMapOnDevice(Int * d_Map,Int * d_Ls,Int psi,Int nsrow)91   int createMapOnDevice ( Int *d_Map, Int *d_Ls,
92 			  Int  psi, Int nsrow )
93   {
94     unsigned int kgrid = (nsrow+31)/32;
95     unsigned int kblock = 32;
96     kernelCreateMap <<<kgrid, kblock>>> ( d_Map, d_Ls, psi, nsrow );
97     return 0;
98   }
99 
100 
createRelativeMapOnDevice(Int * d_Map,Int * d_Ls,Int * d_RelativeMap,Int pdi1,Int ndrow,cudaStream_t * astream)101   int createRelativeMapOnDevice ( Int *d_Map, Int *d_Ls,
102 				  Int *d_RelativeMap,Int  pdi1,
103 				  Int ndrow, cudaStream_t* astream )
104   {
105     unsigned int kgrid = (ndrow+255)/256;
106     unsigned int kblock = 256;
107     kernelCreateRelativeMap <<<kgrid, kblock, 0, *astream>>>
108       ( d_Map, d_Ls, d_RelativeMap, pdi1, ndrow);
109     return 0;
110   }
111 
112 
113   /* ======================================================================== */
addUpdateOnDevice(double * d_A,double * devPtrC,Int * d_RelativeMap,Int ndrow1,Int ndrow2,Int nsrow,cudaStream_t * astream)114   int addUpdateOnDevice ( double *d_A, double *devPtrC,
115 			  Int *d_RelativeMap, Int ndrow1,
116 			  Int ndrow2, Int nsrow,
117 			  cudaStream_t* astream )
118   /* ======================================================================== */
119   /* Assemble the Schur complment from a descendant supernode into the current
120      supernode */
121   /* ======================================================================== */
122 {
123   dim3 grids;
124   dim3 blocks;
125 
126   blocks.x = 16;
127   blocks.y = 16;
128   blocks.z = 1;
129 
130   grids.x = (ndrow2+15)/16;
131   grids.y = (ndrow1+15)/16;
132 
133   kernelAddUpdate <<<grids, blocks, 0, *astream>>>
134     ( d_A, devPtrC, d_RelativeMap, ndrow1, ndrow2, nsrow );
135 
136   return 0;
137 }
138 
139   /* ======================================================================== */
addComplexUpdateOnDevice(double * d_A,double * devPtrC,Int * d_RelativeMap,Int ndrow1,Int ndrow2,Int nsrow,cudaStream_t * astream)140   int addComplexUpdateOnDevice ( double *d_A, double *devPtrC,
141 				 Int *d_RelativeMap, Int ndrow1,
142 				 Int ndrow2, Int nsrow,
143 				 cudaStream_t* astream )
144   /* ======================================================================== */
145   /* Assemble the Schur complment from a descendant supernode into the current
146      supernode */
147   /* ======================================================================== */
148 {
149   dim3 grids;
150   dim3 blocks;
151 
152   blocks.x = 16;
153   blocks.y = 16;
154   blocks.z = 1;
155 
156   grids.x = (ndrow2+15)/16;
157   grids.y = (ndrow1+15)/16;
158 
159   kernelAddComplexUpdate <<<grids, blocks, 0, *astream>>>
160     ( d_A, devPtrC, d_RelativeMap, ndrow1, ndrow2, nsrow );
161 
162   return 0;
163 }
164 
sumAOnDevice(double * a1,double * a2,const double alpha,int nsrow,int nscol)165   int sumAOnDevice ( double *a1, double *a2, const double alpha,
166 		     int nsrow, int nscol )
167   {
168     dim3 grids;
169     dim3 blocks;
170     blocks.x = 16;
171     blocks.y = 16;
172     blocks.z = 1;
173     grids.x = (nsrow+15)/16;
174     grids.y = (nscol+15)/16;
175     kernelSumA <<<grids, blocks, 0, 0>>> ( a1, a2, alpha, nsrow, nscol );
176     return 0;
177   }
178 
sumComplexAOnDevice(double * a1,double * a2,const double alpha,int nsrow,int nscol)179   int sumComplexAOnDevice ( double *a1, double *a2, const double alpha,
180 			    int nsrow, int nscol )
181   {
182     dim3 grids;
183     dim3 blocks;
184     blocks.x = 16;
185     blocks.y = 16;
186     blocks.z = 1;
187     grids.x = (nsrow+15)/16;
188     grids.y = (nscol+15)/16;
189     kernelSumComplexA <<<grids, blocks, 0, 0>>> ( a1, a2, alpha, nsrow, nscol );
190     return 0;
191   }
192 
193 }
194