1 /*
2 Implements the sequential cuda vectors.
3 */
4
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7
8 #include <petscconf.h>
9 #include <petsc/private/vecimpl.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/cudavecimpl.h>
12
13 #include <cuda_runtime.h>
14 #include <thrust/device_ptr.h>
15 #include <thrust/transform.h>
16 #include <thrust/functional.h>
17
18 /*
19 Allocates space for the vector array on the GPU if it does not exist.
20 Does NOT change the PetscCUDAFlag for the vector
21 Does NOT zero the CUDA array
22
23 */
VecCUDAAllocateCheck(Vec v)24 PetscErrorCode VecCUDAAllocateCheck(Vec v)
25 {
26 PetscErrorCode ierr;
27 cudaError_t err;
28 Vec_CUDA *veccuda;
29 PetscBool option_set;
30
31 PetscFunctionBegin;
32 if (!v->spptr) {
33 PetscReal pinned_memory_min;
34 ierr = PetscMalloc(sizeof(Vec_CUDA),&v->spptr);CHKERRQ(ierr);
35 veccuda = (Vec_CUDA*)v->spptr;
36 err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
37 veccuda->GPUarray = veccuda->GPUarray_allocated;
38 veccuda->stream = 0; /* using default stream */
39 if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
40 if (v->data && ((Vec_Seq*)v->data)->array) {
41 v->offloadmask = PETSC_OFFLOAD_CPU;
42 } else {
43 v->offloadmask = PETSC_OFFLOAD_GPU;
44 }
45 }
46 pinned_memory_min = 0;
47
48 /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
49 Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
50 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");CHKERRQ(ierr);
51 ierr = PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);CHKERRQ(ierr);
52 if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
53 ierr = PetscOptionsEnd();CHKERRQ(ierr);
54 }
55 PetscFunctionReturn(0);
56 }
57
58 /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
VecCUDACopyToGPU(Vec v)59 PetscErrorCode VecCUDACopyToGPU(Vec v)
60 {
61 PetscErrorCode ierr;
62 cudaError_t err;
63 Vec_CUDA *veccuda;
64 PetscScalar *varray;
65
66 PetscFunctionBegin;
67 PetscCheckTypeNames(v,VECSEQCUDA,VECMPICUDA);
68 ierr = VecCUDAAllocateCheck(v);CHKERRQ(ierr);
69 if (v->offloadmask == PETSC_OFFLOAD_CPU) {
70 ierr = PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);CHKERRQ(ierr);
71 veccuda = (Vec_CUDA*)v->spptr;
72 varray = veccuda->GPUarray;
73 err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
74 ierr = PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));CHKERRQ(ierr);
75 ierr = PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);CHKERRQ(ierr);
76 v->offloadmask = PETSC_OFFLOAD_BOTH;
77 }
78 PetscFunctionReturn(0);
79 }
80
VecCUDACopyToGPUSome(Vec v,PetscCUDAIndices ci,ScatterMode mode)81 PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
82 {
83 PetscScalar *varray;
84 PetscErrorCode ierr;
85 cudaError_t err;
86 PetscScalar *cpuPtr, *gpuPtr;
87 Vec_Seq *s;
88 VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
89 PetscInt lowestIndex,n;
90
91 PetscFunctionBegin;
92 PetscCheckTypeNames(v,VECSEQCUDA,VECMPICUDA);
93 ierr = VecCUDAAllocateCheck(v);CHKERRQ(ierr);
94 if (v->offloadmask == PETSC_OFFLOAD_CPU) {
95 s = (Vec_Seq*)v->data;
96 if (mode & SCATTER_REVERSE) {
97 lowestIndex = ptop_scatter->sendLowestIndex;
98 n = ptop_scatter->ns;
99 } else {
100 lowestIndex = ptop_scatter->recvLowestIndex;
101 n = ptop_scatter->nr;
102 }
103
104 ierr = PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);CHKERRQ(ierr);
105 varray = ((Vec_CUDA*)v->spptr)->GPUarray;
106 gpuPtr = varray + lowestIndex;
107 cpuPtr = s->array + lowestIndex;
108
109 /* Note : this code copies the smallest contiguous chunk of data
110 containing ALL of the indices */
111 err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
112 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
113
114 /* Set the buffer states */
115 v->offloadmask = PETSC_OFFLOAD_BOTH;
116 ierr = PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);CHKERRQ(ierr);
117 }
118 PetscFunctionReturn(0);
119 }
120
121 /*
122 VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
123 */
VecCUDACopyFromGPU(Vec v)124 PetscErrorCode VecCUDACopyFromGPU(Vec v)
125 {
126 PetscErrorCode ierr;
127 cudaError_t err;
128 Vec_CUDA *veccuda;
129 PetscScalar *varray;
130
131 PetscFunctionBegin;
132 PetscCheckTypeNames(v,VECSEQCUDA,VECMPICUDA);
133 ierr = VecCUDAAllocateCheckHost(v);CHKERRQ(ierr);
134 if (v->offloadmask == PETSC_OFFLOAD_GPU) {
135 ierr = PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);CHKERRQ(ierr);
136 veccuda = (Vec_CUDA*)v->spptr;
137 varray = veccuda->GPUarray;
138 err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
139 ierr = PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));CHKERRQ(ierr);
140 ierr = PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);CHKERRQ(ierr);
141 v->offloadmask = PETSC_OFFLOAD_BOTH;
142 }
143 PetscFunctionReturn(0);
144 }
145
146 /* Note that this function only copies *some* of the values up from the GPU to CPU,
147 which means that we need recombine the data at some point before using any of the standard functions.
148 We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
149 where you have to always call in pairs
150 */
VecCUDACopyFromGPUSome(Vec v,PetscCUDAIndices ci,ScatterMode mode)151 PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
152 {
153 const PetscScalar *varray, *gpuPtr;
154 PetscErrorCode ierr;
155 cudaError_t err;
156 PetscScalar *cpuPtr;
157 Vec_Seq *s;
158 VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
159 PetscInt lowestIndex,n;
160
161 PetscFunctionBegin;
162 PetscCheckTypeNames(v,VECSEQCUDA,VECMPICUDA);
163 ierr = VecCUDAAllocateCheckHost(v);CHKERRQ(ierr);
164 if (v->offloadmask == PETSC_OFFLOAD_GPU) {
165 ierr = PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);CHKERRQ(ierr);
166 if (mode & SCATTER_REVERSE) {
167 lowestIndex = ptop_scatter->recvLowestIndex;
168 n = ptop_scatter->nr;
169 } else {
170 lowestIndex = ptop_scatter->sendLowestIndex;
171 n = ptop_scatter->ns;
172 }
173
174 varray=((Vec_CUDA*)v->spptr)->GPUarray;
175 s = (Vec_Seq*)v->data;
176 gpuPtr = varray + lowestIndex;
177 cpuPtr = s->array + lowestIndex;
178
179 /* Note : this code copies the smallest contiguous chunk of data
180 containing ALL of the indices */
181 err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
182 ierr = PetscLogGpuToCpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
183
184 ierr = VecCUDARestoreArrayRead(v,&varray);CHKERRQ(ierr);
185 ierr = PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);CHKERRQ(ierr);
186 }
187 PetscFunctionReturn(0);
188 }
189
190 /*MC
191 VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
192
193 Options Database Keys:
194 . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
195
196 Level: beginner
197
198 .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
199 M*/
200
VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)201 PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
202 {
203 const PetscScalar *xarray;
204 PetscScalar *yarray;
205 PetscErrorCode ierr;
206 PetscBLASInt one = 1,bn = 0;
207 PetscScalar sone = 1.0;
208 cublasHandle_t cublasv2handle;
209 cublasStatus_t cberr;
210 cudaError_t err;
211
212 PetscFunctionBegin;
213 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
214 ierr = PetscBLASIntCast(yin->map->n,&bn);CHKERRQ(ierr);
215 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
216 ierr = VecCUDAGetArray(yin,&yarray);CHKERRQ(ierr);
217 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
218 if (alpha == (PetscScalar)0.0) {
219 err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
220 } else if (alpha == (PetscScalar)1.0) {
221 cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
222 ierr = PetscLogGpuFlops(1.0*yin->map->n);CHKERRQ(ierr);
223 } else {
224 cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
225 cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
226 ierr = PetscLogGpuFlops(2.0*yin->map->n);CHKERRQ(ierr);
227 }
228 err = WaitForCUDA();CHKERRCUDA(err);
229 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
230 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
231 ierr = VecCUDARestoreArray(yin,&yarray);CHKERRQ(ierr);
232 PetscFunctionReturn(0);
233 }
234
VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)235 PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
236 {
237 const PetscScalar *xarray;
238 PetscScalar *yarray;
239 PetscErrorCode ierr;
240 PetscBLASInt one = 1,bn = 0;
241 cublasHandle_t cublasv2handle;
242 cublasStatus_t cberr;
243 PetscBool yiscuda,xiscuda;
244 cudaError_t err;
245
246 PetscFunctionBegin;
247 if (alpha == (PetscScalar)0.0) PetscFunctionReturn(0);
248 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
249 ierr = PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");CHKERRQ(ierr);
250 ierr = PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");CHKERRQ(ierr);
251 if (xiscuda && yiscuda) {
252 ierr = PetscBLASIntCast(yin->map->n,&bn);CHKERRQ(ierr);
253 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
254 ierr = VecCUDAGetArray(yin,&yarray);CHKERRQ(ierr);
255 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
256 cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
257 err = WaitForCUDA();CHKERRCUDA(err);
258 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
259 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
260 ierr = VecCUDARestoreArray(yin,&yarray);CHKERRQ(ierr);
261 ierr = PetscLogGpuFlops(2.0*yin->map->n);CHKERRQ(ierr);
262 } else {
263 ierr = VecAXPY_Seq(yin,alpha,xin);CHKERRQ(ierr);
264 }
265 PetscFunctionReturn(0);
266 }
267
VecPointwiseDivide_SeqCUDA(Vec win,Vec xin,Vec yin)268 PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
269 {
270 PetscInt n = xin->map->n;
271 const PetscScalar *xarray=NULL,*yarray=NULL;
272 PetscScalar *warray=NULL;
273 thrust::device_ptr<const PetscScalar> xptr,yptr;
274 thrust::device_ptr<PetscScalar> wptr;
275 PetscErrorCode ierr;
276 cudaError_t err;
277
278 PetscFunctionBegin;
279 ierr = VecCUDAGetArrayWrite(win,&warray);CHKERRQ(ierr);
280 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
281 ierr = VecCUDAGetArrayRead(yin,&yarray);CHKERRQ(ierr);
282 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
283 try {
284 wptr = thrust::device_pointer_cast(warray);
285 xptr = thrust::device_pointer_cast(xarray);
286 yptr = thrust::device_pointer_cast(yarray);
287 thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
288 err = WaitForCUDA();CHKERRCUDA(err);
289 } catch (char *ex) {
290 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
291 }
292 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
293 ierr = PetscLogGpuFlops(n);CHKERRQ(ierr);
294 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
295 ierr = VecCUDARestoreArrayRead(yin,&yarray);CHKERRQ(ierr);
296 ierr = VecCUDARestoreArrayWrite(win,&warray);CHKERRQ(ierr);
297 PetscFunctionReturn(0);
298 }
299
VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin,Vec yin)300 PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
301 {
302 const PetscScalar *xarray=NULL,*yarray=NULL;
303 PetscScalar *warray=NULL;
304 PetscErrorCode ierr;
305 PetscBLASInt one = 1,bn = 0;
306 cublasHandle_t cublasv2handle;
307 cublasStatus_t cberr;
308 cudaError_t err;
309
310 PetscFunctionBegin;
311 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
312 ierr = PetscBLASIntCast(win->map->n,&bn);CHKERRQ(ierr);
313 if (alpha == (PetscScalar)0.0) {
314 ierr = VecCopy_SeqCUDA(yin,win);CHKERRQ(ierr);
315 } else {
316 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
317 ierr = VecCUDAGetArrayRead(yin,&yarray);CHKERRQ(ierr);
318 ierr = VecCUDAGetArrayWrite(win,&warray);CHKERRQ(ierr);
319 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
320 err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
321 cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
322 err = WaitForCUDA();CHKERRCUDA(err);
323 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
324 ierr = PetscLogGpuFlops(2*win->map->n);CHKERRQ(ierr);
325 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
326 ierr = VecCUDARestoreArrayRead(yin,&yarray);CHKERRQ(ierr);
327 ierr = VecCUDARestoreArrayWrite(win,&warray);CHKERRQ(ierr);
328 }
329 PetscFunctionReturn(0);
330 }
331
VecMAXPY_SeqCUDA(Vec xin,PetscInt nv,const PetscScalar * alpha,Vec * y)332 PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
333 {
334 PetscErrorCode ierr;
335 cudaError_t err;
336 PetscInt n = xin->map->n,j,j_rem;
337 PetscScalar alpha0,alpha1,alpha2,alpha3;
338
339 PetscFunctionBegin;
340 ierr = PetscLogGpuFlops(nv*2.0*n);CHKERRQ(ierr);
341 switch (j_rem=nv&0x3) {
342 case 3:
343 alpha0 = alpha[0];
344 alpha1 = alpha[1];
345 alpha2 = alpha[2];
346 alpha += 3;
347 ierr = VecAXPY_SeqCUDA(xin,alpha0,y[0]);CHKERRQ(ierr);
348 ierr = VecAXPY_SeqCUDA(xin,alpha1,y[1]);CHKERRQ(ierr);
349 ierr = VecAXPY_SeqCUDA(xin,alpha2,y[2]);CHKERRQ(ierr);
350 y += 3;
351 break;
352 case 2:
353 alpha0 = alpha[0];
354 alpha1 = alpha[1];
355 alpha +=2;
356 ierr = VecAXPY_SeqCUDA(xin,alpha0,y[0]);CHKERRQ(ierr);
357 ierr = VecAXPY_SeqCUDA(xin,alpha1,y[1]);CHKERRQ(ierr);
358 y +=2;
359 break;
360 case 1:
361 alpha0 = *alpha++;
362 ierr = VecAXPY_SeqCUDA(xin,alpha0,y[0]);CHKERRQ(ierr);
363 y +=1;
364 break;
365 }
366 for (j=j_rem; j<nv; j+=4) {
367 alpha0 = alpha[0];
368 alpha1 = alpha[1];
369 alpha2 = alpha[2];
370 alpha3 = alpha[3];
371 alpha += 4;
372 ierr = VecAXPY_SeqCUDA(xin,alpha0,y[0]);CHKERRQ(ierr);
373 ierr = VecAXPY_SeqCUDA(xin,alpha1,y[1]);CHKERRQ(ierr);
374 ierr = VecAXPY_SeqCUDA(xin,alpha2,y[2]);CHKERRQ(ierr);
375 ierr = VecAXPY_SeqCUDA(xin,alpha3,y[3]);CHKERRQ(ierr);
376 y += 4;
377 }
378 err = WaitForCUDA();CHKERRCUDA(err);
379 PetscFunctionReturn(0);
380 }
381
VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar * z)382 PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
383 {
384 const PetscScalar *xarray,*yarray;
385 PetscErrorCode ierr;
386 PetscBLASInt one = 1,bn = 0;
387 cublasHandle_t cublasv2handle;
388 cublasStatus_t cerr;
389
390 PetscFunctionBegin;
391 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
392 ierr = PetscBLASIntCast(yin->map->n,&bn);CHKERRQ(ierr);
393 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
394 ierr = VecCUDAGetArrayRead(yin,&yarray);CHKERRQ(ierr);
395 /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
396 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
397 cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
398 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
399 if (xin->map->n >0) {
400 ierr = PetscLogGpuFlops(2.0*xin->map->n-1);CHKERRQ(ierr);
401 }
402 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
403 ierr = VecCUDARestoreArrayRead(yin,&yarray);CHKERRQ(ierr);
404 PetscFunctionReturn(0);
405 }
406
407 //
408 // CUDA kernels for MDot to follow
409 //
410
411 // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
412 #define MDOT_WORKGROUP_SIZE 128
413 #define MDOT_WORKGROUP_NUM 128
414
415 #if !defined(PETSC_USE_COMPLEX)
416 // M = 2:
VecMDot_SeqCUDA_kernel2(const PetscScalar * x,const PetscScalar * y0,const PetscScalar * y1,PetscInt size,PetscScalar * group_results)417 __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
418 PetscInt size, PetscScalar *group_results)
419 {
420 __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
421 PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
422 entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
423 PetscInt vec_start_index = blockIdx.x * entries_per_group;
424 PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
425
426 PetscScalar entry_x = 0;
427 PetscScalar group_sum0 = 0;
428 PetscScalar group_sum1 = 0;
429 for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
430 entry_x = x[i]; // load only once from global memory!
431 group_sum0 += entry_x * y0[i];
432 group_sum1 += entry_x * y1[i];
433 }
434 tmp_buffer[threadIdx.x] = group_sum0;
435 tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
436
437 // parallel reduction
438 for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
439 __syncthreads();
440 if (threadIdx.x < stride) {
441 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
442 tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
443 }
444 }
445
446 // write result of group to group_results
447 if (threadIdx.x == 0) {
448 group_results[blockIdx.x] = tmp_buffer[0];
449 group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
450 }
451 }
452
453 // M = 3:
VecMDot_SeqCUDA_kernel3(const PetscScalar * x,const PetscScalar * y0,const PetscScalar * y1,const PetscScalar * y2,PetscInt size,PetscScalar * group_results)454 __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
455 PetscInt size, PetscScalar *group_results)
456 {
457 __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
458 PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
459 entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
460 PetscInt vec_start_index = blockIdx.x * entries_per_group;
461 PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
462
463 PetscScalar entry_x = 0;
464 PetscScalar group_sum0 = 0;
465 PetscScalar group_sum1 = 0;
466 PetscScalar group_sum2 = 0;
467 for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
468 entry_x = x[i]; // load only once from global memory!
469 group_sum0 += entry_x * y0[i];
470 group_sum1 += entry_x * y1[i];
471 group_sum2 += entry_x * y2[i];
472 }
473 tmp_buffer[threadIdx.x] = group_sum0;
474 tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
475 tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
476
477 // parallel reduction
478 for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
479 __syncthreads();
480 if (threadIdx.x < stride) {
481 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
482 tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
483 tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
484 }
485 }
486
487 // write result of group to group_results
488 if (threadIdx.x == 0) {
489 group_results[blockIdx.x ] = tmp_buffer[0];
490 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
491 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
492 }
493 }
494
495 // M = 4:
VecMDot_SeqCUDA_kernel4(const PetscScalar * x,const PetscScalar * y0,const PetscScalar * y1,const PetscScalar * y2,const PetscScalar * y3,PetscInt size,PetscScalar * group_results)496 __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
497 PetscInt size, PetscScalar *group_results)
498 {
499 __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
500 PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
501 entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
502 PetscInt vec_start_index = blockIdx.x * entries_per_group;
503 PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
504
505 PetscScalar entry_x = 0;
506 PetscScalar group_sum0 = 0;
507 PetscScalar group_sum1 = 0;
508 PetscScalar group_sum2 = 0;
509 PetscScalar group_sum3 = 0;
510 for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
511 entry_x = x[i]; // load only once from global memory!
512 group_sum0 += entry_x * y0[i];
513 group_sum1 += entry_x * y1[i];
514 group_sum2 += entry_x * y2[i];
515 group_sum3 += entry_x * y3[i];
516 }
517 tmp_buffer[threadIdx.x] = group_sum0;
518 tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
519 tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
520 tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
521
522 // parallel reduction
523 for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
524 __syncthreads();
525 if (threadIdx.x < stride) {
526 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
527 tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
528 tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
529 tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
530 }
531 }
532
533 // write result of group to group_results
534 if (threadIdx.x == 0) {
535 group_results[blockIdx.x ] = tmp_buffer[0];
536 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
537 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
538 group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
539 }
540 }
541
542 // M = 8:
VecMDot_SeqCUDA_kernel8(const PetscScalar * x,const PetscScalar * y0,const PetscScalar * y1,const PetscScalar * y2,const PetscScalar * y3,const PetscScalar * y4,const PetscScalar * y5,const PetscScalar * y6,const PetscScalar * y7,PetscInt size,PetscScalar * group_results)543 __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
544 const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
545 PetscInt size, PetscScalar *group_results)
546 {
547 __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
548 PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
549 entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
550 PetscInt vec_start_index = blockIdx.x * entries_per_group;
551 PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
552
553 PetscScalar entry_x = 0;
554 PetscScalar group_sum0 = 0;
555 PetscScalar group_sum1 = 0;
556 PetscScalar group_sum2 = 0;
557 PetscScalar group_sum3 = 0;
558 PetscScalar group_sum4 = 0;
559 PetscScalar group_sum5 = 0;
560 PetscScalar group_sum6 = 0;
561 PetscScalar group_sum7 = 0;
562 for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
563 entry_x = x[i]; // load only once from global memory!
564 group_sum0 += entry_x * y0[i];
565 group_sum1 += entry_x * y1[i];
566 group_sum2 += entry_x * y2[i];
567 group_sum3 += entry_x * y3[i];
568 group_sum4 += entry_x * y4[i];
569 group_sum5 += entry_x * y5[i];
570 group_sum6 += entry_x * y6[i];
571 group_sum7 += entry_x * y7[i];
572 }
573 tmp_buffer[threadIdx.x] = group_sum0;
574 tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
575 tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
576 tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
577 tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
578 tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
579 tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
580 tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
581
582 // parallel reduction
583 for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
584 __syncthreads();
585 if (threadIdx.x < stride) {
586 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
587 tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
588 tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
589 tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
590 tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
591 tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
592 tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
593 tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
594 }
595 }
596
597 // write result of group to group_results
598 if (threadIdx.x == 0) {
599 group_results[blockIdx.x ] = tmp_buffer[0];
600 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
601 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
602 group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
603 group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
604 group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
605 group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
606 group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
607 }
608 }
609 #endif /* !defined(PETSC_USE_COMPLEX) */
610
VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)611 PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
612 {
613 PetscErrorCode ierr;
614 PetscInt i,n = xin->map->n,current_y_index = 0;
615 const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
616 PetscScalar *group_results_gpu;
617 #if !defined(PETSC_USE_COMPLEX)
618 PetscInt j;
619 PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
620 #endif
621 cudaError_t cuda_ierr;
622 PetscBLASInt one = 1,bn = 0;
623 cublasHandle_t cublasv2handle;
624 cublasStatus_t cberr;
625
626 PetscFunctionBegin;
627 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
628 ierr = PetscBLASIntCast(xin->map->n,&bn);CHKERRQ(ierr);
629 if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
630 /* Handle the case of local size zero first */
631 if (!xin->map->n) {
632 for (i=0; i<nv; ++i) z[i] = 0;
633 PetscFunctionReturn(0);
634 }
635
636 // allocate scratchpad memory for the results of individual work groups:
637 cuda_ierr = cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);
638
639 ierr = VecCUDAGetArrayRead(xin,&xptr);CHKERRQ(ierr);
640
641 while (current_y_index < nv)
642 {
643 switch (nv - current_y_index) {
644
645 case 7:
646 case 6:
647 case 5:
648 case 4:
649 ierr = VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);CHKERRQ(ierr);
650 ierr = VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
651 ierr = VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
652 ierr = VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);CHKERRQ(ierr);
653 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
654 #if defined(PETSC_USE_COMPLEX)
655 cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
656 cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
657 cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
658 cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
659 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
660 #else
661 // run kernel:
662 VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
663 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
664
665 // copy results back to
666 cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
667
668 // sum group results into z:
669 for (j=0; j<4; ++j) {
670 z[current_y_index + j] = 0;
671 for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
672 }
673 #endif
674 ierr = VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);CHKERRQ(ierr);
675 ierr = VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
676 ierr = VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
677 ierr = VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);CHKERRQ(ierr);
678 current_y_index += 4;
679 break;
680
681 case 3:
682 ierr = VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);CHKERRQ(ierr);
683 ierr = VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
684 ierr = VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
685
686 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
687 #if defined(PETSC_USE_COMPLEX)
688 cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
689 cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
690 cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
691 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
692 #else
693 // run kernel:
694 VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
695 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
696
697 // copy results back to
698 cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
699
700 // sum group results into z:
701 for (j=0; j<3; ++j) {
702 z[current_y_index + j] = 0;
703 for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
704 }
705 #endif
706 ierr = VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);CHKERRQ(ierr);
707 ierr = VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
708 ierr = VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
709 current_y_index += 3;
710 break;
711
712 case 2:
713 ierr = VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);CHKERRQ(ierr);
714 ierr = VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
715 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
716 #if defined(PETSC_USE_COMPLEX)
717 cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
718 cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
719 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
720 #else
721 // run kernel:
722 VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
723 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
724
725 // copy results back to
726 cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
727
728 // sum group results into z:
729 for (j=0; j<2; ++j) {
730 z[current_y_index + j] = 0;
731 for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
732 }
733 #endif
734 ierr = VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);CHKERRQ(ierr);
735 ierr = VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
736 current_y_index += 2;
737 break;
738
739 case 1:
740 ierr = VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);CHKERRQ(ierr);
741 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
742 cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
743 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
744 ierr = VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);CHKERRQ(ierr);
745 current_y_index += 1;
746 break;
747
748 default: // 8 or more vectors left
749 ierr = VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);CHKERRQ(ierr);
750 ierr = VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
751 ierr = VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
752 ierr = VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);CHKERRQ(ierr);
753 ierr = VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);CHKERRQ(ierr);
754 ierr = VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);CHKERRQ(ierr);
755 ierr = VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);CHKERRQ(ierr);
756 ierr = VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);CHKERRQ(ierr);
757 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
758 #if defined(PETSC_USE_COMPLEX)
759 cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
760 cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
761 cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
762 cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
763 cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
764 cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
765 cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
766 cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
767 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
768 #else
769 // run kernel:
770 VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
771 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
772
773 // copy results back to
774 cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
775
776 // sum group results into z:
777 for (j=0; j<8; ++j) {
778 z[current_y_index + j] = 0;
779 for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
780 }
781 #endif
782 ierr = VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);CHKERRQ(ierr);
783 ierr = VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
784 ierr = VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
785 ierr = VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);CHKERRQ(ierr);
786 ierr = VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);CHKERRQ(ierr);
787 ierr = VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);CHKERRQ(ierr);
788 ierr = VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);CHKERRQ(ierr);
789 ierr = VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);CHKERRQ(ierr);
790 current_y_index += 8;
791 break;
792 }
793 }
794 ierr = VecCUDARestoreArrayRead(xin,&xptr);CHKERRQ(ierr);
795
796 cuda_ierr = cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
797 ierr = PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));CHKERRQ(ierr);
798 PetscFunctionReturn(0);
799 }
800
801 #undef MDOT_WORKGROUP_SIZE
802 #undef MDOT_WORKGROUP_NUM
803
VecSet_SeqCUDA(Vec xin,PetscScalar alpha)804 PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
805 {
806 PetscInt n = xin->map->n;
807 PetscScalar *xarray = NULL;
808 thrust::device_ptr<PetscScalar> xptr;
809 PetscErrorCode ierr;
810 cudaError_t err;
811
812 PetscFunctionBegin;
813 ierr = VecCUDAGetArrayWrite(xin,&xarray);CHKERRQ(ierr);
814 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
815 if (alpha == (PetscScalar)0.0) {
816 err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
817 } else {
818 try {
819 xptr = thrust::device_pointer_cast(xarray);
820 thrust::fill(xptr,xptr+n,alpha);
821 } catch (char *ex) {
822 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
823 }
824 }
825 err = WaitForCUDA();CHKERRCUDA(err);
826 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
827 ierr = VecCUDARestoreArrayWrite(xin,&xarray);CHKERRQ(ierr);
828 PetscFunctionReturn(0);
829 }
830
VecScale_SeqCUDA(Vec xin,PetscScalar alpha)831 PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
832 {
833 PetscScalar *xarray;
834 PetscErrorCode ierr;
835 PetscBLASInt one = 1,bn = 0;
836 cublasHandle_t cublasv2handle;
837 cublasStatus_t cberr;
838 cudaError_t err;
839
840 PetscFunctionBegin;
841 if (alpha == (PetscScalar)0.0) {
842 ierr = VecSet_SeqCUDA(xin,alpha);CHKERRQ(ierr);
843 err = WaitForCUDA();CHKERRCUDA(err);
844 } else if (alpha != (PetscScalar)1.0) {
845 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
846 ierr = PetscBLASIntCast(xin->map->n,&bn);CHKERRQ(ierr);
847 ierr = VecCUDAGetArray(xin,&xarray);CHKERRQ(ierr);
848 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
849 cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
850 ierr = VecCUDARestoreArray(xin,&xarray);CHKERRQ(ierr);
851 err = WaitForCUDA();CHKERRCUDA(err);
852 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
853 }
854 ierr = PetscLogGpuFlops(xin->map->n);CHKERRQ(ierr);
855 PetscFunctionReturn(0);
856 }
857
VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar * z)858 PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
859 {
860 const PetscScalar *xarray,*yarray;
861 PetscErrorCode ierr;
862 PetscBLASInt one = 1,bn = 0;
863 cublasHandle_t cublasv2handle;
864 cublasStatus_t cerr;
865
866 PetscFunctionBegin;
867 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
868 ierr = PetscBLASIntCast(xin->map->n,&bn);CHKERRQ(ierr);
869 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
870 ierr = VecCUDAGetArrayRead(yin,&yarray);CHKERRQ(ierr);
871 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
872 cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
873 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
874 if (xin->map->n > 0) {
875 ierr = PetscLogGpuFlops(2.0*xin->map->n-1);CHKERRQ(ierr);
876 }
877 ierr = VecCUDARestoreArrayRead(yin,&yarray);CHKERRQ(ierr);
878 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
879 PetscFunctionReturn(0);
880 }
881
VecCopy_SeqCUDA(Vec xin,Vec yin)882 PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
883 {
884 const PetscScalar *xarray;
885 PetscScalar *yarray;
886 PetscErrorCode ierr;
887 cudaError_t err;
888
889 PetscFunctionBegin;
890 if (xin != yin) {
891 if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
892 PetscBool yiscuda;
893
894 ierr = PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");CHKERRQ(ierr);
895 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
896 if (yiscuda) {
897 ierr = VecCUDAGetArrayWrite(yin,&yarray);CHKERRQ(ierr);
898 } else {
899 ierr = VecGetArrayWrite(yin,&yarray);CHKERRQ(ierr);
900 }
901 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
902 if (yiscuda) {
903 err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
904 } else {
905 err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
906 }
907 err = WaitForCUDA();CHKERRCUDA(err);
908 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
909 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
910 if (yiscuda) {
911 ierr = VecCUDARestoreArrayWrite(yin,&yarray);CHKERRQ(ierr);
912 } else {
913 ierr = VecRestoreArrayWrite(yin,&yarray);CHKERRQ(ierr);
914 }
915 } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
916 /* copy in CPU if we are on the CPU */
917 ierr = VecCopy_SeqCUDA_Private(xin,yin);CHKERRQ(ierr);
918 } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
919 /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
920 if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
921 /* copy in CPU */
922 ierr = VecCopy_SeqCUDA_Private(xin,yin);CHKERRQ(ierr);
923 } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
924 /* copy in GPU */
925 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
926 ierr = VecCUDAGetArrayWrite(yin,&yarray);CHKERRQ(ierr);
927 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
928 err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
929 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
930 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
931 ierr = VecCUDARestoreArrayWrite(yin,&yarray);CHKERRQ(ierr);
932 } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
933 /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
934 default to copy in GPU (this is an arbitrary choice) */
935 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
936 ierr = VecCUDAGetArrayWrite(yin,&yarray);CHKERRQ(ierr);
937 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
938 err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
939 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
940 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
941 ierr = VecCUDARestoreArrayWrite(yin,&yarray);CHKERRQ(ierr);
942 } else {
943 ierr = VecCopy_SeqCUDA_Private(xin,yin);CHKERRQ(ierr);
944 }
945 }
946 }
947 PetscFunctionReturn(0);
948 }
949
VecSwap_SeqCUDA(Vec xin,Vec yin)950 PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
951 {
952 PetscErrorCode ierr;
953 PetscBLASInt one = 1,bn = 0;
954 PetscScalar *xarray,*yarray;
955 cublasHandle_t cublasv2handle;
956 cublasStatus_t cberr;
957 cudaError_t err;
958
959 PetscFunctionBegin;
960 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
961 ierr = PetscBLASIntCast(xin->map->n,&bn);CHKERRQ(ierr);
962 if (xin != yin) {
963 ierr = VecCUDAGetArray(xin,&xarray);CHKERRQ(ierr);
964 ierr = VecCUDAGetArray(yin,&yarray);CHKERRQ(ierr);
965 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
966 cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
967 err = WaitForCUDA();CHKERRCUDA(err);
968 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
969 ierr = VecCUDARestoreArray(xin,&xarray);CHKERRQ(ierr);
970 ierr = VecCUDARestoreArray(yin,&yarray);CHKERRQ(ierr);
971 }
972 PetscFunctionReturn(0);
973 }
974
VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)975 PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
976 {
977 PetscErrorCode ierr;
978 PetscScalar a = alpha,b = beta;
979 const PetscScalar *xarray;
980 PetscScalar *yarray;
981 PetscBLASInt one = 1, bn = 0;
982 cublasHandle_t cublasv2handle;
983 cublasStatus_t cberr;
984 cudaError_t err;
985
986 PetscFunctionBegin;
987 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
988 ierr = PetscBLASIntCast(yin->map->n,&bn);CHKERRQ(ierr);
989 if (a == (PetscScalar)0.0) {
990 ierr = VecScale_SeqCUDA(yin,beta);CHKERRQ(ierr);
991 } else if (b == (PetscScalar)1.0) {
992 ierr = VecAXPY_SeqCUDA(yin,alpha,xin);CHKERRQ(ierr);
993 } else if (a == (PetscScalar)1.0) {
994 ierr = VecAYPX_SeqCUDA(yin,beta,xin);CHKERRQ(ierr);
995 } else if (b == (PetscScalar)0.0) {
996 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
997 ierr = VecCUDAGetArray(yin,&yarray);CHKERRQ(ierr);
998 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
999 err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
1000 cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
1001 err = WaitForCUDA();CHKERRCUDA(err);
1002 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1003 ierr = PetscLogGpuFlops(xin->map->n);CHKERRQ(ierr);
1004 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1005 ierr = VecCUDARestoreArray(yin,&yarray);CHKERRQ(ierr);
1006 } else {
1007 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
1008 ierr = VecCUDAGetArray(yin,&yarray);CHKERRQ(ierr);
1009 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1010 cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
1011 cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
1012 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1013 ierr = VecCUDARestoreArray(yin,&yarray);CHKERRQ(ierr);
1014 err = WaitForCUDA();CHKERRCUDA(err);
1015 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1016 ierr = PetscLogGpuFlops(3.0*xin->map->n);CHKERRQ(ierr);
1017 }
1018 PetscFunctionReturn(0);
1019 }
1020
VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)1021 PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1022 {
1023 PetscErrorCode ierr;
1024 cudaError_t err;
1025 PetscInt n = zin->map->n;
1026
1027 PetscFunctionBegin;
1028 if (gamma == (PetscScalar)1.0) {
1029 /* z = ax + b*y + z */
1030 ierr = VecAXPY_SeqCUDA(zin,alpha,xin);CHKERRQ(ierr);
1031 ierr = VecAXPY_SeqCUDA(zin,beta,yin);CHKERRQ(ierr);
1032 ierr = PetscLogGpuFlops(4.0*n);CHKERRQ(ierr);
1033 } else {
1034 /* z = a*x + b*y + c*z */
1035 ierr = VecScale_SeqCUDA(zin,gamma);CHKERRQ(ierr);
1036 ierr = VecAXPY_SeqCUDA(zin,alpha,xin);CHKERRQ(ierr);
1037 ierr = VecAXPY_SeqCUDA(zin,beta,yin);CHKERRQ(ierr);
1038 ierr = PetscLogGpuFlops(5.0*n);CHKERRQ(ierr);
1039 }
1040 err = WaitForCUDA();CHKERRCUDA(err);
1041 PetscFunctionReturn(0);
1042 }
1043
VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)1044 PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1045 {
1046 PetscInt n = win->map->n;
1047 const PetscScalar *xarray,*yarray;
1048 PetscScalar *warray;
1049 thrust::device_ptr<const PetscScalar> xptr,yptr;
1050 thrust::device_ptr<PetscScalar> wptr;
1051 PetscErrorCode ierr;
1052 cudaError_t err;
1053
1054 PetscFunctionBegin;
1055 ierr = VecCUDAGetArray(win,&warray);CHKERRQ(ierr);
1056 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
1057 ierr = VecCUDAGetArrayRead(yin,&yarray);CHKERRQ(ierr);
1058 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1059 try {
1060 wptr = thrust::device_pointer_cast(warray);
1061 xptr = thrust::device_pointer_cast(xarray);
1062 yptr = thrust::device_pointer_cast(yarray);
1063 thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1064 err = WaitForCUDA();CHKERRCUDA(err);
1065 } catch (char *ex) {
1066 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1067 }
1068 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1069 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1070 ierr = VecCUDARestoreArrayRead(yin,&yarray);CHKERRQ(ierr);
1071 ierr = VecCUDARestoreArray(win,&warray);CHKERRQ(ierr);
1072 ierr = PetscLogGpuFlops(n);CHKERRQ(ierr);
1073 PetscFunctionReturn(0);
1074 }
1075
1076 /* should do infinity norm in cuda */
1077
VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal * z)1078 PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1079 {
1080 PetscErrorCode ierr;
1081 PetscInt n = xin->map->n;
1082 PetscBLASInt one = 1, bn = 0;
1083 const PetscScalar *xarray;
1084 cublasHandle_t cublasv2handle;
1085 cublasStatus_t cberr;
1086 cudaError_t err;
1087
1088 PetscFunctionBegin;
1089 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1090 ierr = PetscBLASIntCast(n,&bn);CHKERRQ(ierr);
1091 if (type == NORM_2 || type == NORM_FROBENIUS) {
1092 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
1093 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1094 cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1095 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1096 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1097 ierr = PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));CHKERRQ(ierr);
1098 } else if (type == NORM_INFINITY) {
1099 int i;
1100 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
1101 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1102 cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1103 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1104 if (bn) {
1105 PetscScalar zs;
1106 err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1107 *z = PetscAbsScalar(zs);
1108 } else *z = 0.0;
1109 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1110 } else if (type == NORM_1) {
1111 ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
1112 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1113 cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1114 ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1115 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1116 ierr = PetscLogGpuFlops(PetscMax(n-1.0,0.0));CHKERRQ(ierr);
1117 } else if (type == NORM_1_AND_2) {
1118 ierr = VecNorm_SeqCUDA(xin,NORM_1,z);CHKERRQ(ierr);
1119 ierr = VecNorm_SeqCUDA(xin,NORM_2,z+1);CHKERRQ(ierr);
1120 }
1121 PetscFunctionReturn(0);
1122 }
1123
VecDotNorm2_SeqCUDA(Vec s,Vec t,PetscScalar * dp,PetscScalar * nm)1124 PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1125 {
1126 PetscErrorCode ierr;
1127 cudaError_t err;
1128 PetscReal n = s->map->n;
1129 const PetscScalar *sarray,*tarray;
1130
1131 PetscFunctionBegin;
1132 ierr = VecCUDAGetArrayRead(s,&sarray);CHKERRQ(ierr);
1133 ierr = VecCUDAGetArrayRead(t,&tarray);CHKERRQ(ierr);
1134 ierr = VecDot_SeqCUDA(s,t,dp);CHKERRQ(ierr);
1135 ierr = VecDot_SeqCUDA(t,t,nm);CHKERRQ(ierr);
1136 ierr = VecCUDARestoreArrayRead(s,&sarray);CHKERRQ(ierr);
1137 ierr = VecCUDARestoreArrayRead(t,&tarray);CHKERRQ(ierr);
1138 err = WaitForCUDA();CHKERRCUDA(err);
1139 ierr = PetscLogGpuFlops(4.0*n);CHKERRQ(ierr);
1140 PetscFunctionReturn(0);
1141 }
1142
VecDestroy_SeqCUDA(Vec v)1143 PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1144 {
1145 PetscErrorCode ierr;
1146 cudaError_t err;
1147
1148 PetscFunctionBegin;
1149 if (v->spptr) {
1150 if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1151 err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1152 ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1153 }
1154 if (((Vec_CUDA*)v->spptr)->stream) {
1155 err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1156 }
1157 }
1158 ierr = VecDestroy_SeqCUDA_Private(v);CHKERRQ(ierr);
1159 ierr = PetscFree(v->spptr);CHKERRQ(ierr);
1160 PetscFunctionReturn(0);
1161 }
1162
1163 #if defined(PETSC_USE_COMPLEX)
1164 struct conjugate
1165 {
1166 __host__ __device__
operator ()conjugate1167 PetscScalar operator()(PetscScalar x)
1168 {
1169 return PetscConj(x);
1170 }
1171 };
1172 #endif
1173
VecConjugate_SeqCUDA(Vec xin)1174 PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1175 {
1176 #if defined(PETSC_USE_COMPLEX)
1177 PetscScalar *xarray;
1178 PetscErrorCode ierr;
1179 PetscInt n = xin->map->n;
1180 thrust::device_ptr<PetscScalar> xptr;
1181 cudaError_t err;
1182
1183 PetscFunctionBegin;
1184 ierr = VecCUDAGetArray(xin,&xarray);CHKERRQ(ierr);
1185 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1186 try {
1187 xptr = thrust::device_pointer_cast(xarray);
1188 thrust::transform(xptr,xptr+n,xptr,conjugate());
1189 err = WaitForCUDA();CHKERRCUDA(err);
1190 } catch (char *ex) {
1191 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1192 }
1193 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1194 ierr = VecCUDARestoreArray(xin,&xarray);CHKERRQ(ierr);
1195 #else
1196 PetscFunctionBegin;
1197 #endif
1198 PetscFunctionReturn(0);
1199 }
1200
VecGetLocalVector_SeqCUDA(Vec v,Vec w)1201 PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1202 {
1203 PetscErrorCode ierr;
1204 cudaError_t err;
1205
1206 PetscFunctionBegin;
1207 PetscValidHeaderSpecific(v,VEC_CLASSID,1);
1208 PetscValidHeaderSpecific(w,VEC_CLASSID,2);
1209 PetscCheckTypeName(w,VECSEQCUDA);
1210
1211 if (w->data) {
1212 if (((Vec_Seq*)w->data)->array_allocated) {
1213 if (w->pinned_memory) {
1214 ierr = PetscMallocSetCUDAHost();CHKERRQ(ierr);
1215 }
1216 ierr = PetscFree(((Vec_Seq*)w->data)->array_allocated);CHKERRQ(ierr);
1217 if (w->pinned_memory) {
1218 ierr = PetscMallocResetCUDAHost();CHKERRQ(ierr);
1219 w->pinned_memory = PETSC_FALSE;
1220 }
1221 }
1222 ((Vec_Seq*)w->data)->array = NULL;
1223 ((Vec_Seq*)w->data)->unplacedarray = NULL;
1224 }
1225 if (w->spptr) {
1226 if (((Vec_CUDA*)w->spptr)->GPUarray) {
1227 err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1228 ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1229 }
1230 if (((Vec_CUDA*)v->spptr)->stream) {
1231 err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1232 }
1233 ierr = PetscFree(w->spptr);CHKERRQ(ierr);
1234 }
1235
1236 if (v->petscnative) {
1237 ierr = PetscFree(w->data);CHKERRQ(ierr);
1238 w->data = v->data;
1239 w->offloadmask = v->offloadmask;
1240 w->pinned_memory = v->pinned_memory;
1241 w->spptr = v->spptr;
1242 ierr = PetscObjectStateIncrease((PetscObject)w);CHKERRQ(ierr);
1243 } else {
1244 ierr = VecGetArray(v,&((Vec_Seq*)w->data)->array);CHKERRQ(ierr);
1245 w->offloadmask = PETSC_OFFLOAD_CPU;
1246 ierr = VecCUDAAllocateCheck(w);CHKERRQ(ierr);
1247 }
1248 PetscFunctionReturn(0);
1249 }
1250
VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)1251 PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1252 {
1253 PetscErrorCode ierr;
1254 cudaError_t err;
1255
1256 PetscFunctionBegin;
1257 PetscValidHeaderSpecific(v,VEC_CLASSID,1);
1258 PetscValidHeaderSpecific(w,VEC_CLASSID,2);
1259 PetscCheckTypeName(w,VECSEQCUDA);
1260
1261 if (v->petscnative) {
1262 v->data = w->data;
1263 v->offloadmask = w->offloadmask;
1264 v->pinned_memory = w->pinned_memory;
1265 v->spptr = w->spptr;
1266 ierr = VecCUDACopyFromGPU(v);CHKERRQ(ierr);
1267 ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr);
1268 w->data = 0;
1269 w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1270 w->spptr = 0;
1271 } else {
1272 ierr = VecRestoreArray(v,&((Vec_Seq*)w->data)->array);CHKERRQ(ierr);
1273 if ((Vec_CUDA*)w->spptr) {
1274 err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1275 ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1276 if (((Vec_CUDA*)v->spptr)->stream) {
1277 err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1278 }
1279 ierr = PetscFree(w->spptr);CHKERRQ(ierr);
1280 }
1281 }
1282 PetscFunctionReturn(0);
1283 }
1284