1 /*
2    Implements the sequential cuda vectors.
3 */
4 
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 
8 #include <petscconf.h>
9 #include <petsc/private/vecimpl.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/cudavecimpl.h>
12 
13 #include <cuda_runtime.h>
14 #include <thrust/device_ptr.h>
15 #include <thrust/transform.h>
16 #include <thrust/functional.h>
17 
18 /*
19     Allocates space for the vector array on the GPU if it does not exist.
20     Does NOT change the PetscCUDAFlag for the vector
21     Does NOT zero the CUDA array
22 
23  */
VecCUDAAllocateCheck(Vec v)24 PetscErrorCode VecCUDAAllocateCheck(Vec v)
25 {
26   PetscErrorCode ierr;
27   cudaError_t    err;
28   Vec_CUDA       *veccuda;
29   PetscBool      option_set;
30 
31   PetscFunctionBegin;
32   if (!v->spptr) {
33     PetscReal pinned_memory_min;
34     ierr = PetscMalloc(sizeof(Vec_CUDA),&v->spptr);CHKERRQ(ierr);
35     veccuda = (Vec_CUDA*)v->spptr;
36     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
37     veccuda->GPUarray = veccuda->GPUarray_allocated;
38     veccuda->stream = 0;  /* using default stream */
39     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
40       if (v->data && ((Vec_Seq*)v->data)->array) {
41         v->offloadmask = PETSC_OFFLOAD_CPU;
42       } else {
43         v->offloadmask = PETSC_OFFLOAD_GPU;
44       }
45     }
46     pinned_memory_min = 0;
47 
48     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
49        Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
50     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");CHKERRQ(ierr);
51     ierr = PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);CHKERRQ(ierr);
52     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
53     ierr = PetscOptionsEnd();CHKERRQ(ierr);
54   }
55   PetscFunctionReturn(0);
56 }
57 
58 /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
VecCUDACopyToGPU(Vec v)59 PetscErrorCode VecCUDACopyToGPU(Vec v)
60 {
61   PetscErrorCode ierr;
62   cudaError_t    err;
63   Vec_CUDA       *veccuda;
64   PetscScalar    *varray;
65 
66   PetscFunctionBegin;
67   PetscCheckTypeNames(v,VECSEQCUDA,VECMPICUDA);
68   ierr = VecCUDAAllocateCheck(v);CHKERRQ(ierr);
69   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
70     ierr               = PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);CHKERRQ(ierr);
71     veccuda            = (Vec_CUDA*)v->spptr;
72     varray             = veccuda->GPUarray;
73     err                = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
74     ierr               = PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));CHKERRQ(ierr);
75     ierr               = PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);CHKERRQ(ierr);
76     v->offloadmask = PETSC_OFFLOAD_BOTH;
77   }
78   PetscFunctionReturn(0);
79 }
80 
VecCUDACopyToGPUSome(Vec v,PetscCUDAIndices ci,ScatterMode mode)81 PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
82 {
83   PetscScalar                *varray;
84   PetscErrorCode             ierr;
85   cudaError_t                err;
86   PetscScalar                *cpuPtr, *gpuPtr;
87   Vec_Seq                    *s;
88   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
89   PetscInt                   lowestIndex,n;
90 
91   PetscFunctionBegin;
92   PetscCheckTypeNames(v,VECSEQCUDA,VECMPICUDA);
93   ierr = VecCUDAAllocateCheck(v);CHKERRQ(ierr);
94   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
95     s = (Vec_Seq*)v->data;
96     if (mode & SCATTER_REVERSE) {
97       lowestIndex = ptop_scatter->sendLowestIndex;
98       n           = ptop_scatter->ns;
99     } else {
100       lowestIndex = ptop_scatter->recvLowestIndex;
101       n           = ptop_scatter->nr;
102     }
103 
104     ierr   = PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);CHKERRQ(ierr);
105     varray = ((Vec_CUDA*)v->spptr)->GPUarray;
106     gpuPtr = varray + lowestIndex;
107     cpuPtr = s->array + lowestIndex;
108 
109     /* Note : this code copies the smallest contiguous chunk of data
110        containing ALL of the indices */
111     err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
112     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
113 
114     /* Set the buffer states */
115     v->offloadmask = PETSC_OFFLOAD_BOTH;
116     ierr = PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);CHKERRQ(ierr);
117   }
118   PetscFunctionReturn(0);
119 }
120 
121 /*
122      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
123 */
VecCUDACopyFromGPU(Vec v)124 PetscErrorCode VecCUDACopyFromGPU(Vec v)
125 {
126   PetscErrorCode ierr;
127   cudaError_t    err;
128   Vec_CUDA       *veccuda;
129   PetscScalar    *varray;
130 
131   PetscFunctionBegin;
132   PetscCheckTypeNames(v,VECSEQCUDA,VECMPICUDA);
133   ierr = VecCUDAAllocateCheckHost(v);CHKERRQ(ierr);
134   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
135     ierr               = PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);CHKERRQ(ierr);
136     veccuda            = (Vec_CUDA*)v->spptr;
137     varray             = veccuda->GPUarray;
138     err                = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
139     ierr               = PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));CHKERRQ(ierr);
140     ierr               = PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);CHKERRQ(ierr);
141     v->offloadmask     = PETSC_OFFLOAD_BOTH;
142   }
143   PetscFunctionReturn(0);
144 }
145 
146 /* Note that this function only copies *some* of the values up from the GPU to CPU,
147    which means that we need recombine the data at some point before using any of the standard functions.
148    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
149    where you have to always call in pairs
150 */
VecCUDACopyFromGPUSome(Vec v,PetscCUDAIndices ci,ScatterMode mode)151 PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
152 {
153   const PetscScalar *varray, *gpuPtr;
154   PetscErrorCode    ierr;
155   cudaError_t       err;
156   PetscScalar       *cpuPtr;
157   Vec_Seq           *s;
158   VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
159   PetscInt          lowestIndex,n;
160 
161   PetscFunctionBegin;
162   PetscCheckTypeNames(v,VECSEQCUDA,VECMPICUDA);
163   ierr = VecCUDAAllocateCheckHost(v);CHKERRQ(ierr);
164   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
165     ierr   = PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);CHKERRQ(ierr);
166     if (mode & SCATTER_REVERSE) {
167       lowestIndex = ptop_scatter->recvLowestIndex;
168       n           = ptop_scatter->nr;
169     } else {
170       lowestIndex = ptop_scatter->sendLowestIndex;
171       n           = ptop_scatter->ns;
172     }
173 
174     varray=((Vec_CUDA*)v->spptr)->GPUarray;
175     s = (Vec_Seq*)v->data;
176     gpuPtr = varray + lowestIndex;
177     cpuPtr = s->array + lowestIndex;
178 
179     /* Note : this code copies the smallest contiguous chunk of data
180        containing ALL of the indices */
181     err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
182     ierr = PetscLogGpuToCpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
183 
184     ierr = VecCUDARestoreArrayRead(v,&varray);CHKERRQ(ierr);
185     ierr = PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);CHKERRQ(ierr);
186   }
187   PetscFunctionReturn(0);
188 }
189 
190 /*MC
191    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
192 
193    Options Database Keys:
194 . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
195 
196   Level: beginner
197 
198 .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
199 M*/
200 
VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)201 PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
202 {
203   const PetscScalar *xarray;
204   PetscScalar       *yarray;
205   PetscErrorCode    ierr;
206   PetscBLASInt      one = 1,bn = 0;
207   PetscScalar       sone = 1.0;
208   cublasHandle_t    cublasv2handle;
209   cublasStatus_t    cberr;
210   cudaError_t       err;
211 
212   PetscFunctionBegin;
213   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
214   ierr = PetscBLASIntCast(yin->map->n,&bn);CHKERRQ(ierr);
215   ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
216   ierr = VecCUDAGetArray(yin,&yarray);CHKERRQ(ierr);
217   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
218   if (alpha == (PetscScalar)0.0) {
219     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
220   } else if (alpha == (PetscScalar)1.0) {
221     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
222     ierr = PetscLogGpuFlops(1.0*yin->map->n);CHKERRQ(ierr);
223   } else {
224     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
225     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
226     ierr = PetscLogGpuFlops(2.0*yin->map->n);CHKERRQ(ierr);
227   }
228   err  = WaitForCUDA();CHKERRCUDA(err);
229   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
230   ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
231   ierr = VecCUDARestoreArray(yin,&yarray);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)235 PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
236 {
237   const PetscScalar *xarray;
238   PetscScalar       *yarray;
239   PetscErrorCode    ierr;
240   PetscBLASInt      one = 1,bn = 0;
241   cublasHandle_t    cublasv2handle;
242   cublasStatus_t    cberr;
243   PetscBool         yiscuda,xiscuda;
244   cudaError_t       err;
245 
246   PetscFunctionBegin;
247   if (alpha == (PetscScalar)0.0) PetscFunctionReturn(0);
248   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
249   ierr = PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");CHKERRQ(ierr);
250   ierr = PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");CHKERRQ(ierr);
251   if (xiscuda && yiscuda) {
252     ierr = PetscBLASIntCast(yin->map->n,&bn);CHKERRQ(ierr);
253     ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
254     ierr = VecCUDAGetArray(yin,&yarray);CHKERRQ(ierr);
255     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
256     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
257     err  = WaitForCUDA();CHKERRCUDA(err);
258     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
259     ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
260     ierr = VecCUDARestoreArray(yin,&yarray);CHKERRQ(ierr);
261     ierr = PetscLogGpuFlops(2.0*yin->map->n);CHKERRQ(ierr);
262   } else {
263     ierr = VecAXPY_Seq(yin,alpha,xin);CHKERRQ(ierr);
264   }
265   PetscFunctionReturn(0);
266 }
267 
VecPointwiseDivide_SeqCUDA(Vec win,Vec xin,Vec yin)268 PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
269 {
270   PetscInt                              n = xin->map->n;
271   const PetscScalar                     *xarray=NULL,*yarray=NULL;
272   PetscScalar                           *warray=NULL;
273   thrust::device_ptr<const PetscScalar> xptr,yptr;
274   thrust::device_ptr<PetscScalar>       wptr;
275   PetscErrorCode                        ierr;
276   cudaError_t                           err;
277 
278   PetscFunctionBegin;
279   ierr = VecCUDAGetArrayWrite(win,&warray);CHKERRQ(ierr);
280   ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
281   ierr = VecCUDAGetArrayRead(yin,&yarray);CHKERRQ(ierr);
282   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
283   try {
284     wptr = thrust::device_pointer_cast(warray);
285     xptr = thrust::device_pointer_cast(xarray);
286     yptr = thrust::device_pointer_cast(yarray);
287     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
288     err  = WaitForCUDA();CHKERRCUDA(err);
289   } catch (char *ex) {
290     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
291   }
292   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
293   ierr = PetscLogGpuFlops(n);CHKERRQ(ierr);
294   ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
295   ierr = VecCUDARestoreArrayRead(yin,&yarray);CHKERRQ(ierr);
296   ierr = VecCUDARestoreArrayWrite(win,&warray);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin,Vec yin)300 PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
301 {
302   const PetscScalar *xarray=NULL,*yarray=NULL;
303   PetscScalar       *warray=NULL;
304   PetscErrorCode    ierr;
305   PetscBLASInt      one = 1,bn = 0;
306   cublasHandle_t    cublasv2handle;
307   cublasStatus_t    cberr;
308   cudaError_t       err;
309 
310   PetscFunctionBegin;
311   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
312   ierr = PetscBLASIntCast(win->map->n,&bn);CHKERRQ(ierr);
313   if (alpha == (PetscScalar)0.0) {
314     ierr = VecCopy_SeqCUDA(yin,win);CHKERRQ(ierr);
315   } else {
316     ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
317     ierr = VecCUDAGetArrayRead(yin,&yarray);CHKERRQ(ierr);
318     ierr = VecCUDAGetArrayWrite(win,&warray);CHKERRQ(ierr);
319     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
320     err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
321     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
322     err  = WaitForCUDA();CHKERRCUDA(err);
323     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
324     ierr = PetscLogGpuFlops(2*win->map->n);CHKERRQ(ierr);
325     ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
326     ierr = VecCUDARestoreArrayRead(yin,&yarray);CHKERRQ(ierr);
327     ierr = VecCUDARestoreArrayWrite(win,&warray);CHKERRQ(ierr);
328   }
329   PetscFunctionReturn(0);
330 }
331 
VecMAXPY_SeqCUDA(Vec xin,PetscInt nv,const PetscScalar * alpha,Vec * y)332 PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
333 {
334   PetscErrorCode ierr;
335   cudaError_t    err;
336   PetscInt       n = xin->map->n,j,j_rem;
337   PetscScalar    alpha0,alpha1,alpha2,alpha3;
338 
339   PetscFunctionBegin;
340   ierr = PetscLogGpuFlops(nv*2.0*n);CHKERRQ(ierr);
341   switch (j_rem=nv&0x3) {
342     case 3:
343       alpha0 = alpha[0];
344       alpha1 = alpha[1];
345       alpha2 = alpha[2];
346       alpha += 3;
347       ierr   = VecAXPY_SeqCUDA(xin,alpha0,y[0]);CHKERRQ(ierr);
348       ierr   = VecAXPY_SeqCUDA(xin,alpha1,y[1]);CHKERRQ(ierr);
349       ierr   = VecAXPY_SeqCUDA(xin,alpha2,y[2]);CHKERRQ(ierr);
350       y   += 3;
351       break;
352     case 2:
353       alpha0 = alpha[0];
354       alpha1 = alpha[1];
355       alpha +=2;
356       ierr   = VecAXPY_SeqCUDA(xin,alpha0,y[0]);CHKERRQ(ierr);
357       ierr   = VecAXPY_SeqCUDA(xin,alpha1,y[1]);CHKERRQ(ierr);
358       y +=2;
359       break;
360     case 1:
361       alpha0 = *alpha++;
362       ierr   = VecAXPY_SeqCUDA(xin,alpha0,y[0]);CHKERRQ(ierr);
363       y     +=1;
364       break;
365   }
366   for (j=j_rem; j<nv; j+=4) {
367     alpha0 = alpha[0];
368     alpha1 = alpha[1];
369     alpha2 = alpha[2];
370     alpha3 = alpha[3];
371     alpha += 4;
372     ierr   = VecAXPY_SeqCUDA(xin,alpha0,y[0]);CHKERRQ(ierr);
373     ierr   = VecAXPY_SeqCUDA(xin,alpha1,y[1]);CHKERRQ(ierr);
374     ierr   = VecAXPY_SeqCUDA(xin,alpha2,y[2]);CHKERRQ(ierr);
375     ierr   = VecAXPY_SeqCUDA(xin,alpha3,y[3]);CHKERRQ(ierr);
376     y   += 4;
377   }
378   err  = WaitForCUDA();CHKERRCUDA(err);
379   PetscFunctionReturn(0);
380 }
381 
VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar * z)382 PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
383 {
384   const PetscScalar *xarray,*yarray;
385   PetscErrorCode    ierr;
386   PetscBLASInt      one = 1,bn = 0;
387   cublasHandle_t    cublasv2handle;
388   cublasStatus_t    cerr;
389 
390   PetscFunctionBegin;
391   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
392   ierr = PetscBLASIntCast(yin->map->n,&bn);CHKERRQ(ierr);
393   ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
394   ierr = VecCUDAGetArrayRead(yin,&yarray);CHKERRQ(ierr);
395   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
396   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
397   cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
398   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
399   if (xin->map->n >0) {
400     ierr = PetscLogGpuFlops(2.0*xin->map->n-1);CHKERRQ(ierr);
401   }
402   ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
403   ierr = VecCUDARestoreArrayRead(yin,&yarray);CHKERRQ(ierr);
404   PetscFunctionReturn(0);
405 }
406 
407 //
408 // CUDA kernels for MDot to follow
409 //
410 
411 // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
412 #define MDOT_WORKGROUP_SIZE 128
413 #define MDOT_WORKGROUP_NUM  128
414 
415 #if !defined(PETSC_USE_COMPLEX)
416 // M = 2:
VecMDot_SeqCUDA_kernel2(const PetscScalar * x,const PetscScalar * y0,const PetscScalar * y1,PetscInt size,PetscScalar * group_results)417 __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
418                                         PetscInt size, PetscScalar *group_results)
419 {
420   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
421   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
422   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
423   PetscInt vec_start_index = blockIdx.x * entries_per_group;
424   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
425 
426   PetscScalar entry_x    = 0;
427   PetscScalar group_sum0 = 0;
428   PetscScalar group_sum1 = 0;
429   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
430     entry_x     = x[i];   // load only once from global memory!
431     group_sum0 += entry_x * y0[i];
432     group_sum1 += entry_x * y1[i];
433   }
434   tmp_buffer[threadIdx.x]                       = group_sum0;
435   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
436 
437   // parallel reduction
438   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
439     __syncthreads();
440     if (threadIdx.x < stride) {
441       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
442       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
443     }
444   }
445 
446   // write result of group to group_results
447   if (threadIdx.x == 0) {
448     group_results[blockIdx.x]             = tmp_buffer[0];
449     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
450   }
451 }
452 
453 // M = 3:
VecMDot_SeqCUDA_kernel3(const PetscScalar * x,const PetscScalar * y0,const PetscScalar * y1,const PetscScalar * y2,PetscInt size,PetscScalar * group_results)454 __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
455                                         PetscInt size, PetscScalar *group_results)
456 {
457   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
458   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
459   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
460   PetscInt vec_start_index = blockIdx.x * entries_per_group;
461   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
462 
463   PetscScalar entry_x    = 0;
464   PetscScalar group_sum0 = 0;
465   PetscScalar group_sum1 = 0;
466   PetscScalar group_sum2 = 0;
467   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
468     entry_x     = x[i];   // load only once from global memory!
469     group_sum0 += entry_x * y0[i];
470     group_sum1 += entry_x * y1[i];
471     group_sum2 += entry_x * y2[i];
472   }
473   tmp_buffer[threadIdx.x]                           = group_sum0;
474   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
475   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
476 
477   // parallel reduction
478   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
479     __syncthreads();
480     if (threadIdx.x < stride) {
481       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
482       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
483       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
484     }
485   }
486 
487   // write result of group to group_results
488   if (threadIdx.x == 0) {
489     group_results[blockIdx.x                ] = tmp_buffer[0];
490     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
491     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
492   }
493 }
494 
495 // M = 4:
VecMDot_SeqCUDA_kernel4(const PetscScalar * x,const PetscScalar * y0,const PetscScalar * y1,const PetscScalar * y2,const PetscScalar * y3,PetscInt size,PetscScalar * group_results)496 __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
497                                         PetscInt size, PetscScalar *group_results)
498 {
499   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
500   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
501   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
502   PetscInt vec_start_index = blockIdx.x * entries_per_group;
503   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
504 
505   PetscScalar entry_x    = 0;
506   PetscScalar group_sum0 = 0;
507   PetscScalar group_sum1 = 0;
508   PetscScalar group_sum2 = 0;
509   PetscScalar group_sum3 = 0;
510   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
511     entry_x     = x[i];   // load only once from global memory!
512     group_sum0 += entry_x * y0[i];
513     group_sum1 += entry_x * y1[i];
514     group_sum2 += entry_x * y2[i];
515     group_sum3 += entry_x * y3[i];
516   }
517   tmp_buffer[threadIdx.x]                           = group_sum0;
518   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
519   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
520   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
521 
522   // parallel reduction
523   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
524     __syncthreads();
525     if (threadIdx.x < stride) {
526       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
527       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
528       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
529       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
530     }
531   }
532 
533   // write result of group to group_results
534   if (threadIdx.x == 0) {
535     group_results[blockIdx.x                ] = tmp_buffer[0];
536     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
537     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
538     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
539   }
540 }
541 
542 // M = 8:
VecMDot_SeqCUDA_kernel8(const PetscScalar * x,const PetscScalar * y0,const PetscScalar * y1,const PetscScalar * y2,const PetscScalar * y3,const PetscScalar * y4,const PetscScalar * y5,const PetscScalar * y6,const PetscScalar * y7,PetscInt size,PetscScalar * group_results)543 __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
544                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
545                                           PetscInt size, PetscScalar *group_results)
546 {
547   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
548   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
549   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
550   PetscInt vec_start_index = blockIdx.x * entries_per_group;
551   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
552 
553   PetscScalar entry_x    = 0;
554   PetscScalar group_sum0 = 0;
555   PetscScalar group_sum1 = 0;
556   PetscScalar group_sum2 = 0;
557   PetscScalar group_sum3 = 0;
558   PetscScalar group_sum4 = 0;
559   PetscScalar group_sum5 = 0;
560   PetscScalar group_sum6 = 0;
561   PetscScalar group_sum7 = 0;
562   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
563     entry_x     = x[i];   // load only once from global memory!
564     group_sum0 += entry_x * y0[i];
565     group_sum1 += entry_x * y1[i];
566     group_sum2 += entry_x * y2[i];
567     group_sum3 += entry_x * y3[i];
568     group_sum4 += entry_x * y4[i];
569     group_sum5 += entry_x * y5[i];
570     group_sum6 += entry_x * y6[i];
571     group_sum7 += entry_x * y7[i];
572   }
573   tmp_buffer[threadIdx.x]                           = group_sum0;
574   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
575   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
576   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
577   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
578   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
579   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
580   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
581 
582   // parallel reduction
583   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
584     __syncthreads();
585     if (threadIdx.x < stride) {
586       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
587       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
588       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
589       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
590       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
591       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
592       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
593       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
594     }
595   }
596 
597   // write result of group to group_results
598   if (threadIdx.x == 0) {
599     group_results[blockIdx.x                ] = tmp_buffer[0];
600     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
601     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
602     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
603     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
604     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
605     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
606     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
607   }
608 }
609 #endif /* !defined(PETSC_USE_COMPLEX) */
610 
VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)611 PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
612 {
613   PetscErrorCode    ierr;
614   PetscInt          i,n = xin->map->n,current_y_index = 0;
615   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
616   PetscScalar       *group_results_gpu;
617 #if !defined(PETSC_USE_COMPLEX)
618   PetscInt          j;
619   PetscScalar       group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
620 #endif
621   cudaError_t       cuda_ierr;
622   PetscBLASInt      one = 1,bn = 0;
623   cublasHandle_t    cublasv2handle;
624   cublasStatus_t    cberr;
625 
626   PetscFunctionBegin;
627   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
628   ierr = PetscBLASIntCast(xin->map->n,&bn);CHKERRQ(ierr);
629   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
630   /* Handle the case of local size zero first */
631   if (!xin->map->n) {
632     for (i=0; i<nv; ++i) z[i] = 0;
633     PetscFunctionReturn(0);
634   }
635 
636   // allocate scratchpad memory for the results of individual work groups:
637   cuda_ierr = cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);
638 
639   ierr = VecCUDAGetArrayRead(xin,&xptr);CHKERRQ(ierr);
640 
641   while (current_y_index < nv)
642   {
643     switch (nv - current_y_index) {
644 
645       case 7:
646       case 6:
647       case 5:
648       case 4:
649         ierr = VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);CHKERRQ(ierr);
650         ierr = VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
651         ierr = VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
652         ierr = VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);CHKERRQ(ierr);
653         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
654 #if defined(PETSC_USE_COMPLEX)
655         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
656         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
657         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
658         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
659         ierr  = PetscLogGpuTimeEnd();CHKERRQ(ierr);
660 #else
661         // run kernel:
662         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
663         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
664 
665         // copy results back to
666         cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
667 
668         // sum group results into z:
669         for (j=0; j<4; ++j) {
670           z[current_y_index + j] = 0;
671           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
672         }
673 #endif
674         ierr = VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);CHKERRQ(ierr);
675         ierr = VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
676         ierr = VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
677         ierr = VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);CHKERRQ(ierr);
678         current_y_index += 4;
679         break;
680 
681       case 3:
682         ierr = VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);CHKERRQ(ierr);
683         ierr = VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
684         ierr = VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
685 
686         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
687 #if defined(PETSC_USE_COMPLEX)
688         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
689         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
690         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
691         ierr  = PetscLogGpuTimeEnd();CHKERRQ(ierr);
692 #else
693         // run kernel:
694         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
695         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
696 
697         // copy results back to
698         cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
699 
700         // sum group results into z:
701         for (j=0; j<3; ++j) {
702           z[current_y_index + j] = 0;
703           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
704         }
705 #endif
706         ierr = VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);CHKERRQ(ierr);
707         ierr = VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
708         ierr = VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
709         current_y_index += 3;
710         break;
711 
712       case 2:
713         ierr = VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);CHKERRQ(ierr);
714         ierr = VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
715         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
716 #if defined(PETSC_USE_COMPLEX)
717         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
718         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
719         ierr  = PetscLogGpuTimeEnd();CHKERRQ(ierr);
720 #else
721         // run kernel:
722         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
723         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
724 
725         // copy results back to
726         cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
727 
728         // sum group results into z:
729         for (j=0; j<2; ++j) {
730           z[current_y_index + j] = 0;
731           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
732         }
733 #endif
734         ierr = VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);CHKERRQ(ierr);
735         ierr = VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
736         current_y_index += 2;
737         break;
738 
739       case 1:
740         ierr = VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);CHKERRQ(ierr);
741         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
742         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
743         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
744         ierr = VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);CHKERRQ(ierr);
745         current_y_index += 1;
746         break;
747 
748       default: // 8 or more vectors left
749         ierr = VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);CHKERRQ(ierr);
750         ierr = VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
751         ierr = VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
752         ierr = VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);CHKERRQ(ierr);
753         ierr = VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);CHKERRQ(ierr);
754         ierr = VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);CHKERRQ(ierr);
755         ierr = VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);CHKERRQ(ierr);
756         ierr = VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);CHKERRQ(ierr);
757         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
758 #if defined(PETSC_USE_COMPLEX)
759         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
760         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
761         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
762         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
763         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
764         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
765         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
766         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
767         ierr  = PetscLogGpuTimeEnd();CHKERRQ(ierr);
768 #else
769         // run kernel:
770         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
771         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
772 
773         // copy results back to
774         cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
775 
776         // sum group results into z:
777         for (j=0; j<8; ++j) {
778           z[current_y_index + j] = 0;
779           for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
780         }
781 #endif
782         ierr = VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);CHKERRQ(ierr);
783         ierr = VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);CHKERRQ(ierr);
784         ierr = VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);CHKERRQ(ierr);
785         ierr = VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);CHKERRQ(ierr);
786         ierr = VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);CHKERRQ(ierr);
787         ierr = VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);CHKERRQ(ierr);
788         ierr = VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);CHKERRQ(ierr);
789         ierr = VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);CHKERRQ(ierr);
790         current_y_index += 8;
791         break;
792     }
793   }
794   ierr = VecCUDARestoreArrayRead(xin,&xptr);CHKERRQ(ierr);
795 
796   cuda_ierr = cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
797   ierr = PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));CHKERRQ(ierr);
798   PetscFunctionReturn(0);
799 }
800 
801 #undef MDOT_WORKGROUP_SIZE
802 #undef MDOT_WORKGROUP_NUM
803 
VecSet_SeqCUDA(Vec xin,PetscScalar alpha)804 PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
805 {
806   PetscInt                        n = xin->map->n;
807   PetscScalar                     *xarray = NULL;
808   thrust::device_ptr<PetscScalar> xptr;
809   PetscErrorCode                  ierr;
810   cudaError_t                     err;
811 
812   PetscFunctionBegin;
813   ierr = VecCUDAGetArrayWrite(xin,&xarray);CHKERRQ(ierr);
814   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
815   if (alpha == (PetscScalar)0.0) {
816     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
817   } else {
818     try {
819       xptr = thrust::device_pointer_cast(xarray);
820       thrust::fill(xptr,xptr+n,alpha);
821     } catch (char *ex) {
822       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
823     }
824   }
825   err  = WaitForCUDA();CHKERRCUDA(err);
826   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
827   ierr = VecCUDARestoreArrayWrite(xin,&xarray);CHKERRQ(ierr);
828   PetscFunctionReturn(0);
829 }
830 
VecScale_SeqCUDA(Vec xin,PetscScalar alpha)831 PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
832 {
833   PetscScalar    *xarray;
834   PetscErrorCode ierr;
835   PetscBLASInt   one = 1,bn = 0;
836   cublasHandle_t cublasv2handle;
837   cublasStatus_t cberr;
838   cudaError_t    err;
839 
840   PetscFunctionBegin;
841   if (alpha == (PetscScalar)0.0) {
842     ierr = VecSet_SeqCUDA(xin,alpha);CHKERRQ(ierr);
843     err  = WaitForCUDA();CHKERRCUDA(err);
844   } else if (alpha != (PetscScalar)1.0) {
845     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
846     ierr = PetscBLASIntCast(xin->map->n,&bn);CHKERRQ(ierr);
847     ierr = VecCUDAGetArray(xin,&xarray);CHKERRQ(ierr);
848     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
849     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
850     ierr = VecCUDARestoreArray(xin,&xarray);CHKERRQ(ierr);
851     err  = WaitForCUDA();CHKERRCUDA(err);
852     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
853   }
854   ierr = PetscLogGpuFlops(xin->map->n);CHKERRQ(ierr);
855   PetscFunctionReturn(0);
856 }
857 
VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar * z)858 PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
859 {
860   const PetscScalar *xarray,*yarray;
861   PetscErrorCode    ierr;
862   PetscBLASInt      one = 1,bn = 0;
863   cublasHandle_t    cublasv2handle;
864   cublasStatus_t    cerr;
865 
866   PetscFunctionBegin;
867   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
868   ierr = PetscBLASIntCast(xin->map->n,&bn);CHKERRQ(ierr);
869   ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
870   ierr = VecCUDAGetArrayRead(yin,&yarray);CHKERRQ(ierr);
871   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
872   cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
873   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
874   if (xin->map->n > 0) {
875     ierr = PetscLogGpuFlops(2.0*xin->map->n-1);CHKERRQ(ierr);
876   }
877   ierr = VecCUDARestoreArrayRead(yin,&yarray);CHKERRQ(ierr);
878   ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
879   PetscFunctionReturn(0);
880 }
881 
VecCopy_SeqCUDA(Vec xin,Vec yin)882 PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
883 {
884   const PetscScalar *xarray;
885   PetscScalar       *yarray;
886   PetscErrorCode    ierr;
887   cudaError_t       err;
888 
889   PetscFunctionBegin;
890   if (xin != yin) {
891     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
892       PetscBool yiscuda;
893 
894       ierr = PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");CHKERRQ(ierr);
895       ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
896       if (yiscuda) {
897         ierr = VecCUDAGetArrayWrite(yin,&yarray);CHKERRQ(ierr);
898       } else {
899         ierr = VecGetArrayWrite(yin,&yarray);CHKERRQ(ierr);
900       }
901       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
902       if (yiscuda) {
903         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
904       } else {
905         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
906       }
907       err  = WaitForCUDA();CHKERRCUDA(err);
908       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
909       ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
910       if (yiscuda) {
911         ierr = VecCUDARestoreArrayWrite(yin,&yarray);CHKERRQ(ierr);
912       } else {
913         ierr = VecRestoreArrayWrite(yin,&yarray);CHKERRQ(ierr);
914       }
915     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
916       /* copy in CPU if we are on the CPU */
917       ierr = VecCopy_SeqCUDA_Private(xin,yin);CHKERRQ(ierr);
918     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
919       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
920       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
921         /* copy in CPU */
922         ierr = VecCopy_SeqCUDA_Private(xin,yin);CHKERRQ(ierr);
923       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
924         /* copy in GPU */
925         ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
926         ierr = VecCUDAGetArrayWrite(yin,&yarray);CHKERRQ(ierr);
927         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
928         err  = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
929         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
930         ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
931         ierr = VecCUDARestoreArrayWrite(yin,&yarray);CHKERRQ(ierr);
932       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
933         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
934            default to copy in GPU (this is an arbitrary choice) */
935         ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
936         ierr = VecCUDAGetArrayWrite(yin,&yarray);CHKERRQ(ierr);
937         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
938         err  = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
939         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
940         ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
941         ierr = VecCUDARestoreArrayWrite(yin,&yarray);CHKERRQ(ierr);
942       } else {
943         ierr = VecCopy_SeqCUDA_Private(xin,yin);CHKERRQ(ierr);
944       }
945     }
946   }
947   PetscFunctionReturn(0);
948 }
949 
VecSwap_SeqCUDA(Vec xin,Vec yin)950 PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
951 {
952   PetscErrorCode ierr;
953   PetscBLASInt   one = 1,bn = 0;
954   PetscScalar    *xarray,*yarray;
955   cublasHandle_t cublasv2handle;
956   cublasStatus_t cberr;
957   cudaError_t    err;
958 
959   PetscFunctionBegin;
960   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
961   ierr = PetscBLASIntCast(xin->map->n,&bn);CHKERRQ(ierr);
962   if (xin != yin) {
963     ierr = VecCUDAGetArray(xin,&xarray);CHKERRQ(ierr);
964     ierr = VecCUDAGetArray(yin,&yarray);CHKERRQ(ierr);
965     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
966     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
967     err  = WaitForCUDA();CHKERRCUDA(err);
968     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
969     ierr = VecCUDARestoreArray(xin,&xarray);CHKERRQ(ierr);
970     ierr = VecCUDARestoreArray(yin,&yarray);CHKERRQ(ierr);
971   }
972   PetscFunctionReturn(0);
973 }
974 
VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)975 PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
976 {
977   PetscErrorCode    ierr;
978   PetscScalar       a = alpha,b = beta;
979   const PetscScalar *xarray;
980   PetscScalar       *yarray;
981   PetscBLASInt      one = 1, bn = 0;
982   cublasHandle_t    cublasv2handle;
983   cublasStatus_t    cberr;
984   cudaError_t       err;
985 
986   PetscFunctionBegin;
987   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
988   ierr = PetscBLASIntCast(yin->map->n,&bn);CHKERRQ(ierr);
989   if (a == (PetscScalar)0.0) {
990     ierr = VecScale_SeqCUDA(yin,beta);CHKERRQ(ierr);
991   } else if (b == (PetscScalar)1.0) {
992     ierr = VecAXPY_SeqCUDA(yin,alpha,xin);CHKERRQ(ierr);
993   } else if (a == (PetscScalar)1.0) {
994     ierr = VecAYPX_SeqCUDA(yin,beta,xin);CHKERRQ(ierr);
995   } else if (b == (PetscScalar)0.0) {
996     ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
997     ierr = VecCUDAGetArray(yin,&yarray);CHKERRQ(ierr);
998     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
999     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
1000     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
1001     err  = WaitForCUDA();CHKERRCUDA(err);
1002     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1003     ierr = PetscLogGpuFlops(xin->map->n);CHKERRQ(ierr);
1004     ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1005     ierr = VecCUDARestoreArray(yin,&yarray);CHKERRQ(ierr);
1006   } else {
1007     ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
1008     ierr = VecCUDAGetArray(yin,&yarray);CHKERRQ(ierr);
1009     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1010     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
1011     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
1012     ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1013     ierr = VecCUDARestoreArray(yin,&yarray);CHKERRQ(ierr);
1014     err  = WaitForCUDA();CHKERRCUDA(err);
1015     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1016     ierr = PetscLogGpuFlops(3.0*xin->map->n);CHKERRQ(ierr);
1017   }
1018   PetscFunctionReturn(0);
1019 }
1020 
VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)1021 PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1022 {
1023   PetscErrorCode ierr;
1024   cudaError_t    err;
1025   PetscInt       n = zin->map->n;
1026 
1027   PetscFunctionBegin;
1028   if (gamma == (PetscScalar)1.0) {
1029     /* z = ax + b*y + z */
1030     ierr = VecAXPY_SeqCUDA(zin,alpha,xin);CHKERRQ(ierr);
1031     ierr = VecAXPY_SeqCUDA(zin,beta,yin);CHKERRQ(ierr);
1032     ierr = PetscLogGpuFlops(4.0*n);CHKERRQ(ierr);
1033   } else {
1034     /* z = a*x + b*y + c*z */
1035     ierr = VecScale_SeqCUDA(zin,gamma);CHKERRQ(ierr);
1036     ierr = VecAXPY_SeqCUDA(zin,alpha,xin);CHKERRQ(ierr);
1037     ierr = VecAXPY_SeqCUDA(zin,beta,yin);CHKERRQ(ierr);
1038     ierr = PetscLogGpuFlops(5.0*n);CHKERRQ(ierr);
1039   }
1040   err  = WaitForCUDA();CHKERRCUDA(err);
1041   PetscFunctionReturn(0);
1042 }
1043 
VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)1044 PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1045 {
1046   PetscInt                              n = win->map->n;
1047   const PetscScalar                     *xarray,*yarray;
1048   PetscScalar                           *warray;
1049   thrust::device_ptr<const PetscScalar> xptr,yptr;
1050   thrust::device_ptr<PetscScalar>       wptr;
1051   PetscErrorCode                        ierr;
1052   cudaError_t                           err;
1053 
1054   PetscFunctionBegin;
1055   ierr = VecCUDAGetArray(win,&warray);CHKERRQ(ierr);
1056   ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
1057   ierr = VecCUDAGetArrayRead(yin,&yarray);CHKERRQ(ierr);
1058   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1059   try {
1060     wptr = thrust::device_pointer_cast(warray);
1061     xptr = thrust::device_pointer_cast(xarray);
1062     yptr = thrust::device_pointer_cast(yarray);
1063     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1064     err  = WaitForCUDA();CHKERRCUDA(err);
1065   } catch (char *ex) {
1066     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1067   }
1068   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1069   ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1070   ierr = VecCUDARestoreArrayRead(yin,&yarray);CHKERRQ(ierr);
1071   ierr = VecCUDARestoreArray(win,&warray);CHKERRQ(ierr);
1072   ierr = PetscLogGpuFlops(n);CHKERRQ(ierr);
1073   PetscFunctionReturn(0);
1074 }
1075 
1076 /* should do infinity norm in cuda */
1077 
VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal * z)1078 PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1079 {
1080   PetscErrorCode    ierr;
1081   PetscInt          n = xin->map->n;
1082   PetscBLASInt      one = 1, bn = 0;
1083   const PetscScalar *xarray;
1084   cublasHandle_t    cublasv2handle;
1085   cublasStatus_t    cberr;
1086   cudaError_t       err;
1087 
1088   PetscFunctionBegin;
1089   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1090   ierr = PetscBLASIntCast(n,&bn);CHKERRQ(ierr);
1091   if (type == NORM_2 || type == NORM_FROBENIUS) {
1092     ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
1093     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1094     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1095     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1096     ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1097     ierr = PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));CHKERRQ(ierr);
1098   } else if (type == NORM_INFINITY) {
1099     int  i;
1100     ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
1101     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1102     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1103     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1104     if (bn) {
1105       PetscScalar zs;
1106       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1107       *z = PetscAbsScalar(zs);
1108     } else *z = 0.0;
1109     ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1110   } else if (type == NORM_1) {
1111     ierr = VecCUDAGetArrayRead(xin,&xarray);CHKERRQ(ierr);
1112     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1113     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1114     ierr = VecCUDARestoreArrayRead(xin,&xarray);CHKERRQ(ierr);
1115     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1116     ierr = PetscLogGpuFlops(PetscMax(n-1.0,0.0));CHKERRQ(ierr);
1117   } else if (type == NORM_1_AND_2) {
1118     ierr = VecNorm_SeqCUDA(xin,NORM_1,z);CHKERRQ(ierr);
1119     ierr = VecNorm_SeqCUDA(xin,NORM_2,z+1);CHKERRQ(ierr);
1120   }
1121   PetscFunctionReturn(0);
1122 }
1123 
VecDotNorm2_SeqCUDA(Vec s,Vec t,PetscScalar * dp,PetscScalar * nm)1124 PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1125 {
1126   PetscErrorCode    ierr;
1127   cudaError_t       err;
1128   PetscReal         n = s->map->n;
1129   const PetscScalar *sarray,*tarray;
1130 
1131   PetscFunctionBegin;
1132   ierr = VecCUDAGetArrayRead(s,&sarray);CHKERRQ(ierr);
1133   ierr = VecCUDAGetArrayRead(t,&tarray);CHKERRQ(ierr);
1134   ierr = VecDot_SeqCUDA(s,t,dp);CHKERRQ(ierr);
1135   ierr = VecDot_SeqCUDA(t,t,nm);CHKERRQ(ierr);
1136   ierr = VecCUDARestoreArrayRead(s,&sarray);CHKERRQ(ierr);
1137   ierr = VecCUDARestoreArrayRead(t,&tarray);CHKERRQ(ierr);
1138   err  = WaitForCUDA();CHKERRCUDA(err);
1139   ierr = PetscLogGpuFlops(4.0*n);CHKERRQ(ierr);
1140   PetscFunctionReturn(0);
1141 }
1142 
VecDestroy_SeqCUDA(Vec v)1143 PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1144 {
1145   PetscErrorCode ierr;
1146   cudaError_t    err;
1147 
1148   PetscFunctionBegin;
1149   if (v->spptr) {
1150     if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1151       err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1152       ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1153     }
1154     if (((Vec_CUDA*)v->spptr)->stream) {
1155       err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1156     }
1157   }
1158   ierr = VecDestroy_SeqCUDA_Private(v);CHKERRQ(ierr);
1159   ierr = PetscFree(v->spptr);CHKERRQ(ierr);
1160   PetscFunctionReturn(0);
1161 }
1162 
1163 #if defined(PETSC_USE_COMPLEX)
1164 struct conjugate
1165 {
1166   __host__ __device__
operator ()conjugate1167     PetscScalar operator()(PetscScalar x)
1168     {
1169       return PetscConj(x);
1170     }
1171 };
1172 #endif
1173 
VecConjugate_SeqCUDA(Vec xin)1174 PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1175 {
1176 #if defined(PETSC_USE_COMPLEX)
1177   PetscScalar                     *xarray;
1178   PetscErrorCode                  ierr;
1179   PetscInt                        n = xin->map->n;
1180   thrust::device_ptr<PetscScalar> xptr;
1181   cudaError_t                     err;
1182 
1183   PetscFunctionBegin;
1184   ierr = VecCUDAGetArray(xin,&xarray);CHKERRQ(ierr);
1185   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1186   try {
1187     xptr = thrust::device_pointer_cast(xarray);
1188     thrust::transform(xptr,xptr+n,xptr,conjugate());
1189     err  = WaitForCUDA();CHKERRCUDA(err);
1190   } catch (char *ex) {
1191     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1192   }
1193   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1194   ierr = VecCUDARestoreArray(xin,&xarray);CHKERRQ(ierr);
1195 #else
1196   PetscFunctionBegin;
1197 #endif
1198   PetscFunctionReturn(0);
1199 }
1200 
VecGetLocalVector_SeqCUDA(Vec v,Vec w)1201 PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1202 {
1203   PetscErrorCode ierr;
1204   cudaError_t    err;
1205 
1206   PetscFunctionBegin;
1207   PetscValidHeaderSpecific(v,VEC_CLASSID,1);
1208   PetscValidHeaderSpecific(w,VEC_CLASSID,2);
1209   PetscCheckTypeName(w,VECSEQCUDA);
1210 
1211   if (w->data) {
1212     if (((Vec_Seq*)w->data)->array_allocated) {
1213       if (w->pinned_memory) {
1214         ierr = PetscMallocSetCUDAHost();CHKERRQ(ierr);
1215       }
1216       ierr = PetscFree(((Vec_Seq*)w->data)->array_allocated);CHKERRQ(ierr);
1217       if (w->pinned_memory) {
1218         ierr = PetscMallocResetCUDAHost();CHKERRQ(ierr);
1219         w->pinned_memory = PETSC_FALSE;
1220       }
1221     }
1222     ((Vec_Seq*)w->data)->array = NULL;
1223     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1224   }
1225   if (w->spptr) {
1226     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1227       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1228       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1229     }
1230     if (((Vec_CUDA*)v->spptr)->stream) {
1231       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1232     }
1233     ierr = PetscFree(w->spptr);CHKERRQ(ierr);
1234   }
1235 
1236   if (v->petscnative) {
1237     ierr = PetscFree(w->data);CHKERRQ(ierr);
1238     w->data = v->data;
1239     w->offloadmask = v->offloadmask;
1240     w->pinned_memory = v->pinned_memory;
1241     w->spptr = v->spptr;
1242     ierr = PetscObjectStateIncrease((PetscObject)w);CHKERRQ(ierr);
1243   } else {
1244     ierr = VecGetArray(v,&((Vec_Seq*)w->data)->array);CHKERRQ(ierr);
1245     w->offloadmask = PETSC_OFFLOAD_CPU;
1246     ierr = VecCUDAAllocateCheck(w);CHKERRQ(ierr);
1247   }
1248   PetscFunctionReturn(0);
1249 }
1250 
VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)1251 PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1252 {
1253   PetscErrorCode ierr;
1254   cudaError_t    err;
1255 
1256   PetscFunctionBegin;
1257   PetscValidHeaderSpecific(v,VEC_CLASSID,1);
1258   PetscValidHeaderSpecific(w,VEC_CLASSID,2);
1259   PetscCheckTypeName(w,VECSEQCUDA);
1260 
1261   if (v->petscnative) {
1262     v->data = w->data;
1263     v->offloadmask = w->offloadmask;
1264     v->pinned_memory = w->pinned_memory;
1265     v->spptr = w->spptr;
1266     ierr = VecCUDACopyFromGPU(v);CHKERRQ(ierr);
1267     ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr);
1268     w->data = 0;
1269     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1270     w->spptr = 0;
1271   } else {
1272     ierr = VecRestoreArray(v,&((Vec_Seq*)w->data)->array);CHKERRQ(ierr);
1273     if ((Vec_CUDA*)w->spptr) {
1274       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1275       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1276       if (((Vec_CUDA*)v->spptr)->stream) {
1277         err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1278       }
1279       ierr = PetscFree(w->spptr);CHKERRQ(ierr);
1280     }
1281   }
1282   PetscFunctionReturn(0);
1283 }
1284