1 /* ========================================================================== */
2 /* === GPU/t_cholmod_gpu ==================================================== */
3 /* ========================================================================== */
4 
5 /* -----------------------------------------------------------------------------
6  * CHOLMOD/GPU Module.  Copyright (C) 2005-2012, Timothy A. Davis
7  * http://www.suitesparse.com
8  * -------------------------------------------------------------------------- */
9 
10 /* GPU BLAS template routine for cholmod_super_numeric. */
11 
12 /* ========================================================================== */
13 /* === include files and definitions ======================================== */
14 /* ========================================================================== */
15 
16 #ifdef GPU_BLAS
17 
18 #include <string.h>
19 #include "cholmod_template.h"
20 #include "cholmod_gpu_kernels.h"
21 #include <fenv.h>
22 #include <cuda.h>
23 #include <cuda_runtime.h>
24 
25 #undef L_ENTRY
26 #ifdef REAL
27 #define L_ENTRY 1
28 #else
29 #define L_ENTRY 2
30 #endif
31 
32 
33 /* ========================================================================== */
34 /* === gpu_clear_memory ===================================================== */
35 /* ========================================================================== */
36 /*
37  * Ensure the Lx is zeroed before forming factor.  This is a significant cost
38  * in the GPU case - so using this parallel memset code for efficiency.
39  */
40 
TEMPLATE2(CHOLMOD (gpu_clear_memory))41 void TEMPLATE2 (CHOLMOD (gpu_clear_memory))
42 (
43     double* buff,
44     size_t size,
45     int num_threads
46 )
47 {
48     int chunk_multiplier = 5;
49     int num_chunks = chunk_multiplier * num_threads;
50     size_t chunksize = size / num_chunks;
51     size_t i;
52 
53 #pragma omp parallel for num_threads(num_threads) private(i) schedule(dynamic)
54     for(i = 0; i < num_chunks; i++) {
55         size_t chunkoffset = i * chunksize;
56         if(i == num_chunks - 1) {
57             memset(buff + chunkoffset, 0, (size - chunksize*(num_chunks - 1)) *
58                    sizeof(double));
59         }
60         else {
61             memset(buff + chunkoffset, 0, chunksize * sizeof(double));
62         }
63     }
64 }
65 
66 /* ========================================================================== */
67 /* === gpu_init ============================================================= */
68 /* ========================================================================== */
69 /*
70  * Performs required initialization for GPU computing.
71  *
72  * Returns 0 if there is an error, so the intended use is
73  *
74  *     useGPU = CHOLMOD(gpu_init)
75  *
76  * which would locally turn off gpu processing if the initialization failed.
77  */
78 
TEMPLATE2(CHOLMOD (gpu_init))79 int TEMPLATE2 (CHOLMOD (gpu_init))
80 (
81     void *Cwork,
82     cholmod_factor *L,
83     cholmod_common *Common,
84     Int nsuper,
85     Int n,
86     Int nls,
87     cholmod_gpu_pointers *gpu_p
88 )
89 {
90     Int i, k, maxSize ;
91     cublasStatus_t cublasError ;
92     cudaError_t cudaErr ;
93     size_t maxBytesSize, HostPinnedSize ;
94 
95     feenableexcept (FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW );
96 
97     maxSize = L->maxcsize;
98 
99     /* #define PAGE_SIZE (4*1024) */
100     CHOLMOD_GPU_PRINTF (("gpu_init : %p\n",
101         (void *) ((size_t) Cwork & ~(4*1024-1)))) ;
102 
103     /* make sure the assumed buffer sizes are large enough */
104     if ( (nls+2*n+4)*sizeof(Int) > Common->devBuffSize ) {
105         ERROR (CHOLMOD_GPU_PROBLEM,"\n\n"
106                "GPU Memory allocation error.  Ls, Map and RelativeMap exceed\n"
107                "devBuffSize.  It is not clear if this is due to insufficient\n"
108                "device or host memory or both.  You can try:\n"
109                "     1) increasing the amount of GPU memory requested\n"
110                "     2) reducing CHOLMOD_NUM_HOST_BUFFERS\n"
111                "     3) using a GPU & host with more memory\n"
112                "This issue is a known limitation and should be fixed in a \n"
113                "future release of CHOLMOD.\n") ;
114         return (0) ;
115     }
116 
117     /* divvy up the memory in dev_mempool */
118     gpu_p->d_Lx[0] = Common->dev_mempool;
119     gpu_p->d_Lx[1] = Common->dev_mempool + Common->devBuffSize;
120     gpu_p->d_C = Common->dev_mempool + 2*Common->devBuffSize;
121     gpu_p->d_A[0] = Common->dev_mempool + 3*Common->devBuffSize;
122     gpu_p->d_A[1] = Common->dev_mempool + 4*Common->devBuffSize;
123     gpu_p->d_Ls = Common->dev_mempool + 5*Common->devBuffSize;
124     gpu_p->d_Map = gpu_p->d_Ls + (nls+1)*sizeof(Int) ;
125     gpu_p->d_RelativeMap = gpu_p->d_Map + (n+1)*sizeof(Int) ;
126 
127     /* Copy all of the Ls and Lpi data to the device.  If any supernodes are
128      * to be computed on the device then this will be needed, so might as
129      * well do it now.   */
130 
131     cudaErr = cudaMemcpy ( gpu_p->d_Ls, L->s, nls*sizeof(Int),
132                            cudaMemcpyHostToDevice );
133     CHOLMOD_HANDLE_CUDA_ERROR(cudaErr,"cudaMemcpy(d_Ls)");
134 
135     if (!(Common->gpuStream[0])) {
136 
137         /* ------------------------------------------------------------------ */
138         /* create each CUDA stream */
139         /* ------------------------------------------------------------------ */
140 
141         for ( i=0; i<CHOLMOD_HOST_SUPERNODE_BUFFERS; i++ ) {
142             cudaErr = cudaStreamCreate ( &(Common->gpuStream[i]) );
143             if (cudaErr != cudaSuccess) {
144                 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA stream") ;
145                 return (0) ;
146             }
147         }
148 
149         /* ------------------------------------------------------------------ */
150         /* create each CUDA event */
151         /* ------------------------------------------------------------------ */
152 
153         for (i = 0 ; i < 3 ; i++) {
154             cudaErr = cudaEventCreateWithFlags
155                 (&(Common->cublasEventPotrf [i]), cudaEventDisableTiming) ;
156             if (cudaErr != cudaSuccess) {
157                 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event") ;
158                 return (0) ;
159             }
160         }
161 
162         for (i = 0 ; i < CHOLMOD_HOST_SUPERNODE_BUFFERS ; i++) {
163             cudaErr = cudaEventCreateWithFlags
164                 (&(Common->updateCBuffersFree[i]), cudaEventDisableTiming) ;
165             if (cudaErr != cudaSuccess) {
166                 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event") ;
167                 return (0) ;
168             }
169         }
170 
171         cudaErr = cudaEventCreateWithFlags ( &(Common->updateCKernelsComplete),
172                                              cudaEventDisableTiming );
173         if (cudaErr != cudaSuccess) {
174             ERROR (CHOLMOD_GPU_PROBLEM, "CUDA updateCKernelsComplete event") ;
175             return (0) ;
176         }
177 
178     }
179 
180     gpu_p->h_Lx[0] = (double*)(Common->host_pinned_mempool);
181     for ( k=1; k<CHOLMOD_HOST_SUPERNODE_BUFFERS; k++ ) {
182         gpu_p->h_Lx[k] = (double*)((char *)(Common->host_pinned_mempool) +
183                                    k*Common->devBuffSize);
184     }
185 
186     return (1);  /* initialization successfull, useGPU = 1 */
187 
188 }
189 
190 
191 /* ========================================================================== */
192 /* === gpu_reorder_descendants ============================================== */
193 /* ========================================================================== */
194 /* Reorder the descendant supernodes as:
195  *    1st - descendant supernodes eligible for processing on the GPU
196  *          in increasing (by flops) order
197  *    2nd - supernodes whose processing is to remain on the CPU
198  *          in any order
199  *
200  * All of the GPU-eligible supernodes will be scheduled first.  All
201  * CPU-eligible descendants will overlap with the last (largest)
202  * CHOLMOD_HOST_SUPERNODE_BUFFERS GPU-eligible descendants.
203  */
204 
TEMPLATE2(CHOLMOD (gpu_reorder_descendants))205 void TEMPLATE2 (CHOLMOD (gpu_reorder_descendants))
206 (
207     cholmod_common *Common,
208     Int *Super,
209     Int *locals,
210     Int *Lpi,
211     Int *Lpos,
212     Int *Head,
213     Int *Next,
214     Int *Previous,
215     Int *ndescendants,
216     Int *tail,
217     Int *mapCreatedOnGpu,
218     cholmod_gpu_pointers *gpu_p
219 )
220 {
221 
222     Int prevd, nextd, firstcpu, d, k, kd1, kd2, ndcol, pdi, pdend, pdi1;
223     Int dnext, ndrow2, p;
224     Int n_descendant = 0;
225     double score;
226 
227     /* use h_Lx[0] to buffer the GPU-eligible descendants */
228     struct cholmod_descendant_score_t* scores =
229         (struct cholmod_descendant_score_t*) gpu_p->h_Lx[0];
230 
231     double cpuref = 0.0;
232 
233     int nreverse = 1;
234     int previousd;
235 
236 
237     d = Head[*locals];
238     prevd = -1;
239     firstcpu = -1;
240     *mapCreatedOnGpu = 0;
241 
242     while ( d != EMPTY )
243     {
244 
245         /* Get the parameters for the current descendant supernode */
246         kd1 = Super [d] ;       /* d contains cols kd1 to kd2-1 of L */
247         kd2 = Super [d+1] ;
248         ndcol = kd2 - kd1 ;     /* # of columns in all of d */
249         pdi = Lpi [d] ;         /* pointer to first row of d in Ls */
250         pdend = Lpi [d+1] ;     /* pointer just past last row of d in Ls */
251         p = Lpos [d] ;          /* offset of 1st row of d affecting s */
252         pdi1 = pdi + p ;        /* ptr to 1st row of d affecting s in Ls */
253         ndrow2 = pdend - pdi1;
254 
255         nextd = Next[d];
256 
257         /* compute a rough flops 'score' for this descendant supernode */
258         score = ndrow2 * ndcol;
259         if ( ndrow2*L_ENTRY >= CHOLMOD_ND_ROW_LIMIT &&
260              ndcol*L_ENTRY >= CHOLMOD_ND_COL_LIMIT ) {
261             score += Common->devBuffSize;
262         }
263 
264         /* place in sort buffer */
265         scores[n_descendant].score = score;
266         scores[n_descendant].d = d;
267         n_descendant++;
268 
269         d = nextd;
270 
271     }
272 
273     /* Sort the GPU-eligible supernodes */
274     qsort ( scores, n_descendant, sizeof(struct cholmod_descendant_score_t),
275             (__compar_fn_t) CHOLMOD(score_comp) );
276 
277     /* Place sorted data back in descendant supernode linked list*/
278     if ( n_descendant > 0 ) {
279         Head[*locals] = scores[0].d;
280         if ( n_descendant > 1 ) {
281 
282 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
283     if (n_descendant > 64)
284 
285             for ( k=1; k<n_descendant; k++ ) {
286                 Next[scores[k-1].d] = scores[k].d;
287             }
288         }
289         Next[scores[n_descendant-1].d] = firstcpu;
290     }
291 
292     /* reverse the first CHOLMOD_HOST_SUPERNODE_BUFFERS to better hide PCIe
293        communications */
294 
295     if ( Head[*locals] != EMPTY && Next[Head[*locals]] != EMPTY ) {
296         previousd = Head[*locals];
297         d = Next[Head[*locals]];
298         while ( d!=EMPTY && nreverse < CHOLMOD_HOST_SUPERNODE_BUFFERS ) {
299 
300             kd1 = Super [d] ;       /* d contains cols kd1 to kd2-1 of L */
301             kd2 = Super [d+1] ;
302             ndcol = kd2 - kd1 ;     /* # of columns in all of d */
303             pdi = Lpi [d] ;         /* pointer to first row of d in Ls */
304             pdend = Lpi [d+1] ;     /* pointer just past last row of d in Ls */
305             p = Lpos [d] ;          /* offset of 1st row of d affecting s */
306             pdi1 = pdi + p ;        /* ptr to 1st row of d affecting s in Ls */
307             ndrow2 = pdend - pdi1;
308 
309             nextd = Next[d];
310 
311             nreverse++;
312 
313             if ( ndrow2*L_ENTRY >= CHOLMOD_ND_ROW_LIMIT && ndcol*L_ENTRY >=
314                  CHOLMOD_ND_COL_LIMIT ) {
315                 /* place this supernode at the front of the list */
316                 Next[previousd] = Next[d];
317                 Next[d] = Head[*locals];
318                 Head[*locals] = d;
319             }
320             else {
321                 previousd = d;
322             }
323             d = nextd;
324         }
325     }
326 
327     /* create a 'previous' list so we can traverse backwards */
328     *ndescendants = 0;
329     if ( Head[*locals] != EMPTY ) {
330         Previous[Head[*locals]] = EMPTY;
331         for (d = Head [*locals] ; d != EMPTY ; d = dnext) {
332             (*ndescendants)++;
333             dnext = Next[d];
334             if ( dnext != EMPTY ) {
335                 Previous[dnext] = d;
336             }
337             else {
338                 *tail = d;
339             }
340         }
341     }
342 
343     return;
344 
345 }
346 
347 /* ========================================================================== */
348 /* === gpu_initialize_supernode ============================================= */
349 /* ========================================================================== */
350 
351 /* C = L (k1:n-1, kd1:kd2-1) * L (k1:k2-1, kd1:kd2-1)', except that k1:n-1
352  */
353 
TEMPLATE2(CHOLMOD (gpu_initialize_supernode))354 void TEMPLATE2 (CHOLMOD (gpu_initialize_supernode))
355 (
356     cholmod_common *Common,
357     Int nscol,
358     Int nsrow,
359     Int psi,
360     cholmod_gpu_pointers *gpu_p
361 )
362 {
363 
364     cudaError_t cuErr;
365 
366     /* initialize the device supernode assemby memory to zero */
367     cuErr = cudaMemset ( gpu_p->d_A[0], 0, nscol*nsrow*L_ENTRY*sizeof(double) );
368     CHOLMOD_HANDLE_CUDA_ERROR(cuErr,"cudaMemset(d_A)");
369 
370     /* Create the Map on the device */
371     createMapOnDevice ( (Int *)(gpu_p->d_Map),
372                         (Int *)(gpu_p->d_Ls), psi, nsrow );
373 
374     return;
375 
376 }
377 
378 
379 /* ========================================================================== */
380 /* === gpu_updateC ========================================================== */
381 /* ========================================================================== */
382 
383 /* C = L (k1:n-1, kd1:kd2-1) * L (k1:k2-1, kd1:kd2-1)', except that k1:n-1
384  * refers to all of the rows in L, but many of the rows are all zero.
385  * Supernode d holds columns kd1 to kd2-1 of L.  Nonzero rows in the range
386  * k1:k2-1 are in the list Ls [pdi1 ... pdi2-1], of size ndrow1.  Nonzero rows
387  * in the range k2:n-1 are in the list Ls [pdi2 ... pdend], of size ndrow2.
388  * Let L1 = L (Ls [pdi1 ... pdi2-1], kd1:kd2-1), and let L2 = L (Ls [pdi2 ...
389  * pdend],  kd1:kd2-1).  C is ndrow2-by-ndrow1.  Let C1 be the first ndrow1
390  * rows of C and let C2 be the last ndrow2-ndrow1 rows of C.  Only the lower
391  * triangular part of C1 needs to be computed since C1 is symmetric.
392  *
393  * UpdateC is completely asynchronous w.r.t. the GPU.  Once the input buffer
394  * d_Lx[] has been filled, all of the device operations are issues, and the
395  * host can continue with filling the next input buffer / or start processing
396  * all of the descendant supernodes which are not eligible for processing on
397  * the device (since they are too small - will not fill the device).
398  */
399 
TEMPLATE2(CHOLMOD (gpu_updateC))400 int TEMPLATE2 (CHOLMOD (gpu_updateC))
401 (
402     Int ndrow1,         /* C is ndrow2-by-ndrow2 */
403     Int ndrow2,
404     Int ndrow,          /* leading dimension of Lx */
405     Int ndcol,          /* L1 is ndrow1-by-ndcol */
406     Int nsrow,
407     Int pdx1,           /* L1 starts at Lx + L_ENTRY*pdx1 */
408     /* L2 starts at Lx + L_ENTRY*(pdx1 + ndrow1) */
409     Int pdi1,
410     double *Lx,
411     double *C,
412     cholmod_common *Common,
413     cholmod_gpu_pointers *gpu_p
414 )
415 {
416     double *devPtrLx, *devPtrC ;
417     double alpha, beta ;
418     cublasStatus_t cublasStatus ;
419     cudaError_t cudaStat [2] ;
420     Int ndrow3 ;
421     int icol, irow;
422     int iHostBuff, iDevBuff ;
423 
424 #ifndef NTIMER
425     double tstart = 0;
426 #endif
427 
428     if ((ndrow2*L_ENTRY < CHOLMOD_ND_ROW_LIMIT) ||
429         (ndcol*L_ENTRY <  CHOLMOD_ND_COL_LIMIT))
430     {
431         /* too small for the CUDA BLAS; use the CPU instead */
432         return (0) ;
433     }
434 
435     ndrow3 = ndrow2 - ndrow1 ;
436 
437 #ifndef NTIMER
438     Common->syrkStart = SuiteSparse_time ( ) ;
439     Common->CHOLMOD_GPU_SYRK_CALLS++ ;
440 #endif
441 
442     /* ---------------------------------------------------------------------- */
443     /* allocate workspace on the GPU */
444     /* ---------------------------------------------------------------------- */
445 
446     iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
447     iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;
448 
449     /* cycle the device Lx buffer, d_Lx, through CHOLMOD_DEVICE_STREAMS,
450        usually 2, so we can overlap the copy of this descendent supernode
451        with the compute of the previous descendant supernode */
452     devPtrLx = (double *)(gpu_p->d_Lx[iDevBuff]);
453     /* very little overlap between kernels for difference descendant supernodes
454        (since we enforce the supernodes must be large enough to fill the
455        device) so we only need one C buffer */
456     devPtrC = (double *)(gpu_p->d_C);
457 
458     /* ---------------------------------------------------------------------- */
459     /* copy Lx to the GPU */
460     /* ---------------------------------------------------------------------- */
461 
462     /* copy host data to pinned buffer first for better H2D bandwidth */
463 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) if (ndcol > 32)
464     for ( icol=0; icol<ndcol; icol++ ) {
465         for ( irow=0; irow<ndrow2*L_ENTRY; irow++ ) {
466             gpu_p->h_Lx[iHostBuff][icol*ndrow2*L_ENTRY+irow] =
467                 Lx[pdx1*L_ENTRY+icol*ndrow*L_ENTRY + irow];
468         }
469     }
470 
471     cudaStat[0] = cudaMemcpyAsync ( devPtrLx,
472         gpu_p->h_Lx[iHostBuff],
473         ndrow2*ndcol*L_ENTRY*sizeof(devPtrLx[0]),
474         cudaMemcpyHostToDevice,
475         Common->gpuStream[iDevBuff] );
476 
477     if ( cudaStat[0] ) {
478         CHOLMOD_GPU_PRINTF ((" ERROR cudaMemcpyAsync = %d \n", cudaStat[0]));
479         return (0);
480     }
481 
482     /* make the current stream wait for kernels in previous streams */
483     cudaStreamWaitEvent ( Common->gpuStream[iDevBuff],
484                           Common->updateCKernelsComplete, 0 ) ;
485 
486     /* ---------------------------------------------------------------------- */
487     /* create the relative map for this descendant supernode */
488     /* ---------------------------------------------------------------------- */
489 
490     createRelativeMapOnDevice ( (Int *)(gpu_p->d_Map),
491                                 (Int *)(gpu_p->d_Ls),
492                                 (Int *)(gpu_p->d_RelativeMap),
493                                 pdi1, ndrow2,
494                                 &(Common->gpuStream[iDevBuff]) );
495 
496     /* ---------------------------------------------------------------------- */
497     /* do the CUDA SYRK */
498     /* ---------------------------------------------------------------------- */
499 
500     cublasStatus = cublasSetStream (Common->cublasHandle,
501                                     Common->gpuStream[iDevBuff]) ;
502     if (cublasStatus != CUBLAS_STATUS_SUCCESS)
503     {
504         ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ;
505     }
506 
507     alpha  = 1.0 ;
508     beta   = 0.0 ;
509 
510 #ifdef REAL
511     cublasStatus = cublasDsyrk (Common->cublasHandle,
512         CUBLAS_FILL_MODE_LOWER,
513         CUBLAS_OP_N,
514         (int) ndrow1,
515         (int) ndcol,    /* N, K: L1 is ndrow1-by-ndcol */
516         &alpha,         /* ALPHA:  1 */
517         devPtrLx,
518         ndrow2,         /* A, LDA: L1, ndrow2 */
519         &beta,          /* BETA:   0 */
520         devPtrC,
521         ndrow2) ;       /* C, LDC: C1 */
522 #else
523     cublasStatus = cublasZherk (Common->cublasHandle,
524         CUBLAS_FILL_MODE_LOWER,
525         CUBLAS_OP_N,
526         (int) ndrow1,
527         (int) ndcol,    /* N, K: L1 is ndrow1-by-ndcol*/
528         &alpha,         /* ALPHA:  1 */
529         (const cuDoubleComplex *) devPtrLx,
530         ndrow2,         /* A, LDA: L1, ndrow2 */
531         &beta,          /* BETA:   0 */
532         (cuDoubleComplex *) devPtrC,
533         ndrow2) ;       /* C, LDC: C1 */
534 #endif
535 
536     if (cublasStatus != CUBLAS_STATUS_SUCCESS)
537     {
538         ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
539     }
540 
541 #ifndef NTIMER
542     Common->CHOLMOD_GPU_SYRK_TIME += SuiteSparse_time() - Common->syrkStart;
543 #endif
544 
545     /* ---------------------------------------------------------------------- */
546     /* compute remaining (ndrow2-ndrow1)-by-ndrow1 block of C, C2 = L2*L1'    */
547     /* ---------------------------------------------------------------------- */
548 
549 #ifndef NTIMER
550     Common->CHOLMOD_GPU_GEMM_CALLS++ ;
551     tstart = SuiteSparse_time();
552 #endif
553 
554     if (ndrow3 > 0)
555     {
556 #ifndef REAL
557         cuDoubleComplex calpha  = {1.0,0.0} ;
558         cuDoubleComplex cbeta   = {0.0,0.0} ;
559 #endif
560 
561         /* ------------------------------------------------------------------ */
562         /* do the CUDA BLAS dgemm */
563         /* ------------------------------------------------------------------ */
564 
565 #ifdef REAL
566         alpha  = 1.0 ;
567         beta   = 0.0 ;
568         cublasStatus = cublasDgemm (Common->cublasHandle,
569             CUBLAS_OP_N, CUBLAS_OP_T,
570             ndrow3, ndrow1, ndcol,          /* M, N, K */
571             &alpha,                         /* ALPHA:  1 */
572             devPtrLx + L_ENTRY*(ndrow1),    /* A, LDA: L2*/
573             ndrow2,                         /* ndrow */
574             devPtrLx,                       /* B, LDB: L1 */
575             ndrow2,                         /* ndrow */
576             &beta,                          /* BETA:   0 */
577             devPtrC + L_ENTRY*ndrow1,       /* C, LDC: C2 */
578             ndrow2) ;
579 #else
580         cublasStatus = cublasZgemm (Common->cublasHandle,
581             CUBLAS_OP_N, CUBLAS_OP_C,
582             ndrow3, ndrow1, ndcol,          /* M, N, K */
583             &calpha,                        /* ALPHA:  1 */
584             (const cuDoubleComplex*) devPtrLx + ndrow1,
585             ndrow2,                         /* ndrow */
586             (const cuDoubleComplex *) devPtrLx,
587             ndrow2,                         /* ndrow */
588             &cbeta,                         /* BETA:   0 */
589             (cuDoubleComplex *)devPtrC + ndrow1,
590             ndrow2) ;
591 #endif
592 
593         if (cublasStatus != CUBLAS_STATUS_SUCCESS)
594         {
595             ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
596         }
597 
598     }
599 
600 #ifndef NTIMER
601     Common->CHOLMOD_GPU_GEMM_TIME += SuiteSparse_time() - tstart;
602 #endif
603 
604     /* ------------------------------------------------------------------ */
605     /* Assemble the update C on the device using the d_RelativeMap */
606     /* ------------------------------------------------------------------ */
607 
608 #ifdef REAL
609     addUpdateOnDevice ( gpu_p->d_A[0], devPtrC,
610         gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow,
611         &(Common->gpuStream[iDevBuff]) );
612 #else
613     addComplexUpdateOnDevice ( gpu_p->d_A[0], devPtrC,
614         gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow,
615         &(Common->gpuStream[iDevBuff]) );
616 #endif
617 
618     /* Record an event indicating that kernels for
619        this descendant are complete */
620     cudaEventRecord ( Common->updateCKernelsComplete,
621                       Common->gpuStream[iDevBuff]);
622     cudaEventRecord ( Common->updateCBuffersFree[iHostBuff],
623                       Common->gpuStream[iDevBuff]);
624 
625     return (1) ;
626 }
627 
628 /* ========================================================================== */
629 /* === gpu_final_assembly =================================================== */
630 /* ========================================================================== */
631 
632 /* If the supernode was assembled on both the CPU and the GPU, this will
633  * complete the supernode assembly on both the GPU and CPU.
634  */
635 
TEMPLATE2(CHOLMOD (gpu_final_assembly))636 void TEMPLATE2 (CHOLMOD (gpu_final_assembly))
637 (
638     cholmod_common *Common,
639     double *Lx,
640     Int psx,
641     Int nscol,
642     Int nsrow,
643     int supernodeUsedGPU,
644     int *iHostBuff,
645     int *iDevBuff,
646     cholmod_gpu_pointers *gpu_p
647 )
648 {
649     Int iidx, i, j;
650     Int iHostBuff2 ;
651     Int iDevBuff2 ;
652 
653     if ( supernodeUsedGPU ) {
654 
655         /* ------------------------------------------------------------------ */
656         /* Apply all of the Shur-complement updates, computed on the gpu, to */
657         /* the supernode. */
658         /* ------------------------------------------------------------------ */
659 
660         *iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
661         *iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;
662 
663         if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) {
664 
665             /* If this supernode is going to be factored using the GPU (potrf)
666              * then it will need the portion of the update assembled ont the
667              * CPU.  So copy that to a pinned buffer an H2D copy to device. */
668 
669             /* wait until a buffer is free */
670             cudaEventSynchronize ( Common->updateCBuffersFree[*iHostBuff] );
671 
672             /* copy update assembled on CPU to a pinned buffer */
673 
674 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
675     private(iidx) if (nscol>32)
676 
677             for ( j=0; j<nscol; j++ ) {
678                 for ( i=j; i<nsrow*L_ENTRY; i++ ) {
679                     iidx = j*nsrow*L_ENTRY+i;
680                     gpu_p->h_Lx[*iHostBuff][iidx] = Lx[psx*L_ENTRY+iidx];
681                 }
682             }
683 
684             /* H2D transfer of update assembled on CPU */
685             cudaMemcpyAsync ( gpu_p->d_A[1], gpu_p->h_Lx[*iHostBuff],
686                               nscol*nsrow*L_ENTRY*sizeof(double),
687                               cudaMemcpyHostToDevice,
688                               Common->gpuStream[*iDevBuff] );
689         }
690 
691         Common->ibuffer++;
692 
693         iHostBuff2 = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
694         iDevBuff2 = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;
695 
696         /* wait for all kernels to complete */
697         cudaEventSynchronize( Common->updateCKernelsComplete );
698 
699         /* copy assembled Schur-complement updates computed on GPU */
700         cudaMemcpyAsync ( gpu_p->h_Lx[iHostBuff2], gpu_p->d_A[0],
701                           nscol*nsrow*L_ENTRY*sizeof(double),
702                           cudaMemcpyDeviceToHost,
703                           Common->gpuStream[iDevBuff2] );
704 
705         if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) {
706 
707             /* with the current implementation, potrf still uses data from the
708              * CPU - so put the fully assembled supernode in a pinned buffer for
709              * fastest access */
710 
711             /* need both H2D and D2H copies to be complete */
712             cudaDeviceSynchronize();
713 
714             /* sum updates from cpu and device on device */
715 #ifdef REAL
716             sumAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0], -1.0, nsrow, nscol );
717 #else
718             sumComplexAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0],
719                                   -1.0, nsrow, nscol );
720 #endif
721 
722             /* place final assembled supernode in pinned buffer */
723 
724 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
725     private(iidx) if (nscol>32)
726 
727             for ( j=0; j<nscol; j++ ) {
728                 for ( i=j*L_ENTRY; i<nscol*L_ENTRY; i++ ) {
729                     iidx = j*nsrow*L_ENTRY+i;
730                     gpu_p->h_Lx[*iHostBuff][iidx] -=
731                         gpu_p->h_Lx[iHostBuff2][iidx];
732                 }
733             }
734 
735         }
736         else
737         {
738 
739             /* assemble with CPU updates */
740             cudaDeviceSynchronize();
741 
742 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
743     private(iidx) if (nscol>32)
744 
745             for ( j=0; j<nscol; j++ ) {
746                 for ( i=j*L_ENTRY; i<nsrow*L_ENTRY; i++ ) {
747                     iidx = j*nsrow*L_ENTRY+i;
748                     Lx[psx*L_ENTRY+iidx] -= gpu_p->h_Lx[iHostBuff2][iidx];
749                 }
750             }
751         }
752     }
753     return;
754 }
755 
756 
757 /* ========================================================================== */
758 /* === gpu_lower_potrf ====================================================== */
759 /* ========================================================================== */
760 
761 /* Cholesky factorzation (dpotrf) of a matrix S, operating on the lower
762  * triangular part only.   S is nscol2-by-nscol2 with leading dimension nsrow.
763  *
764  * S is the top part of the supernode (the lower triangular matrx).
765  * This function also copies the bottom rectangular part of the supernode (B)
766  * onto the GPU, in preparation for gpu_triangular_solve.
767  */
768 
769 /*
770  * On entry, d_A[1] contains the fully assembled supernode
771  */
772 
TEMPLATE2(CHOLMOD (gpu_lower_potrf))773 int TEMPLATE2 (CHOLMOD (gpu_lower_potrf))
774 (
775     Int nscol2,     /* S is nscol2-by-nscol2 */
776     Int nsrow,      /* leading dimension of S */
777     Int psx,        /* S is located at Lx + L_ENTRY*psx */
778     double *Lx,     /* contains S; overwritten with Cholesky factor */
779     Int *info,      /* BLAS info return value */
780     cholmod_common *Common,
781     cholmod_gpu_pointers *gpu_p
782 )
783 {
784     double *devPtrA, *devPtrB, *A ;
785     double alpha, beta ;
786     cudaError_t cudaStat ;
787     cublasStatus_t cublasStatus ;
788     Int j, nsrow2, nb, n, gpu_lda, lda, gpu_ldb ;
789     int ilda, ijb, iinfo ;
790 #ifndef NTIMER
791     double tstart ;
792 #endif
793 
794     if (nscol2 * L_ENTRY < CHOLMOD_POTRF_LIMIT)
795     {
796         /* too small for the CUDA BLAS; use the CPU instead */
797         return (0) ;
798     }
799 
800 #ifndef NTIMER
801     tstart = SuiteSparse_time ( ) ;
802     Common->CHOLMOD_GPU_POTRF_CALLS++ ;
803 #endif
804 
805     nsrow2 = nsrow - nscol2 ;
806 
807     /* ---------------------------------------------------------------------- */
808     /* heuristic to get the block size depending of the problem size */
809     /* ---------------------------------------------------------------------- */
810 
811     nb = 128 ;
812     if (nscol2 > 4096) nb = 256 ;
813     if (nscol2 > 8192) nb = 384 ;
814     n  = nscol2 ;
815     gpu_lda = ((nscol2+31)/32)*32 ;
816     lda = nsrow ;
817 
818     A = gpu_p->h_Lx[(Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1)%
819                     CHOLMOD_HOST_SUPERNODE_BUFFERS];
820 
821     /* ---------------------------------------------------------------------- */
822     /* determine the GPU leading dimension of B */
823     /* ---------------------------------------------------------------------- */
824 
825     gpu_ldb = 0 ;
826     if (nsrow2 > 0)
827     {
828         gpu_ldb = ((nsrow2+31)/32)*32 ;
829     }
830 
831     /* ---------------------------------------------------------------------- */
832     /* remember where device memory is, to be used by triangular solve later */
833     /* ---------------------------------------------------------------------- */
834 
835     devPtrA = gpu_p->d_Lx[0];
836     devPtrB = gpu_p->d_Lx[1];
837 
838     /* ---------------------------------------------------------------------- */
839     /* copy A from device to device */
840     /* ---------------------------------------------------------------------- */
841 
842     cudaStat = cudaMemcpy2DAsync ( devPtrA,
843        gpu_lda * L_ENTRY * sizeof (devPtrA[0]),
844        gpu_p->d_A[1],
845        nsrow * L_ENTRY * sizeof (Lx[0]),
846        nscol2 * L_ENTRY * sizeof (devPtrA[0]),
847        nscol2,
848        cudaMemcpyDeviceToDevice,
849        Common->gpuStream[0] );
850 
851     if ( cudaStat ) {
852         ERROR ( CHOLMOD_GPU_PROBLEM, "GPU memcopy device to device");
853     }
854 
855     /* ---------------------------------------------------------------------- */
856     /* copy B in advance, for gpu_triangular_solve */
857     /* ---------------------------------------------------------------------- */
858 
859     if (nsrow2 > 0)
860     {
861         cudaStat = cudaMemcpy2DAsync (devPtrB,
862             gpu_ldb * L_ENTRY * sizeof (devPtrB [0]),
863             gpu_p->d_A[1] + L_ENTRY*nscol2,
864             nsrow * L_ENTRY * sizeof (Lx [0]),
865             nsrow2 * L_ENTRY * sizeof (devPtrB [0]),
866             nscol2,
867             cudaMemcpyDeviceToDevice,
868             Common->gpuStream[0]) ;
869         if (cudaStat)
870         {
871             ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
872         }
873     }
874 
875     /* ------------------------------------------------------------------ */
876     /* define the dpotrf stream */
877     /* ------------------------------------------------------------------ */
878 
879     cublasStatus = cublasSetStream (Common->cublasHandle,
880                                     Common->gpuStream [0]) ;
881     if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
882         ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ;
883     }
884 
885     /* ---------------------------------------------------------------------- */
886     /* block Cholesky factorization of S */
887     /* ---------------------------------------------------------------------- */
888 
889     for (j = 0 ; j < n ; j += nb)
890     {
891         Int jb = nb < (n-j) ? nb : (n-j) ;
892 
893         /* ------------------------------------------------------------------ */
894         /* do the CUDA BLAS dsyrk */
895         /* ------------------------------------------------------------------ */
896 
897         alpha = -1.0 ;
898         beta  = 1.0 ;
899 #ifdef REAL
900         cublasStatus = cublasDsyrk (Common->cublasHandle,
901             CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j,
902             &alpha, devPtrA + j, gpu_lda,
903             &beta,  devPtrA + j + j*gpu_lda, gpu_lda) ;
904 
905 #else
906         cublasStatus = cublasZherk (Common->cublasHandle,
907             CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j,
908             &alpha, (cuDoubleComplex*)devPtrA + j,
909             gpu_lda,
910             &beta,
911             (cuDoubleComplex*)devPtrA + j + j*gpu_lda,
912             gpu_lda) ;
913 #endif
914 
915         if (cublasStatus != CUBLAS_STATUS_SUCCESS)
916         {
917             ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
918         }
919 
920         /* ------------------------------------------------------------------ */
921 
922         cudaStat = cudaEventRecord (Common->cublasEventPotrf [0],
923                                     Common->gpuStream [0]) ;
924         if (cudaStat)
925         {
926             ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
927         }
928 
929         cudaStat = cudaStreamWaitEvent (Common->gpuStream [1],
930                                         Common->cublasEventPotrf [0], 0) ;
931         if (cudaStat)
932         {
933             ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
934         }
935 
936         /* ------------------------------------------------------------------ */
937         /* copy back the jb columns on two different streams */
938         /* ------------------------------------------------------------------ */
939 
940         cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + j*lda),
941             lda * L_ENTRY * sizeof (double),
942             devPtrA + L_ENTRY*(j + j*gpu_lda),
943             gpu_lda * L_ENTRY * sizeof (double),
944             L_ENTRY * sizeof (double)*jb,
945             jb,
946             cudaMemcpyDeviceToHost,
947             Common->gpuStream [1]) ;
948 
949         if (cudaStat)
950         {
951             ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ;
952         }
953 
954         /* ------------------------------------------------------------------ */
955         /* do the CUDA BLAS dgemm */
956         /* ------------------------------------------------------------------ */
957 
958         if ((j+jb) < n)
959         {
960 
961 #ifdef REAL
962             alpha = -1.0 ;
963             beta  = 1.0 ;
964             cublasStatus = cublasDgemm (Common->cublasHandle,
965                 CUBLAS_OP_N, CUBLAS_OP_T,
966                 (n-j-jb), jb, j,
967                 &alpha,
968                 devPtrA + (j+jb), gpu_lda,
969                 devPtrA + (j)  , gpu_lda,
970                 &beta,
971                 devPtrA + (j+jb + j*gpu_lda), gpu_lda) ;
972 
973 #else
974             cuDoubleComplex calpha = {-1.0,0.0} ;
975             cuDoubleComplex cbeta  = { 1.0,0.0} ;
976             cublasStatus = cublasZgemm (Common->cublasHandle,
977                 CUBLAS_OP_N, CUBLAS_OP_C,
978                 (n-j-jb), jb, j,
979                 &calpha,
980                 (cuDoubleComplex*)devPtrA + (j+jb),
981                 gpu_lda,
982                 (cuDoubleComplex*)devPtrA + (j),
983                 gpu_lda,
984                 &cbeta,
985                 (cuDoubleComplex*)devPtrA +
986                 (j+jb + j*gpu_lda),
987                 gpu_lda ) ;
988 #endif
989 
990             if (cublasStatus != CUBLAS_STATUS_SUCCESS)
991             {
992                 ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
993             }
994         }
995 
996         cudaStat = cudaStreamSynchronize (Common->gpuStream [1]) ;
997         if (cudaStat)
998         {
999             ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
1000         }
1001 
1002         /* ------------------------------------------------------------------ */
1003         /* compute the Cholesky factorization of the jbxjb block on the CPU */
1004         /* ------------------------------------------------------------------ */
1005 
1006         ilda = (int) lda ;
1007         ijb  = jb ;
1008 #ifdef REAL
1009         LAPACK_DPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ;
1010 #else
1011         LAPACK_ZPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ;
1012 #endif
1013         *info = iinfo ;
1014 
1015         if (*info != 0)
1016         {
1017             *info = *info + j ;
1018             break ;
1019         }
1020 
1021         /* ------------------------------------------------------------------ */
1022         /* copy the result back to the GPU */
1023         /* ------------------------------------------------------------------ */
1024 
1025         cudaStat = cudaMemcpy2DAsync (devPtrA + L_ENTRY*(j + j*gpu_lda),
1026             gpu_lda * L_ENTRY * sizeof (double),
1027             A + L_ENTRY * (j + j*lda),
1028             lda * L_ENTRY * sizeof (double),
1029             L_ENTRY * sizeof (double) * jb,
1030             jb,
1031             cudaMemcpyHostToDevice,
1032             Common->gpuStream [0]) ;
1033 
1034         if (cudaStat)
1035         {
1036             ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
1037         }
1038 
1039         /* ------------------------------------------------------------------ */
1040         /* do the CUDA BLAS dtrsm */
1041         /* ------------------------------------------------------------------ */
1042 
1043         if ((j+jb) < n)
1044         {
1045 
1046 #ifdef REAL
1047             alpha  = 1.0 ;
1048             cublasStatus = cublasDtrsm (Common->cublasHandle,
1049                 CUBLAS_SIDE_RIGHT,
1050                 CUBLAS_FILL_MODE_LOWER,
1051                 CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
1052                 (n-j-jb), jb,
1053                 &alpha,
1054                 devPtrA + (j + j*gpu_lda), gpu_lda,
1055                 devPtrA + (j+jb + j*gpu_lda), gpu_lda) ;
1056 #else
1057             cuDoubleComplex calpha  = {1.0,0.0};
1058             cublasStatus = cublasZtrsm (Common->cublasHandle,
1059                 CUBLAS_SIDE_RIGHT,
1060                 CUBLAS_FILL_MODE_LOWER,
1061                 CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT,
1062                 (n-j-jb), jb,
1063                 &calpha,
1064                 (cuDoubleComplex *)devPtrA +
1065                 (j + j*gpu_lda),
1066                 gpu_lda,
1067                 (cuDoubleComplex *)devPtrA +
1068                 (j+jb + j*gpu_lda),
1069                 gpu_lda) ;
1070 #endif
1071 
1072             if (cublasStatus != CUBLAS_STATUS_SUCCESS)
1073             {
1074                 ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
1075             }
1076 
1077             /* -------------------------------------------------------------- */
1078             /* Copy factored column back to host.                             */
1079             /* -------------------------------------------------------------- */
1080 
1081             cudaStat = cudaEventRecord (Common->cublasEventPotrf[2],
1082                                         Common->gpuStream[0]) ;
1083             if (cudaStat)
1084             {
1085                 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
1086             }
1087 
1088             cudaStat = cudaStreamWaitEvent (Common->gpuStream[1],
1089                                             Common->cublasEventPotrf[2], 0) ;
1090             if (cudaStat)
1091             {
1092                 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
1093             }
1094 
1095             cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + jb + j * lda),
1096                   lda * L_ENTRY * sizeof (double),
1097                   devPtrA + L_ENTRY*
1098                   (j + jb + j * gpu_lda),
1099                   gpu_lda * L_ENTRY * sizeof (double),
1100                   L_ENTRY * sizeof (double)*
1101                   (n - j - jb), jb,
1102                   cudaMemcpyDeviceToHost,
1103                   Common->gpuStream[1]) ;
1104 
1105             if (cudaStat)
1106             {
1107                 ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
1108             }
1109         }
1110     }
1111 
1112 #ifndef NTIMER
1113     Common->CHOLMOD_GPU_POTRF_TIME += SuiteSparse_time ( ) - tstart ;
1114 #endif
1115 
1116     return (1) ;
1117 }
1118 
1119 
1120 /* ========================================================================== */
1121 /* === gpu_triangular_solve ================================================= */
1122 /* ========================================================================== */
1123 
1124 /* The current supernode is columns k1 to k2-1 of L.  Let L1 be the diagonal
1125  * block (factorized by dpotrf/zpotrf above; rows/cols k1:k2-1), and L2 be rows
1126  * k2:n-1 and columns k1:k2-1 of L.  The triangular system to solve is L2*L1' =
1127  * S2, where S2 is overwritten with L2.  More precisely, L2 = S2 / L1' in
1128  * MATLAB notation.
1129  */
1130 
1131 /* Version with pre-allocation in POTRF */
1132 
TEMPLATE2(CHOLMOD (gpu_triangular_solve))1133 int TEMPLATE2 (CHOLMOD (gpu_triangular_solve))
1134 (
1135     Int nsrow2,     /* L1 and S2 are nsrow2-by-nscol2 */
1136     Int nscol2,     /* L1 is nscol2-by-nscol2 */
1137     Int nsrow,      /* leading dimension of L1, L2, and S2 */
1138     Int psx,        /* L1 is at Lx+L_ENTRY*psx;
1139                      * L2 at Lx+L_ENTRY*(psx+nscol2)*/
1140     double *Lx,     /* holds L1, L2, and S2 */
1141     cholmod_common *Common,
1142     cholmod_gpu_pointers *gpu_p
1143 )
1144 {
1145     double *devPtrA, *devPtrB ;
1146     cudaError_t cudaStat ;
1147     cublasStatus_t cublasStatus ;
1148     Int gpu_lda, gpu_ldb, gpu_rowstep ;
1149 
1150     Int gpu_row_start = 0 ;
1151     Int gpu_row_max_chunk, gpu_row_chunk;
1152     int ibuf = 0;
1153     int iblock = 0;
1154     int iHostBuff = (Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1) %
1155         CHOLMOD_HOST_SUPERNODE_BUFFERS;
1156     int i, j;
1157     Int iidx;
1158     int iwrap;
1159 
1160 #ifndef NTIMER
1161     double tstart ;
1162 #endif
1163 
1164 #ifdef REAL
1165     double alpha  = 1.0 ;
1166     gpu_row_max_chunk = 768;
1167 #else
1168     cuDoubleComplex calpha  = {1.0,0.0} ;
1169     gpu_row_max_chunk = 256;
1170 #endif
1171 
1172     if ( nsrow2 <= 0 )
1173     {
1174         return (0) ;
1175     }
1176 
1177 #ifndef NTIMER
1178     tstart = SuiteSparse_time ( ) ;
1179     Common->CHOLMOD_GPU_TRSM_CALLS++ ;
1180 #endif
1181 
1182     gpu_lda = ((nscol2+31)/32)*32 ;
1183     gpu_ldb = ((nsrow2+31)/32)*32 ;
1184 
1185     devPtrA = gpu_p->d_Lx[0];
1186     devPtrB = gpu_p->d_Lx[1];
1187 
1188     /* make sure the copy of B has completed */
1189     cudaStreamSynchronize( Common->gpuStream[0] );
1190 
1191     /* ---------------------------------------------------------------------- */
1192     /* do the CUDA BLAS dtrsm */
1193     /* ---------------------------------------------------------------------- */
1194 
1195     while ( gpu_row_start < nsrow2 )
1196     {
1197 
1198         gpu_row_chunk = nsrow2 - gpu_row_start;
1199         if ( gpu_row_chunk  > gpu_row_max_chunk ) {
1200             gpu_row_chunk = gpu_row_max_chunk;
1201         }
1202 
1203         cublasStatus = cublasSetStream ( Common->cublasHandle,
1204                                          Common->gpuStream[ibuf] );
1205 
1206         if ( cublasStatus != CUBLAS_STATUS_SUCCESS )
1207         {
1208             ERROR ( CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream");
1209         }
1210 
1211 #ifdef REAL
1212         cublasStatus = cublasDtrsm (Common->cublasHandle,
1213                                     CUBLAS_SIDE_RIGHT,
1214                                     CUBLAS_FILL_MODE_LOWER,
1215                                     CUBLAS_OP_T,
1216                                     CUBLAS_DIAG_NON_UNIT,
1217                                     gpu_row_chunk,
1218                                     nscol2,
1219                                     &alpha,
1220                                     devPtrA,
1221                                     gpu_lda,
1222                                     devPtrB + gpu_row_start,
1223                                     gpu_ldb) ;
1224 #else
1225         cublasStatus = cublasZtrsm (Common->cublasHandle,
1226                                     CUBLAS_SIDE_RIGHT,
1227                                     CUBLAS_FILL_MODE_LOWER,
1228                                     CUBLAS_OP_C,
1229                                     CUBLAS_DIAG_NON_UNIT,
1230                                     gpu_row_chunk,
1231                                     nscol2,
1232                                     &calpha,
1233                                     (const cuDoubleComplex *) devPtrA,
1234                                     gpu_lda,
1235                                     (cuDoubleComplex *)devPtrB + gpu_row_start ,
1236                                     gpu_ldb) ;
1237 #endif
1238         if (cublasStatus != CUBLAS_STATUS_SUCCESS)
1239         {
1240             ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
1241         }
1242 
1243         /* ------------------------------------------------------------------ */
1244         /* copy result back to the CPU */
1245         /* ------------------------------------------------------------------ */
1246 
1247         cudaStat = cudaMemcpy2DAsync (
1248             gpu_p->h_Lx[iHostBuff] +
1249             L_ENTRY*(nscol2+gpu_row_start),
1250             nsrow * L_ENTRY * sizeof (Lx [0]),
1251             devPtrB + L_ENTRY*gpu_row_start,
1252             gpu_ldb * L_ENTRY * sizeof (devPtrB [0]),
1253             gpu_row_chunk * L_ENTRY *
1254             sizeof (devPtrB [0]),
1255             nscol2,
1256             cudaMemcpyDeviceToHost,
1257             Common->gpuStream[ibuf]);
1258 
1259         if (cudaStat)
1260         {
1261             ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ;
1262         }
1263 
1264         cudaEventRecord ( Common->updateCBuffersFree[ibuf],
1265                           Common->gpuStream[ibuf] );
1266 
1267         gpu_row_start += gpu_row_chunk;
1268         ibuf++;
1269         ibuf = ibuf % CHOLMOD_HOST_SUPERNODE_BUFFERS;
1270 
1271         iblock ++;
1272 
1273         if ( iblock >= CHOLMOD_HOST_SUPERNODE_BUFFERS )
1274         {
1275             Int gpu_row_start2 ;
1276             Int gpu_row_end ;
1277 
1278             /* then CHOLMOD_HOST_SUPERNODE_BUFFERS worth of work has been
1279              *  scheduled, so check for completed events and copy result into
1280              *  Lx before continuing. */
1281             cudaEventSynchronize ( Common->updateCBuffersFree
1282                                    [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] );
1283 
1284             /* copy into Lx */
1285             gpu_row_start2 = nscol2 +
1286                 (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS)
1287                 *gpu_row_max_chunk;
1288             gpu_row_end = gpu_row_start2+gpu_row_max_chunk;
1289 
1290             if ( gpu_row_end > nsrow ) gpu_row_end = nsrow;
1291 
1292 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
1293     private(iidx) if ( nscol2 > 32 )
1294 
1295             for ( j=0; j<nscol2; j++ ) {
1296                 for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) {
1297                     iidx = j*nsrow*L_ENTRY+i;
1298                     Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
1299                 }
1300             }
1301         }
1302     }
1303 
1304     /* Convenient to copy the L1 block here */
1305 
1306 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
1307 private ( iidx ) if ( nscol2 > 32 )
1308 
1309     for ( j=0; j<nscol2; j++ ) {
1310         for ( i=j*L_ENTRY; i<nscol2*L_ENTRY; i++ ) {
1311             iidx = j*nsrow*L_ENTRY + i;
1312             Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
1313         }
1314     }
1315 
1316     /* now account for the last HSTREAMS buffers */
1317     for ( iwrap=0; iwrap<CHOLMOD_HOST_SUPERNODE_BUFFERS; iwrap++ )
1318     {
1319         int i, j;
1320         Int gpu_row_start2 = nscol2 + (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS)
1321             *gpu_row_max_chunk;
1322         if (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS >= 0 &&
1323             gpu_row_start2 < nsrow )
1324         {
1325             Int iidx;
1326             Int gpu_row_end = gpu_row_start2+gpu_row_max_chunk;
1327             if ( gpu_row_end > nsrow ) gpu_row_end = nsrow;
1328             cudaEventSynchronize ( Common->updateCBuffersFree
1329                                    [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] );
1330             /* copy into Lx */
1331 
1332 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
1333     private(iidx) if ( nscol2 > 32 )
1334 
1335             for ( j=0; j<nscol2; j++ ) {
1336                 for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) {
1337                     iidx = j*nsrow*L_ENTRY+i;
1338                     Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
1339                 }
1340             }
1341         }
1342         iblock++;
1343     }
1344 
1345     /* ---------------------------------------------------------------------- */
1346     /* return */
1347     /* ---------------------------------------------------------------------- */
1348 
1349 #ifndef NTIMER
1350     Common->CHOLMOD_GPU_TRSM_TIME += SuiteSparse_time ( ) - tstart ;
1351 #endif
1352 
1353     return (1) ;
1354 }
1355 
1356 /* ========================================================================== */
1357 /* === gpu_copy_supernode =================================================== */
1358 /* ========================================================================== */
1359 /*
1360  * In the event gpu_triangular_sovle is not needed / called, this routine
1361  * copies the factored diagonal block from the GPU to the CPU.
1362  */
1363 
TEMPLATE2(CHOLMOD (gpu_copy_supernode))1364 void TEMPLATE2 (CHOLMOD (gpu_copy_supernode))
1365 (
1366     cholmod_common *Common,
1367     double *Lx,
1368     Int psx,
1369     Int nscol,
1370     Int nscol2,
1371     Int nsrow,
1372     int supernodeUsedGPU,
1373     int iHostBuff,
1374     cholmod_gpu_pointers *gpu_p
1375 )
1376 {
1377     Int iidx, i, j;
1378     if ( supernodeUsedGPU && nscol2 * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) {
1379         cudaDeviceSynchronize();
1380 
1381 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
1382     private(iidx,i,j) if (nscol>32)
1383 
1384         for ( j=0; j<nscol; j++ ) {
1385             for ( i=j*L_ENTRY; i<nscol*L_ENTRY; i++ ) {
1386                 iidx = j*nsrow*L_ENTRY+i;
1387                 Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
1388             }
1389         }
1390     }
1391     return;
1392 }
1393 
1394 #endif
1395 
1396 #undef REAL
1397 #undef COMPLEX
1398 #undef ZOMPLEX
1399