1 /* ========================================================================== */
2 /* === GPU/t_cholmod_gpu ==================================================== */
3 /* ========================================================================== */
4
5 /* -----------------------------------------------------------------------------
6 * CHOLMOD/GPU Module. Copyright (C) 2005-2012, Timothy A. Davis
7 * http://www.suitesparse.com
8 * -------------------------------------------------------------------------- */
9
10 /* GPU BLAS template routine for cholmod_super_numeric. */
11
12 /* ========================================================================== */
13 /* === include files and definitions ======================================== */
14 /* ========================================================================== */
15
16 #ifdef GPU_BLAS
17
18 #include <string.h>
19 #include "cholmod_template.h"
20 #include "cholmod_gpu_kernels.h"
21 #include <fenv.h>
22 #include <cuda.h>
23 #include <cuda_runtime.h>
24
25 #undef L_ENTRY
26 #ifdef REAL
27 #define L_ENTRY 1
28 #else
29 #define L_ENTRY 2
30 #endif
31
32
33 /* ========================================================================== */
34 /* === gpu_clear_memory ===================================================== */
35 /* ========================================================================== */
36 /*
37 * Ensure the Lx is zeroed before forming factor. This is a significant cost
38 * in the GPU case - so using this parallel memset code for efficiency.
39 */
40
TEMPLATE2(CHOLMOD (gpu_clear_memory))41 void TEMPLATE2 (CHOLMOD (gpu_clear_memory))
42 (
43 double* buff,
44 size_t size,
45 int num_threads
46 )
47 {
48 int chunk_multiplier = 5;
49 int num_chunks = chunk_multiplier * num_threads;
50 size_t chunksize = size / num_chunks;
51 size_t i;
52
53 #pragma omp parallel for num_threads(num_threads) private(i) schedule(dynamic)
54 for(i = 0; i < num_chunks; i++) {
55 size_t chunkoffset = i * chunksize;
56 if(i == num_chunks - 1) {
57 memset(buff + chunkoffset, 0, (size - chunksize*(num_chunks - 1)) *
58 sizeof(double));
59 }
60 else {
61 memset(buff + chunkoffset, 0, chunksize * sizeof(double));
62 }
63 }
64 }
65
66 /* ========================================================================== */
67 /* === gpu_init ============================================================= */
68 /* ========================================================================== */
69 /*
70 * Performs required initialization for GPU computing.
71 *
72 * Returns 0 if there is an error, so the intended use is
73 *
74 * useGPU = CHOLMOD(gpu_init)
75 *
76 * which would locally turn off gpu processing if the initialization failed.
77 */
78
TEMPLATE2(CHOLMOD (gpu_init))79 int TEMPLATE2 (CHOLMOD (gpu_init))
80 (
81 void *Cwork,
82 cholmod_factor *L,
83 cholmod_common *Common,
84 Int nsuper,
85 Int n,
86 Int nls,
87 cholmod_gpu_pointers *gpu_p
88 )
89 {
90 Int i, k, maxSize ;
91 cublasStatus_t cublasError ;
92 cudaError_t cudaErr ;
93 size_t maxBytesSize, HostPinnedSize ;
94
95 feenableexcept (FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW );
96
97 maxSize = L->maxcsize;
98
99 /* #define PAGE_SIZE (4*1024) */
100 CHOLMOD_GPU_PRINTF (("gpu_init : %p\n",
101 (void *) ((size_t) Cwork & ~(4*1024-1)))) ;
102
103 /* make sure the assumed buffer sizes are large enough */
104 if ( (nls+2*n+4)*sizeof(Int) > Common->devBuffSize ) {
105 ERROR (CHOLMOD_GPU_PROBLEM,"\n\n"
106 "GPU Memory allocation error. Ls, Map and RelativeMap exceed\n"
107 "devBuffSize. It is not clear if this is due to insufficient\n"
108 "device or host memory or both. You can try:\n"
109 " 1) increasing the amount of GPU memory requested\n"
110 " 2) reducing CHOLMOD_NUM_HOST_BUFFERS\n"
111 " 3) using a GPU & host with more memory\n"
112 "This issue is a known limitation and should be fixed in a \n"
113 "future release of CHOLMOD.\n") ;
114 return (0) ;
115 }
116
117 /* divvy up the memory in dev_mempool */
118 gpu_p->d_Lx[0] = Common->dev_mempool;
119 gpu_p->d_Lx[1] = Common->dev_mempool + Common->devBuffSize;
120 gpu_p->d_C = Common->dev_mempool + 2*Common->devBuffSize;
121 gpu_p->d_A[0] = Common->dev_mempool + 3*Common->devBuffSize;
122 gpu_p->d_A[1] = Common->dev_mempool + 4*Common->devBuffSize;
123 gpu_p->d_Ls = Common->dev_mempool + 5*Common->devBuffSize;
124 gpu_p->d_Map = gpu_p->d_Ls + (nls+1)*sizeof(Int) ;
125 gpu_p->d_RelativeMap = gpu_p->d_Map + (n+1)*sizeof(Int) ;
126
127 /* Copy all of the Ls and Lpi data to the device. If any supernodes are
128 * to be computed on the device then this will be needed, so might as
129 * well do it now. */
130
131 cudaErr = cudaMemcpy ( gpu_p->d_Ls, L->s, nls*sizeof(Int),
132 cudaMemcpyHostToDevice );
133 CHOLMOD_HANDLE_CUDA_ERROR(cudaErr,"cudaMemcpy(d_Ls)");
134
135 if (!(Common->gpuStream[0])) {
136
137 /* ------------------------------------------------------------------ */
138 /* create each CUDA stream */
139 /* ------------------------------------------------------------------ */
140
141 for ( i=0; i<CHOLMOD_HOST_SUPERNODE_BUFFERS; i++ ) {
142 cudaErr = cudaStreamCreate ( &(Common->gpuStream[i]) );
143 if (cudaErr != cudaSuccess) {
144 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA stream") ;
145 return (0) ;
146 }
147 }
148
149 /* ------------------------------------------------------------------ */
150 /* create each CUDA event */
151 /* ------------------------------------------------------------------ */
152
153 for (i = 0 ; i < 3 ; i++) {
154 cudaErr = cudaEventCreateWithFlags
155 (&(Common->cublasEventPotrf [i]), cudaEventDisableTiming) ;
156 if (cudaErr != cudaSuccess) {
157 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event") ;
158 return (0) ;
159 }
160 }
161
162 for (i = 0 ; i < CHOLMOD_HOST_SUPERNODE_BUFFERS ; i++) {
163 cudaErr = cudaEventCreateWithFlags
164 (&(Common->updateCBuffersFree[i]), cudaEventDisableTiming) ;
165 if (cudaErr != cudaSuccess) {
166 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event") ;
167 return (0) ;
168 }
169 }
170
171 cudaErr = cudaEventCreateWithFlags ( &(Common->updateCKernelsComplete),
172 cudaEventDisableTiming );
173 if (cudaErr != cudaSuccess) {
174 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA updateCKernelsComplete event") ;
175 return (0) ;
176 }
177
178 }
179
180 gpu_p->h_Lx[0] = (double*)(Common->host_pinned_mempool);
181 for ( k=1; k<CHOLMOD_HOST_SUPERNODE_BUFFERS; k++ ) {
182 gpu_p->h_Lx[k] = (double*)((char *)(Common->host_pinned_mempool) +
183 k*Common->devBuffSize);
184 }
185
186 return (1); /* initialization successfull, useGPU = 1 */
187
188 }
189
190
191 /* ========================================================================== */
192 /* === gpu_reorder_descendants ============================================== */
193 /* ========================================================================== */
194 /* Reorder the descendant supernodes as:
195 * 1st - descendant supernodes eligible for processing on the GPU
196 * in increasing (by flops) order
197 * 2nd - supernodes whose processing is to remain on the CPU
198 * in any order
199 *
200 * All of the GPU-eligible supernodes will be scheduled first. All
201 * CPU-eligible descendants will overlap with the last (largest)
202 * CHOLMOD_HOST_SUPERNODE_BUFFERS GPU-eligible descendants.
203 */
204
TEMPLATE2(CHOLMOD (gpu_reorder_descendants))205 void TEMPLATE2 (CHOLMOD (gpu_reorder_descendants))
206 (
207 cholmod_common *Common,
208 Int *Super,
209 Int *locals,
210 Int *Lpi,
211 Int *Lpos,
212 Int *Head,
213 Int *Next,
214 Int *Previous,
215 Int *ndescendants,
216 Int *tail,
217 Int *mapCreatedOnGpu,
218 cholmod_gpu_pointers *gpu_p
219 )
220 {
221
222 Int prevd, nextd, firstcpu, d, k, kd1, kd2, ndcol, pdi, pdend, pdi1;
223 Int dnext, ndrow2, p;
224 Int n_descendant = 0;
225 double score;
226
227 /* use h_Lx[0] to buffer the GPU-eligible descendants */
228 struct cholmod_descendant_score_t* scores =
229 (struct cholmod_descendant_score_t*) gpu_p->h_Lx[0];
230
231 double cpuref = 0.0;
232
233 int nreverse = 1;
234 int previousd;
235
236
237 d = Head[*locals];
238 prevd = -1;
239 firstcpu = -1;
240 *mapCreatedOnGpu = 0;
241
242 while ( d != EMPTY )
243 {
244
245 /* Get the parameters for the current descendant supernode */
246 kd1 = Super [d] ; /* d contains cols kd1 to kd2-1 of L */
247 kd2 = Super [d+1] ;
248 ndcol = kd2 - kd1 ; /* # of columns in all of d */
249 pdi = Lpi [d] ; /* pointer to first row of d in Ls */
250 pdend = Lpi [d+1] ; /* pointer just past last row of d in Ls */
251 p = Lpos [d] ; /* offset of 1st row of d affecting s */
252 pdi1 = pdi + p ; /* ptr to 1st row of d affecting s in Ls */
253 ndrow2 = pdend - pdi1;
254
255 nextd = Next[d];
256
257 /* compute a rough flops 'score' for this descendant supernode */
258 score = ndrow2 * ndcol;
259 if ( ndrow2*L_ENTRY >= CHOLMOD_ND_ROW_LIMIT &&
260 ndcol*L_ENTRY >= CHOLMOD_ND_COL_LIMIT ) {
261 score += Common->devBuffSize;
262 }
263
264 /* place in sort buffer */
265 scores[n_descendant].score = score;
266 scores[n_descendant].d = d;
267 n_descendant++;
268
269 d = nextd;
270
271 }
272
273 /* Sort the GPU-eligible supernodes */
274 qsort ( scores, n_descendant, sizeof(struct cholmod_descendant_score_t),
275 (__compar_fn_t) CHOLMOD(score_comp) );
276
277 /* Place sorted data back in descendant supernode linked list*/
278 if ( n_descendant > 0 ) {
279 Head[*locals] = scores[0].d;
280 if ( n_descendant > 1 ) {
281
282 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
283 if (n_descendant > 64)
284
285 for ( k=1; k<n_descendant; k++ ) {
286 Next[scores[k-1].d] = scores[k].d;
287 }
288 }
289 Next[scores[n_descendant-1].d] = firstcpu;
290 }
291
292 /* reverse the first CHOLMOD_HOST_SUPERNODE_BUFFERS to better hide PCIe
293 communications */
294
295 if ( Head[*locals] != EMPTY && Next[Head[*locals]] != EMPTY ) {
296 previousd = Head[*locals];
297 d = Next[Head[*locals]];
298 while ( d!=EMPTY && nreverse < CHOLMOD_HOST_SUPERNODE_BUFFERS ) {
299
300 kd1 = Super [d] ; /* d contains cols kd1 to kd2-1 of L */
301 kd2 = Super [d+1] ;
302 ndcol = kd2 - kd1 ; /* # of columns in all of d */
303 pdi = Lpi [d] ; /* pointer to first row of d in Ls */
304 pdend = Lpi [d+1] ; /* pointer just past last row of d in Ls */
305 p = Lpos [d] ; /* offset of 1st row of d affecting s */
306 pdi1 = pdi + p ; /* ptr to 1st row of d affecting s in Ls */
307 ndrow2 = pdend - pdi1;
308
309 nextd = Next[d];
310
311 nreverse++;
312
313 if ( ndrow2*L_ENTRY >= CHOLMOD_ND_ROW_LIMIT && ndcol*L_ENTRY >=
314 CHOLMOD_ND_COL_LIMIT ) {
315 /* place this supernode at the front of the list */
316 Next[previousd] = Next[d];
317 Next[d] = Head[*locals];
318 Head[*locals] = d;
319 }
320 else {
321 previousd = d;
322 }
323 d = nextd;
324 }
325 }
326
327 /* create a 'previous' list so we can traverse backwards */
328 *ndescendants = 0;
329 if ( Head[*locals] != EMPTY ) {
330 Previous[Head[*locals]] = EMPTY;
331 for (d = Head [*locals] ; d != EMPTY ; d = dnext) {
332 (*ndescendants)++;
333 dnext = Next[d];
334 if ( dnext != EMPTY ) {
335 Previous[dnext] = d;
336 }
337 else {
338 *tail = d;
339 }
340 }
341 }
342
343 return;
344
345 }
346
347 /* ========================================================================== */
348 /* === gpu_initialize_supernode ============================================= */
349 /* ========================================================================== */
350
351 /* C = L (k1:n-1, kd1:kd2-1) * L (k1:k2-1, kd1:kd2-1)', except that k1:n-1
352 */
353
TEMPLATE2(CHOLMOD (gpu_initialize_supernode))354 void TEMPLATE2 (CHOLMOD (gpu_initialize_supernode))
355 (
356 cholmod_common *Common,
357 Int nscol,
358 Int nsrow,
359 Int psi,
360 cholmod_gpu_pointers *gpu_p
361 )
362 {
363
364 cudaError_t cuErr;
365
366 /* initialize the device supernode assemby memory to zero */
367 cuErr = cudaMemset ( gpu_p->d_A[0], 0, nscol*nsrow*L_ENTRY*sizeof(double) );
368 CHOLMOD_HANDLE_CUDA_ERROR(cuErr,"cudaMemset(d_A)");
369
370 /* Create the Map on the device */
371 createMapOnDevice ( (Int *)(gpu_p->d_Map),
372 (Int *)(gpu_p->d_Ls), psi, nsrow );
373
374 return;
375
376 }
377
378
379 /* ========================================================================== */
380 /* === gpu_updateC ========================================================== */
381 /* ========================================================================== */
382
383 /* C = L (k1:n-1, kd1:kd2-1) * L (k1:k2-1, kd1:kd2-1)', except that k1:n-1
384 * refers to all of the rows in L, but many of the rows are all zero.
385 * Supernode d holds columns kd1 to kd2-1 of L. Nonzero rows in the range
386 * k1:k2-1 are in the list Ls [pdi1 ... pdi2-1], of size ndrow1. Nonzero rows
387 * in the range k2:n-1 are in the list Ls [pdi2 ... pdend], of size ndrow2.
388 * Let L1 = L (Ls [pdi1 ... pdi2-1], kd1:kd2-1), and let L2 = L (Ls [pdi2 ...
389 * pdend], kd1:kd2-1). C is ndrow2-by-ndrow1. Let C1 be the first ndrow1
390 * rows of C and let C2 be the last ndrow2-ndrow1 rows of C. Only the lower
391 * triangular part of C1 needs to be computed since C1 is symmetric.
392 *
393 * UpdateC is completely asynchronous w.r.t. the GPU. Once the input buffer
394 * d_Lx[] has been filled, all of the device operations are issues, and the
395 * host can continue with filling the next input buffer / or start processing
396 * all of the descendant supernodes which are not eligible for processing on
397 * the device (since they are too small - will not fill the device).
398 */
399
TEMPLATE2(CHOLMOD (gpu_updateC))400 int TEMPLATE2 (CHOLMOD (gpu_updateC))
401 (
402 Int ndrow1, /* C is ndrow2-by-ndrow2 */
403 Int ndrow2,
404 Int ndrow, /* leading dimension of Lx */
405 Int ndcol, /* L1 is ndrow1-by-ndcol */
406 Int nsrow,
407 Int pdx1, /* L1 starts at Lx + L_ENTRY*pdx1 */
408 /* L2 starts at Lx + L_ENTRY*(pdx1 + ndrow1) */
409 Int pdi1,
410 double *Lx,
411 double *C,
412 cholmod_common *Common,
413 cholmod_gpu_pointers *gpu_p
414 )
415 {
416 double *devPtrLx, *devPtrC ;
417 double alpha, beta ;
418 cublasStatus_t cublasStatus ;
419 cudaError_t cudaStat [2] ;
420 Int ndrow3 ;
421 int icol, irow;
422 int iHostBuff, iDevBuff ;
423
424 #ifndef NTIMER
425 double tstart = 0;
426 #endif
427
428 if ((ndrow2*L_ENTRY < CHOLMOD_ND_ROW_LIMIT) ||
429 (ndcol*L_ENTRY < CHOLMOD_ND_COL_LIMIT))
430 {
431 /* too small for the CUDA BLAS; use the CPU instead */
432 return (0) ;
433 }
434
435 ndrow3 = ndrow2 - ndrow1 ;
436
437 #ifndef NTIMER
438 Common->syrkStart = SuiteSparse_time ( ) ;
439 Common->CHOLMOD_GPU_SYRK_CALLS++ ;
440 #endif
441
442 /* ---------------------------------------------------------------------- */
443 /* allocate workspace on the GPU */
444 /* ---------------------------------------------------------------------- */
445
446 iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
447 iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;
448
449 /* cycle the device Lx buffer, d_Lx, through CHOLMOD_DEVICE_STREAMS,
450 usually 2, so we can overlap the copy of this descendent supernode
451 with the compute of the previous descendant supernode */
452 devPtrLx = (double *)(gpu_p->d_Lx[iDevBuff]);
453 /* very little overlap between kernels for difference descendant supernodes
454 (since we enforce the supernodes must be large enough to fill the
455 device) so we only need one C buffer */
456 devPtrC = (double *)(gpu_p->d_C);
457
458 /* ---------------------------------------------------------------------- */
459 /* copy Lx to the GPU */
460 /* ---------------------------------------------------------------------- */
461
462 /* copy host data to pinned buffer first for better H2D bandwidth */
463 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) if (ndcol > 32)
464 for ( icol=0; icol<ndcol; icol++ ) {
465 for ( irow=0; irow<ndrow2*L_ENTRY; irow++ ) {
466 gpu_p->h_Lx[iHostBuff][icol*ndrow2*L_ENTRY+irow] =
467 Lx[pdx1*L_ENTRY+icol*ndrow*L_ENTRY + irow];
468 }
469 }
470
471 cudaStat[0] = cudaMemcpyAsync ( devPtrLx,
472 gpu_p->h_Lx[iHostBuff],
473 ndrow2*ndcol*L_ENTRY*sizeof(devPtrLx[0]),
474 cudaMemcpyHostToDevice,
475 Common->gpuStream[iDevBuff] );
476
477 if ( cudaStat[0] ) {
478 CHOLMOD_GPU_PRINTF ((" ERROR cudaMemcpyAsync = %d \n", cudaStat[0]));
479 return (0);
480 }
481
482 /* make the current stream wait for kernels in previous streams */
483 cudaStreamWaitEvent ( Common->gpuStream[iDevBuff],
484 Common->updateCKernelsComplete, 0 ) ;
485
486 /* ---------------------------------------------------------------------- */
487 /* create the relative map for this descendant supernode */
488 /* ---------------------------------------------------------------------- */
489
490 createRelativeMapOnDevice ( (Int *)(gpu_p->d_Map),
491 (Int *)(gpu_p->d_Ls),
492 (Int *)(gpu_p->d_RelativeMap),
493 pdi1, ndrow2,
494 &(Common->gpuStream[iDevBuff]) );
495
496 /* ---------------------------------------------------------------------- */
497 /* do the CUDA SYRK */
498 /* ---------------------------------------------------------------------- */
499
500 cublasStatus = cublasSetStream (Common->cublasHandle,
501 Common->gpuStream[iDevBuff]) ;
502 if (cublasStatus != CUBLAS_STATUS_SUCCESS)
503 {
504 ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ;
505 }
506
507 alpha = 1.0 ;
508 beta = 0.0 ;
509
510 #ifdef REAL
511 cublasStatus = cublasDsyrk (Common->cublasHandle,
512 CUBLAS_FILL_MODE_LOWER,
513 CUBLAS_OP_N,
514 (int) ndrow1,
515 (int) ndcol, /* N, K: L1 is ndrow1-by-ndcol */
516 &alpha, /* ALPHA: 1 */
517 devPtrLx,
518 ndrow2, /* A, LDA: L1, ndrow2 */
519 &beta, /* BETA: 0 */
520 devPtrC,
521 ndrow2) ; /* C, LDC: C1 */
522 #else
523 cublasStatus = cublasZherk (Common->cublasHandle,
524 CUBLAS_FILL_MODE_LOWER,
525 CUBLAS_OP_N,
526 (int) ndrow1,
527 (int) ndcol, /* N, K: L1 is ndrow1-by-ndcol*/
528 &alpha, /* ALPHA: 1 */
529 (const cuDoubleComplex *) devPtrLx,
530 ndrow2, /* A, LDA: L1, ndrow2 */
531 &beta, /* BETA: 0 */
532 (cuDoubleComplex *) devPtrC,
533 ndrow2) ; /* C, LDC: C1 */
534 #endif
535
536 if (cublasStatus != CUBLAS_STATUS_SUCCESS)
537 {
538 ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
539 }
540
541 #ifndef NTIMER
542 Common->CHOLMOD_GPU_SYRK_TIME += SuiteSparse_time() - Common->syrkStart;
543 #endif
544
545 /* ---------------------------------------------------------------------- */
546 /* compute remaining (ndrow2-ndrow1)-by-ndrow1 block of C, C2 = L2*L1' */
547 /* ---------------------------------------------------------------------- */
548
549 #ifndef NTIMER
550 Common->CHOLMOD_GPU_GEMM_CALLS++ ;
551 tstart = SuiteSparse_time();
552 #endif
553
554 if (ndrow3 > 0)
555 {
556 #ifndef REAL
557 cuDoubleComplex calpha = {1.0,0.0} ;
558 cuDoubleComplex cbeta = {0.0,0.0} ;
559 #endif
560
561 /* ------------------------------------------------------------------ */
562 /* do the CUDA BLAS dgemm */
563 /* ------------------------------------------------------------------ */
564
565 #ifdef REAL
566 alpha = 1.0 ;
567 beta = 0.0 ;
568 cublasStatus = cublasDgemm (Common->cublasHandle,
569 CUBLAS_OP_N, CUBLAS_OP_T,
570 ndrow3, ndrow1, ndcol, /* M, N, K */
571 &alpha, /* ALPHA: 1 */
572 devPtrLx + L_ENTRY*(ndrow1), /* A, LDA: L2*/
573 ndrow2, /* ndrow */
574 devPtrLx, /* B, LDB: L1 */
575 ndrow2, /* ndrow */
576 &beta, /* BETA: 0 */
577 devPtrC + L_ENTRY*ndrow1, /* C, LDC: C2 */
578 ndrow2) ;
579 #else
580 cublasStatus = cublasZgemm (Common->cublasHandle,
581 CUBLAS_OP_N, CUBLAS_OP_C,
582 ndrow3, ndrow1, ndcol, /* M, N, K */
583 &calpha, /* ALPHA: 1 */
584 (const cuDoubleComplex*) devPtrLx + ndrow1,
585 ndrow2, /* ndrow */
586 (const cuDoubleComplex *) devPtrLx,
587 ndrow2, /* ndrow */
588 &cbeta, /* BETA: 0 */
589 (cuDoubleComplex *)devPtrC + ndrow1,
590 ndrow2) ;
591 #endif
592
593 if (cublasStatus != CUBLAS_STATUS_SUCCESS)
594 {
595 ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
596 }
597
598 }
599
600 #ifndef NTIMER
601 Common->CHOLMOD_GPU_GEMM_TIME += SuiteSparse_time() - tstart;
602 #endif
603
604 /* ------------------------------------------------------------------ */
605 /* Assemble the update C on the device using the d_RelativeMap */
606 /* ------------------------------------------------------------------ */
607
608 #ifdef REAL
609 addUpdateOnDevice ( gpu_p->d_A[0], devPtrC,
610 gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow,
611 &(Common->gpuStream[iDevBuff]) );
612 #else
613 addComplexUpdateOnDevice ( gpu_p->d_A[0], devPtrC,
614 gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow,
615 &(Common->gpuStream[iDevBuff]) );
616 #endif
617
618 /* Record an event indicating that kernels for
619 this descendant are complete */
620 cudaEventRecord ( Common->updateCKernelsComplete,
621 Common->gpuStream[iDevBuff]);
622 cudaEventRecord ( Common->updateCBuffersFree[iHostBuff],
623 Common->gpuStream[iDevBuff]);
624
625 return (1) ;
626 }
627
628 /* ========================================================================== */
629 /* === gpu_final_assembly =================================================== */
630 /* ========================================================================== */
631
632 /* If the supernode was assembled on both the CPU and the GPU, this will
633 * complete the supernode assembly on both the GPU and CPU.
634 */
635
TEMPLATE2(CHOLMOD (gpu_final_assembly))636 void TEMPLATE2 (CHOLMOD (gpu_final_assembly))
637 (
638 cholmod_common *Common,
639 double *Lx,
640 Int psx,
641 Int nscol,
642 Int nsrow,
643 int supernodeUsedGPU,
644 int *iHostBuff,
645 int *iDevBuff,
646 cholmod_gpu_pointers *gpu_p
647 )
648 {
649 Int iidx, i, j;
650 Int iHostBuff2 ;
651 Int iDevBuff2 ;
652
653 if ( supernodeUsedGPU ) {
654
655 /* ------------------------------------------------------------------ */
656 /* Apply all of the Shur-complement updates, computed on the gpu, to */
657 /* the supernode. */
658 /* ------------------------------------------------------------------ */
659
660 *iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
661 *iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;
662
663 if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) {
664
665 /* If this supernode is going to be factored using the GPU (potrf)
666 * then it will need the portion of the update assembled ont the
667 * CPU. So copy that to a pinned buffer an H2D copy to device. */
668
669 /* wait until a buffer is free */
670 cudaEventSynchronize ( Common->updateCBuffersFree[*iHostBuff] );
671
672 /* copy update assembled on CPU to a pinned buffer */
673
674 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
675 private(iidx) if (nscol>32)
676
677 for ( j=0; j<nscol; j++ ) {
678 for ( i=j; i<nsrow*L_ENTRY; i++ ) {
679 iidx = j*nsrow*L_ENTRY+i;
680 gpu_p->h_Lx[*iHostBuff][iidx] = Lx[psx*L_ENTRY+iidx];
681 }
682 }
683
684 /* H2D transfer of update assembled on CPU */
685 cudaMemcpyAsync ( gpu_p->d_A[1], gpu_p->h_Lx[*iHostBuff],
686 nscol*nsrow*L_ENTRY*sizeof(double),
687 cudaMemcpyHostToDevice,
688 Common->gpuStream[*iDevBuff] );
689 }
690
691 Common->ibuffer++;
692
693 iHostBuff2 = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
694 iDevBuff2 = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;
695
696 /* wait for all kernels to complete */
697 cudaEventSynchronize( Common->updateCKernelsComplete );
698
699 /* copy assembled Schur-complement updates computed on GPU */
700 cudaMemcpyAsync ( gpu_p->h_Lx[iHostBuff2], gpu_p->d_A[0],
701 nscol*nsrow*L_ENTRY*sizeof(double),
702 cudaMemcpyDeviceToHost,
703 Common->gpuStream[iDevBuff2] );
704
705 if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) {
706
707 /* with the current implementation, potrf still uses data from the
708 * CPU - so put the fully assembled supernode in a pinned buffer for
709 * fastest access */
710
711 /* need both H2D and D2H copies to be complete */
712 cudaDeviceSynchronize();
713
714 /* sum updates from cpu and device on device */
715 #ifdef REAL
716 sumAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0], -1.0, nsrow, nscol );
717 #else
718 sumComplexAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0],
719 -1.0, nsrow, nscol );
720 #endif
721
722 /* place final assembled supernode in pinned buffer */
723
724 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
725 private(iidx) if (nscol>32)
726
727 for ( j=0; j<nscol; j++ ) {
728 for ( i=j*L_ENTRY; i<nscol*L_ENTRY; i++ ) {
729 iidx = j*nsrow*L_ENTRY+i;
730 gpu_p->h_Lx[*iHostBuff][iidx] -=
731 gpu_p->h_Lx[iHostBuff2][iidx];
732 }
733 }
734
735 }
736 else
737 {
738
739 /* assemble with CPU updates */
740 cudaDeviceSynchronize();
741
742 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
743 private(iidx) if (nscol>32)
744
745 for ( j=0; j<nscol; j++ ) {
746 for ( i=j*L_ENTRY; i<nsrow*L_ENTRY; i++ ) {
747 iidx = j*nsrow*L_ENTRY+i;
748 Lx[psx*L_ENTRY+iidx] -= gpu_p->h_Lx[iHostBuff2][iidx];
749 }
750 }
751 }
752 }
753 return;
754 }
755
756
757 /* ========================================================================== */
758 /* === gpu_lower_potrf ====================================================== */
759 /* ========================================================================== */
760
761 /* Cholesky factorzation (dpotrf) of a matrix S, operating on the lower
762 * triangular part only. S is nscol2-by-nscol2 with leading dimension nsrow.
763 *
764 * S is the top part of the supernode (the lower triangular matrx).
765 * This function also copies the bottom rectangular part of the supernode (B)
766 * onto the GPU, in preparation for gpu_triangular_solve.
767 */
768
769 /*
770 * On entry, d_A[1] contains the fully assembled supernode
771 */
772
TEMPLATE2(CHOLMOD (gpu_lower_potrf))773 int TEMPLATE2 (CHOLMOD (gpu_lower_potrf))
774 (
775 Int nscol2, /* S is nscol2-by-nscol2 */
776 Int nsrow, /* leading dimension of S */
777 Int psx, /* S is located at Lx + L_ENTRY*psx */
778 double *Lx, /* contains S; overwritten with Cholesky factor */
779 Int *info, /* BLAS info return value */
780 cholmod_common *Common,
781 cholmod_gpu_pointers *gpu_p
782 )
783 {
784 double *devPtrA, *devPtrB, *A ;
785 double alpha, beta ;
786 cudaError_t cudaStat ;
787 cublasStatus_t cublasStatus ;
788 Int j, nsrow2, nb, n, gpu_lda, lda, gpu_ldb ;
789 int ilda, ijb, iinfo ;
790 #ifndef NTIMER
791 double tstart ;
792 #endif
793
794 if (nscol2 * L_ENTRY < CHOLMOD_POTRF_LIMIT)
795 {
796 /* too small for the CUDA BLAS; use the CPU instead */
797 return (0) ;
798 }
799
800 #ifndef NTIMER
801 tstart = SuiteSparse_time ( ) ;
802 Common->CHOLMOD_GPU_POTRF_CALLS++ ;
803 #endif
804
805 nsrow2 = nsrow - nscol2 ;
806
807 /* ---------------------------------------------------------------------- */
808 /* heuristic to get the block size depending of the problem size */
809 /* ---------------------------------------------------------------------- */
810
811 nb = 128 ;
812 if (nscol2 > 4096) nb = 256 ;
813 if (nscol2 > 8192) nb = 384 ;
814 n = nscol2 ;
815 gpu_lda = ((nscol2+31)/32)*32 ;
816 lda = nsrow ;
817
818 A = gpu_p->h_Lx[(Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1)%
819 CHOLMOD_HOST_SUPERNODE_BUFFERS];
820
821 /* ---------------------------------------------------------------------- */
822 /* determine the GPU leading dimension of B */
823 /* ---------------------------------------------------------------------- */
824
825 gpu_ldb = 0 ;
826 if (nsrow2 > 0)
827 {
828 gpu_ldb = ((nsrow2+31)/32)*32 ;
829 }
830
831 /* ---------------------------------------------------------------------- */
832 /* remember where device memory is, to be used by triangular solve later */
833 /* ---------------------------------------------------------------------- */
834
835 devPtrA = gpu_p->d_Lx[0];
836 devPtrB = gpu_p->d_Lx[1];
837
838 /* ---------------------------------------------------------------------- */
839 /* copy A from device to device */
840 /* ---------------------------------------------------------------------- */
841
842 cudaStat = cudaMemcpy2DAsync ( devPtrA,
843 gpu_lda * L_ENTRY * sizeof (devPtrA[0]),
844 gpu_p->d_A[1],
845 nsrow * L_ENTRY * sizeof (Lx[0]),
846 nscol2 * L_ENTRY * sizeof (devPtrA[0]),
847 nscol2,
848 cudaMemcpyDeviceToDevice,
849 Common->gpuStream[0] );
850
851 if ( cudaStat ) {
852 ERROR ( CHOLMOD_GPU_PROBLEM, "GPU memcopy device to device");
853 }
854
855 /* ---------------------------------------------------------------------- */
856 /* copy B in advance, for gpu_triangular_solve */
857 /* ---------------------------------------------------------------------- */
858
859 if (nsrow2 > 0)
860 {
861 cudaStat = cudaMemcpy2DAsync (devPtrB,
862 gpu_ldb * L_ENTRY * sizeof (devPtrB [0]),
863 gpu_p->d_A[1] + L_ENTRY*nscol2,
864 nsrow * L_ENTRY * sizeof (Lx [0]),
865 nsrow2 * L_ENTRY * sizeof (devPtrB [0]),
866 nscol2,
867 cudaMemcpyDeviceToDevice,
868 Common->gpuStream[0]) ;
869 if (cudaStat)
870 {
871 ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
872 }
873 }
874
875 /* ------------------------------------------------------------------ */
876 /* define the dpotrf stream */
877 /* ------------------------------------------------------------------ */
878
879 cublasStatus = cublasSetStream (Common->cublasHandle,
880 Common->gpuStream [0]) ;
881 if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
882 ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ;
883 }
884
885 /* ---------------------------------------------------------------------- */
886 /* block Cholesky factorization of S */
887 /* ---------------------------------------------------------------------- */
888
889 for (j = 0 ; j < n ; j += nb)
890 {
891 Int jb = nb < (n-j) ? nb : (n-j) ;
892
893 /* ------------------------------------------------------------------ */
894 /* do the CUDA BLAS dsyrk */
895 /* ------------------------------------------------------------------ */
896
897 alpha = -1.0 ;
898 beta = 1.0 ;
899 #ifdef REAL
900 cublasStatus = cublasDsyrk (Common->cublasHandle,
901 CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j,
902 &alpha, devPtrA + j, gpu_lda,
903 &beta, devPtrA + j + j*gpu_lda, gpu_lda) ;
904
905 #else
906 cublasStatus = cublasZherk (Common->cublasHandle,
907 CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j,
908 &alpha, (cuDoubleComplex*)devPtrA + j,
909 gpu_lda,
910 &beta,
911 (cuDoubleComplex*)devPtrA + j + j*gpu_lda,
912 gpu_lda) ;
913 #endif
914
915 if (cublasStatus != CUBLAS_STATUS_SUCCESS)
916 {
917 ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
918 }
919
920 /* ------------------------------------------------------------------ */
921
922 cudaStat = cudaEventRecord (Common->cublasEventPotrf [0],
923 Common->gpuStream [0]) ;
924 if (cudaStat)
925 {
926 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
927 }
928
929 cudaStat = cudaStreamWaitEvent (Common->gpuStream [1],
930 Common->cublasEventPotrf [0], 0) ;
931 if (cudaStat)
932 {
933 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
934 }
935
936 /* ------------------------------------------------------------------ */
937 /* copy back the jb columns on two different streams */
938 /* ------------------------------------------------------------------ */
939
940 cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + j*lda),
941 lda * L_ENTRY * sizeof (double),
942 devPtrA + L_ENTRY*(j + j*gpu_lda),
943 gpu_lda * L_ENTRY * sizeof (double),
944 L_ENTRY * sizeof (double)*jb,
945 jb,
946 cudaMemcpyDeviceToHost,
947 Common->gpuStream [1]) ;
948
949 if (cudaStat)
950 {
951 ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ;
952 }
953
954 /* ------------------------------------------------------------------ */
955 /* do the CUDA BLAS dgemm */
956 /* ------------------------------------------------------------------ */
957
958 if ((j+jb) < n)
959 {
960
961 #ifdef REAL
962 alpha = -1.0 ;
963 beta = 1.0 ;
964 cublasStatus = cublasDgemm (Common->cublasHandle,
965 CUBLAS_OP_N, CUBLAS_OP_T,
966 (n-j-jb), jb, j,
967 &alpha,
968 devPtrA + (j+jb), gpu_lda,
969 devPtrA + (j) , gpu_lda,
970 &beta,
971 devPtrA + (j+jb + j*gpu_lda), gpu_lda) ;
972
973 #else
974 cuDoubleComplex calpha = {-1.0,0.0} ;
975 cuDoubleComplex cbeta = { 1.0,0.0} ;
976 cublasStatus = cublasZgemm (Common->cublasHandle,
977 CUBLAS_OP_N, CUBLAS_OP_C,
978 (n-j-jb), jb, j,
979 &calpha,
980 (cuDoubleComplex*)devPtrA + (j+jb),
981 gpu_lda,
982 (cuDoubleComplex*)devPtrA + (j),
983 gpu_lda,
984 &cbeta,
985 (cuDoubleComplex*)devPtrA +
986 (j+jb + j*gpu_lda),
987 gpu_lda ) ;
988 #endif
989
990 if (cublasStatus != CUBLAS_STATUS_SUCCESS)
991 {
992 ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
993 }
994 }
995
996 cudaStat = cudaStreamSynchronize (Common->gpuStream [1]) ;
997 if (cudaStat)
998 {
999 ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
1000 }
1001
1002 /* ------------------------------------------------------------------ */
1003 /* compute the Cholesky factorization of the jbxjb block on the CPU */
1004 /* ------------------------------------------------------------------ */
1005
1006 ilda = (int) lda ;
1007 ijb = jb ;
1008 #ifdef REAL
1009 LAPACK_DPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ;
1010 #else
1011 LAPACK_ZPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ;
1012 #endif
1013 *info = iinfo ;
1014
1015 if (*info != 0)
1016 {
1017 *info = *info + j ;
1018 break ;
1019 }
1020
1021 /* ------------------------------------------------------------------ */
1022 /* copy the result back to the GPU */
1023 /* ------------------------------------------------------------------ */
1024
1025 cudaStat = cudaMemcpy2DAsync (devPtrA + L_ENTRY*(j + j*gpu_lda),
1026 gpu_lda * L_ENTRY * sizeof (double),
1027 A + L_ENTRY * (j + j*lda),
1028 lda * L_ENTRY * sizeof (double),
1029 L_ENTRY * sizeof (double) * jb,
1030 jb,
1031 cudaMemcpyHostToDevice,
1032 Common->gpuStream [0]) ;
1033
1034 if (cudaStat)
1035 {
1036 ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
1037 }
1038
1039 /* ------------------------------------------------------------------ */
1040 /* do the CUDA BLAS dtrsm */
1041 /* ------------------------------------------------------------------ */
1042
1043 if ((j+jb) < n)
1044 {
1045
1046 #ifdef REAL
1047 alpha = 1.0 ;
1048 cublasStatus = cublasDtrsm (Common->cublasHandle,
1049 CUBLAS_SIDE_RIGHT,
1050 CUBLAS_FILL_MODE_LOWER,
1051 CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
1052 (n-j-jb), jb,
1053 &alpha,
1054 devPtrA + (j + j*gpu_lda), gpu_lda,
1055 devPtrA + (j+jb + j*gpu_lda), gpu_lda) ;
1056 #else
1057 cuDoubleComplex calpha = {1.0,0.0};
1058 cublasStatus = cublasZtrsm (Common->cublasHandle,
1059 CUBLAS_SIDE_RIGHT,
1060 CUBLAS_FILL_MODE_LOWER,
1061 CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT,
1062 (n-j-jb), jb,
1063 &calpha,
1064 (cuDoubleComplex *)devPtrA +
1065 (j + j*gpu_lda),
1066 gpu_lda,
1067 (cuDoubleComplex *)devPtrA +
1068 (j+jb + j*gpu_lda),
1069 gpu_lda) ;
1070 #endif
1071
1072 if (cublasStatus != CUBLAS_STATUS_SUCCESS)
1073 {
1074 ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
1075 }
1076
1077 /* -------------------------------------------------------------- */
1078 /* Copy factored column back to host. */
1079 /* -------------------------------------------------------------- */
1080
1081 cudaStat = cudaEventRecord (Common->cublasEventPotrf[2],
1082 Common->gpuStream[0]) ;
1083 if (cudaStat)
1084 {
1085 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
1086 }
1087
1088 cudaStat = cudaStreamWaitEvent (Common->gpuStream[1],
1089 Common->cublasEventPotrf[2], 0) ;
1090 if (cudaStat)
1091 {
1092 ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
1093 }
1094
1095 cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + jb + j * lda),
1096 lda * L_ENTRY * sizeof (double),
1097 devPtrA + L_ENTRY*
1098 (j + jb + j * gpu_lda),
1099 gpu_lda * L_ENTRY * sizeof (double),
1100 L_ENTRY * sizeof (double)*
1101 (n - j - jb), jb,
1102 cudaMemcpyDeviceToHost,
1103 Common->gpuStream[1]) ;
1104
1105 if (cudaStat)
1106 {
1107 ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
1108 }
1109 }
1110 }
1111
1112 #ifndef NTIMER
1113 Common->CHOLMOD_GPU_POTRF_TIME += SuiteSparse_time ( ) - tstart ;
1114 #endif
1115
1116 return (1) ;
1117 }
1118
1119
1120 /* ========================================================================== */
1121 /* === gpu_triangular_solve ================================================= */
1122 /* ========================================================================== */
1123
1124 /* The current supernode is columns k1 to k2-1 of L. Let L1 be the diagonal
1125 * block (factorized by dpotrf/zpotrf above; rows/cols k1:k2-1), and L2 be rows
1126 * k2:n-1 and columns k1:k2-1 of L. The triangular system to solve is L2*L1' =
1127 * S2, where S2 is overwritten with L2. More precisely, L2 = S2 / L1' in
1128 * MATLAB notation.
1129 */
1130
1131 /* Version with pre-allocation in POTRF */
1132
TEMPLATE2(CHOLMOD (gpu_triangular_solve))1133 int TEMPLATE2 (CHOLMOD (gpu_triangular_solve))
1134 (
1135 Int nsrow2, /* L1 and S2 are nsrow2-by-nscol2 */
1136 Int nscol2, /* L1 is nscol2-by-nscol2 */
1137 Int nsrow, /* leading dimension of L1, L2, and S2 */
1138 Int psx, /* L1 is at Lx+L_ENTRY*psx;
1139 * L2 at Lx+L_ENTRY*(psx+nscol2)*/
1140 double *Lx, /* holds L1, L2, and S2 */
1141 cholmod_common *Common,
1142 cholmod_gpu_pointers *gpu_p
1143 )
1144 {
1145 double *devPtrA, *devPtrB ;
1146 cudaError_t cudaStat ;
1147 cublasStatus_t cublasStatus ;
1148 Int gpu_lda, gpu_ldb, gpu_rowstep ;
1149
1150 Int gpu_row_start = 0 ;
1151 Int gpu_row_max_chunk, gpu_row_chunk;
1152 int ibuf = 0;
1153 int iblock = 0;
1154 int iHostBuff = (Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1) %
1155 CHOLMOD_HOST_SUPERNODE_BUFFERS;
1156 int i, j;
1157 Int iidx;
1158 int iwrap;
1159
1160 #ifndef NTIMER
1161 double tstart ;
1162 #endif
1163
1164 #ifdef REAL
1165 double alpha = 1.0 ;
1166 gpu_row_max_chunk = 768;
1167 #else
1168 cuDoubleComplex calpha = {1.0,0.0} ;
1169 gpu_row_max_chunk = 256;
1170 #endif
1171
1172 if ( nsrow2 <= 0 )
1173 {
1174 return (0) ;
1175 }
1176
1177 #ifndef NTIMER
1178 tstart = SuiteSparse_time ( ) ;
1179 Common->CHOLMOD_GPU_TRSM_CALLS++ ;
1180 #endif
1181
1182 gpu_lda = ((nscol2+31)/32)*32 ;
1183 gpu_ldb = ((nsrow2+31)/32)*32 ;
1184
1185 devPtrA = gpu_p->d_Lx[0];
1186 devPtrB = gpu_p->d_Lx[1];
1187
1188 /* make sure the copy of B has completed */
1189 cudaStreamSynchronize( Common->gpuStream[0] );
1190
1191 /* ---------------------------------------------------------------------- */
1192 /* do the CUDA BLAS dtrsm */
1193 /* ---------------------------------------------------------------------- */
1194
1195 while ( gpu_row_start < nsrow2 )
1196 {
1197
1198 gpu_row_chunk = nsrow2 - gpu_row_start;
1199 if ( gpu_row_chunk > gpu_row_max_chunk ) {
1200 gpu_row_chunk = gpu_row_max_chunk;
1201 }
1202
1203 cublasStatus = cublasSetStream ( Common->cublasHandle,
1204 Common->gpuStream[ibuf] );
1205
1206 if ( cublasStatus != CUBLAS_STATUS_SUCCESS )
1207 {
1208 ERROR ( CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream");
1209 }
1210
1211 #ifdef REAL
1212 cublasStatus = cublasDtrsm (Common->cublasHandle,
1213 CUBLAS_SIDE_RIGHT,
1214 CUBLAS_FILL_MODE_LOWER,
1215 CUBLAS_OP_T,
1216 CUBLAS_DIAG_NON_UNIT,
1217 gpu_row_chunk,
1218 nscol2,
1219 &alpha,
1220 devPtrA,
1221 gpu_lda,
1222 devPtrB + gpu_row_start,
1223 gpu_ldb) ;
1224 #else
1225 cublasStatus = cublasZtrsm (Common->cublasHandle,
1226 CUBLAS_SIDE_RIGHT,
1227 CUBLAS_FILL_MODE_LOWER,
1228 CUBLAS_OP_C,
1229 CUBLAS_DIAG_NON_UNIT,
1230 gpu_row_chunk,
1231 nscol2,
1232 &calpha,
1233 (const cuDoubleComplex *) devPtrA,
1234 gpu_lda,
1235 (cuDoubleComplex *)devPtrB + gpu_row_start ,
1236 gpu_ldb) ;
1237 #endif
1238 if (cublasStatus != CUBLAS_STATUS_SUCCESS)
1239 {
1240 ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
1241 }
1242
1243 /* ------------------------------------------------------------------ */
1244 /* copy result back to the CPU */
1245 /* ------------------------------------------------------------------ */
1246
1247 cudaStat = cudaMemcpy2DAsync (
1248 gpu_p->h_Lx[iHostBuff] +
1249 L_ENTRY*(nscol2+gpu_row_start),
1250 nsrow * L_ENTRY * sizeof (Lx [0]),
1251 devPtrB + L_ENTRY*gpu_row_start,
1252 gpu_ldb * L_ENTRY * sizeof (devPtrB [0]),
1253 gpu_row_chunk * L_ENTRY *
1254 sizeof (devPtrB [0]),
1255 nscol2,
1256 cudaMemcpyDeviceToHost,
1257 Common->gpuStream[ibuf]);
1258
1259 if (cudaStat)
1260 {
1261 ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ;
1262 }
1263
1264 cudaEventRecord ( Common->updateCBuffersFree[ibuf],
1265 Common->gpuStream[ibuf] );
1266
1267 gpu_row_start += gpu_row_chunk;
1268 ibuf++;
1269 ibuf = ibuf % CHOLMOD_HOST_SUPERNODE_BUFFERS;
1270
1271 iblock ++;
1272
1273 if ( iblock >= CHOLMOD_HOST_SUPERNODE_BUFFERS )
1274 {
1275 Int gpu_row_start2 ;
1276 Int gpu_row_end ;
1277
1278 /* then CHOLMOD_HOST_SUPERNODE_BUFFERS worth of work has been
1279 * scheduled, so check for completed events and copy result into
1280 * Lx before continuing. */
1281 cudaEventSynchronize ( Common->updateCBuffersFree
1282 [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] );
1283
1284 /* copy into Lx */
1285 gpu_row_start2 = nscol2 +
1286 (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS)
1287 *gpu_row_max_chunk;
1288 gpu_row_end = gpu_row_start2+gpu_row_max_chunk;
1289
1290 if ( gpu_row_end > nsrow ) gpu_row_end = nsrow;
1291
1292 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
1293 private(iidx) if ( nscol2 > 32 )
1294
1295 for ( j=0; j<nscol2; j++ ) {
1296 for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) {
1297 iidx = j*nsrow*L_ENTRY+i;
1298 Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
1299 }
1300 }
1301 }
1302 }
1303
1304 /* Convenient to copy the L1 block here */
1305
1306 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
1307 private ( iidx ) if ( nscol2 > 32 )
1308
1309 for ( j=0; j<nscol2; j++ ) {
1310 for ( i=j*L_ENTRY; i<nscol2*L_ENTRY; i++ ) {
1311 iidx = j*nsrow*L_ENTRY + i;
1312 Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
1313 }
1314 }
1315
1316 /* now account for the last HSTREAMS buffers */
1317 for ( iwrap=0; iwrap<CHOLMOD_HOST_SUPERNODE_BUFFERS; iwrap++ )
1318 {
1319 int i, j;
1320 Int gpu_row_start2 = nscol2 + (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS)
1321 *gpu_row_max_chunk;
1322 if (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS >= 0 &&
1323 gpu_row_start2 < nsrow )
1324 {
1325 Int iidx;
1326 Int gpu_row_end = gpu_row_start2+gpu_row_max_chunk;
1327 if ( gpu_row_end > nsrow ) gpu_row_end = nsrow;
1328 cudaEventSynchronize ( Common->updateCBuffersFree
1329 [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] );
1330 /* copy into Lx */
1331
1332 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
1333 private(iidx) if ( nscol2 > 32 )
1334
1335 for ( j=0; j<nscol2; j++ ) {
1336 for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) {
1337 iidx = j*nsrow*L_ENTRY+i;
1338 Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
1339 }
1340 }
1341 }
1342 iblock++;
1343 }
1344
1345 /* ---------------------------------------------------------------------- */
1346 /* return */
1347 /* ---------------------------------------------------------------------- */
1348
1349 #ifndef NTIMER
1350 Common->CHOLMOD_GPU_TRSM_TIME += SuiteSparse_time ( ) - tstart ;
1351 #endif
1352
1353 return (1) ;
1354 }
1355
1356 /* ========================================================================== */
1357 /* === gpu_copy_supernode =================================================== */
1358 /* ========================================================================== */
1359 /*
1360 * In the event gpu_triangular_sovle is not needed / called, this routine
1361 * copies the factored diagonal block from the GPU to the CPU.
1362 */
1363
TEMPLATE2(CHOLMOD (gpu_copy_supernode))1364 void TEMPLATE2 (CHOLMOD (gpu_copy_supernode))
1365 (
1366 cholmod_common *Common,
1367 double *Lx,
1368 Int psx,
1369 Int nscol,
1370 Int nscol2,
1371 Int nsrow,
1372 int supernodeUsedGPU,
1373 int iHostBuff,
1374 cholmod_gpu_pointers *gpu_p
1375 )
1376 {
1377 Int iidx, i, j;
1378 if ( supernodeUsedGPU && nscol2 * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) {
1379 cudaDeviceSynchronize();
1380
1381 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
1382 private(iidx,i,j) if (nscol>32)
1383
1384 for ( j=0; j<nscol; j++ ) {
1385 for ( i=j*L_ENTRY; i<nscol*L_ENTRY; i++ ) {
1386 iidx = j*nsrow*L_ENTRY+i;
1387 Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
1388 }
1389 }
1390 }
1391 return;
1392 }
1393
1394 #endif
1395
1396 #undef REAL
1397 #undef COMPLEX
1398 #undef ZOMPLEX
1399