1 // =============================================================================
2 // === spqrgpu_kernel ==========================================================
3 // =============================================================================
4
5 // Compute the QR factorization on the GPU. Does not handle rank-deficient
6 // matrices (R is OK, but will not be 'squeezed'). Does not handle complex
7 // matrices. Does not return the Householder vectors.
8
9 #ifdef GPU_BLAS
10 #include "spqr.hpp"
11
12 #ifdef SUITESPARSE_TIMER_ENABLED
13 #define INIT_TIME(x) double x = 0.0; double time_ ## x;
14 #define TIC(x) time_ ## x = SuiteSparse_time();
15 #define TOC(x) x += SuiteSparse_time() - time_ ## x;
16 #else
17 #define INIT_TIME(x)
18 #define TIC(x)
19 #define TOC(x)
20 #endif
21
22 // -----------------------------------------------------------------------------
23 // numfronts_in_stage
24 // -----------------------------------------------------------------------------
25
numfronts_in_stage(Long stage,Long * Stagingp,Long * StageMap,Long * Post,Long * Child,Long * Childp,Long * p_numFronts,Long * p_leftoverChildren)26 void numfronts_in_stage
27 (
28 // input, not modified
29 Long stage, // count the # of fronts in this stage
30 Long *Stagingp, // fronts are in the list
31 // Post [Stagingp [stage]...Stagingp[stage+1]-1]
32 Long *StageMap, // front f is in stage StageMap [f]
33 Long *Post, // array of size nf (# of fronts)
34 Long *Child, // list of children for each front is
35 Long *Childp, // in Child [Childp [f] ... Childp [f+1]-1]
36
37 // output, not defined on input
38 Long *p_numFronts, // number of fronts in stage (a scalar)
39 Long *p_leftoverChildren // number of leftover children (a scalar)
40 )
41 {
42 // the # of fronts at this stage is given by the Stagingp workspace
43 // plus any children within the stage that must still be assembled
44 Long sStart = Stagingp[stage];
45 Long sEnd = Stagingp[stage+1];
46 Long numFronts = (sEnd - sStart);
47 Long leftoverChildren = 0;
48 for(Long p=sStart; p<sEnd; p++) // for each front in the stage
49 {
50 Long f = Post[p];
51 for(Long cp=Childp[f]; cp<Childp[f+1]; cp++)
52 {
53 Long c = Child[cp];
54 if(StageMap[c] < stage) leftoverChildren++;
55 }
56 }
57 numFronts += leftoverChildren ;
58 *p_numFronts = numFronts ;
59 *p_leftoverChildren = leftoverChildren ;
60 }
61
62
63 // -----------------------------------------------------------------------------
64 // spqrgpu_kernel
65 // -----------------------------------------------------------------------------
66
spqrgpu_kernel(spqr_blob<double> * Blob)67 void spqrgpu_kernel
68 (
69 spqr_blob <double> *Blob
70 )
71 {
72
73 INIT_TIME(complete);
74 TIC(complete);
75
76 // -------------------------------------------------------------------------
77 // get the Blob
78 // -------------------------------------------------------------------------
79
80 spqr_symbolic *QRsym = Blob->QRsym ;
81 spqr_numeric <double> *QRnum = Blob->QRnum ;
82 spqr_work <double> *Work = Blob->Work ;
83 double *Sx = Blob->Sx ;
84 // Long ntol = Blob->ntol ; // no rank detection on the GPU
85 cholmod_common *cc = Blob->cc ;
86
87 // -------------------------------------------------------------------------
88 // get the contents of the QR symbolic object
89 // -------------------------------------------------------------------------
90
91 Long * Super = QRsym->Super ; // size nf+1, gives pivot columns in F
92 Long * Rp = QRsym->Rp ; // size nf+1, pointers for pattern of R
93 Long * Rj = QRsym->Rj ; // size QRsym->rjsize, col indices of R
94 Long * Sleft = QRsym->Sleft ; // size n+2, leftmost column sets
95 Long * Sp = QRsym->Sp ; // size m+1, row pointers for S
96 Long * Sj = QRsym->Sj ; // size anz, column indices for S
97 Long * Parent = QRsym->Parent ; // size nf, for parent index
98 Long * Child = QRsym->Child ; // size nf, for lists of children
99 Long * Childp = QRsym->Childp ; // size nf+1, for lists of children
100 Long * Fm = QRsym->Fm ; // number of rows in F
101 Long * Cm = QRsym->Cm ; // # of rows in contribution blocks
102 Long nf = QRsym->nf ; // number of fronts
103 Long n = QRsym->n ; // number of columns
104 Long m = QRsym->m ; // number of rows
105 Long * Post = QRsym->Post ; // size nf
106
107 // -------------------------------------------------------------------------
108 // get the contents of the QR numeric object
109 // -------------------------------------------------------------------------
110
111 double ** Rblock = QRnum->Rblock ;
112 char * Rdead = QRnum->Rdead ;
113
114 // -------------------------------------------------------------------------
115 // get the stack for this task and the head/top pointers
116 // -------------------------------------------------------------------------
117
118 Long stack = 0 ; // no mixing of GPU and TBB parallelism
119 ASSERT (QRnum->ntasks == 1) ;
120
121 double * Stack_top = Work [stack].Stack_top ;
122 double * Stack_head = Work [stack].Stack_head ;
123
124 Long sumfrank = Work [stack].sumfrank ;
125 Long maxfrank = Work [stack].maxfrank ;
126
127 // -------------------------------------------------------------------------
128 // get the SPQR GPU members from symbolic analysis
129 // -------------------------------------------------------------------------
130
131 spqr_gpu *QRgpu = QRsym->QRgpu;
132
133 // assembly metadata
134 Long *RjmapOffsets = QRgpu->RjmapOffsets;
135 Long *RimapOffsets = QRgpu->RimapOffsets;
136 Long RjmapSize = MAX(1, QRgpu->RjmapSize);
137 Long RimapSize = MAX(1, QRgpu->RimapSize);
138
139 // staging metadata
140 Long numStages = QRgpu->numStages;
141 Long *Stagingp = QRgpu->Stagingp;
142 Long *StageMap = QRgpu->StageMap;
143 size_t *FSize = QRgpu->FSize;
144 size_t *RSize = QRgpu->RSize;
145 size_t *SSize = QRgpu->SSize;
146 Long *FOffsets = QRgpu->FOffsets;
147 Long *ROffsets = QRgpu->ROffsets;
148 Long *SOffsets = QRgpu->SOffsets;
149
150 // gpu parameters
151 size_t gpuMemorySize = cc->gpuMemorySize;
152
153 // -------------------------------------------------------------------------
154 // use a CUDA stream for asynchronous memory transfers
155 // -------------------------------------------------------------------------
156
157 cudaStream_t memoryStreamH2D;
158 cudaError_t result = cudaStreamCreate(&memoryStreamH2D);
159 if(result != cudaSuccess)
160 {
161 ERROR (CHOLMOD_GPU_PROBLEM, "cannot create CUDA stream to GPU") ;
162 return ;
163 }
164
165 // -------------------------------------------------------------------------
166 // use one mongo Stair for the entire problem
167 // -------------------------------------------------------------------------
168
169 Long stairsize = Rp [nf] ;
170 Long *Stair = (Long*) cholmod_l_malloc (stairsize, sizeof(Long), cc);
171
172 // -------------------------------------------------------------------------
173 // use a workspace directory to store contribution blocks, if needed
174 // semantics: create as needed, destroy after used
175 // -------------------------------------------------------------------------
176
177 Workspace **LimboDirectory =
178 (Workspace**) cholmod_l_calloc (nf, sizeof(Workspace*), cc);
179
180 // -------------------------------------------------------------------------
181 // allocate, construct, and ship S, Rimap, and Rjmap for the entire problem
182 // -------------------------------------------------------------------------
183
184 // malloc wsMondoS on the CPU (pagelocked), not on the GPU
185 Workspace *wsMondoS = Workspace::allocate (Sp[m], // CPU pagelocked
186 sizeof(SEntry), false, true, false, true) ;
187
188 // malloc wsRimap and wsRjmap on both the CPU and GPU (not pagelocked)
189 Workspace *wsRimap = Workspace::allocate (RimapSize, // CPU and GPU
190 sizeof(int), false, true, true, false) ;
191 Workspace *wsRjmap = Workspace::allocate (RjmapSize, // CPU and GPU
192 sizeof(int), false, true, true, false) ;
193
194 // use shared Long workspace (Iwork) for Fmap and InvPost [ [
195 // Note that Iwork (0:nf-1) is already in use for Blob.Cm (size nf)
196 // was: Long *Fmap = (Long*) cholmod_l_malloc (n, sizeof(Long), cc);
197 cholmod_l_allocate_work (0, 2*nf + n + 1, 0, cc) ;
198 Long *Wi = (Long *) cc->Iwork ; // Cm is size nf, already in use
199 Long *InvPost = Wi + nf ; // InvPost is size nf+1
200 Long *Fmap = Wi + (nf+1) ; // Fmap is size n
201
202 Long numFronts = 0 ;
203 Front *fronts = NULL ;
204 Workspace *wsMondoF = NULL ;
205 Workspace *wsMondoR = NULL ;
206 Workspace *wsS = NULL ;
207
208 Long wsMondoF_size = 0 ;
209 Long wsMondoR_size = 0 ;
210 Long wsS_size = 0 ;
211 Long maxfronts_in_stage = 0 ;
212
213 // -------------------------------------------------------------------------
214 // check if out of memory
215 // -------------------------------------------------------------------------
216
217 #define FREE_ALL_WORKSPACE \
218 cudaStreamDestroy(memoryStreamH2D); \
219 memoryStreamH2D = NULL ; \
220 Stair = (Long*) cholmod_l_free (stairsize, sizeof(Long), Stair, cc) ; \
221 if (LimboDirectory != NULL) \
222 { \
223 for (Long f2 = 0 ; f2 < nf ; f2++) \
224 { \
225 Workspace *wsLimbo2 = LimboDirectory[f2]; \
226 if (wsLimbo2 != NULL) \
227 { \
228 wsLimbo2->assign(wsLimbo2->cpu(), NULL); \
229 LimboDirectory[f2] = Workspace::destroy(wsLimbo2); \
230 } \
231 } \
232 } \
233 LimboDirectory = (Workspace**) \
234 cholmod_l_free (nf, sizeof(Workspace*), LimboDirectory, cc); \
235 wsMondoS = Workspace::destroy(wsMondoS); \
236 wsRjmap = Workspace::destroy(wsRjmap); \
237 wsRimap = Workspace::destroy(wsRimap); \
238 fronts = (Front*) \
239 cholmod_l_free (maxfronts_in_stage, sizeof(Front), fronts, cc) ; \
240 wsMondoF = Workspace::destroy(wsMondoF); \
241 wsMondoR = Workspace::destroy(wsMondoR); \
242 if (wsS != NULL) wsS->assign(NULL, wsS->gpu()); \
243 wsS = Workspace::destroy(wsS);
244
245 // was: Fmap = (Long*) cholmod_l_free (n, sizeof(Long), Fmap, cc);
246
247 if (cc->status < CHOLMOD_OK // ensures Fmap and InvPost are allocated
248 || !Stair || !LimboDirectory || !wsMondoS || !wsRimap || !wsRjmap)
249 {
250 // cholmod_l_allocate_work will set cc->status if it fails, but it
251 // will "never" fail since the symbolic analysis has already allocated
252 // a larger Iwork than is required by the factorization
253 ERROR (CHOLMOD_OUT_OF_MEMORY, "out of memory") ;
254 FREE_ALL_WORKSPACE ;
255 return;
256 }
257
258 // -------------------------------------------------------------------------
259 // build the assembly maps
260 // -------------------------------------------------------------------------
261
262 // construct/pack S, Rimap, and Rjmap
263 spqrgpu_buildAssemblyMaps (
264 nf, n,
265 Fmap, // workspace for spqrgpu_buildAssemblyMaps
266 Post, Super, Rp, Rj,
267 Sleft, Sp, Sj, Sx,
268 Fm, Cm,
269 Childp, Child,
270 Stair, // StairOffsets not needed because Rp suffices as offsets
271 CPU_REFERENCE(wsRjmap, int*), RjmapOffsets,
272 CPU_REFERENCE(wsRimap, int*), RimapOffsets,
273 CPU_REFERENCE(wsMondoS, SEntry*)
274 // , Sp[m] (parameterno longer needed)
275 ) ;
276
277 // done using Iwork for Fmap ]
278 // Fmap = (Long*) cholmod_l_free (n, sizeof(Long), Fmap, cc);
279
280 // -------------------------------------------------------------------------
281 // ship the assembly maps asynchronously along the H2D memory stream.
282 // -------------------------------------------------------------------------
283
284 wsRjmap->transfer(cudaMemcpyHostToDevice, false, memoryStreamH2D);
285 wsRimap->transfer(cudaMemcpyHostToDevice, false, memoryStreamH2D);
286
287 // Keep track of where we are in MondoS
288 Long SOffset = 0;
289
290 // -------------------------------------------------------------------------
291 // allocate workspace for all stages
292 // -------------------------------------------------------------------------
293
294 for(Long stage=0; stage<numStages; stage++)
295 {
296 // find the memory requirements of this stage
297 Long leftoverChildren ;
298 numfronts_in_stage (stage, Stagingp, StageMap, Post, Child, Childp,
299 &numFronts, &leftoverChildren) ;
300 wsMondoF_size = MAX (wsMondoF_size, FSize [stage]) ;
301 wsMondoR_size = MAX (wsMondoR_size, RSize [stage]) ;
302 wsS_size = MAX (wsS_size, SSize [stage]) ;
303 maxfronts_in_stage = MAX (maxfronts_in_stage, numFronts) ;
304 }
305
306 // The front listing has fronts in the stage at
307 // the beginning with children needing assembly appearing at the end
308 fronts = (Front*) cholmod_l_malloc (maxfronts_in_stage, sizeof(Front),cc) ;
309
310 // This allocate is done once for each stage, but we still need
311 // to set the first FSize[stage] entries in wsMondoF to zero, for each
312 // stage. So malloc the space on the GPU once, but see cudaMemset below.
313 // This workspace is not on the CPU.
314 wsMondoF = Workspace::allocate (wsMondoF_size, // GPU only
315 sizeof(double), false, false, true, false) ;
316
317 // malloc R for each front on the CPU (not on the GPU)
318 wsMondoR = Workspace::allocate (wsMondoR_size, // CPU only
319 sizeof(double), false, true, false, false) ;
320
321 // malloc S on the GPU (not on the CPU)
322 wsS = Workspace::allocate (wsS_size, // GPU only
323 sizeof(SEntry), false, false, true, false) ;
324
325 if(!fronts || !wsMondoF || !wsMondoR || !wsS)
326 {
327 ERROR (CHOLMOD_OUT_OF_MEMORY, "out of memory") ;
328 FREE_ALL_WORKSPACE ;
329 return ;
330 }
331
332 // -------------------------------------------------------------------------
333 // construct InvPost, the inverse postordering
334 // -------------------------------------------------------------------------
335
336 for (Long k = 0 ; k < nf ; k++)
337 {
338 Long f = Post [k] ; // front f is the kth front in the postoder
339 ASSERT (f >= 0 && f < nf) ;
340 InvPost [f] = k ; // record the same info in InvPost
341 }
342 InvPost [nf] = nf ; // final placeholder
343
344 // -------------------------------------------------------------------------
345 // iterate over the staging schedule and factorize each stage
346 // -------------------------------------------------------------------------
347
348 for(Long stage=0; stage<numStages; stage++)
349 {
350 Long leftoverChildren ;
351
352 PR (("Building stage %ld of %ld\n", stage+1, numStages));
353
354 numfronts_in_stage (stage, Stagingp, StageMap, Post, Child, Childp,
355 &numFronts, &leftoverChildren) ;
356
357 Long sStart = Stagingp[stage];
358 Long sEnd = Stagingp[stage+1];
359 Long pNextChild = 0;
360
361 PR (("# fronts in stage has %ld (%ld regular + %ld leftovers).\n",
362 numFronts, numFronts-leftoverChildren, leftoverChildren));
363
364 // set fronts on the GPU to all zero
365 cudaMemset (wsMondoF->gpu(), (size_t) 0,
366 FSize [stage] * sizeof (double)) ;
367
368 // surgically transfer S input values to the GPU
369 Workspace surgical_S (SSize [stage], sizeof (SEntry)) ;
370 surgical_S.assign (((SEntry*) wsMondoS->cpu()) + SOffset, wsS->gpu());
371 surgical_S.transfer(cudaMemcpyHostToDevice, false, memoryStreamH2D);
372 surgical_S.assign (NULL, NULL) ;
373
374 SOffset += SSize[stage];
375
376 // get the fronts ready for GPUQREngine
377 for(Long p=sStart; p<sEnd; p++)
378 {
379 // get the front, parent, and relative p
380 Long f = Post[p];
381
382 // compute basic front fields
383 Long fm = Fm[f]; // F is fm-by-fn
384 Long fn = Rp [f+1] - Rp [f];
385 Long col1 = Super [f] ; // first global pivot col in F
386 Long fp = Super[f+1] - col1 ; // with fp pivot columns
387 Long frank = MIN(fm, fp);
388
389 // create the front
390 Long frelp = leftoverChildren + (p - sStart);
391 Long pid = Parent[f];
392 Long gpid = Parent[pid];
393 Long prelp = EMPTY; // assume that we have no parent in stage
394
395 // if we have a non-dummy parent in same stage.
396 if(gpid != EMPTY && StageMap[pid] == stage)
397 {
398 Long pp = InvPost [pid] ;
399 prelp = leftoverChildren + (pp - sStart);
400 }
401
402 PR (("building front %ld (%ld) with parent %ld (%ld)\n",
403 f, frelp, pid, prelp));
404
405 // using placement new
406 Front *front = new (&fronts[frelp]) Front(frelp, prelp, fm, fn);
407 front->fidg = f;
408 front->pidg = pid;
409
410 // override the default (dense) rank
411 front->rank = frank;
412
413 // attach the Stair
414 front->Stair = Stair + Rp[f];
415
416 // set pointer offsets into mondo workspaces
417 front->gpuF = GPU_REFERENCE(wsMondoF, double*) + FOffsets[f];
418 front->cpuR = CPU_REFERENCE(wsMondoR, double*) + ROffsets[f];
419
420 front->sparseMeta = SparseMeta();
421 SparseMeta *meta = &(front->sparseMeta);
422 meta->isSparse = true;
423 meta->fp = fp;
424 meta->nc = Childp[f+1] - Childp[f];
425
426 // S assembly
427 Long pSStart, pSEnd;
428 pSStart = Sp[Sleft[Super[f]]];
429 pSEnd = Sp[Sleft[Super[f+1]]];
430 meta->Scount = MAX(0, pSEnd - pSStart);
431 meta->cpuS = CPU_REFERENCE(wsS, SEntry*) + SOffsets[f];
432 meta->gpuS = GPU_REFERENCE(wsS, SEntry*) + SOffsets[f];
433
434 // pack assembly
435 meta->cn = fn - fp;
436 meta->cm = Cm[f];
437 meta->csize = meta->cm * meta->cn;
438 if(prelp != EMPTY) // parent in stage
439 {
440 meta->pn = Rp[pid+1] - Rp[pid];
441 meta->pc = Rp[f] + meta->fp;
442 meta->gpuRjmap = GPU_REFERENCE(wsRjmap, int*) + RjmapOffsets[f];
443 meta->gpuRimap = GPU_REFERENCE(wsRimap, int*) + RimapOffsets[f];
444 meta->gpuC = front->gpuF + front->rank * front->fn + meta->fp;
445 }
446 else if(Parent[pid] != EMPTY) // parent not in stage
447 {
448 meta->isStaged = true;
449 }
450
451 // build any lingering children
452 // the memory for the child fronts is at the end of its parent
453 // the Front metadata resides at the beginning of the frontList
454 Long ChildOffset = FOffsets[f] + fm*fn;
455 for(Long cp=Childp[f]; cp<Childp[f+1]; cp++)
456 {
457 Long c = Child[cp];
458 if(StageMap[c] == stage) continue; // only consider leftovers
459
460 // Long cfm = Fm[c];
461 Long cfn = Rp [c+1] - Rp [c];
462 Long cfp = Super[c+1] - Super[c];
463 // Long crank = MIN(cfm, cfp);
464 Long ccn = cfn - cfp;
465 Long ccm = Cm[c];
466
467 // only copy the exact CBlock back to the GPU
468 PR (("building child %ld (%ld) with parent %ld (%ld)\n",
469 c, pNextChild, f, frelp));
470 Front *child = new (&fronts[pNextChild])
471 Front(pNextChild, frelp, ccm, ccn);
472 child->fidg = c;
473 child->pidg = f;
474
475 child->sparseMeta = SparseMeta();
476 SparseMeta *cmeta = &(child->sparseMeta);
477 cmeta->isSparse = true;
478 cmeta->pushOnly = true;
479 cmeta->nc = 0;
480 cmeta->pn = fn;
481 cmeta->cm = (int) ccm;
482 cmeta->cn = (int) ccn;
483 cmeta->csize = (int) ccm * (int) ccn;
484 cmeta->gpuRimap = GPU_REFERENCE(wsRimap, int*)+RimapOffsets[c];
485 cmeta->gpuRjmap = GPU_REFERENCE(wsRjmap, int*)+RjmapOffsets[c];
486
487 // the memory layout for children can be right next to
488 // its parent in memory since we're already charging our
489 // staging algorithm for the space of a front and its children
490 child->gpuF = cmeta->gpuC =
491 GPU_REFERENCE(wsMondoF, double*) + ChildOffset;
492
493 // surgically copy the data to the GPU asynchronously
494 Workspace *wsLimbo = LimboDirectory[c];
495 wsLimbo->assign(wsLimbo->cpu(), child->gpuF);
496 wsLimbo->transfer(cudaMemcpyHostToDevice, false,
497 memoryStreamH2D);
498
499 // advance the child offset in case we have more children
500 ChildOffset += ccm * ccn;
501
502 // advance the next child index
503 pNextChild++;
504 }
505 }
506
507 // wait for data to go down to the GPU.
508 cudaStreamSynchronize(memoryStreamH2D);
509
510 // now we can free limbo children
511 for(Long p=sStart; p<sEnd; p++)
512 {
513 Long f = Post[p];
514 for(Long cp=Childp[f]; cp<Childp[f+1]; cp++)
515 {
516 Long c = Child[cp];
517 if(StageMap[c] == stage) continue;
518 Workspace *wsLimbo = LimboDirectory[c];
519 wsLimbo->assign(wsLimbo->cpu(), NULL);
520 LimboDirectory[c] = Workspace::destroy(wsLimbo);
521 }
522 }
523
524 // ---------------------------------------------------------------------
525 // Run the QR Engine
526 // ---------------------------------------------------------------------
527
528 INIT_TIME(engine);
529 TIC(engine);
530
531 QREngineStats stats;
532 GPUQREngine(gpuMemorySize, fronts, numFronts, Parent, Childp, Child,
533 &stats);
534 cc->gpuKernelTime += stats.kernelTime;
535 cc->gpuFlops += stats.flopsActual;
536 cc->gpuNumKernelLaunches += stats.numLaunches;
537 TOC(engine);
538 PR (("%f engine time\n", engine));
539
540 #ifndef NDEBUG
541 for(Long p=sStart; p<sEnd; p++)
542 {
543 Long f = Post[p];
544 Long relp = leftoverChildren + (p - sStart);
545
546 Long fm = Fm[f]; // F has fm rows
547 Long fn = Rp [f+1] - Rp [f] ; // F has fn cols
548 Long fp = Super [f+1] - Super [f] ; // F has fp pivot columns
549 Long frank = MIN(fm, fp);
550 double *cpuF = (&fronts[relp])->cpuR;
551
552 PR (("\n --- Front factorized, front %ld fm %ld fn %ld fp %ld frank %ld",
553 f, fm, fn, fp, frank)) ;
554 PR ((" : rows %ld to %ld of R (1-based)\n",
555 1+ Super [f], 1+ Super [f+1]-1)) ;
556 PR ((" Printing just R part, stored by row:\n")) ;
557 for (Long j = 0 ; j < fn ; j++)
558 {
559 PR ((" --- column %ld of %ld\n", j, fn)) ;
560 for (Long i = 0 ; i < frank ; i++)
561 {
562 if (i == j) PR ((" [ diag: ")) ;
563 else PR ((" row %4ld ", i)) ;
564 PR ((" %10.4g", cpuF [fn*i+j])) ;
565 if (i == j) PR ((" ]\n")) ;
566 else PR (("\n")) ;
567 }
568 PR (("\n")) ;
569 }
570 }
571 #endif
572
573 // ---------------------------------------------------------------------
574 // Pack R from each front onto the Davis Stack.
575 // ---------------------------------------------------------------------
576
577 for(Long p=sStart; p<sEnd; p++)
578 {
579 Long f = Post[p];
580 Long relp = leftoverChildren + (p - sStart);
581
582 Long fm = Fm[f]; // F has fm rows
583 Long fn = Rp [f+1] - Rp [f] ; // F has fn cols
584 Long fp = Super [f+1] - Super [f] ; // F has fp pivot columns
585 Long frank = MIN(fm, fp);
586 double *cpuF = (&fronts[relp])->cpuR;
587
588 // cpuF for frontal matrix f has been factorized on the GPU and
589 // copied to the CPU. It is (frank+cm)-by-fn, stored by row. The
590 // entries in R for this front f are in the first frank rows, in
591 // upper triangular form and stored in unpacked form. On the Davis
592 // stack, R is stored by columns in packed form.
593
594 // Record where R is in the Davis stack
595 Rblock [f] = Stack_head ;
596
597 // copy the leading upper triangular part from cpuF to R
598 for (Long j = 0 ; j < frank ; j++)
599 {
600 // copy column j of the front from cpuF to R
601 for (Long i = 0 ; i <= j ; i++)
602 {
603 (*Stack_head++) = cpuF [fn*i+j] ;
604 }
605 }
606 // copy the rectangular part from cpuF to R
607 for (Long j = frank ; j < fn ; j++)
608 {
609 // copy column j of the front from cpuF to R
610 for (Long i = 0 ; i < frank ; i++)
611 {
612 (*Stack_head++) = cpuF [fn*i+j] ;
613 }
614 }
615
616 // When packed, R has frank*fn - (frank*(frank-1))/2 entries.
617 ASSERT ((Stack_head - Rblock [f]) ==
618 (frank*fn - (frank*(frank-1))/2)) ;
619
620 // If the front needs to be staged, copy the CBlock into limbo.
621 SparseMeta *meta = &((&fronts[relp])->sparseMeta);
622 if(meta->isStaged)
623 {
624 // offset into R to get the start of C
625 double *C = cpuF + frank * fn + fp;
626
627 PR ((" ===> move to Limbo\n")) ;
628
629 // allocate a Limbo
630 int cn = fn - fp;
631 int cm = MIN (fm-frank, cn);
632
633 PR (("frontList[%ld] is front %ld, staged;", relp, f)) ;
634 PR (("CBlock is %d by %d\n", cm, cn)) ;
635
636 // calloc pagelocked memory on the CPU
637 Workspace *wsLimbo =
638 LimboDirectory[f] = Workspace::allocate (cm * cn, // CPU
639 sizeof(double), true, true, false, true);
640
641 if (wsLimbo == NULL)
642 {
643 ERROR (CHOLMOD_OUT_OF_MEMORY, "out of memory") ;
644 FREE_ALL_WORKSPACE ;
645 return ;
646 }
647
648 double *L = CPU_REFERENCE(wsLimbo, double*);
649
650 // copy from C into Limbo
651 for(Long i=0; i<cm; i++)
652 {
653 for(Long j=i; j<cn; j++)
654 {
655 double value = C[i*fn+j];
656 L[i*cn+j] = value;
657 PR (("%f\n", value));
658 }
659 PR (("\n"));
660 }
661 }
662 }
663
664 // cleanup after this stage
665
666 #if 0
667 // This is not needed since Front->F is NULL (nothing on the CPU)
668 for(int f=0; f<numFronts; f++)
669 {
670 // invoke destructor without freeing memory
671 Front *front = (&fronts[f]);
672 front->~Front();
673 }
674 #endif
675
676 }
677
678 // -------------------------------------------------------------------------
679 // cleanup the cuda memory transfer stream and free workspaces
680 // -------------------------------------------------------------------------
681
682 FREE_ALL_WORKSPACE ;
683 // done using Iwork for InvPost ]
684
685 // -------------------------------------------------------------------------
686 // cleanup rank detection results (no rank detection done by the GPU)
687 // -------------------------------------------------------------------------
688
689 Work [stack].Stack_head = Stack_head ;
690 Work [stack].Stack_top = Stack_top ;
691
692 // Compute sumfrank and maxfrank in a last-minute manner, and find Rdead
693 for(Long f=0; f<nf; f++)
694 {
695 // compute basic front fields
696 Long fm = Fm[f]; // F is fm-by-fn
697 // Long fn = Rp [f+1] - Rp [f];
698 Long col1 = Super [f] ; // first global pivot col in F
699 Long fp = Super[f+1] - col1 ; // with fp pivot columns
700 Long frank = MIN(fm, fp);
701 PR (("cleanup front %ld, %ld by %ld with %ld pivots, frank %ld\n",
702 f, fm, Rp [f+1] - Rp [f], fp, frank)) ;
703
704 if (fp > fm)
705 {
706 // hit the wall. The front is fm-by-fn but has fp pivot columns
707 PR (("hit the wall, npiv %ld k %ld rank %ld\n", fp, fm, fm)) ;
708 for (Long k = fm ; k < fp ; k++)
709 {
710 Rdead [col1 + k] = 1 ;
711 }
712 }
713
714 sumfrank += frank ;
715 maxfrank = MAX (maxfrank, frank) ;
716 }
717
718 Work [stack].sumfrank = sumfrank ; // sum rank of fronts for this stack
719 Work [stack].maxfrank = maxfrank ; // max rank of fronts for this stack
720
721 TOC(complete) ;
722 PR (("%f complete time\n", complete));
723 }
724
725 // -------------------------------------------------------------------------
726
spqrgpu_kernel(spqr_blob<Complex> * Blob)727 void spqrgpu_kernel
728 (
729 spqr_blob <Complex> *Blob
730 )
731 {
732 // complex case not yet supported on the GPU
733 cholmod_common *cc = Blob->cc ;
734 ERROR (CHOLMOD_INVALID, "complex case not yet supported on the GPU") ;
735 return ;
736 }
737 #endif
738