1 // =============================================================================
2 // === spqrgpu_kernel ==========================================================
3 // =============================================================================
4 
5 // Compute the QR factorization on the GPU.  Does not handle rank-deficient
6 // matrices (R is OK, but will not be 'squeezed').  Does not handle complex
7 // matrices.  Does not return the Householder vectors.
8 
9 #ifdef GPU_BLAS
10 #include "spqr.hpp"
11 
12 #ifdef SUITESPARSE_TIMER_ENABLED
13 #define INIT_TIME(x)    double x = 0.0; double time_ ## x;
14 #define TIC(x)          time_ ## x = SuiteSparse_time();
15 #define TOC(x)          x += SuiteSparse_time() - time_ ## x;
16 #else
17 #define INIT_TIME(x)
18 #define TIC(x)
19 #define TOC(x)
20 #endif
21 
22 // -----------------------------------------------------------------------------
23 // numfronts_in_stage
24 // -----------------------------------------------------------------------------
25 
numfronts_in_stage(Long stage,Long * Stagingp,Long * StageMap,Long * Post,Long * Child,Long * Childp,Long * p_numFronts,Long * p_leftoverChildren)26 void numfronts_in_stage
27 (
28     // input, not modified
29     Long stage,         // count the # of fronts in this stage
30     Long *Stagingp,     // fronts are in the list
31                         //      Post [Stagingp [stage]...Stagingp[stage+1]-1]
32     Long *StageMap,     // front f is in stage StageMap [f]
33     Long *Post,         // array of size nf (# of fronts)
34     Long *Child,        // list of children for each front is
35     Long *Childp,       //      in Child [Childp [f] ... Childp [f+1]-1]
36 
37     // output, not defined on input
38     Long *p_numFronts,          // number of fronts in stage (a scalar)
39     Long *p_leftoverChildren    // number of leftover children (a scalar)
40 )
41 {
42     // the # of fronts at this stage is given by the Stagingp workspace
43     // plus any children within the stage that must still be assembled
44     Long sStart = Stagingp[stage];
45     Long sEnd = Stagingp[stage+1];
46     Long numFronts = (sEnd - sStart);
47     Long leftoverChildren = 0;
48     for(Long p=sStart; p<sEnd; p++) // for each front in the stage
49     {
50         Long f = Post[p];
51         for(Long cp=Childp[f]; cp<Childp[f+1]; cp++)
52         {
53             Long c = Child[cp];
54             if(StageMap[c] < stage) leftoverChildren++;
55         }
56     }
57     numFronts += leftoverChildren ;
58     *p_numFronts = numFronts ;
59     *p_leftoverChildren = leftoverChildren ;
60 }
61 
62 
63 // -----------------------------------------------------------------------------
64 // spqrgpu_kernel
65 // -----------------------------------------------------------------------------
66 
spqrgpu_kernel(spqr_blob<double> * Blob)67 void spqrgpu_kernel
68 (
69     spqr_blob <double> *Blob
70 )
71 {
72 
73     INIT_TIME(complete);
74     TIC(complete);
75 
76     // -------------------------------------------------------------------------
77     // get the Blob
78     // -------------------------------------------------------------------------
79 
80     spqr_symbolic *QRsym = Blob->QRsym ;
81     spqr_numeric <double> *QRnum = Blob->QRnum ;
82     spqr_work <double> *Work = Blob->Work ;
83     double *Sx = Blob->Sx ;
84 //  Long ntol = Blob->ntol ;        // no rank detection on the GPU
85     cholmod_common *cc = Blob->cc ;
86 
87     // -------------------------------------------------------------------------
88     // get the contents of the QR symbolic object
89     // -------------------------------------------------------------------------
90 
91     Long *   Super = QRsym->Super ;      // size nf+1, gives pivot columns in F
92     Long *   Rp = QRsym->Rp ;            // size nf+1, pointers for pattern of R
93     Long *   Rj = QRsym->Rj ;            // size QRsym->rjsize, col indices of R
94     Long *   Sleft = QRsym->Sleft ;      // size n+2, leftmost column sets
95     Long *   Sp = QRsym->Sp ;            // size m+1, row pointers for S
96     Long *   Sj = QRsym->Sj ;            // size anz, column indices for S
97     Long *   Parent = QRsym->Parent ;    // size nf, for parent index
98     Long *   Child = QRsym->Child ;      // size nf, for lists of children
99     Long *   Childp = QRsym->Childp ;    // size nf+1, for lists of children
100     Long *   Fm = QRsym->Fm ;            // number of rows in F
101     Long *   Cm = QRsym->Cm ;            // # of rows in contribution blocks
102     Long     nf = QRsym->nf ;            // number of fronts
103     Long     n = QRsym->n ;              // number of columns
104     Long     m = QRsym->m ;              // number of rows
105     Long *   Post = QRsym->Post ;        // size nf
106 
107     // -------------------------------------------------------------------------
108     // get the contents of the QR numeric object
109     // -------------------------------------------------------------------------
110 
111     double ** Rblock = QRnum->Rblock ;
112     char *    Rdead = QRnum->Rdead ;
113 
114     // -------------------------------------------------------------------------
115     // get the stack for this task and the head/top pointers
116     // -------------------------------------------------------------------------
117 
118     Long stack = 0 ;                    // no mixing of GPU and TBB parallelism
119     ASSERT (QRnum->ntasks == 1) ;
120 
121     double * Stack_top = Work [stack].Stack_top ;
122     double * Stack_head = Work [stack].Stack_head ;
123 
124     Long sumfrank = Work [stack].sumfrank ;
125     Long maxfrank = Work [stack].maxfrank ;
126 
127     // -------------------------------------------------------------------------
128     // get the SPQR GPU members from symbolic analysis
129     // -------------------------------------------------------------------------
130 
131     spqr_gpu *QRgpu = QRsym->QRgpu;
132 
133     // assembly metadata
134     Long *RjmapOffsets = QRgpu->RjmapOffsets;
135     Long *RimapOffsets = QRgpu->RimapOffsets;
136     Long RjmapSize = MAX(1, QRgpu->RjmapSize);
137     Long RimapSize = MAX(1, QRgpu->RimapSize);
138 
139     // staging metadata
140     Long numStages = QRgpu->numStages;
141     Long *Stagingp = QRgpu->Stagingp;
142     Long *StageMap = QRgpu->StageMap;
143     size_t *FSize = QRgpu->FSize;
144     size_t *RSize = QRgpu->RSize;
145     size_t *SSize = QRgpu->SSize;
146     Long *FOffsets = QRgpu->FOffsets;
147     Long *ROffsets = QRgpu->ROffsets;
148     Long *SOffsets = QRgpu->SOffsets;
149 
150     // gpu parameters
151     size_t gpuMemorySize = cc->gpuMemorySize;
152 
153     // -------------------------------------------------------------------------
154     // use a CUDA stream for asynchronous memory transfers
155     // -------------------------------------------------------------------------
156 
157     cudaStream_t memoryStreamH2D;
158     cudaError_t result = cudaStreamCreate(&memoryStreamH2D);
159     if(result != cudaSuccess)
160     {
161         ERROR (CHOLMOD_GPU_PROBLEM, "cannot create CUDA stream to GPU") ;
162         return ;
163     }
164 
165     // -------------------------------------------------------------------------
166     // use one mongo Stair for the entire problem
167     // -------------------------------------------------------------------------
168 
169     Long stairsize = Rp [nf] ;
170     Long *Stair = (Long*) cholmod_l_malloc (stairsize, sizeof(Long), cc);
171 
172     // -------------------------------------------------------------------------
173     // use a workspace directory to store contribution blocks, if needed
174     // semantics: create as needed, destroy after used
175     // -------------------------------------------------------------------------
176 
177     Workspace **LimboDirectory =
178         (Workspace**) cholmod_l_calloc (nf, sizeof(Workspace*), cc);
179 
180     // -------------------------------------------------------------------------
181     // allocate, construct, and ship S, Rimap, and Rjmap for the entire problem
182     // -------------------------------------------------------------------------
183 
184     // malloc wsMondoS on the CPU (pagelocked), not on the GPU
185     Workspace *wsMondoS = Workspace::allocate (Sp[m],       // CPU pagelocked
186         sizeof(SEntry), false, true, false, true) ;
187 
188     // malloc wsRimap and wsRjmap on both the CPU and GPU (not pagelocked)
189     Workspace *wsRimap  = Workspace::allocate (RimapSize,   // CPU and GPU
190         sizeof(int), false, true, true, false) ;
191     Workspace *wsRjmap  = Workspace::allocate (RjmapSize,   // CPU and GPU
192         sizeof(int), false, true, true, false) ;
193 
194     // use shared Long workspace (Iwork) for Fmap and InvPost [ [
195     // Note that Iwork (0:nf-1) is already in use for Blob.Cm (size nf)
196     // was: Long *Fmap = (Long*) cholmod_l_malloc (n, sizeof(Long), cc);
197     cholmod_l_allocate_work (0, 2*nf + n + 1, 0, cc) ;
198     Long *Wi = (Long *) cc->Iwork ;     // Cm is size nf, already in use
199     Long *InvPost = Wi + nf ;           // InvPost is size nf+1
200     Long *Fmap    = Wi + (nf+1) ;       // Fmap is size n
201 
202     Long numFronts = 0 ;
203     Front *fronts = NULL ;
204     Workspace *wsMondoF = NULL ;
205     Workspace *wsMondoR = NULL ;
206     Workspace *wsS = NULL ;
207 
208     Long wsMondoF_size = 0 ;
209     Long wsMondoR_size = 0 ;
210     Long wsS_size = 0 ;
211     Long maxfronts_in_stage = 0 ;
212 
213     // -------------------------------------------------------------------------
214     // check if out of memory
215     // -------------------------------------------------------------------------
216 
217 #define FREE_ALL_WORKSPACE \
218         cudaStreamDestroy(memoryStreamH2D); \
219         memoryStreamH2D = NULL ; \
220         Stair = (Long*) cholmod_l_free (stairsize, sizeof(Long), Stair, cc) ; \
221         if (LimboDirectory != NULL) \
222         { \
223             for (Long f2 = 0 ; f2 < nf ; f2++) \
224             { \
225                 Workspace *wsLimbo2 = LimboDirectory[f2]; \
226                 if (wsLimbo2 != NULL) \
227                 { \
228                     wsLimbo2->assign(wsLimbo2->cpu(), NULL); \
229                     LimboDirectory[f2] = Workspace::destroy(wsLimbo2); \
230                 } \
231             } \
232         } \
233         LimboDirectory = (Workspace**) \
234             cholmod_l_free (nf, sizeof(Workspace*), LimboDirectory, cc); \
235         wsMondoS = Workspace::destroy(wsMondoS); \
236         wsRjmap  = Workspace::destroy(wsRjmap); \
237         wsRimap  = Workspace::destroy(wsRimap); \
238         fronts = (Front*) \
239             cholmod_l_free (maxfronts_in_stage, sizeof(Front), fronts, cc) ; \
240         wsMondoF = Workspace::destroy(wsMondoF); \
241         wsMondoR = Workspace::destroy(wsMondoR); \
242         if (wsS != NULL) wsS->assign(NULL, wsS->gpu()); \
243         wsS = Workspace::destroy(wsS);
244 
245         // was: Fmap = (Long*) cholmod_l_free (n, sizeof(Long), Fmap, cc);
246 
247     if (cc->status < CHOLMOD_OK     // ensures Fmap and InvPost are allocated
248         || !Stair || !LimboDirectory || !wsMondoS || !wsRimap || !wsRjmap)
249     {
250         // cholmod_l_allocate_work will set cc->status if it fails, but it
251         // will "never" fail since the symbolic analysis has already allocated
252         // a larger Iwork than is required by the factorization
253         ERROR (CHOLMOD_OUT_OF_MEMORY, "out of memory") ;
254         FREE_ALL_WORKSPACE ;
255         return;
256     }
257 
258     // -------------------------------------------------------------------------
259     // build the assembly maps
260     // -------------------------------------------------------------------------
261 
262     // construct/pack S, Rimap, and Rjmap
263     spqrgpu_buildAssemblyMaps (
264         nf, n,
265         Fmap,       // workspace for spqrgpu_buildAssemblyMaps
266         Post, Super, Rp, Rj,
267         Sleft, Sp, Sj, Sx,
268         Fm, Cm,
269         Childp, Child,
270         Stair,      // StairOffsets not needed because Rp suffices as offsets
271         CPU_REFERENCE(wsRjmap, int*), RjmapOffsets,
272         CPU_REFERENCE(wsRimap, int*), RimapOffsets,
273         CPU_REFERENCE(wsMondoS, SEntry*)
274                     // , Sp[m] (parameterno longer needed)
275     ) ;
276 
277     // done using Iwork for Fmap ]
278     // Fmap = (Long*) cholmod_l_free (n, sizeof(Long), Fmap, cc);
279 
280     // -------------------------------------------------------------------------
281     // ship the assembly maps asynchronously along the H2D memory stream.
282     // -------------------------------------------------------------------------
283 
284     wsRjmap->transfer(cudaMemcpyHostToDevice, false, memoryStreamH2D);
285     wsRimap->transfer(cudaMemcpyHostToDevice, false, memoryStreamH2D);
286 
287     // Keep track of where we are in MondoS
288     Long SOffset = 0;
289 
290     // -------------------------------------------------------------------------
291     // allocate workspace for all stages
292     // -------------------------------------------------------------------------
293 
294     for(Long stage=0; stage<numStages; stage++)
295     {
296         // find the memory requirements of this stage
297         Long leftoverChildren ;
298         numfronts_in_stage (stage, Stagingp, StageMap, Post, Child, Childp,
299             &numFronts, &leftoverChildren) ;
300         wsMondoF_size      = MAX (wsMondoF_size,      FSize [stage]) ;
301         wsMondoR_size      = MAX (wsMondoR_size,      RSize [stage]) ;
302         wsS_size           = MAX (wsS_size,           SSize [stage]) ;
303         maxfronts_in_stage = MAX (maxfronts_in_stage, numFronts) ;
304     }
305 
306     // The front listing has fronts in the stage at
307     // the beginning with children needing assembly appearing at the end
308     fronts = (Front*) cholmod_l_malloc (maxfronts_in_stage, sizeof(Front),cc) ;
309 
310     // This allocate is done once for each stage, but we still need
311     // to set the first FSize[stage] entries in wsMondoF to zero, for each
312     // stage.  So malloc the space on the GPU once, but see cudaMemset below.
313     // This workspace is not on the CPU.
314     wsMondoF = Workspace::allocate (wsMondoF_size,      // GPU only
315         sizeof(double), false, false, true, false) ;
316 
317     // malloc R for each front on the CPU (not on the GPU)
318     wsMondoR = Workspace::allocate (wsMondoR_size,      // CPU only
319         sizeof(double), false, true, false, false) ;
320 
321     // malloc S on the GPU (not on the CPU)
322     wsS = Workspace::allocate (wsS_size,                // GPU only
323         sizeof(SEntry), false, false, true, false) ;
324 
325     if(!fronts || !wsMondoF || !wsMondoR || !wsS)
326     {
327         ERROR (CHOLMOD_OUT_OF_MEMORY, "out of memory") ;
328         FREE_ALL_WORKSPACE ;
329         return ;
330     }
331 
332     // -------------------------------------------------------------------------
333     // construct InvPost, the inverse postordering
334     // -------------------------------------------------------------------------
335 
336     for (Long k = 0 ; k < nf ; k++)
337     {
338         Long f = Post [k] ;     // front f is the kth front in the postoder
339         ASSERT (f >= 0 && f < nf) ;
340         InvPost [f] = k ;       // record the same info in InvPost
341     }
342     InvPost [nf] = nf ;         // final placeholder
343 
344     // -------------------------------------------------------------------------
345     // iterate over the staging schedule and factorize each stage
346     // -------------------------------------------------------------------------
347 
348     for(Long stage=0; stage<numStages; stage++)
349     {
350         Long leftoverChildren ;
351 
352         PR (("Building stage %ld of %ld\n", stage+1, numStages));
353 
354         numfronts_in_stage (stage, Stagingp, StageMap, Post, Child, Childp,
355             &numFronts, &leftoverChildren) ;
356 
357         Long sStart = Stagingp[stage];
358         Long sEnd = Stagingp[stage+1];
359         Long pNextChild = 0;
360 
361         PR (("# fronts in stage has %ld (%ld regular + %ld leftovers).\n",
362             numFronts, numFronts-leftoverChildren, leftoverChildren));
363 
364         // set fronts on the GPU to all zero
365         cudaMemset (wsMondoF->gpu(), (size_t) 0,
366             FSize [stage] * sizeof (double)) ;
367 
368         // surgically transfer S input values to the GPU
369         Workspace surgical_S (SSize [stage], sizeof (SEntry)) ;
370         surgical_S.assign (((SEntry*) wsMondoS->cpu()) + SOffset, wsS->gpu());
371         surgical_S.transfer(cudaMemcpyHostToDevice, false, memoryStreamH2D);
372         surgical_S.assign (NULL, NULL) ;
373 
374         SOffset += SSize[stage];
375 
376         // get the fronts ready for GPUQREngine
377         for(Long p=sStart; p<sEnd; p++)
378         {
379             // get the front, parent, and relative p
380             Long f = Post[p];
381 
382             // compute basic front fields
383             Long fm = Fm[f];                    // F is fm-by-fn
384             Long fn = Rp [f+1] - Rp [f];
385             Long col1 = Super [f] ;             // first global pivot col in F
386             Long fp = Super[f+1] - col1 ;       // with fp pivot columns
387             Long frank = MIN(fm, fp);
388 
389             // create the front
390             Long frelp = leftoverChildren + (p - sStart);
391             Long pid = Parent[f];
392             Long gpid = Parent[pid];
393             Long prelp = EMPTY;      // assume that we have no parent in stage
394 
395             // if we have a non-dummy parent in same stage.
396             if(gpid != EMPTY && StageMap[pid] == stage)
397             {
398                 Long pp = InvPost [pid] ;
399                 prelp = leftoverChildren + (pp - sStart);
400             }
401 
402             PR (("building front %ld (%ld) with parent %ld (%ld)\n",
403                 f, frelp, pid, prelp));
404 
405             // using placement new
406             Front *front = new (&fronts[frelp]) Front(frelp, prelp, fm, fn);
407             front->fidg = f;
408             front->pidg = pid;
409 
410             // override the default (dense) rank
411             front->rank = frank;
412 
413             // attach the Stair
414             front->Stair = Stair + Rp[f];
415 
416             // set pointer offsets into mondo workspaces
417             front->gpuF = GPU_REFERENCE(wsMondoF, double*) + FOffsets[f];
418             front->cpuR = CPU_REFERENCE(wsMondoR, double*) + ROffsets[f];
419 
420             front->sparseMeta = SparseMeta();
421             SparseMeta *meta = &(front->sparseMeta);
422             meta->isSparse = true;
423             meta->fp = fp;
424             meta->nc = Childp[f+1] - Childp[f];
425 
426             // S assembly
427             Long pSStart, pSEnd;
428             pSStart = Sp[Sleft[Super[f]]];
429             pSEnd = Sp[Sleft[Super[f+1]]];
430             meta->Scount = MAX(0, pSEnd - pSStart);
431             meta->cpuS = CPU_REFERENCE(wsS, SEntry*) + SOffsets[f];
432             meta->gpuS = GPU_REFERENCE(wsS, SEntry*) + SOffsets[f];
433 
434             // pack assembly
435             meta->cn = fn - fp;
436             meta->cm = Cm[f];
437             meta->csize = meta->cm * meta->cn;
438             if(prelp != EMPTY) // parent in stage
439             {
440                 meta->pn = Rp[pid+1] - Rp[pid];
441                 meta->pc = Rp[f] + meta->fp;
442                 meta->gpuRjmap = GPU_REFERENCE(wsRjmap, int*) + RjmapOffsets[f];
443                 meta->gpuRimap = GPU_REFERENCE(wsRimap, int*) + RimapOffsets[f];
444                 meta->gpuC = front->gpuF + front->rank * front->fn + meta->fp;
445             }
446             else if(Parent[pid] != EMPTY) // parent not in stage
447             {
448                 meta->isStaged = true;
449             }
450 
451             // build any lingering children
452             // the memory for the child fronts is at the end of its parent
453             // the Front metadata resides at the beginning of the frontList
454             Long ChildOffset = FOffsets[f] + fm*fn;
455             for(Long cp=Childp[f]; cp<Childp[f+1]; cp++)
456             {
457                 Long c = Child[cp];
458                 if(StageMap[c] == stage) continue; // only consider leftovers
459 
460                 // Long cfm = Fm[c];
461                 Long cfn = Rp [c+1] - Rp [c];
462                 Long cfp = Super[c+1] - Super[c];
463                 // Long crank = MIN(cfm, cfp);
464                 Long ccn = cfn - cfp;
465                 Long ccm = Cm[c];
466 
467                 // only copy the exact CBlock back to the GPU
468                 PR (("building child %ld (%ld) with parent %ld (%ld)\n",
469                     c, pNextChild, f, frelp));
470                 Front *child = new (&fronts[pNextChild])
471                                    Front(pNextChild, frelp, ccm, ccn);
472                 child->fidg = c;
473                 child->pidg = f;
474 
475                 child->sparseMeta = SparseMeta();
476                 SparseMeta *cmeta = &(child->sparseMeta);
477                 cmeta->isSparse = true;
478                 cmeta->pushOnly = true;
479                 cmeta->nc = 0;
480                 cmeta->pn = fn;
481                 cmeta->cm = (int) ccm;
482                 cmeta->cn = (int) ccn;
483                 cmeta->csize = (int) ccm * (int) ccn;
484                 cmeta->gpuRimap = GPU_REFERENCE(wsRimap, int*)+RimapOffsets[c];
485                 cmeta->gpuRjmap = GPU_REFERENCE(wsRjmap, int*)+RjmapOffsets[c];
486 
487                 // the memory layout for children can be right next to
488                 // its parent in memory since we're already charging our
489                 // staging algorithm for the space of a front and its children
490                 child->gpuF = cmeta->gpuC =
491                     GPU_REFERENCE(wsMondoF, double*) + ChildOffset;
492 
493                 // surgically copy the data to the GPU asynchronously
494                 Workspace *wsLimbo = LimboDirectory[c];
495                 wsLimbo->assign(wsLimbo->cpu(), child->gpuF);
496                 wsLimbo->transfer(cudaMemcpyHostToDevice, false,
497                     memoryStreamH2D);
498 
499                 // advance the child offset in case we have more children
500                 ChildOffset += ccm * ccn;
501 
502                 // advance the next child index
503                 pNextChild++;
504             }
505         }
506 
507         // wait for data to go down to the GPU.
508         cudaStreamSynchronize(memoryStreamH2D);
509 
510         // now we can free limbo children
511         for(Long p=sStart; p<sEnd; p++)
512         {
513             Long f = Post[p];
514             for(Long cp=Childp[f]; cp<Childp[f+1]; cp++)
515             {
516                 Long c = Child[cp];
517                 if(StageMap[c] == stage) continue;
518                 Workspace *wsLimbo = LimboDirectory[c];
519                 wsLimbo->assign(wsLimbo->cpu(), NULL);
520                 LimboDirectory[c] = Workspace::destroy(wsLimbo);
521             }
522         }
523 
524         // ---------------------------------------------------------------------
525         // Run the QR Engine
526         // ---------------------------------------------------------------------
527 
528         INIT_TIME(engine);
529         TIC(engine);
530 
531         QREngineStats stats;
532         GPUQREngine(gpuMemorySize, fronts, numFronts, Parent, Childp, Child,
533             &stats);
534         cc->gpuKernelTime += stats.kernelTime;
535         cc->gpuFlops += stats.flopsActual;
536         cc->gpuNumKernelLaunches += stats.numLaunches;
537         TOC(engine);
538         PR (("%f engine time\n", engine));
539 
540 #ifndef NDEBUG
541         for(Long p=sStart; p<sEnd; p++)
542         {
543             Long f = Post[p];
544             Long relp = leftoverChildren + (p - sStart);
545 
546             Long fm = Fm[f];                     // F has fm rows
547             Long fn = Rp [f+1] - Rp [f] ;        // F has fn cols
548             Long fp = Super [f+1] - Super [f] ;  // F has fp pivot columns
549             Long frank = MIN(fm, fp);
550             double *cpuF = (&fronts[relp])->cpuR;
551 
552             PR (("\n --- Front factorized, front %ld fm %ld fn %ld fp %ld frank %ld",
553                 f, fm, fn, fp, frank)) ;
554             PR ((" : rows %ld to %ld of R (1-based)\n",
555                 1+ Super [f], 1+ Super [f+1]-1)) ;
556             PR (("     Printing just R part, stored by row:\n")) ;
557             for (Long j = 0 ; j < fn ; j++)
558             {
559                 PR (("   --- column %ld of %ld\n", j, fn)) ;
560                 for (Long i = 0 ; i < frank ; i++)
561                 {
562                     if (i == j) PR (("      [ diag:     ")) ;
563                     else        PR (("      row %4ld    ", i)) ;
564                     PR ((" %10.4g", cpuF [fn*i+j])) ;
565                     if (i == j) PR ((" ]\n")) ;
566                     else        PR (("\n")) ;
567                 }
568                 PR (("\n")) ;
569             }
570         }
571 #endif
572 
573         // ---------------------------------------------------------------------
574         // Pack R from each front onto the Davis Stack.
575         // ---------------------------------------------------------------------
576 
577         for(Long p=sStart; p<sEnd; p++)
578         {
579             Long f = Post[p];
580             Long relp = leftoverChildren + (p - sStart);
581 
582             Long fm = Fm[f];                     // F has fm rows
583             Long fn = Rp [f+1] - Rp [f] ;        // F has fn cols
584             Long fp = Super [f+1] - Super [f] ;  // F has fp pivot columns
585             Long frank = MIN(fm, fp);
586             double *cpuF = (&fronts[relp])->cpuR;
587 
588             // cpuF for frontal matrix f has been factorized on the GPU and
589             // copied to the CPU.  It is (frank+cm)-by-fn, stored by row.  The
590             // entries in R for this front f are in the first frank rows, in
591             // upper triangular form and stored in unpacked form.  On the Davis
592             // stack, R is stored by columns in packed form.
593 
594             // Record where R is in the Davis stack
595             Rblock [f] = Stack_head ;
596 
597             // copy the leading upper triangular part from cpuF to R
598             for (Long j = 0 ; j < frank ; j++)
599             {
600                 // copy column j of the front from cpuF to R
601                 for (Long i = 0 ; i <= j ; i++)
602                 {
603                     (*Stack_head++) = cpuF [fn*i+j] ;
604                 }
605             }
606             // copy the rectangular part from cpuF to R
607             for (Long j = frank ; j < fn ; j++)
608             {
609                 // copy column j of the front from cpuF to R
610                 for (Long i = 0 ; i < frank ; i++)
611                 {
612                     (*Stack_head++) = cpuF [fn*i+j] ;
613                 }
614             }
615 
616             // When packed, R has frank*fn - (frank*(frank-1))/2 entries.
617             ASSERT ((Stack_head - Rblock [f]) ==
618                     (frank*fn - (frank*(frank-1))/2)) ;
619 
620             // If the front needs to be staged, copy the CBlock into limbo.
621             SparseMeta *meta = &((&fronts[relp])->sparseMeta);
622             if(meta->isStaged)
623             {
624                 // offset into R to get the start of C
625                 double *C = cpuF + frank * fn + fp;
626 
627                 PR (("   ===> move to Limbo\n")) ;
628 
629                 // allocate a Limbo
630                 int cn = fn - fp;
631                 int cm = MIN (fm-frank, cn);
632 
633                 PR (("frontList[%ld] is front %ld, staged;", relp, f)) ;
634                 PR (("CBlock is %d by %d\n", cm, cn)) ;
635 
636                 // calloc pagelocked memory on the CPU
637                 Workspace *wsLimbo =
638                     LimboDirectory[f] = Workspace::allocate (cm * cn,   // CPU
639                     sizeof(double), true, true, false, true);
640 
641                 if (wsLimbo == NULL)
642                 {
643                     ERROR (CHOLMOD_OUT_OF_MEMORY, "out of memory") ;
644                     FREE_ALL_WORKSPACE ;
645                     return ;
646                 }
647 
648                 double *L = CPU_REFERENCE(wsLimbo, double*);
649 
650                 // copy from C into Limbo
651                 for(Long i=0; i<cm; i++)
652                 {
653                     for(Long j=i; j<cn; j++)
654                     {
655                         double value = C[i*fn+j];
656                         L[i*cn+j] = value;
657                         PR (("%f\n", value));
658                     }
659                     PR (("\n"));
660                 }
661             }
662         }
663 
664         // cleanup after this stage
665 
666         #if 0
667         // This is not needed since Front->F is NULL (nothing on the CPU)
668         for(int f=0; f<numFronts; f++)
669         {
670             // invoke destructor without freeing memory
671             Front *front = (&fronts[f]);
672             front->~Front();
673         }
674         #endif
675 
676     }
677 
678     // -------------------------------------------------------------------------
679     // cleanup the cuda memory transfer stream and free workspaces
680     // -------------------------------------------------------------------------
681 
682     FREE_ALL_WORKSPACE ;
683     // done using Iwork for InvPost ]
684 
685     // -------------------------------------------------------------------------
686     // cleanup rank detection results (no rank detection done by the GPU)
687     // -------------------------------------------------------------------------
688 
689     Work [stack].Stack_head = Stack_head  ;
690     Work [stack].Stack_top = Stack_top ;
691 
692     // Compute sumfrank and maxfrank in a last-minute manner, and find Rdead
693     for(Long f=0; f<nf; f++)
694     {
695         // compute basic front fields
696         Long fm = Fm[f];                    // F is fm-by-fn
697         // Long fn = Rp [f+1] - Rp [f];
698         Long col1 = Super [f] ;             // first global pivot col in F
699         Long fp = Super[f+1] - col1 ;       // with fp pivot columns
700         Long frank = MIN(fm, fp);
701         PR (("cleanup front %ld, %ld by %ld with %ld pivots, frank %ld\n",
702             f, fm, Rp [f+1] - Rp [f], fp, frank)) ;
703 
704         if (fp > fm)
705         {
706             // hit the wall.  The front is fm-by-fn but has fp pivot columns
707             PR (("hit the wall, npiv %ld k %ld rank %ld\n", fp, fm, fm)) ;
708             for (Long k = fm ; k < fp ; k++)
709             {
710                 Rdead [col1 + k] = 1 ;
711             }
712         }
713 
714         sumfrank += frank ;
715         maxfrank = MAX (maxfrank, frank) ;
716     }
717 
718     Work [stack].sumfrank = sumfrank ;  // sum rank of fronts for this stack
719     Work [stack].maxfrank = maxfrank ;  // max rank of fronts for this stack
720 
721     TOC(complete) ;
722     PR (("%f complete time\n", complete));
723 }
724 
725 // -------------------------------------------------------------------------
726 
spqrgpu_kernel(spqr_blob<Complex> * Blob)727 void spqrgpu_kernel
728 (
729     spqr_blob <Complex> *Blob
730 )
731 {
732     // complex case not yet supported on the GPU
733     cholmod_common *cc = Blob->cc ;
734     ERROR (CHOLMOD_INVALID, "complex case not yet supported on the GPU") ;
735     return ;
736 }
737 #endif
738