1 // =============================================================================
2 // === spqr_factorize ==========================================================
3 // =============================================================================
4 
5 // Given an m-by-n sparse matrix A and its symbolic analsys, compute its
6 // numeric QR factorization.
7 //
8 // Note that the first part of the code allocates workspace, and the resulting
9 // QR factors.  It then does all its work in that amount of space, with no
10 // reallocations.  The size of the memory blocks that are allocated are known
11 // a priori, from the symbolic analysis, and do not depend upon the specific
12 // numerical values.  When done, it frees its workspace.
13 //
14 // Furthermore, if rank detection is disabled (with tol < 0), the actual flops
15 // become perfectly repeatable; the code does the same thing regardless of
16 // the actual numerical values.
17 //
18 // These characteristics make this QR factorization a candidate for a
19 // hardware-in-the-loop solution, for control applications (for example).
20 // However, the Stacks must not be shrunk when the factorization is done.
21 //
22 // FUTURE: split this function into three parts:
23 //     (a) allocate workspace and resulting QR factors
24 //     (b) factorize (no malloc/free)
25 //     (c) free workspace and optionally shrink the Stacks
26 // Then part (b) could be called many times with different values but same
27 // nonzero pattern.
28 
29 // =============================================================================
30 // === macros ==================================================================
31 // =============================================================================
32 
33 #include "spqr.hpp"
34 
35 #define FCHUNK 32        // FUTURE: make a parameter; Householder block size
36 
37 // Free Sx, A, and all of Work except the Work [0:ns-1] array itself
38 #define FREE_WORK_PART1 \
39 { \
40     free_Work <Entry> (Work, ns, n, maxfn, wtsize, cc) ; \
41     if (freeA) cholmod_l_free_sparse (Ahandle, cc) ; \
42     cholmod_l_free (anz, sizeof (Entry), Sx, cc) ; \
43     Sx = NULL ; \
44 }
45 
46 // Free Cblock and the Work array itself
47 #define FREE_WORK_PART2 \
48 { \
49     cholmod_l_free (ns, sizeof (spqr_work <Entry>), Work, cc) ; \
50     Work = NULL ; \
51     cholmod_l_free (nf+1, sizeof (Entry *), Cblock, cc) ; \
52     Cblock = NULL ; \
53 }
54 
55 // Free all workspace
56 #define FREE_WORK \
57 { \
58     FREE_WORK_PART1 ; \
59     FREE_WORK_PART2 ; \
60 }
61 
62 
63 // =============================================================================
64 // === get_Work ================================================================
65 // =============================================================================
66 
67 // Allocate the Work object, which is an array of structs.  The entry Work [s]
68 // contains workspace for the Stack s, for each Stack 0 to ns-1.
69 
get_Work(Long ns,Long n,Long maxfn,Long keepH,Long fchunk,Long * p_wtsize,cholmod_common * cc)70 template <typename Entry> spqr_work <Entry> *get_Work
71 (
72     Long ns,            // number of stacks
73     Long n,             // number of columns of A
74     Long maxfn,         // largest number of columns in any front
75     Long keepH,         // if true, H is kept
76     Long fchunk,
77     Long *p_wtsize,     // size of WTwork for each
78     cholmod_common *cc
79 )
80 {
81     int ok = TRUE ;
82     spqr_work <Entry> *Work ;
83     Long wtsize ;
84     *p_wtsize = 0 ;
85 
86     // wtsize = (fchunk + (keepH ? 0:1)) * maxfn ;
87     wtsize = spqr_mult (fchunk + (keepH ? 0:1), maxfn, &ok) ;
88 
89     Work = (spqr_work <Entry> *)
90         cholmod_l_malloc (ns, sizeof (spqr_work <Entry>), cc) ;
91 
92     if (!ok || cc->status < CHOLMOD_OK)
93     {
94         // out of memory or Long overflow
95         cholmod_l_free (ns, sizeof (spqr_work <Entry>), Work, cc) ;
96         ERROR (CHOLMOD_OUT_OF_MEMORY, "out of memory") ;
97         return (NULL) ;
98     }
99 
100     for (Long stack = 0 ; stack < ns ; stack++)
101     {
102         Work [stack].Fmap = (Long *) cholmod_l_malloc (n, sizeof (Long), cc) ;
103         Work [stack].Cmap = (Long *) cholmod_l_malloc (maxfn, sizeof(Long), cc);
104         if (keepH)
105         {
106             // Staircase is a permanent part of H
107             Work [stack].Stair1 = NULL ;
108         }
109         else
110         {
111             // Staircase workspace reused for each front
112             Work [stack].Stair1 =
113                 (Long *) cholmod_l_malloc (maxfn, sizeof (Long), cc) ;
114         }
115         Work [stack].WTwork =
116             (Entry *) cholmod_l_malloc (wtsize, sizeof (Entry), cc) ;
117         Work [stack].sumfrank = 0 ;
118         Work [stack].maxfrank = 0 ;
119 
120         Work [stack].wscale = 0 ;
121         Work [stack].wssq   = 0 ;
122     }
123 
124     *p_wtsize = wtsize ;
125     return (Work) ;
126 }
127 
128 
129 // =============================================================================
130 // === free_Work ===============================================================
131 // =============================================================================
132 
133 // Free the contents of Work, but not the Work array itself
134 
free_Work(spqr_work<Entry> * Work,Long ns,Long n,Long maxfn,Long wtsize,cholmod_common * cc)135 template <typename Entry> void free_Work
136 (
137     spqr_work <Entry> *Work,
138     Long ns,            // number of stacks
139     Long n,             // number of columns of A
140     Long maxfn,         // largest number of columns in any front
141     Long wtsize,        // size of WTwork array for each Stack
142     cholmod_common *cc
143 )
144 {
145     if (Work != NULL)
146     {
147         for (Long stack = 0 ; stack < ns ; stack++)
148         {
149             cholmod_l_free (n,      sizeof (Long),   Work [stack].Fmap,   cc) ;
150             cholmod_l_free (maxfn,  sizeof (Long),   Work [stack].Cmap,   cc) ;
151             cholmod_l_free (maxfn,  sizeof (Long),   Work [stack].Stair1, cc) ;
152             cholmod_l_free (wtsize, sizeof (Entry), Work [stack].WTwork, cc) ;
153             Work [stack].Fmap = NULL ;
154             Work [stack].Cmap = NULL ;
155             Work [stack].Stair1 = NULL ;
156             Work [stack].WTwork = NULL ;
157         }
158     }
159 }
160 
161 
162 // =============================================================================
163 // === spqr_factorize ==========================================================
164 // =============================================================================
165 
spqr_factorize(cholmod_sparse ** Ahandle,Long freeA,double tol,Long ntol,spqr_symbolic * QRsym,cholmod_common * cc)166 template <typename Entry> spqr_numeric <Entry> *spqr_factorize
167 (
168     // input, optionally freed on output
169     cholmod_sparse **Ahandle,
170 
171     // inputs, not modified
172     Long freeA,                     // if TRUE, free A on output
173     double tol,                     // for rank detection
174     Long ntol,                      // apply tol only to first ntol columns
175     spqr_symbolic *QRsym,
176 
177     // workspace and parameters
178     cholmod_common *cc
179 )
180 {
181     Long *Wi, *Qfill, *PLinv, *Cm, *Sp, *Stack_size,
182         *TaskFront, *TaskFrontp, *TaskStack, *Stack_maxstack ;
183     Entry *Sx, **Rblock, **Cblock, **Stacks ;
184     spqr_numeric <Entry> *QRnum ;
185     Long nf, m, n, anz, fchunk, maxfn, rank, maxfrank, rjsize, rank1,
186         maxstack,j, wtsize, stack, ns, ntasks, keepH, hisize ;
187     char *Rdead ;
188     cholmod_sparse *A ;
189     spqr_work <Entry> *Work ;
190 
191     // -------------------------------------------------------------------------
192     // get inputs and contents of symbolic object
193     // -------------------------------------------------------------------------
194 
195     if (QRsym == NULL)
196     {
197         // out of memory in caller
198         if (freeA)
199         {
200             // if freeA is true, A must always be freed, even on error
201             cholmod_l_free_sparse (Ahandle, cc) ;
202         }
203         return (NULL) ;
204     }
205 
206     A = *Ahandle ;
207 
208     nf = QRsym->nf ;                // number of frontal matrices
209     m = QRsym->m ;                  // A is m-by-n
210     n = QRsym->n ;
211     anz = QRsym->anz ;              // nnz (A)
212 
213     keepH = QRsym->keepH ;
214 
215     rjsize = QRsym->rjsize ;
216 
217     Sp = QRsym->Sp ;                // size m+1, row pointers for S
218     Qfill = QRsym->Qfill ;          // fill-reducing ordering
219     PLinv = QRsym->PLinv ;          // size m, leftmost column sort
220 
221     ns = QRsym->ns ;                // number of stacks
222     ntasks = QRsym->ntasks ;        // number of tasks
223 
224     // FUTURE: compute a unique maxfn for each stack.  Current maxfn is OK, but
225     // it's a global max of the fn of all fronts, and need only be max fn of
226     // the fronts in any given stack.
227 
228     maxfn  = QRsym->maxfn ;         // max # of columns in any front
229     ASSERT (maxfn <= n) ;
230     hisize = QRsym->hisize ;        // # of integers in Hii, Householder vectors
231 
232     TaskFrontp = QRsym->TaskFrontp ;
233     TaskFront  = QRsym->TaskFront ;
234     TaskStack  = QRsym->TaskStack ;
235 
236     maxstack = QRsym->maxstack ;
237     Stack_maxstack = QRsym->Stack_maxstack ;
238 
239     if (!(QRsym->do_rank_detection))
240     {
241         // disable rank detection if not accounted for in analysis
242         tol = -1 ;
243     }
244 
245     // If there is one task, there is only one stack, and visa versa
246     ASSERT ((ns == 1) == (ntasks == 1)) ;
247 
248     PR (("factorize with ns %ld ntasks %ld\n", ns, ntasks)) ;
249 
250     // -------------------------------------------------------------------------
251     // allocate workspace
252     // -------------------------------------------------------------------------
253 
254     cholmod_l_allocate_work (0, MAX (m,nf), 0, cc) ;
255 
256     // shared Long workspace
257     Wi = (Long *) cc->Iwork ;   // size m, aliased with the rest of Iwork
258     Cm = Wi ;                   // size nf
259 
260     // Cblock is workspace shared by all threads
261     Cblock = (Entry **) cholmod_l_malloc (nf+1, sizeof (Entry *), cc) ;
262 
263     Work = NULL ;               // Work and its contents not yet allocated
264     fchunk = MIN (m, FCHUNK) ;
265     wtsize = 0 ;
266 
267     // -------------------------------------------------------------------------
268     // create S
269     // -------------------------------------------------------------------------
270 
271     // create numeric values of S = A(p,q) in row-form in Sx
272     Sx = (Entry *) cholmod_l_malloc (anz, sizeof (Entry), cc) ;
273 
274     if (cc->status == CHOLMOD_OK)
275     {
276         // use Wi as workspace (Iwork (0:m-1)) [
277         spqr_stranspose2 (A, Qfill, Sp, PLinv, Sx, Wi) ;
278         // Wi no longer needed ]
279     }
280 
281     PR (("in spqr_factorize, status after creating Sx: %d\n", cc->status)) ;
282 
283     // -------------------------------------------------------------------------
284     // input matrix A no longer needed; free it if the user doesn't need it
285     // -------------------------------------------------------------------------
286 
287     if (freeA)
288     {
289         // this is done even if out of memory, above
290         cholmod_l_free_sparse (Ahandle, cc) ;
291         ASSERT (*Ahandle == NULL) ;
292     }
293     PR (("in spqr_factorize, freed A, status %d\n", cc->status)) ;
294 
295     if (cc->status < CHOLMOD_OK)
296     {
297         PR (("in spqr_factorize, failure %d\n", cc->status)) ;
298         // out of memory
299         FREE_WORK ;
300         return (NULL) ;
301     }
302 
303     // -------------------------------------------------------------------------
304     // allocate numeric object
305     // -------------------------------------------------------------------------
306 
307     QRnum = (spqr_numeric<Entry> *)
308         cholmod_l_malloc (1, sizeof (spqr_numeric<Entry>), cc) ;
309     PR (("after allocating numeric object header, status %d\n", cc->status)) ;
310 
311     if (cc->status < CHOLMOD_OK)
312     {
313         // out of memory
314         FREE_WORK ;
315         return (NULL) ;
316     }
317 
318     Rblock     = (Entry **) cholmod_l_malloc (nf, sizeof (Entry *), cc) ;
319     Rdead      = (char *)   cholmod_l_calloc (n,  sizeof (char),    cc) ;
320 
321     // these may be revised (with ns=1) if we run out of memory
322     Stacks     = (Entry **) cholmod_l_calloc (ns, sizeof (Entry *), cc) ;
323     Stack_size = (Long *)   cholmod_l_calloc (ns, sizeof (Long),    cc) ;
324 
325     QRnum->Rblock     = Rblock ;
326     QRnum->Rdead      = Rdead ;
327     QRnum->Stacks     = Stacks ;
328     QRnum->Stack_size = Stack_size ;
329 
330     if (keepH)
331     {
332         // allocate permanent space for Stair, Tau, Hii for each front
333         QRnum->HStair= (Long *)  cholmod_l_malloc (rjsize, sizeof (Long),  cc) ;
334         QRnum->HTau  = (Entry *) cholmod_l_malloc (rjsize, sizeof (Entry), cc) ;
335         QRnum->Hii   = (Long *)  cholmod_l_malloc (hisize, sizeof (Long),  cc) ;
336         QRnum->Hm    = (Long *)  cholmod_l_malloc (nf,     sizeof (Long),  cc) ;
337         QRnum->Hr    = (Long *)  cholmod_l_malloc (nf,     sizeof (Long),  cc) ;
338         QRnum->HPinv = (Long *)  cholmod_l_malloc (m,      sizeof (Long),  cc) ;
339     }
340     else
341     {
342         // H is not kept; this part of the numeric object is not used
343         QRnum->HStair = NULL ;
344         QRnum->HTau = NULL ;
345         QRnum->Hii = NULL ;
346         QRnum->Hm = NULL ;
347         QRnum->Hr = NULL ;
348         QRnum->HPinv = NULL ;
349     }
350 
351     QRnum->n = n ;
352     QRnum->m = m ;
353     QRnum->nf = nf ;
354     QRnum->rjsize = rjsize ;
355     QRnum->hisize = hisize ;
356     QRnum->keepH = keepH ;
357     QRnum->maxstack = maxstack ;
358     QRnum->ns = ns ;
359     QRnum->ntasks = ntasks ;
360     QRnum->maxfm = EMPTY ;      // max (Hm [0:nf-1]), computed only if H is kept
361 
362     if (cc->status < CHOLMOD_OK)
363     {
364         // out of memory
365         spqr_freenum (&QRnum, cc) ;
366         FREE_WORK ;
367         return (NULL) ;
368     }
369 
370     PR (("after allocating rest of numeric object, status %d\n", cc->status)) ;
371 
372     // -------------------------------------------------------------------------
373     // allocate workspace
374     // -------------------------------------------------------------------------
375 
376     Work = get_Work <Entry> (ns, n, maxfn, keepH, fchunk, &wtsize, cc) ;
377     PR (("after allocating work, status %d\n", cc->status)) ;
378 
379     // -------------------------------------------------------------------------
380     // allocate and initialize each Stack
381     // -------------------------------------------------------------------------
382 
383     if (cc->status == CHOLMOD_OK)
384     {
385         for (stack = 0 ; stack < ns ; stack++)
386         {
387             Entry *Stack ;
388             size_t stacksize = (ntasks == 1) ?
389                 maxstack : Stack_maxstack [stack] ;
390             Stack_size [stack] = stacksize ;
391             Stack = (Entry *) cholmod_l_malloc (stacksize, sizeof (Entry), cc) ;
392             Stacks [stack] = Stack ;
393             Work [stack].Stack_head = Stack ;
394             Work [stack].Stack_top  = Stack + stacksize ;
395         }
396     }
397 
398     PR (("after allocating the stacks, status %d\n", cc->status)) ;
399 
400     // -------------------------------------------------------------------------
401     // punt to sequential case and fchunk = 1 if out of memory
402     // -------------------------------------------------------------------------
403 
404     if (cc->status < CHOLMOD_OK)
405     {
406         // PUNT: ran out of memory; try again with smaller workspace
407         // out of memory; free any stacks that were successfully allocated
408         if (Stacks != NULL)
409         {
410             for (stack = 0 ; stack < ns ; stack++)
411             {
412                 size_t stacksize = (ntasks == 1) ?
413                     maxstack : Stack_maxstack [stack] ;
414                 cholmod_l_free (stacksize, sizeof (Entry), Stacks [stack], cc) ;
415             }
416         }
417         cholmod_l_free (ns, sizeof (Entry *), Stacks,     cc) ;
418         cholmod_l_free (ns, sizeof (Long),    Stack_size, cc) ;
419 
420         // free the contents of Work, and the Work array itself
421         free_Work <Entry> (Work, ns, n, maxfn, wtsize, cc) ;
422         cholmod_l_free (ns, sizeof (spqr_work <Entry>), Work, cc) ;
423 
424         // punt to a single stack, a single task, and fchunk of 1
425         ns = 1 ;
426         ntasks = 1 ;
427         fchunk = 1 ;
428         cc->status = CHOLMOD_OK ;
429         Work = get_Work <Entry> (ns, n, maxfn, keepH, fchunk, &wtsize, cc) ;
430         Stacks     = (Entry **) cholmod_l_calloc (ns, sizeof (Entry *), cc) ;
431         Stack_size = (Long *)   cholmod_l_calloc (ns, sizeof (Long),    cc) ;
432         QRnum->Stacks     = Stacks ;
433         QRnum->Stack_size = Stack_size ;
434         if (cc->status == CHOLMOD_OK)
435         {
436             Entry *Stack ;
437             Stack_size [0] = maxstack ;
438             Stack = (Entry *) cholmod_l_malloc (maxstack, sizeof (Entry), cc) ;
439             Stacks [0] = Stack ;
440             Work [0].Stack_head = Stack ;
441             Work [0].Stack_top  = Stack + maxstack ;
442         }
443     }
444 
445     // actual # of stacks and tasks used
446     QRnum->ns = ns ;
447     QRnum->ntasks = ntasks ;
448 
449     // -------------------------------------------------------------------------
450     // check if everything was allocated OK
451     // -------------------------------------------------------------------------
452 
453     if (cc->status < CHOLMOD_OK)
454     {
455         spqr_freenum (&QRnum, cc) ;
456         FREE_WORK ;
457         return (NULL) ;
458     }
459 
460     // At this point, the factorization is guaranteed to succeed, unless
461     // sizeof (BLAS_INT) < sizeof (Long), in which case, you really should get
462     // a 64-bit BLAS.
463 
464     // -------------------------------------------------------------------------
465     // create the Blob : everything the numeric factorization kernel needs
466     // -------------------------------------------------------------------------
467 
468     spqr_blob <Entry> Blob ;
469     Blob.QRsym = QRsym ;
470     Blob.QRnum = QRnum ;
471     Blob.tol = tol ;
472     Blob.Work = Work ;
473     Blob.Cm = Cm ;
474     Blob.Cblock = Cblock ;
475     Blob.Sx = Sx ;
476     Blob.ntol = ntol ;
477     Blob.fchunk = fchunk ;
478     Blob.cc = cc ;
479 
480     // -------------------------------------------------------------------------
481     // initialize the "pure" flop count (for performance testing only)
482     // -------------------------------------------------------------------------
483 
484     cc->SPQR_flopcount = 0 ;
485 
486     // -------------------------------------------------------------------------
487     // numeric QR factorization
488     // -------------------------------------------------------------------------
489 
490     PR (("[ calling the kernel\n")) ;
491 
492     if (ntasks == 1)
493     {
494         // Just one task, with or without TBB installed: don't use TBB
495         spqr_kernel (0, &Blob) ;        // sequential case
496     }
497     else
498     {
499 #ifdef HAVE_TBB
500         // parallel case: TBB is installed, and there is more than one task
501         int nthreads = MAX (0, cc->SPQR_nthreads) ;
502         spqr_parallel (ntasks, nthreads, &Blob) ;
503 #else
504         // TBB not installed, but the work is still split into multiple tasks.
505         // do tasks 0 to ntasks-2 (skip the placeholder root task id = ntasks-1)
506         for (Long id = 0 ; id < ntasks-1 ; id++)
507         {
508             spqr_kernel (id, &Blob) ;
509         }
510 #endif
511     }
512 
513     PR (("] did the kernel\n")) ;
514 
515     // -------------------------------------------------------------------------
516     // check for BLAS Long overflow on the CPU, or GPU failure
517     // -------------------------------------------------------------------------
518 
519     if (cc->status < CHOLMOD_OK)
520     {
521         // On the CPU, this case can only occur if the problem is too large for
522         // the BLAS.  This can only occur if, for example you're on a 64-bit
523         // platform (with sizeof (Long) = 8) and using a 32-bit BLAS (with
524         // sizeof (BLAS_INT) = 4).  On the GPU, this case can more easily
525         // occur, if we run out of memory in spqrgpu_kernel, or if we fail to
526         // communicate with the GPU.
527         spqr_freenum (&QRnum, cc) ;
528         FREE_WORK ;
529         return (NULL) ;
530     }
531 
532     // -------------------------------------------------------------------------
533     // finalize the rank
534     // -------------------------------------------------------------------------
535 
536     PR (("finalize the rank\n")) ;
537     rank = 0 ;
538     maxfrank = 1 ;
539     for (stack = 0 ; stack < ns ; stack++)
540     {
541         PR (("stack: %ld Work [stack].sumfrank: %ld\n", stack, Work [stack].sumfrank)) ;
542         rank += Work [stack].sumfrank ;
543         maxfrank = MAX (maxfrank, Work [stack].maxfrank) ;
544     }
545     QRnum->rank = rank ;                    // required by spqr_hpinv
546     QRnum->maxfrank = maxfrank ;
547     PR (("m %ld n %ld my QR rank %ld\n", m, n, rank)) ;
548 
549     // -------------------------------------------------------------------------
550     // finalize norm(w) for the dead column 2-norms
551     // -------------------------------------------------------------------------
552 
553     double wscale = 0 ;
554     double wssq = 1 ;
555     for (stack = 0 ; stack < ns ; stack++)
556     {
557         // norm_E_fro = norm (s.*sqrt(q)) ; see also LAPACK's dnrm2
558         double ws = Work [stack].wscale ;
559         double wq = Work [stack].wssq ;
560         if (wq != 0)
561         {
562             double wk = ws * sqrt (wq) ;
563             if (wscale < wk)
564             {
565                 double rr = wscale / wk ;
566                 wssq = 1 + wssq * rr * rr ;
567                 wscale = wk ;
568             }
569             else
570             {
571                 double rr = wk / wscale ;
572                 wssq += rr * rr ;
573             }
574         }
575     }
576     QRnum->norm_E_fro = wscale * sqrt (wssq) ;
577     cc->SPQR_norm_E_fro = QRnum->norm_E_fro ;
578 
579     // -------------------------------------------------------------------------
580     // free all workspace, except Cblock and Work
581     // -------------------------------------------------------------------------
582 
583     FREE_WORK_PART1 ;
584 
585     // -------------------------------------------------------------------------
586     // shrink the Stacks to hold just R (and H, if H kept)
587     // -------------------------------------------------------------------------
588 
589     // If shrink is <= 0, then the Stacks are not modified.
590     // If shrink is 1, each Stack is realloc'ed to the right size (default)
591     // If shrink is > 1, then each Stack is forcibly moved and shrunk.
592     // This option is mainly meant for testing, not production use.
593     // It should be left at 1 for production use.
594 
595     Long any_moved = FALSE ;
596 
597     int shrink = cc->SPQR_shrink ;
598 
599     if (shrink > 0)
600     {
601         for (stack = 0 ; stack < ns ; stack++)
602         {
603             // stacksize is the current size of the this Stack
604             size_t stacksize = Stack_size [stack] ;
605             Entry *Stack = Stacks [stack] ;
606             // Work [stack].Stack_head points to the first empty slot in stack,
607             // so newstacksize is the size of the space in use by R and H.
608             size_t newstacksize = Work [stack].Stack_head - Stack ;
609             ASSERT (newstacksize <= stacksize) ;
610             // Reduce the size of this stack.  Cblock [0:nf-1] is no longer
611             // needed for holding pointers to the C blocks of each frontal
612             // matrix.  Reuse it to hold the reallocated stacks.
613             if (shrink > 1)
614             {
615                 // force the block to move by malloc'ing a new one;
616                 // this option is mainly for testing only.
617                 Cblock [stack] = (Entry *) cholmod_l_malloc (newstacksize,
618                     sizeof (Entry), cc) ;
619                 if (Cblock [stack] == NULL)
620                 {
621                     // oops, the malloc failed; just use the old block
622                     cc->status = CHOLMOD_OK ;
623                     Cblock [stack] = Stack ;
624                     // update the memory usage statistics, however
625 		    cc->memory_inuse +=
626                         ((newstacksize-stacksize) * sizeof (Entry)) ;
627                 }
628                 else
629                 {
630                     // malloc is OK; copy the block over and free the old one
631                     memcpy (Cblock [stack], Stack, newstacksize*sizeof(Entry)) ;
632                     cholmod_l_free (stacksize, sizeof (Entry), Stack, cc) ;
633                 }
634                 // the Stack has been shrunk to the new size
635                 stacksize = newstacksize ;
636             }
637             else
638             {
639                 // normal method; just realloc the block
640                 PR (("Normal shrink of the stack: %ld to %ld\n",
641                     stacksize, newstacksize)) ;
642                 Cblock [stack] =    // pointer to the new Stack
643                     (Entry *) cholmod_l_realloc (
644                     newstacksize,   // requested size of Stack, in # of Entries
645                     sizeof (Entry), // size of each Entry in the Stack
646                     Stack,          // pointer to the old Stack
647                     &stacksize,     // input: old stack size; output: new size
648                     cc) ;
649             }
650             Stack_size [stack] = stacksize ;
651             any_moved = any_moved || (Cblock [stack] != Stack) ;
652             // reducing the size of a block of memory always succeeds
653             ASSERT (cc->status == CHOLMOD_OK) ;
654         }
655     }
656 
657     // -------------------------------------------------------------------------
658     // adjust the Rblock pointers if the Stacks have been moved
659     // -------------------------------------------------------------------------
660 
661     if (any_moved)
662     {
663         // at least one Stack has moved; check all fronts and adjust them
664         for (Long task = 0 ; task < ntasks ; task++)
665         {
666             Long kfirst, klast ;
667             if (ntasks == 1)
668             {
669                 // sequential case
670                 kfirst = 0 ;
671                 klast = nf ;
672                 stack = 0 ;
673             }
674             else
675             {
676                 kfirst = TaskFrontp [task] ;
677                 klast  = TaskFrontp [task+1] ;
678                 stack  = TaskStack [task] ;
679             }
680             ASSERT (stack >= 0 && stack < ns) ;
681             Entry *Old_Stack = Stacks [stack] ;
682             Entry *New_Stack = Cblock [stack] ;
683             if (New_Stack != Old_Stack)
684             {
685                 for (Long kf = kfirst ; kf < klast ; kf++)
686                 {
687                     Long f = (ntasks == 1) ? kf : TaskFront [kf] ;
688                     Rblock [f] = New_Stack + (Rblock [f] - Old_Stack) ;
689                 }
690             }
691         }
692         // finalize the Stacks
693         for (stack = 0 ; stack < ns ; stack++)
694         {
695             Stacks [stack] = Cblock [stack] ;
696         }
697     }
698 
699     // -------------------------------------------------------------------------
700     // free the rest of the workspace
701     // -------------------------------------------------------------------------
702 
703     FREE_WORK_PART2 ;
704 
705     // -------------------------------------------------------------------------
706     // extract the implicit row permutation for H
707     // -------------------------------------------------------------------------
708 
709     // this must be done sequentially, when all threads are finished
710     if (keepH)
711     {
712         // use Wi as workspace (Iwork (0:m-1)) [
713         spqr_hpinv (QRsym, QRnum, Wi) ;
714         // Wi no longer needed ]
715     }
716 
717     // -------------------------------------------------------------------------
718     // find the rank and return the result
719     // -------------------------------------------------------------------------
720 
721     // find the rank of the first ntol columns of A
722     PR (("find rank of first ntol cols of A: ntol %ld n %ld\n", ntol, n)) ;
723     if (ntol >= n)
724     {
725         rank1 = rank ;
726         PR (("rank1 is rank: %ld\n", rank1)) ;
727     }
728     else
729     {
730         rank1 = 0 ;
731         for (j = 0 ; j < ntol ; j++)
732         {
733             PR (("column %ld Rdead: %d\n", j, (int) Rdead [j])) ;
734             if (!Rdead [j])
735             {
736                 rank1++ ;
737             }
738         }
739         PR (("rank1 is sum of non-Rdead: %ld\n", rank1)) ;
740     }
741     QRnum->rank1 = rank1 ;
742     return (QRnum) ;
743 }
744 
745 
746 // =============================================================================
747 
748 template spqr_numeric <double> *spqr_factorize <double>
749 (
750     // input, optionally freed on output
751     cholmod_sparse **Ahandle,
752 
753     // inputs, not modified
754     Long freeA,                     // if TRUE, free A on output
755     double tol,                     // for rank detection
756     Long ntol,                      // apply tol only to first ntol columns
757     spqr_symbolic *QRsym,
758 
759     // workspace and parameters
760     cholmod_common *cc
761 ) ;
762 
763 // =============================================================================
764 
765 template spqr_numeric <Complex> *spqr_factorize <Complex>
766 (
767     // input, optionally freed on output
768     cholmod_sparse **Ahandle,
769 
770     // inputs, not modified
771     Long freeA,                     // if TRUE, free A on output
772     double tol,                     // for rank detection
773     Long ntol,                      // apply tol only to first ntol columns
774     spqr_symbolic *QRsym,
775 
776     // workspace and parameters
777     cholmod_common *cc
778 ) ;
779