1 /* ========================================================================== */
2 /* === Supernodal/cholmod_super_symbolic ==================================== */
3 /* ========================================================================== */
4 
5 /* -----------------------------------------------------------------------------
6  * CHOLMOD/Supernodal Module. Copyright (C) 2005-2006, Timothy A. Davis
7  * http://www.suitesparse.com
8  * -------------------------------------------------------------------------- */
9 
10 /* Supernodal symbolic analysis of the LL' factorization of A, A*A',
11  * A(:,f)*A(:,f)'.
12  *
13  * This routine must be preceded by a simplicial symbolic analysis
14  * (cholmod_rowcolcounts).  See cholmod_analyze.c for an example of how to use
15  * this routine.
16  *
17  * The user need not call this directly; cholmod_analyze is a "simple" wrapper
18  * for this routine.
19  *
20  * Symmetric case:
21  *
22  *	A is stored in column form, with entries stored in the upper triangular
23  *	part.  Entries in the lower triangular part are ignored.
24  *
25  * Unsymmetric case:
26  *
27  *	A is stored in column form.  If F is equal to the transpose of A, then
28  *	A*A' is analyzed.  F can include a subset of the columns of A
29  *	(F=A(:,f)'), in which case F*F' is analyzed.
30  *
31  * Requires Parent and L->ColCount to be defined on input; these are the
32  * simplicial Parent and ColCount arrays as computed by cholmod_rowcolcounts.
33  * Does not use L->Perm; the input matrices A and F must already be properly
34  * permuted.  Allocates and computes the supernodal pattern of L (L->super,
35  * L->pi, L->px, and L->s).  Does not allocate the real part (L->x).
36  *
37  * Supports any xtype (pattern, real, complex, or zomplex).
38  */
39 
40 #ifndef NGPL
41 #ifndef NSUPERNODAL
42 
43 #include "cholmod_internal.h"
44 #include "cholmod_supernodal.h"
45 
46 #ifdef GPU_BLAS
47 #include "cholmod_gpu.h"
48 #endif
49 
50 /* ========================================================================== */
51 /* === subtree ============================================================== */
52 /* ========================================================================== */
53 
54 /* In the symmetric case, traverse the kth row subtree from the nonzeros in
55  * A (0:k1-1,k) and add the new entries found to the pattern of the kth row
56  * of L.  The current supernode s contains the diagonal block k1:k2-1, so it
57  * can be skipped.
58  *
59  * In the unsymmetric case, the nonzero pattern of A*F is computed one column
60  * at a time (thus, the total time spent in this function is bounded below by
61  * the time taken to multiply A*F, which can be high if A is tall and thin).
62  * The kth column is A*F(:,k), or the set union of all columns A(:,j) for which
63  * F(j,k) is nonzero.  This routine is called once for each entry j.  Only the
64  * upper triangular part is needed, so only A (0:k1-1,j) is accessed, where
65  * k1:k2-1 are the columns of the current supernode s (k is in the range k1 to
66  * k2-1).
67  *
68  * If A is sorted, then the total time taken by this function is proportional
69  * to the number of nonzeros in the strictly block upper triangular part of A,
70  * plus the number of entries in the strictly block lower triangular part of
71  * the supernodal part of L.  This excludes entries in the diagonal blocks
72  * corresponding to the columns in each supernode.  That is, if k1:k2-1 are
73  * in a single supernode, then only A (0:k1-1,k1:k2-1) are accessed.
74  *
75  * For the unsymmetric case, only the strictly block upper triangular part
76  * of A*F is constructed.
77  *
78  * Only adds column indices corresponding to the leading columns of each
79  * relaxed supernode.
80  */
81 
subtree(Int j,Int k,Int Ap[],Int Ai[],Int Anz[],Int SuperMap[],Int Sparent[],Int mark,Int sorted,Int k1,Int Flag[],Int Ls[],Int Lpi2[])82 static void subtree
83 (
84     /* inputs, not modified: */
85     Int j,		/* j = k for symmetric case */
86     Int k,
87     Int Ap [ ],
88     Int Ai [ ],
89     Int Anz [ ],
90     Int SuperMap [ ],
91     Int Sparent [ ],
92     Int mark,
93     Int sorted,         /* true if the columns of A are sorted */
94     Int k1,             /* only consider A (0:k1-1,k) */
95 
96     /* input/output: */
97     Int Flag [ ],
98     Int Ls [ ],
99     Int Lpi2 [ ]
100 )
101 {
102     Int p, pend, i, si ;
103     p = Ap [j] ;
104     pend = (Anz == NULL) ? (Ap [j+1]) : (p + Anz [j]) ;
105 
106     for ( ; p < pend ; p++)
107     {
108 	i = Ai [p] ;
109 	if (i < k1)
110 	{
111 	    /* (i,k) is an entry in the upper triangular part of A or A*F'.
112 	     * symmetric case:   A(i,k) is nonzero (j=k).
113 	     * unsymmetric case: A(i,j) and F(j,k) are both nonzero.
114 	     *
115 	     * Column i is in supernode si = SuperMap [i].  Follow path from si
116 	     * to root of supernodal etree, stopping at the first flagged
117 	     * supernode.  The root of the row subtree is supernode SuperMap[k],
118 	     * which is flagged already. This traversal will stop there, or it
119 	     * might stop earlier if supernodes have been flagged by previous
120 	     * calls to this routine for the same k. */
121 	    for (si = SuperMap [i] ; Flag [si] < mark ; si = Sparent [si])
122 	    {
123 		ASSERT (si <= SuperMap [k]) ;
124 		Ls [Lpi2 [si]++] = k ;
125 		Flag [si] = mark ;
126 	    }
127 	}
128         else if (sorted)
129         {
130             break ;
131         }
132     }
133 }
134 
135 
136 /* clear workspace used by cholmod_super_symbolic */
137 #define FREE_WORKSPACE \
138 { \
139     /* CHOLMOD(clear_flag) (Common) ; */ \
140     CHOLMOD_CLEAR_FLAG (Common) ; \
141     for (k = 0 ; k <= nfsuper ; k++) \
142     { \
143 	Head [k] = EMPTY ; \
144     } \
145     ASSERT (CHOLMOD(dump_work) (TRUE, TRUE, 0, Common)) ; \
146 } \
147 
148 
149 /* ========================================================================== */
150 /* === cholmod_super_symbolic2 ============================================== */
151 /* ========================================================================== */
152 
153 /* Analyze for supernodal Cholesky or multifrontal QR. */
154 
CHOLMOD(super_symbolic2)155 int CHOLMOD(super_symbolic2)
156 (
157     /* ---- input ---- */
158     int for_whom,       /* FOR_SPQR     (0): for SPQR but not GPU-accelerated
159                            FOR_CHOLESKY (1): for Cholesky (GPU or not)
160                            FOR_SPQRGPU  (2): for SPQR with GPU acceleration */
161     cholmod_sparse *A,	/* matrix to analyze */
162     cholmod_sparse *F,	/* F = A' or A(:,f)' */
163     Int *Parent,	/* elimination tree */
164     /* ---- in/out --- */
165     cholmod_factor *L,	/* simplicial symbolic on input,
166 			 * supernodal symbolic on output */
167     /* --------------- */
168     cholmod_common *Common
169 )
170 {
171     double zrelax0, zrelax1, zrelax2, xxsize ;
172     Int *Wi, *Wj, *Super, *Snz, *Ap, *Ai, *Flag, *Head, *Ls, *Lpi, *Lpx, *Fnz,
173 	*Sparent, *Anz, *SuperMap, *Merged, *Nscol, *Zeros, *Fp, *Fj,
174 	*ColCount, *Lpi2, *Lsuper, *Iwork ;
175     Int nsuper, d, n, j, k, s, mark, parent, p, pend, k1, k2, packed, nscol,
176 	nsrow, ndrow1, ndrow2, stype, ssize, xsize, sparent, plast, slast,
177 	csize, maxcsize, ss, nscol0, nscol1, ns, nfsuper, newzeros, totzeros,
178 	merge, snext, esize, maxesize, nrelax0, nrelax1, nrelax2, Asorted ;
179     size_t w ;
180     int ok = TRUE, find_xsize ;
181     const char* env_use_gpu;
182     const char* env_max_bytes;
183     size_t max_bytes;
184     const char* env_max_fraction;
185     double max_fraction;
186 
187     /* ---------------------------------------------------------------------- */
188     /* check inputs */
189     /* ---------------------------------------------------------------------- */
190 
191     RETURN_IF_NULL_COMMON (FALSE) ;
192     RETURN_IF_NULL (A, FALSE) ;
193     RETURN_IF_NULL (L, FALSE) ;
194     RETURN_IF_NULL (Parent, FALSE) ;
195     RETURN_IF_XTYPE_INVALID (A, CHOLMOD_PATTERN, CHOLMOD_ZOMPLEX, FALSE) ;
196     RETURN_IF_XTYPE_INVALID (L, CHOLMOD_PATTERN, CHOLMOD_PATTERN, FALSE) ;
197     stype = A->stype ;
198     if (stype < 0)
199     {
200 	/* invalid symmetry; symmetric lower form not supported */
201 	ERROR (CHOLMOD_INVALID, "symmetric lower not supported") ;
202 	return (FALSE) ;
203     }
204     if (stype == 0)
205     {
206 	/* F must be present in the unsymmetric case */
207 	RETURN_IF_NULL (F, FALSE) ;
208     }
209     if (L->is_super)
210     {
211 	/* L must be a simplicial symbolic factor */
212 	ERROR (CHOLMOD_INVALID, "L must be symbolic on input") ;
213 	return (FALSE) ;
214     }
215     Common->status = CHOLMOD_OK ;
216 
217     /* ---------------------------------------------------------------------- */
218     /* allocate workspace */
219     /* ---------------------------------------------------------------------- */
220 
221     n = A->nrow ;
222 
223     /* w = 5*n */
224     w = CHOLMOD(mult_size_t) (n, 5, &ok) ;
225     if (!ok)
226     {
227 	ERROR (CHOLMOD_TOO_LARGE, "problem too large") ;
228 	return (FALSE) ;
229     }
230 
231     CHOLMOD(allocate_work) (n, w, 0, Common) ;
232     if (Common->status < CHOLMOD_OK)
233     {
234 	/* out of memory */
235 	return (FALSE) ;
236     }
237     ASSERT (CHOLMOD(dump_work) (TRUE, TRUE, 0, Common)) ;
238 
239     /* ---------------------------------------------------------------------- */
240     /* allocate GPU workspace */
241     /* ---------------------------------------------------------------------- */
242 
243     L->useGPU = 0 ;     /* only used for Cholesky factorization, not QR */
244 
245 #ifdef GPU_BLAS
246 
247     /* GPU module is installed */
248     if ( for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY )
249     {
250         /* only allocate GPU workspace for supernodal Cholesky, and only when
251            the GPU is requested and available. */
252 
253         max_bytes = 0;
254         max_fraction = 0;
255 
256 #ifdef DLONG
257         if ( Common->useGPU == EMPTY )
258         {
259             /* useGPU not explicity requested by the user, but not explicitly
260              * prohibited either.  Query OS environment variables for request.*/
261             env_use_gpu  = getenv("CHOLMOD_USE_GPU");
262 
263             if ( env_use_gpu )
264             {
265                 /* CHOLMOD_USE_GPU environment variable is set to something */
266                 if ( atoi ( env_use_gpu ) == 0 )
267                 {
268                     Common->useGPU = 0; /* don't use the gpu */
269                 }
270                 else
271                 {
272                     Common->useGPU = 1; /* use the gpu */
273                     env_max_bytes = getenv("CHOLMOD_GPU_MEM_BYTES");
274                     env_max_fraction = getenv("CHOLMOD_GPU_MEM_FRACTION");
275                     if ( env_max_bytes )
276                     {
277                         max_bytes = atol(env_max_bytes);
278                         Common->maxGpuMemBytes = max_bytes;
279                     }
280                     if ( env_max_fraction )
281                     {
282                         max_fraction = atof (env_max_fraction);
283                         if ( max_fraction < 0 ) max_fraction = 0;
284                         if ( max_fraction > 1 ) max_fraction = 1;
285                         Common->maxGpuMemFraction = max_fraction;
286                     }
287                 }
288             }
289             else
290             {
291                 /* CHOLMOD_USE_GPU environment variable not set, so no GPU
292                  * acceleration will be used */
293                 Common->useGPU = 0;
294             }
295             /* fprintf (stderr, "useGPU queried: %d\n", Common->useGPU) ; */
296         }
297 
298         /* Ensure that a GPU is present */
299         if ( Common->useGPU == 1 )
300         {
301             /* fprintf (stderr, "\nprobe GPU:\n") ; */
302             Common->useGPU = CHOLMOD(gpu_probe) (Common);   // Cholesky only, not SPQR
303             /* fprintf (stderr, "\nprobe GPU: result %d\n", Common->useGPU) ; */
304         }
305 
306         if ( Common->useGPU == 1 )
307         {
308             /* Cholesky + GPU, so allocate space */
309             /* fprintf (stderr, "allocate GPU:\n") ; */
310             CHOLMOD(gpu_allocate) ( Common );               // Cholesky only, not SPQR
311             /* fprintf (stderr, "allocate GPU done\n") ; */
312         }
313 #else
314         /* GPU acceleration is only supported for long int version */
315         Common->useGPU = 0;
316 #endif
317 
318         /* Cache the fact that the symbolic factorization supports
319          * GPU acceleration */
320         L->useGPU = Common->useGPU;
321 
322     }
323 
324 #else
325     /* GPU module is not installed */
326     Common->useGPU = 0 ;
327 #endif
328 
329     /* ---------------------------------------------------------------------- */
330     /* get inputs */
331     /* ---------------------------------------------------------------------- */
332 
333     /* A is now either A or triu(A(p,p)) for the symmetric case.  It is either
334      * A or A(p,f) for the unsymmetric case (both in column form).  It can be
335      * either packed or unpacked, and either sorted or unsorted.  Entries in
336      * the lower triangular part may be present if A is symmetric, but these
337      * are ignored. */
338 
339     Ap = A->p ;
340     Ai = A->i ;
341     Anz = A->nz ;
342 
343     if (stype != 0)
344     {
345 	/* F not accessed */
346 	Fp = NULL ;
347 	Fj = NULL ;
348 	Fnz = NULL ;
349 	packed = TRUE ;
350     }
351     else
352     {
353 	/* F = A(:,f) or A(p,f) in packed row form, either sorted or unsorted */
354 	Fp = F->p ;
355 	Fj = F->i ;
356 	Fnz = F->nz ;
357 	packed = F->packed ;
358     }
359 
360     ColCount = L->ColCount ;
361 
362     nrelax0 = Common->nrelax [0] ;
363     nrelax1 = Common->nrelax [1] ;
364     nrelax2 = Common->nrelax [2] ;
365 
366     zrelax0 = Common->zrelax [0] ;
367     zrelax1 = Common->zrelax [1] ;
368     zrelax2 = Common->zrelax [2] ;
369 
370     zrelax0 = IS_NAN (zrelax0) ? 0 : zrelax0 ;
371     zrelax1 = IS_NAN (zrelax1) ? 0 : zrelax1 ;
372     zrelax2 = IS_NAN (zrelax2) ? 0 : zrelax2 ;
373 
374     ASSERT (CHOLMOD(dump_parent) (Parent, n, "Parent", Common)) ;
375 
376     /* ---------------------------------------------------------------------- */
377     /* get workspace */
378     /* ---------------------------------------------------------------------- */
379 
380     /* Sparent, Snz, and Merged could be allocated later, of size nfsuper */
381 
382     Iwork = Common->Iwork ;
383     Wi      = Iwork ;	    /* size n (i/l/l).  Lpi2 is i/l/l */
384     Wj      = Iwork + n ;   /* size n (i/l/l).  Zeros is i/l/l */
385     Sparent = Iwork + 2*((size_t) n) ; /* size nfsuper <= n [ */
386     Snz     = Iwork + 3*((size_t) n) ; /* size nfsuper <= n [ */
387     Merged  = Iwork + 4*((size_t) n) ; /* size nfsuper <= n [ */
388 
389     Flag = Common->Flag ;   /* size n */
390     Head = Common->Head ;   /* size n+1 */
391 
392     /* ---------------------------------------------------------------------- */
393     /* find the fundamental supernodes */
394     /* ---------------------------------------------------------------------- */
395 
396     /* count the number of children of each node, using Wi [ */
397     for (j = 0 ; j < n ; j++)
398     {
399 	Wi [j] = 0 ;
400     }
401     for (j = 0 ; j < n ; j++)
402     {
403 	parent = Parent [j] ;
404 	if (parent != EMPTY)
405 	{
406 	    Wi [parent]++ ;
407 	}
408     }
409 
410     Super = Head ;  /* use Head [0..nfsuper] as workspace for Super list ( */
411 
412     /* column 0 always starts a new supernode */
413     nfsuper = (n == 0) ? 0 : 1 ;	/* number of fundamental supernodes */
414     Super [0] = 0 ;
415 
416     for (j = 1 ; j < n ; j++)
417     {
418 	/* check if j starts new supernode, or in the same supernode as j-1 */
419 	if (Parent [j-1] != j	    /* parent of j-1 is not j */
420 	    || (ColCount [j-1] != ColCount [j] + 1) /* j-1 not subset of j*/
421 	    || Wi [j] > 1	    /* j has more than one child */
422 #ifdef GPU_BLAS
423 	    /* Ensure that the supernode will fit in the GPU buffers */
424 	    /* Data size of 16 bytes must be assumed for case of PATTERN */
425 	    || (for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY && L->useGPU &&
426 		 (j-Super[nfsuper-1]+1) *
427 		 ColCount[Super[nfsuper-1]] * sizeof(double) * 2 >=
428 		 Common->devBuffSize)
429 #endif
430 	    )
431 	{
432 	    /* j is the leading node of a supernode */
433 	    Super [nfsuper++] = j ;
434 	}
435     }
436     Super [nfsuper] = n ;
437 
438     /* contents of Wi no longer needed for child count ] */
439 
440     Nscol = Wi ; /* use Wi as size-nfsuper workspace for Nscol [ */
441 
442     /* ---------------------------------------------------------------------- */
443     /* find the mapping of fundamental nodes to supernodes */
444     /* ---------------------------------------------------------------------- */
445 
446     SuperMap = Wj ;	/* use Wj as workspace for SuperMap [ */
447 
448     /* SuperMap [k] = s if column k is contained in supernode s */
449     for (s = 0 ; s < nfsuper ; s++)
450     {
451 	for (k = Super [s] ; k < Super [s+1] ; k++)
452 	{
453 	    SuperMap [k] = s ;
454 	}
455     }
456 
457     /* ---------------------------------------------------------------------- */
458     /* construct the fundamental supernodal etree */
459     /* ---------------------------------------------------------------------- */
460 
461     for (s = 0 ; s < nfsuper ; s++)
462     {
463 	j = Super [s+1] - 1 ;	/* last node in supernode s */
464 	parent = Parent [j] ;	/* parent of last node */
465 	Sparent [s] = (parent == EMPTY) ? EMPTY : SuperMap [parent] ;
466 	PRINT1 (("Sparent ["ID"] = "ID"\n", s, Sparent [s])) ;
467     }
468 
469     /* contents of Wj no longer needed as workspace for SuperMap ]
470      * SuperMap will be recomputed below, for the relaxed supernodes. */
471 
472     Zeros = Wj ;   /* use Wj for Zeros, workspace of size nfsuper [ */
473 
474     /* ---------------------------------------------------------------------- */
475     /* relaxed amalgamation */
476     /* ---------------------------------------------------------------------- */
477 
478     for (s = 0 ; s < nfsuper ; s++)
479     {
480 	Merged [s] = EMPTY ;			/* s not merged into another */
481 	Nscol [s] = Super [s+1] - Super [s] ;	/* # of columns in s */
482 	Zeros [s] = 0 ;				/* # of zero entries in s */
483 	ASSERT (s <= Super [s]) ;
484 	Snz [s] = ColCount [Super [s]] ;  /* # of entries in leading col of s */
485 	PRINT2 (("lnz ["ID"] "ID"\n", s, Snz [s])) ;
486     }
487 
488     for (s = nfsuper-2 ; s >= 0 ; s--)
489     {
490         double lnz1 ;
491 
492 	/* should supernodes s and s+1 merge into a new node s? */
493 	PRINT1 (("\n========= Check relax of s "ID" and s+1 "ID"\n", s, s+1)) ;
494 
495 	ss = Sparent [s] ;
496 	if (ss == EMPTY)
497 	{
498 	    PRINT1 (("s "ID" is a root, no merge with s+1 = "ID"\n", s, s+1)) ;
499 	    continue ;
500 	}
501 
502 	/* find the current parent of s (perform path compression as needed) */
503 	for (ss = Sparent [s] ; Merged [ss] != EMPTY ; ss = Merged [ss]) ;
504 	sparent = ss ;
505 	PRINT2 (("Current sparent of s "ID" is "ID"\n", s, sparent)) ;
506 
507 	/* ss is the current parent of s */
508 	for (ss = Sparent [s] ; Merged [ss] != EMPTY ; ss = snext)
509 	{
510 	    snext = Merged [ss] ;
511 	    PRINT2 (("ss "ID" is dead, merged into snext "ID"\n", ss, snext)) ;
512 	    Merged [ss] = sparent ;
513 	}
514 
515 	/* if s+1 is not the current parent of s, do not merge */
516 	if (sparent != s+1)
517 	{
518 	    continue ;
519 	}
520 
521 	nscol0 = Nscol [s] ;	/* # of columns in s */
522 	nscol1 = Nscol [s+1] ;	/* # of columns in s+1 */
523 	ns = nscol0 + nscol1 ;
524 	PRINT2 (("ns "ID" nscol0 "ID" nscol1 "ID"\n", ns, nscol0, nscol1)) ;
525 
526 	totzeros = Zeros [s+1] ;	/* current # of zeros in s+1 */
527 	lnz1 = (double) (Snz [s+1]) ;	/* # entries in leading column of s+1 */
528 
529 	/* determine if supernodes s and s+1 should merge */
530 	if (ns <= nrelax0)
531 	{
532 	    PRINT2 (("ns is tiny ("ID"), so go ahead and merge\n", ns)) ;
533 	    merge = TRUE ;
534 	}
535 	else
536 	{
537 	    /* use double to avoid integer overflow */
538 	    double lnz0 = Snz [s] ;	/* # entries in leading column of s */
539 	    double xnewzeros = nscol0 * (lnz1 + nscol0 - lnz0) ;
540 
541 	    /* use Int for the final update of Zeros [s] below */
542 	    newzeros = nscol0 * (Snz [s+1] + nscol0 - Snz [s]) ;
543 	    ASSERT (newzeros == xnewzeros) ;
544 
545 	    PRINT2 (("lnz0 %g lnz1 %g xnewzeros %g\n", lnz0, lnz1, xnewzeros)) ;
546 	    if (xnewzeros == 0)
547 	    {
548 		/* no new zeros, so go ahead and merge */
549 		PRINT2 (("no new fillin, so go ahead and merge\n")) ;
550 		merge = TRUE ;
551 	    }
552 	    else
553 	    {
554 		/* # of zeros if merged */
555 		double xtotzeros = ((double) totzeros) + xnewzeros ;
556 
557 		/* xtotsize: total size of merged supernode, if merged: */
558 		double xns = (double) ns ;
559 		double xtotsize  = (xns * (xns+1) / 2) + xns * (lnz1 - nscol1) ;
560 		double z = xtotzeros / xtotsize ;
561 
562 		Int totsize ;
563 		totsize  = (ns * (ns+1) / 2) + ns * (Snz [s+1] - nscol1) ;
564 
565 		PRINT2 (("oldzeros "ID" newzeros "ID" xtotsize %g z %g\n",
566 			    Zeros [s+1], newzeros, xtotsize, z)) ;
567 
568 		/* use Int for the final update of Zeros [s] below */
569 		totzeros += newzeros ;
570 
571 		/* do not merge if supernode would become too big
572 		 * (Int overflow).  Continue computing; not (yet) an error. */
573 		/* fl.pt. compare, but no NaN's can occur here */
574 		merge = ((ns <= nrelax1 && z < zrelax0) ||
575 			 (ns <= nrelax2 && z < zrelax1) ||
576 					  (z < zrelax2)) &&
577 			(xtotsize < Int_max / sizeof (double)) ;
578 
579 	    }
580 	}
581 
582 #ifdef GPU_BLAS
583 	if ( for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY && L->useGPU ) {
584 	  /* Ensure that the aggregated supernode fits in the device
585 	     supernode buffers */
586 	  double xns = (double) ns;
587 	  if ( ((xns * xns) + xns * (lnz1 - nscol1))*sizeof(double)*2  >=
588 	       Common->devBuffSize ) {
589 	    merge = FALSE;
590 	  }
591 	}
592 #endif
593 
594 	if (merge)
595 	{
596 	    PRINT1 (("Merge node s ("ID") and s+1 ("ID")\n", s, s+1)) ;
597 	    Zeros [s] = totzeros ;
598 	    Merged [s+1] = s ;
599 	    Snz [s] = nscol0 + Snz [s+1] ;
600 	    Nscol [s] += Nscol [s+1] ;
601 	}
602     }
603 
604     /* contents of Wj no longer needed for Zeros ] */
605     /* contents of Wi no longer needed for Nscol ] */
606     /* contents of Sparent no longer needed (recomputed below) */
607 
608     /* ---------------------------------------------------------------------- */
609     /* construct the relaxed supernode list */
610     /* ---------------------------------------------------------------------- */
611 
612     nsuper = 0 ;
613     for (s = 0 ; s < nfsuper ; s++)
614     {
615 	if (Merged [s] == EMPTY)
616 	{
617 	    PRINT1 (("live supernode: "ID" snz "ID"\n", s, Snz [s])) ;
618 	    Super [nsuper] = Super [s] ;
619 	    Snz [nsuper] = Snz [s] ;
620 	    nsuper++ ;
621 	}
622     }
623     Super [nsuper] = n ;
624     PRINT1 (("Fundamental supernodes: "ID"  relaxed "ID"\n", nfsuper, nsuper)) ;
625 
626     /* Merged no longer needed ] */
627 
628     /* ---------------------------------------------------------------------- */
629     /* find the mapping of relaxed nodes to supernodes */
630     /* ---------------------------------------------------------------------- */
631 
632     /* use Wj as workspace for SuperMap { */
633 
634     /* SuperMap [k] = s if column k is contained in supernode s */
635     for (s = 0 ; s < nsuper ; s++)
636     {
637 	for (k = Super [s] ; k < Super [s+1] ; k++)
638 	{
639 	    SuperMap [k] = s ;
640 	}
641     }
642 
643     /* ---------------------------------------------------------------------- */
644     /* construct the relaxed supernodal etree */
645     /* ---------------------------------------------------------------------- */
646 
647     for (s = 0 ; s < nsuper ; s++)
648     {
649 	j = Super [s+1] - 1 ;	/* last node in supernode s */
650 	parent = Parent [j] ;	/* parent of last node */
651 	Sparent [s] = (parent == EMPTY) ? EMPTY : SuperMap [parent] ;
652 	PRINT1 (("new Sparent ["ID"] = "ID"\n", s, Sparent [s])) ;
653     }
654 
655     /* ---------------------------------------------------------------------- */
656     /* determine the size of L->s and L->x */
657     /* ---------------------------------------------------------------------- */
658 
659     ssize = 0 ;
660     xsize = 0 ;
661     xxsize = 0 ;
662     find_xsize = for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY ||
663                  for_whom == CHOLMOD_ANALYZE_FOR_SPQRGPU ;
664     for (s = 0 ; s < nsuper ; s++)
665     {
666 	nscol = Super [s+1] - Super [s] ;
667 	nsrow = Snz [s] ;
668 	ASSERT (nscol > 0) ;
669 	ssize += nsrow ;
670         if (find_xsize)
671         {
672             xsize += nscol * nsrow ;
673             /* also compute xsize in double to guard against Int overflow */
674             xxsize += ((double) nscol) * ((double) nsrow) ;
675         }
676 	if (ssize < 0 ||(find_xsize && xxsize > Int_max))
677 	{
678 	    /* Int overflow, clear workspace and return.
679                QR factorization will not use xxsize, so that error is ignored.
680                For Cholesky factorization, however, memory of space xxsize
681                will be allocated, so this is a failure.  Both QR and Cholesky
682                fail if ssize overflows. */
683 	    ERROR (CHOLMOD_TOO_LARGE, "problem too large") ;
684 	    FREE_WORKSPACE ;
685 	    return (FALSE) ;
686 	}
687 	ASSERT (ssize > 0) ;
688         ASSERT (IMPLIES (find_xsize, xsize > 0)) ;
689     }
690     xsize = MAX (1, xsize) ;
691     ssize = MAX (1, ssize) ;
692     PRINT1 (("ix sizes: "ID" "ID" nsuper "ID"\n", ssize, xsize, nsuper)) ;
693 
694     /* ---------------------------------------------------------------------- */
695     /* allocate L (all except real part L->x) */
696     /* ---------------------------------------------------------------------- */
697 
698     L->ssize = ssize ;
699     L->xsize = xsize ;
700     L->nsuper = nsuper ;
701 
702     CHOLMOD(change_factor) (CHOLMOD_PATTERN, TRUE, TRUE, TRUE, TRUE, L, Common);
703 
704     if (Common->status < CHOLMOD_OK)
705     {
706 	/* out of memory; L is still a valid simplicial symbolic factor */
707 	FREE_WORKSPACE ;
708 	return (FALSE) ;
709     }
710 
711     DEBUG (CHOLMOD(dump_factor) (L, "L to symbolic super", Common)) ;
712     ASSERT (L->is_ll && L->xtype == CHOLMOD_PATTERN && L->is_super) ;
713 
714     Lpi = L->pi ;
715     Lpx = L->px ;
716     Ls = L->s ;
717     Ls [0] = 0 ;    /* flag for cholmod_check_factor; supernodes are defined */
718     Lsuper = L->super ;
719 
720     /* copy the list of relaxed supernodes into the final list in L */
721     for (s = 0 ; s <= nsuper ; s++)
722     {
723 	Lsuper [s] = Super [s] ;
724     }
725 
726     /* Head no longer needed as workspace for fundamental Super list ) */
727 
728     Super = Lsuper ;	    /* Super is now the list of relaxed supernodes */
729 
730     /* ---------------------------------------------------------------------- */
731     /* construct column pointers of relaxed supernodal pattern (L->pi) */
732     /* ---------------------------------------------------------------------- */
733 
734     p = 0 ;
735     for (s = 0 ; s < nsuper ; s++)
736     {
737 	Lpi [s] = p ;
738 	p += Snz [s] ;
739 	PRINT1 (("Snz ["ID"] = "ID", Super ["ID"] = "ID"\n",
740 		    s, Snz [s], s, Super[s])) ;
741     }
742     Lpi [nsuper] = p ;
743     ASSERT ((Int) (L->ssize) == MAX (1,p)) ;
744 
745     /* ---------------------------------------------------------------------- */
746     /* construct pointers for supernodal values (L->px) */
747     /* ---------------------------------------------------------------------- */
748 
749     if (for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY ||
750         for_whom == CHOLMOD_ANALYZE_FOR_SPQRGPU)
751     {
752         Lpx [0] = 0 ;
753         p = 0 ;
754         for (s = 0 ; s < nsuper ; s++)
755         {
756             nscol = Super [s+1] - Super [s] ;   /* number of columns in s */
757             nsrow = Snz [s] ;           /* # of rows, incl triangular part*/
758             Lpx [s] = p ;               /* pointer to numerical part of s */
759             p += nscol * nsrow ;
760         }
761         Lpx [s] = p ;
762         ASSERT ((Int) (L->xsize) == MAX (1,p)) ;
763     }
764     else
765     {
766         /* L->px is not needed for non-GPU accelerated QR factorization (it may
767          * lead to Int overflow, anyway, if xsize caused Int overflow above).
768          * Use a magic number to tell cholmod_check_factor to ignore Lpx. */
769         Lpx [0] = 123456 ;
770     }
771 
772     /* Snz no longer needed ] */
773 
774     /* ---------------------------------------------------------------------- */
775     /* symbolic analysis to construct the relaxed supernodal pattern (L->s) */
776     /* ---------------------------------------------------------------------- */
777 
778     Lpi2 = Wi ;	    /* copy Lpi into Lpi2, using Wi as workspace for Lpi2 [ */
779     for (s = 0 ; s < nsuper ; s++)
780     {
781 	Lpi2 [s] = Lpi [s] ;
782     }
783 
784     Asorted = A->sorted ;
785 
786     for (s = 0 ; s < nsuper ; s++)
787     {
788 	/* sth supernode is in columns k1 to k2-1.
789 	 * compute nonzero pattern of L (k1:k2-1,:). */
790 
791 	/* place rows k1 to k2-1 in leading column of supernode s */
792 	k1 = Super [s] ;
793 	k2 = Super [s+1] ;
794 	PRINT1 (("=========>>> Supernode "ID" k1 "ID" k2-1 "ID"\n",
795 		    s, k1, k2-1)) ;
796 	for (k = k1 ; k < k2 ; k++)
797 	{
798 	    Ls [Lpi2 [s]++] = k ;
799 	}
800 
801 	/* compute nonzero pattern each row k1 to k2-1 */
802 	for (k = k1 ; k < k2 ; k++)
803 	{
804 	    /* compute row k of L.  In the symmetric case, the pattern of L(k,:)
805 	     * is the set of nodes reachable in the supernodal etree from any
806 	     * row i in the nonzero pattern of A(0:k,k).  In the unsymmetric
807 	     * case, the pattern of the kth column of A*A' is the set union
808 	     * of all columns A(0:k,j) for each nonzero F(j,k). */
809 
810 	    /* clear the Flag array and mark the current supernode */
811 	    /* mark = CHOLMOD(clear_flag) (Common) ; */
812 	    CHOLMOD_CLEAR_FLAG (Common) ;
813 	    mark = Common->mark ;
814 	    Flag [s] = mark ;
815 	    ASSERT (s == SuperMap [k]) ;
816 
817 	    /* traverse the row subtree for each nonzero in A or AA' */
818 	    if (stype != 0)
819 	    {
820 		subtree (k, k, Ap, Ai, Anz, SuperMap, Sparent, mark,
821                         Asorted, k1, Flag, Ls, Lpi2) ;
822 	    }
823 	    else
824 	    {
825 		/* for each j nonzero in F (:,k) do */
826 		p = Fp [k] ;
827 		pend = (packed) ? (Fp [k+1]) : (p + Fnz [k]) ;
828 		for ( ; p < pend ; p++)
829 		{
830 		    subtree (Fj [p], k, Ap, Ai, Anz, SuperMap, Sparent, mark,
831 			    Asorted, k1, Flag, Ls, Lpi2) ;
832 		}
833 	    }
834 	}
835     }
836 #ifndef NDEBUG
837     for (s = 0 ; s < nsuper ; s++)
838     {
839 	PRINT1 (("Lpi2[s] "ID" Lpi[s+1] "ID"\n", Lpi2 [s], Lpi [s+1])) ;
840 	ASSERT (Lpi2 [s] == Lpi [s+1]) ;
841 	CHOLMOD(dump_super) (s, Super, Lpi, Ls, NULL, NULL, 0, Common) ;
842     }
843 #endif
844 
845     /* contents of Wi no longer needed for Lpi2 ] */
846     /* Sparent no longer needed ] */
847 
848     /* ---------------------------------------------------------------------- */
849     /* determine the largest update matrix (L->maxcsize) */
850     /* ---------------------------------------------------------------------- */
851 
852     /* maxcsize could be determined before L->s is allocated and defined, which
853      * would mean that all memory requirements for both the symbolic and numeric
854      * factorizations could be computed using O(nnz(A)+O(n)) space.  However, it
855      * would require a lot of extra work.  The analysis phase, above, would need
856      * to be duplicated, but with Ls not kept; instead, the algorithm would keep
857      * track of the current s and slast for each supernode d, and update them
858      * when a new row index appears in supernode d.  An alternative would be to
859      * do this computation only if the allocation of L->s failed, in which case
860      * the following code would be skipped.
861      *
862      * The csize for a supernode is the size of its largest contribution to
863      * a subsequent ancestor supernode.  For example, suppose the rows of #'s
864      * in the figure below correspond to the columns of a subsequent supernode,
865      * and the dots are the entries in that ancestore.
866      *
867      *	    c
868      *	    c c
869      *	    c c c
870      *	    x x x
871      *	    x x x
872      *	    # # #   .
873      *	    # # #   . .
874      *	    * * *   . .
875      *	    * * *   . .
876      *	    * * *   . .
877      *	            . .
878      *
879      * Then for this update, the csize is 3-by-2, or 6, because there are 3
880      * rows of *'s which is the number of rows in the update, and there are
881      * 2 rows of #'s, which is the number columns in the update.  The csize
882      * of a supernode is the largest such contribution for any ancestor
883      * supernode.  maxcsize, for the whole matrix, has a rough upper bound of
884      * the maximum size of any supernode.  This bound is loose, because the
885      * the contribution must be less than the size of the ancestor supernodal
886      * that it's updating.  maxcsize of a completely dense matrix, with one
887      * supernode, is zero.
888      *
889      * maxesize is the column dimension for the workspace E needed for the
890      * solve.  E is of size nrhs-by-maxesize, where the nrhs is the number of
891      * columns in the right-hand-side.  The maxesize is the largest esize of
892      * any supernode.  The esize of a supernode is the number of row indices
893      * it contains, excluding the column indices of the supernode itself.
894      * For the following example, esize is 4:
895      *
896      *	    c
897      *	    c c
898      *	    c c c
899      *	    x x x
900      *	    x x x
901      *	    x x x
902      *	    x x x
903      *
904      * maxesize can be no bigger than n.
905      */
906 
907     maxcsize = 1 ;
908     maxesize = 1 ;
909 
910     /* Do not need to guard csize against Int overflow since xsize is OK. */
911 
912     if (for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY ||
913         for_whom == CHOLMOD_ANALYZE_FOR_SPQRGPU)
914     {
915         /* this is not needed for non-GPU accelerated QR factorization */
916         for (d = 0 ; d < nsuper ; d++)
917         {
918             nscol = Super [d+1] - Super [d] ;
919             p = Lpi [d] + nscol ;
920             plast = p ;
921             pend = Lpi [d+1] ;
922             esize = pend - p ;
923             maxesize = MAX (maxesize, esize) ;
924             slast = (p == pend) ? (EMPTY) : (SuperMap [Ls [p]]) ;
925             for ( ; p <= pend ; p++)
926             {
927                 s = (p == pend) ? (EMPTY) : (SuperMap [Ls [p]]) ;
928                 if (s != slast)
929                 {
930                     /* row i is the start of a new supernode */
931                     ndrow1 = p - plast ;
932                     ndrow2 = pend - plast ;
933                     csize = ndrow2 * ndrow1 ;
934                     PRINT1 (("Supernode "ID" ancestor "ID" C: "ID"-by-"ID
935                         "  csize "ID"\n", d, slast, ndrow1, ndrow2, csize)) ;
936                     maxcsize = MAX (maxcsize, csize) ;
937                     plast = p ;
938                     slast = s ;
939                 }
940             }
941         }
942         PRINT1 (("max csize "ID"\n", maxcsize)) ;
943     }
944 
945     /* Wj no longer needed for SuperMap } */
946 
947     L->maxcsize = maxcsize ;
948     L->maxesize = maxesize ;
949     L->is_super = TRUE ;
950     ASSERT (L->xtype == CHOLMOD_PATTERN && L->is_ll) ;
951 
952     /* ---------------------------------------------------------------------- */
953     /* supernodal symbolic factorization is complete */
954     /* ---------------------------------------------------------------------- */
955 
956     FREE_WORKSPACE ;
957     return (TRUE) ;
958 }
959 
960 /* ========================================================================== */
961 /* === cholmod_super_symbolic =============================================== */
962 /* ========================================================================== */
963 
964 /* Analyzes A, AA', or A(:,f)*A(:,f)' in preparation for a supernodal numeric
965  * factorization.  The user need not call this directly; cholmod_analyze is
966  * a "simple" wrapper for this routine.
967  *
968  * This function does all the analysis for a supernodal Cholesky factorization.
969  *
970  * workspace: Flag (nrow), Head (nrow), Iwork (2*nrow),
971  * and temporary space of size 3*nfsuper*sizeof(Int), where nfsuper <= n
972  * is the number of fundamental supernodes.
973  */
974 
CHOLMOD(super_symbolic)975 int CHOLMOD(super_symbolic)
976 (
977     /* ---- input ---- */
978     cholmod_sparse *A,	/* matrix to analyze */
979     cholmod_sparse *F,	/* F = A' or A(:,f)' */
980     Int *Parent,	/* elimination tree */
981     /* ---- in/out --- */
982     cholmod_factor *L,	/* simplicial symbolic on input,
983 			 * supernodal symbolic on output */
984     /* --------------- */
985     cholmod_common *Common
986 )
987 {
988     return (CHOLMOD(super_symbolic2) (CHOLMOD_ANALYZE_FOR_CHOLESKY,
989         A, F, Parent, L, Common)) ;
990 }
991 #endif
992 #endif
993