1 /* ========================================================================== */
2 /* === Supernodal/cholmod_super_symbolic ==================================== */
3 /* ========================================================================== */
4
5 /* -----------------------------------------------------------------------------
6 * CHOLMOD/Supernodal Module. Copyright (C) 2005-2006, Timothy A. Davis
7 * http://www.suitesparse.com
8 * -------------------------------------------------------------------------- */
9
10 /* Supernodal symbolic analysis of the LL' factorization of A, A*A',
11 * A(:,f)*A(:,f)'.
12 *
13 * This routine must be preceded by a simplicial symbolic analysis
14 * (cholmod_rowcolcounts). See cholmod_analyze.c for an example of how to use
15 * this routine.
16 *
17 * The user need not call this directly; cholmod_analyze is a "simple" wrapper
18 * for this routine.
19 *
20 * Symmetric case:
21 *
22 * A is stored in column form, with entries stored in the upper triangular
23 * part. Entries in the lower triangular part are ignored.
24 *
25 * Unsymmetric case:
26 *
27 * A is stored in column form. If F is equal to the transpose of A, then
28 * A*A' is analyzed. F can include a subset of the columns of A
29 * (F=A(:,f)'), in which case F*F' is analyzed.
30 *
31 * Requires Parent and L->ColCount to be defined on input; these are the
32 * simplicial Parent and ColCount arrays as computed by cholmod_rowcolcounts.
33 * Does not use L->Perm; the input matrices A and F must already be properly
34 * permuted. Allocates and computes the supernodal pattern of L (L->super,
35 * L->pi, L->px, and L->s). Does not allocate the real part (L->x).
36 *
37 * Supports any xtype (pattern, real, complex, or zomplex).
38 */
39
40 #ifndef NGPL
41 #ifndef NSUPERNODAL
42
43 #include "cholmod_internal.h"
44 #include "cholmod_supernodal.h"
45
46 #ifdef GPU_BLAS
47 #include "cholmod_gpu.h"
48 #endif
49
50 /* ========================================================================== */
51 /* === subtree ============================================================== */
52 /* ========================================================================== */
53
54 /* In the symmetric case, traverse the kth row subtree from the nonzeros in
55 * A (0:k1-1,k) and add the new entries found to the pattern of the kth row
56 * of L. The current supernode s contains the diagonal block k1:k2-1, so it
57 * can be skipped.
58 *
59 * In the unsymmetric case, the nonzero pattern of A*F is computed one column
60 * at a time (thus, the total time spent in this function is bounded below by
61 * the time taken to multiply A*F, which can be high if A is tall and thin).
62 * The kth column is A*F(:,k), or the set union of all columns A(:,j) for which
63 * F(j,k) is nonzero. This routine is called once for each entry j. Only the
64 * upper triangular part is needed, so only A (0:k1-1,j) is accessed, where
65 * k1:k2-1 are the columns of the current supernode s (k is in the range k1 to
66 * k2-1).
67 *
68 * If A is sorted, then the total time taken by this function is proportional
69 * to the number of nonzeros in the strictly block upper triangular part of A,
70 * plus the number of entries in the strictly block lower triangular part of
71 * the supernodal part of L. This excludes entries in the diagonal blocks
72 * corresponding to the columns in each supernode. That is, if k1:k2-1 are
73 * in a single supernode, then only A (0:k1-1,k1:k2-1) are accessed.
74 *
75 * For the unsymmetric case, only the strictly block upper triangular part
76 * of A*F is constructed.
77 *
78 * Only adds column indices corresponding to the leading columns of each
79 * relaxed supernode.
80 */
81
subtree(Int j,Int k,Int Ap[],Int Ai[],Int Anz[],Int SuperMap[],Int Sparent[],Int mark,Int sorted,Int k1,Int Flag[],Int Ls[],Int Lpi2[])82 static void subtree
83 (
84 /* inputs, not modified: */
85 Int j, /* j = k for symmetric case */
86 Int k,
87 Int Ap [ ],
88 Int Ai [ ],
89 Int Anz [ ],
90 Int SuperMap [ ],
91 Int Sparent [ ],
92 Int mark,
93 Int sorted, /* true if the columns of A are sorted */
94 Int k1, /* only consider A (0:k1-1,k) */
95
96 /* input/output: */
97 Int Flag [ ],
98 Int Ls [ ],
99 Int Lpi2 [ ]
100 )
101 {
102 Int p, pend, i, si ;
103 p = Ap [j] ;
104 pend = (Anz == NULL) ? (Ap [j+1]) : (p + Anz [j]) ;
105
106 for ( ; p < pend ; p++)
107 {
108 i = Ai [p] ;
109 if (i < k1)
110 {
111 /* (i,k) is an entry in the upper triangular part of A or A*F'.
112 * symmetric case: A(i,k) is nonzero (j=k).
113 * unsymmetric case: A(i,j) and F(j,k) are both nonzero.
114 *
115 * Column i is in supernode si = SuperMap [i]. Follow path from si
116 * to root of supernodal etree, stopping at the first flagged
117 * supernode. The root of the row subtree is supernode SuperMap[k],
118 * which is flagged already. This traversal will stop there, or it
119 * might stop earlier if supernodes have been flagged by previous
120 * calls to this routine for the same k. */
121 for (si = SuperMap [i] ; Flag [si] < mark ; si = Sparent [si])
122 {
123 ASSERT (si <= SuperMap [k]) ;
124 Ls [Lpi2 [si]++] = k ;
125 Flag [si] = mark ;
126 }
127 }
128 else if (sorted)
129 {
130 break ;
131 }
132 }
133 }
134
135
136 /* clear workspace used by cholmod_super_symbolic */
137 #define FREE_WORKSPACE \
138 { \
139 /* CHOLMOD(clear_flag) (Common) ; */ \
140 CHOLMOD_CLEAR_FLAG (Common) ; \
141 for (k = 0 ; k <= nfsuper ; k++) \
142 { \
143 Head [k] = EMPTY ; \
144 } \
145 ASSERT (CHOLMOD(dump_work) (TRUE, TRUE, 0, Common)) ; \
146 } \
147
148
149 /* ========================================================================== */
150 /* === cholmod_super_symbolic2 ============================================== */
151 /* ========================================================================== */
152
153 /* Analyze for supernodal Cholesky or multifrontal QR. */
154
CHOLMOD(super_symbolic2)155 int CHOLMOD(super_symbolic2)
156 (
157 /* ---- input ---- */
158 int for_whom, /* FOR_SPQR (0): for SPQR but not GPU-accelerated
159 FOR_CHOLESKY (1): for Cholesky (GPU or not)
160 FOR_SPQRGPU (2): for SPQR with GPU acceleration */
161 cholmod_sparse *A, /* matrix to analyze */
162 cholmod_sparse *F, /* F = A' or A(:,f)' */
163 Int *Parent, /* elimination tree */
164 /* ---- in/out --- */
165 cholmod_factor *L, /* simplicial symbolic on input,
166 * supernodal symbolic on output */
167 /* --------------- */
168 cholmod_common *Common
169 )
170 {
171 double zrelax0, zrelax1, zrelax2, xxsize ;
172 Int *Wi, *Wj, *Super, *Snz, *Ap, *Ai, *Flag, *Head, *Ls, *Lpi, *Lpx, *Fnz,
173 *Sparent, *Anz, *SuperMap, *Merged, *Nscol, *Zeros, *Fp, *Fj,
174 *ColCount, *Lpi2, *Lsuper, *Iwork ;
175 Int nsuper, d, n, j, k, s, mark, parent, p, pend, k1, k2, packed, nscol,
176 nsrow, ndrow1, ndrow2, stype, ssize, xsize, sparent, plast, slast,
177 csize, maxcsize, ss, nscol0, nscol1, ns, nfsuper, newzeros, totzeros,
178 merge, snext, esize, maxesize, nrelax0, nrelax1, nrelax2, Asorted ;
179 size_t w ;
180 int ok = TRUE, find_xsize ;
181 const char* env_use_gpu;
182 const char* env_max_bytes;
183 size_t max_bytes;
184 const char* env_max_fraction;
185 double max_fraction;
186
187 /* ---------------------------------------------------------------------- */
188 /* check inputs */
189 /* ---------------------------------------------------------------------- */
190
191 RETURN_IF_NULL_COMMON (FALSE) ;
192 RETURN_IF_NULL (A, FALSE) ;
193 RETURN_IF_NULL (L, FALSE) ;
194 RETURN_IF_NULL (Parent, FALSE) ;
195 RETURN_IF_XTYPE_INVALID (A, CHOLMOD_PATTERN, CHOLMOD_ZOMPLEX, FALSE) ;
196 RETURN_IF_XTYPE_INVALID (L, CHOLMOD_PATTERN, CHOLMOD_PATTERN, FALSE) ;
197 stype = A->stype ;
198 if (stype < 0)
199 {
200 /* invalid symmetry; symmetric lower form not supported */
201 ERROR (CHOLMOD_INVALID, "symmetric lower not supported") ;
202 return (FALSE) ;
203 }
204 if (stype == 0)
205 {
206 /* F must be present in the unsymmetric case */
207 RETURN_IF_NULL (F, FALSE) ;
208 }
209 if (L->is_super)
210 {
211 /* L must be a simplicial symbolic factor */
212 ERROR (CHOLMOD_INVALID, "L must be symbolic on input") ;
213 return (FALSE) ;
214 }
215 Common->status = CHOLMOD_OK ;
216
217 /* ---------------------------------------------------------------------- */
218 /* allocate workspace */
219 /* ---------------------------------------------------------------------- */
220
221 n = A->nrow ;
222
223 /* w = 5*n */
224 w = CHOLMOD(mult_size_t) (n, 5, &ok) ;
225 if (!ok)
226 {
227 ERROR (CHOLMOD_TOO_LARGE, "problem too large") ;
228 return (FALSE) ;
229 }
230
231 CHOLMOD(allocate_work) (n, w, 0, Common) ;
232 if (Common->status < CHOLMOD_OK)
233 {
234 /* out of memory */
235 return (FALSE) ;
236 }
237 ASSERT (CHOLMOD(dump_work) (TRUE, TRUE, 0, Common)) ;
238
239 /* ---------------------------------------------------------------------- */
240 /* allocate GPU workspace */
241 /* ---------------------------------------------------------------------- */
242
243 L->useGPU = 0 ; /* only used for Cholesky factorization, not QR */
244
245 #ifdef GPU_BLAS
246
247 /* GPU module is installed */
248 if ( for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY )
249 {
250 /* only allocate GPU workspace for supernodal Cholesky, and only when
251 the GPU is requested and available. */
252
253 max_bytes = 0;
254 max_fraction = 0;
255
256 #ifdef DLONG
257 if ( Common->useGPU == EMPTY )
258 {
259 /* useGPU not explicity requested by the user, but not explicitly
260 * prohibited either. Query OS environment variables for request.*/
261 env_use_gpu = getenv("CHOLMOD_USE_GPU");
262
263 if ( env_use_gpu )
264 {
265 /* CHOLMOD_USE_GPU environment variable is set to something */
266 if ( atoi ( env_use_gpu ) == 0 )
267 {
268 Common->useGPU = 0; /* don't use the gpu */
269 }
270 else
271 {
272 Common->useGPU = 1; /* use the gpu */
273 env_max_bytes = getenv("CHOLMOD_GPU_MEM_BYTES");
274 env_max_fraction = getenv("CHOLMOD_GPU_MEM_FRACTION");
275 if ( env_max_bytes )
276 {
277 max_bytes = atol(env_max_bytes);
278 Common->maxGpuMemBytes = max_bytes;
279 }
280 if ( env_max_fraction )
281 {
282 max_fraction = atof (env_max_fraction);
283 if ( max_fraction < 0 ) max_fraction = 0;
284 if ( max_fraction > 1 ) max_fraction = 1;
285 Common->maxGpuMemFraction = max_fraction;
286 }
287 }
288 }
289 else
290 {
291 /* CHOLMOD_USE_GPU environment variable not set, so no GPU
292 * acceleration will be used */
293 Common->useGPU = 0;
294 }
295 /* fprintf (stderr, "useGPU queried: %d\n", Common->useGPU) ; */
296 }
297
298 /* Ensure that a GPU is present */
299 if ( Common->useGPU == 1 )
300 {
301 /* fprintf (stderr, "\nprobe GPU:\n") ; */
302 Common->useGPU = CHOLMOD(gpu_probe) (Common); // Cholesky only, not SPQR
303 /* fprintf (stderr, "\nprobe GPU: result %d\n", Common->useGPU) ; */
304 }
305
306 if ( Common->useGPU == 1 )
307 {
308 /* Cholesky + GPU, so allocate space */
309 /* fprintf (stderr, "allocate GPU:\n") ; */
310 CHOLMOD(gpu_allocate) ( Common ); // Cholesky only, not SPQR
311 /* fprintf (stderr, "allocate GPU done\n") ; */
312 }
313 #else
314 /* GPU acceleration is only supported for long int version */
315 Common->useGPU = 0;
316 #endif
317
318 /* Cache the fact that the symbolic factorization supports
319 * GPU acceleration */
320 L->useGPU = Common->useGPU;
321
322 }
323
324 #else
325 /* GPU module is not installed */
326 Common->useGPU = 0 ;
327 #endif
328
329 /* ---------------------------------------------------------------------- */
330 /* get inputs */
331 /* ---------------------------------------------------------------------- */
332
333 /* A is now either A or triu(A(p,p)) for the symmetric case. It is either
334 * A or A(p,f) for the unsymmetric case (both in column form). It can be
335 * either packed or unpacked, and either sorted or unsorted. Entries in
336 * the lower triangular part may be present if A is symmetric, but these
337 * are ignored. */
338
339 Ap = A->p ;
340 Ai = A->i ;
341 Anz = A->nz ;
342
343 if (stype != 0)
344 {
345 /* F not accessed */
346 Fp = NULL ;
347 Fj = NULL ;
348 Fnz = NULL ;
349 packed = TRUE ;
350 }
351 else
352 {
353 /* F = A(:,f) or A(p,f) in packed row form, either sorted or unsorted */
354 Fp = F->p ;
355 Fj = F->i ;
356 Fnz = F->nz ;
357 packed = F->packed ;
358 }
359
360 ColCount = L->ColCount ;
361
362 nrelax0 = Common->nrelax [0] ;
363 nrelax1 = Common->nrelax [1] ;
364 nrelax2 = Common->nrelax [2] ;
365
366 zrelax0 = Common->zrelax [0] ;
367 zrelax1 = Common->zrelax [1] ;
368 zrelax2 = Common->zrelax [2] ;
369
370 zrelax0 = IS_NAN (zrelax0) ? 0 : zrelax0 ;
371 zrelax1 = IS_NAN (zrelax1) ? 0 : zrelax1 ;
372 zrelax2 = IS_NAN (zrelax2) ? 0 : zrelax2 ;
373
374 ASSERT (CHOLMOD(dump_parent) (Parent, n, "Parent", Common)) ;
375
376 /* ---------------------------------------------------------------------- */
377 /* get workspace */
378 /* ---------------------------------------------------------------------- */
379
380 /* Sparent, Snz, and Merged could be allocated later, of size nfsuper */
381
382 Iwork = Common->Iwork ;
383 Wi = Iwork ; /* size n (i/l/l). Lpi2 is i/l/l */
384 Wj = Iwork + n ; /* size n (i/l/l). Zeros is i/l/l */
385 Sparent = Iwork + 2*((size_t) n) ; /* size nfsuper <= n [ */
386 Snz = Iwork + 3*((size_t) n) ; /* size nfsuper <= n [ */
387 Merged = Iwork + 4*((size_t) n) ; /* size nfsuper <= n [ */
388
389 Flag = Common->Flag ; /* size n */
390 Head = Common->Head ; /* size n+1 */
391
392 /* ---------------------------------------------------------------------- */
393 /* find the fundamental supernodes */
394 /* ---------------------------------------------------------------------- */
395
396 /* count the number of children of each node, using Wi [ */
397 for (j = 0 ; j < n ; j++)
398 {
399 Wi [j] = 0 ;
400 }
401 for (j = 0 ; j < n ; j++)
402 {
403 parent = Parent [j] ;
404 if (parent != EMPTY)
405 {
406 Wi [parent]++ ;
407 }
408 }
409
410 Super = Head ; /* use Head [0..nfsuper] as workspace for Super list ( */
411
412 /* column 0 always starts a new supernode */
413 nfsuper = (n == 0) ? 0 : 1 ; /* number of fundamental supernodes */
414 Super [0] = 0 ;
415
416 for (j = 1 ; j < n ; j++)
417 {
418 /* check if j starts new supernode, or in the same supernode as j-1 */
419 if (Parent [j-1] != j /* parent of j-1 is not j */
420 || (ColCount [j-1] != ColCount [j] + 1) /* j-1 not subset of j*/
421 || Wi [j] > 1 /* j has more than one child */
422 #ifdef GPU_BLAS
423 /* Ensure that the supernode will fit in the GPU buffers */
424 /* Data size of 16 bytes must be assumed for case of PATTERN */
425 || (for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY && L->useGPU &&
426 (j-Super[nfsuper-1]+1) *
427 ColCount[Super[nfsuper-1]] * sizeof(double) * 2 >=
428 Common->devBuffSize)
429 #endif
430 )
431 {
432 /* j is the leading node of a supernode */
433 Super [nfsuper++] = j ;
434 }
435 }
436 Super [nfsuper] = n ;
437
438 /* contents of Wi no longer needed for child count ] */
439
440 Nscol = Wi ; /* use Wi as size-nfsuper workspace for Nscol [ */
441
442 /* ---------------------------------------------------------------------- */
443 /* find the mapping of fundamental nodes to supernodes */
444 /* ---------------------------------------------------------------------- */
445
446 SuperMap = Wj ; /* use Wj as workspace for SuperMap [ */
447
448 /* SuperMap [k] = s if column k is contained in supernode s */
449 for (s = 0 ; s < nfsuper ; s++)
450 {
451 for (k = Super [s] ; k < Super [s+1] ; k++)
452 {
453 SuperMap [k] = s ;
454 }
455 }
456
457 /* ---------------------------------------------------------------------- */
458 /* construct the fundamental supernodal etree */
459 /* ---------------------------------------------------------------------- */
460
461 for (s = 0 ; s < nfsuper ; s++)
462 {
463 j = Super [s+1] - 1 ; /* last node in supernode s */
464 parent = Parent [j] ; /* parent of last node */
465 Sparent [s] = (parent == EMPTY) ? EMPTY : SuperMap [parent] ;
466 PRINT1 (("Sparent ["ID"] = "ID"\n", s, Sparent [s])) ;
467 }
468
469 /* contents of Wj no longer needed as workspace for SuperMap ]
470 * SuperMap will be recomputed below, for the relaxed supernodes. */
471
472 Zeros = Wj ; /* use Wj for Zeros, workspace of size nfsuper [ */
473
474 /* ---------------------------------------------------------------------- */
475 /* relaxed amalgamation */
476 /* ---------------------------------------------------------------------- */
477
478 for (s = 0 ; s < nfsuper ; s++)
479 {
480 Merged [s] = EMPTY ; /* s not merged into another */
481 Nscol [s] = Super [s+1] - Super [s] ; /* # of columns in s */
482 Zeros [s] = 0 ; /* # of zero entries in s */
483 ASSERT (s <= Super [s]) ;
484 Snz [s] = ColCount [Super [s]] ; /* # of entries in leading col of s */
485 PRINT2 (("lnz ["ID"] "ID"\n", s, Snz [s])) ;
486 }
487
488 for (s = nfsuper-2 ; s >= 0 ; s--)
489 {
490 double lnz1 ;
491
492 /* should supernodes s and s+1 merge into a new node s? */
493 PRINT1 (("\n========= Check relax of s "ID" and s+1 "ID"\n", s, s+1)) ;
494
495 ss = Sparent [s] ;
496 if (ss == EMPTY)
497 {
498 PRINT1 (("s "ID" is a root, no merge with s+1 = "ID"\n", s, s+1)) ;
499 continue ;
500 }
501
502 /* find the current parent of s (perform path compression as needed) */
503 for (ss = Sparent [s] ; Merged [ss] != EMPTY ; ss = Merged [ss]) ;
504 sparent = ss ;
505 PRINT2 (("Current sparent of s "ID" is "ID"\n", s, sparent)) ;
506
507 /* ss is the current parent of s */
508 for (ss = Sparent [s] ; Merged [ss] != EMPTY ; ss = snext)
509 {
510 snext = Merged [ss] ;
511 PRINT2 (("ss "ID" is dead, merged into snext "ID"\n", ss, snext)) ;
512 Merged [ss] = sparent ;
513 }
514
515 /* if s+1 is not the current parent of s, do not merge */
516 if (sparent != s+1)
517 {
518 continue ;
519 }
520
521 nscol0 = Nscol [s] ; /* # of columns in s */
522 nscol1 = Nscol [s+1] ; /* # of columns in s+1 */
523 ns = nscol0 + nscol1 ;
524 PRINT2 (("ns "ID" nscol0 "ID" nscol1 "ID"\n", ns, nscol0, nscol1)) ;
525
526 totzeros = Zeros [s+1] ; /* current # of zeros in s+1 */
527 lnz1 = (double) (Snz [s+1]) ; /* # entries in leading column of s+1 */
528
529 /* determine if supernodes s and s+1 should merge */
530 if (ns <= nrelax0)
531 {
532 PRINT2 (("ns is tiny ("ID"), so go ahead and merge\n", ns)) ;
533 merge = TRUE ;
534 }
535 else
536 {
537 /* use double to avoid integer overflow */
538 double lnz0 = Snz [s] ; /* # entries in leading column of s */
539 double xnewzeros = nscol0 * (lnz1 + nscol0 - lnz0) ;
540
541 /* use Int for the final update of Zeros [s] below */
542 newzeros = nscol0 * (Snz [s+1] + nscol0 - Snz [s]) ;
543 ASSERT (newzeros == xnewzeros) ;
544
545 PRINT2 (("lnz0 %g lnz1 %g xnewzeros %g\n", lnz0, lnz1, xnewzeros)) ;
546 if (xnewzeros == 0)
547 {
548 /* no new zeros, so go ahead and merge */
549 PRINT2 (("no new fillin, so go ahead and merge\n")) ;
550 merge = TRUE ;
551 }
552 else
553 {
554 /* # of zeros if merged */
555 double xtotzeros = ((double) totzeros) + xnewzeros ;
556
557 /* xtotsize: total size of merged supernode, if merged: */
558 double xns = (double) ns ;
559 double xtotsize = (xns * (xns+1) / 2) + xns * (lnz1 - nscol1) ;
560 double z = xtotzeros / xtotsize ;
561
562 Int totsize ;
563 totsize = (ns * (ns+1) / 2) + ns * (Snz [s+1] - nscol1) ;
564
565 PRINT2 (("oldzeros "ID" newzeros "ID" xtotsize %g z %g\n",
566 Zeros [s+1], newzeros, xtotsize, z)) ;
567
568 /* use Int for the final update of Zeros [s] below */
569 totzeros += newzeros ;
570
571 /* do not merge if supernode would become too big
572 * (Int overflow). Continue computing; not (yet) an error. */
573 /* fl.pt. compare, but no NaN's can occur here */
574 merge = ((ns <= nrelax1 && z < zrelax0) ||
575 (ns <= nrelax2 && z < zrelax1) ||
576 (z < zrelax2)) &&
577 (xtotsize < Int_max / sizeof (double)) ;
578
579 }
580 }
581
582 #ifdef GPU_BLAS
583 if ( for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY && L->useGPU ) {
584 /* Ensure that the aggregated supernode fits in the device
585 supernode buffers */
586 double xns = (double) ns;
587 if ( ((xns * xns) + xns * (lnz1 - nscol1))*sizeof(double)*2 >=
588 Common->devBuffSize ) {
589 merge = FALSE;
590 }
591 }
592 #endif
593
594 if (merge)
595 {
596 PRINT1 (("Merge node s ("ID") and s+1 ("ID")\n", s, s+1)) ;
597 Zeros [s] = totzeros ;
598 Merged [s+1] = s ;
599 Snz [s] = nscol0 + Snz [s+1] ;
600 Nscol [s] += Nscol [s+1] ;
601 }
602 }
603
604 /* contents of Wj no longer needed for Zeros ] */
605 /* contents of Wi no longer needed for Nscol ] */
606 /* contents of Sparent no longer needed (recomputed below) */
607
608 /* ---------------------------------------------------------------------- */
609 /* construct the relaxed supernode list */
610 /* ---------------------------------------------------------------------- */
611
612 nsuper = 0 ;
613 for (s = 0 ; s < nfsuper ; s++)
614 {
615 if (Merged [s] == EMPTY)
616 {
617 PRINT1 (("live supernode: "ID" snz "ID"\n", s, Snz [s])) ;
618 Super [nsuper] = Super [s] ;
619 Snz [nsuper] = Snz [s] ;
620 nsuper++ ;
621 }
622 }
623 Super [nsuper] = n ;
624 PRINT1 (("Fundamental supernodes: "ID" relaxed "ID"\n", nfsuper, nsuper)) ;
625
626 /* Merged no longer needed ] */
627
628 /* ---------------------------------------------------------------------- */
629 /* find the mapping of relaxed nodes to supernodes */
630 /* ---------------------------------------------------------------------- */
631
632 /* use Wj as workspace for SuperMap { */
633
634 /* SuperMap [k] = s if column k is contained in supernode s */
635 for (s = 0 ; s < nsuper ; s++)
636 {
637 for (k = Super [s] ; k < Super [s+1] ; k++)
638 {
639 SuperMap [k] = s ;
640 }
641 }
642
643 /* ---------------------------------------------------------------------- */
644 /* construct the relaxed supernodal etree */
645 /* ---------------------------------------------------------------------- */
646
647 for (s = 0 ; s < nsuper ; s++)
648 {
649 j = Super [s+1] - 1 ; /* last node in supernode s */
650 parent = Parent [j] ; /* parent of last node */
651 Sparent [s] = (parent == EMPTY) ? EMPTY : SuperMap [parent] ;
652 PRINT1 (("new Sparent ["ID"] = "ID"\n", s, Sparent [s])) ;
653 }
654
655 /* ---------------------------------------------------------------------- */
656 /* determine the size of L->s and L->x */
657 /* ---------------------------------------------------------------------- */
658
659 ssize = 0 ;
660 xsize = 0 ;
661 xxsize = 0 ;
662 find_xsize = for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY ||
663 for_whom == CHOLMOD_ANALYZE_FOR_SPQRGPU ;
664 for (s = 0 ; s < nsuper ; s++)
665 {
666 nscol = Super [s+1] - Super [s] ;
667 nsrow = Snz [s] ;
668 ASSERT (nscol > 0) ;
669 ssize += nsrow ;
670 if (find_xsize)
671 {
672 xsize += nscol * nsrow ;
673 /* also compute xsize in double to guard against Int overflow */
674 xxsize += ((double) nscol) * ((double) nsrow) ;
675 }
676 if (ssize < 0 ||(find_xsize && xxsize > Int_max))
677 {
678 /* Int overflow, clear workspace and return.
679 QR factorization will not use xxsize, so that error is ignored.
680 For Cholesky factorization, however, memory of space xxsize
681 will be allocated, so this is a failure. Both QR and Cholesky
682 fail if ssize overflows. */
683 ERROR (CHOLMOD_TOO_LARGE, "problem too large") ;
684 FREE_WORKSPACE ;
685 return (FALSE) ;
686 }
687 ASSERT (ssize > 0) ;
688 ASSERT (IMPLIES (find_xsize, xsize > 0)) ;
689 }
690 xsize = MAX (1, xsize) ;
691 ssize = MAX (1, ssize) ;
692 PRINT1 (("ix sizes: "ID" "ID" nsuper "ID"\n", ssize, xsize, nsuper)) ;
693
694 /* ---------------------------------------------------------------------- */
695 /* allocate L (all except real part L->x) */
696 /* ---------------------------------------------------------------------- */
697
698 L->ssize = ssize ;
699 L->xsize = xsize ;
700 L->nsuper = nsuper ;
701
702 CHOLMOD(change_factor) (CHOLMOD_PATTERN, TRUE, TRUE, TRUE, TRUE, L, Common);
703
704 if (Common->status < CHOLMOD_OK)
705 {
706 /* out of memory; L is still a valid simplicial symbolic factor */
707 FREE_WORKSPACE ;
708 return (FALSE) ;
709 }
710
711 DEBUG (CHOLMOD(dump_factor) (L, "L to symbolic super", Common)) ;
712 ASSERT (L->is_ll && L->xtype == CHOLMOD_PATTERN && L->is_super) ;
713
714 Lpi = L->pi ;
715 Lpx = L->px ;
716 Ls = L->s ;
717 Ls [0] = 0 ; /* flag for cholmod_check_factor; supernodes are defined */
718 Lsuper = L->super ;
719
720 /* copy the list of relaxed supernodes into the final list in L */
721 for (s = 0 ; s <= nsuper ; s++)
722 {
723 Lsuper [s] = Super [s] ;
724 }
725
726 /* Head no longer needed as workspace for fundamental Super list ) */
727
728 Super = Lsuper ; /* Super is now the list of relaxed supernodes */
729
730 /* ---------------------------------------------------------------------- */
731 /* construct column pointers of relaxed supernodal pattern (L->pi) */
732 /* ---------------------------------------------------------------------- */
733
734 p = 0 ;
735 for (s = 0 ; s < nsuper ; s++)
736 {
737 Lpi [s] = p ;
738 p += Snz [s] ;
739 PRINT1 (("Snz ["ID"] = "ID", Super ["ID"] = "ID"\n",
740 s, Snz [s], s, Super[s])) ;
741 }
742 Lpi [nsuper] = p ;
743 ASSERT ((Int) (L->ssize) == MAX (1,p)) ;
744
745 /* ---------------------------------------------------------------------- */
746 /* construct pointers for supernodal values (L->px) */
747 /* ---------------------------------------------------------------------- */
748
749 if (for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY ||
750 for_whom == CHOLMOD_ANALYZE_FOR_SPQRGPU)
751 {
752 Lpx [0] = 0 ;
753 p = 0 ;
754 for (s = 0 ; s < nsuper ; s++)
755 {
756 nscol = Super [s+1] - Super [s] ; /* number of columns in s */
757 nsrow = Snz [s] ; /* # of rows, incl triangular part*/
758 Lpx [s] = p ; /* pointer to numerical part of s */
759 p += nscol * nsrow ;
760 }
761 Lpx [s] = p ;
762 ASSERT ((Int) (L->xsize) == MAX (1,p)) ;
763 }
764 else
765 {
766 /* L->px is not needed for non-GPU accelerated QR factorization (it may
767 * lead to Int overflow, anyway, if xsize caused Int overflow above).
768 * Use a magic number to tell cholmod_check_factor to ignore Lpx. */
769 Lpx [0] = 123456 ;
770 }
771
772 /* Snz no longer needed ] */
773
774 /* ---------------------------------------------------------------------- */
775 /* symbolic analysis to construct the relaxed supernodal pattern (L->s) */
776 /* ---------------------------------------------------------------------- */
777
778 Lpi2 = Wi ; /* copy Lpi into Lpi2, using Wi as workspace for Lpi2 [ */
779 for (s = 0 ; s < nsuper ; s++)
780 {
781 Lpi2 [s] = Lpi [s] ;
782 }
783
784 Asorted = A->sorted ;
785
786 for (s = 0 ; s < nsuper ; s++)
787 {
788 /* sth supernode is in columns k1 to k2-1.
789 * compute nonzero pattern of L (k1:k2-1,:). */
790
791 /* place rows k1 to k2-1 in leading column of supernode s */
792 k1 = Super [s] ;
793 k2 = Super [s+1] ;
794 PRINT1 (("=========>>> Supernode "ID" k1 "ID" k2-1 "ID"\n",
795 s, k1, k2-1)) ;
796 for (k = k1 ; k < k2 ; k++)
797 {
798 Ls [Lpi2 [s]++] = k ;
799 }
800
801 /* compute nonzero pattern each row k1 to k2-1 */
802 for (k = k1 ; k < k2 ; k++)
803 {
804 /* compute row k of L. In the symmetric case, the pattern of L(k,:)
805 * is the set of nodes reachable in the supernodal etree from any
806 * row i in the nonzero pattern of A(0:k,k). In the unsymmetric
807 * case, the pattern of the kth column of A*A' is the set union
808 * of all columns A(0:k,j) for each nonzero F(j,k). */
809
810 /* clear the Flag array and mark the current supernode */
811 /* mark = CHOLMOD(clear_flag) (Common) ; */
812 CHOLMOD_CLEAR_FLAG (Common) ;
813 mark = Common->mark ;
814 Flag [s] = mark ;
815 ASSERT (s == SuperMap [k]) ;
816
817 /* traverse the row subtree for each nonzero in A or AA' */
818 if (stype != 0)
819 {
820 subtree (k, k, Ap, Ai, Anz, SuperMap, Sparent, mark,
821 Asorted, k1, Flag, Ls, Lpi2) ;
822 }
823 else
824 {
825 /* for each j nonzero in F (:,k) do */
826 p = Fp [k] ;
827 pend = (packed) ? (Fp [k+1]) : (p + Fnz [k]) ;
828 for ( ; p < pend ; p++)
829 {
830 subtree (Fj [p], k, Ap, Ai, Anz, SuperMap, Sparent, mark,
831 Asorted, k1, Flag, Ls, Lpi2) ;
832 }
833 }
834 }
835 }
836 #ifndef NDEBUG
837 for (s = 0 ; s < nsuper ; s++)
838 {
839 PRINT1 (("Lpi2[s] "ID" Lpi[s+1] "ID"\n", Lpi2 [s], Lpi [s+1])) ;
840 ASSERT (Lpi2 [s] == Lpi [s+1]) ;
841 CHOLMOD(dump_super) (s, Super, Lpi, Ls, NULL, NULL, 0, Common) ;
842 }
843 #endif
844
845 /* contents of Wi no longer needed for Lpi2 ] */
846 /* Sparent no longer needed ] */
847
848 /* ---------------------------------------------------------------------- */
849 /* determine the largest update matrix (L->maxcsize) */
850 /* ---------------------------------------------------------------------- */
851
852 /* maxcsize could be determined before L->s is allocated and defined, which
853 * would mean that all memory requirements for both the symbolic and numeric
854 * factorizations could be computed using O(nnz(A)+O(n)) space. However, it
855 * would require a lot of extra work. The analysis phase, above, would need
856 * to be duplicated, but with Ls not kept; instead, the algorithm would keep
857 * track of the current s and slast for each supernode d, and update them
858 * when a new row index appears in supernode d. An alternative would be to
859 * do this computation only if the allocation of L->s failed, in which case
860 * the following code would be skipped.
861 *
862 * The csize for a supernode is the size of its largest contribution to
863 * a subsequent ancestor supernode. For example, suppose the rows of #'s
864 * in the figure below correspond to the columns of a subsequent supernode,
865 * and the dots are the entries in that ancestore.
866 *
867 * c
868 * c c
869 * c c c
870 * x x x
871 * x x x
872 * # # # .
873 * # # # . .
874 * * * * . .
875 * * * * . .
876 * * * * . .
877 * . .
878 *
879 * Then for this update, the csize is 3-by-2, or 6, because there are 3
880 * rows of *'s which is the number of rows in the update, and there are
881 * 2 rows of #'s, which is the number columns in the update. The csize
882 * of a supernode is the largest such contribution for any ancestor
883 * supernode. maxcsize, for the whole matrix, has a rough upper bound of
884 * the maximum size of any supernode. This bound is loose, because the
885 * the contribution must be less than the size of the ancestor supernodal
886 * that it's updating. maxcsize of a completely dense matrix, with one
887 * supernode, is zero.
888 *
889 * maxesize is the column dimension for the workspace E needed for the
890 * solve. E is of size nrhs-by-maxesize, where the nrhs is the number of
891 * columns in the right-hand-side. The maxesize is the largest esize of
892 * any supernode. The esize of a supernode is the number of row indices
893 * it contains, excluding the column indices of the supernode itself.
894 * For the following example, esize is 4:
895 *
896 * c
897 * c c
898 * c c c
899 * x x x
900 * x x x
901 * x x x
902 * x x x
903 *
904 * maxesize can be no bigger than n.
905 */
906
907 maxcsize = 1 ;
908 maxesize = 1 ;
909
910 /* Do not need to guard csize against Int overflow since xsize is OK. */
911
912 if (for_whom == CHOLMOD_ANALYZE_FOR_CHOLESKY ||
913 for_whom == CHOLMOD_ANALYZE_FOR_SPQRGPU)
914 {
915 /* this is not needed for non-GPU accelerated QR factorization */
916 for (d = 0 ; d < nsuper ; d++)
917 {
918 nscol = Super [d+1] - Super [d] ;
919 p = Lpi [d] + nscol ;
920 plast = p ;
921 pend = Lpi [d+1] ;
922 esize = pend - p ;
923 maxesize = MAX (maxesize, esize) ;
924 slast = (p == pend) ? (EMPTY) : (SuperMap [Ls [p]]) ;
925 for ( ; p <= pend ; p++)
926 {
927 s = (p == pend) ? (EMPTY) : (SuperMap [Ls [p]]) ;
928 if (s != slast)
929 {
930 /* row i is the start of a new supernode */
931 ndrow1 = p - plast ;
932 ndrow2 = pend - plast ;
933 csize = ndrow2 * ndrow1 ;
934 PRINT1 (("Supernode "ID" ancestor "ID" C: "ID"-by-"ID
935 " csize "ID"\n", d, slast, ndrow1, ndrow2, csize)) ;
936 maxcsize = MAX (maxcsize, csize) ;
937 plast = p ;
938 slast = s ;
939 }
940 }
941 }
942 PRINT1 (("max csize "ID"\n", maxcsize)) ;
943 }
944
945 /* Wj no longer needed for SuperMap } */
946
947 L->maxcsize = maxcsize ;
948 L->maxesize = maxesize ;
949 L->is_super = TRUE ;
950 ASSERT (L->xtype == CHOLMOD_PATTERN && L->is_ll) ;
951
952 /* ---------------------------------------------------------------------- */
953 /* supernodal symbolic factorization is complete */
954 /* ---------------------------------------------------------------------- */
955
956 FREE_WORKSPACE ;
957 return (TRUE) ;
958 }
959
960 /* ========================================================================== */
961 /* === cholmod_super_symbolic =============================================== */
962 /* ========================================================================== */
963
964 /* Analyzes A, AA', or A(:,f)*A(:,f)' in preparation for a supernodal numeric
965 * factorization. The user need not call this directly; cholmod_analyze is
966 * a "simple" wrapper for this routine.
967 *
968 * This function does all the analysis for a supernodal Cholesky factorization.
969 *
970 * workspace: Flag (nrow), Head (nrow), Iwork (2*nrow),
971 * and temporary space of size 3*nfsuper*sizeof(Int), where nfsuper <= n
972 * is the number of fundamental supernodes.
973 */
974
CHOLMOD(super_symbolic)975 int CHOLMOD(super_symbolic)
976 (
977 /* ---- input ---- */
978 cholmod_sparse *A, /* matrix to analyze */
979 cholmod_sparse *F, /* F = A' or A(:,f)' */
980 Int *Parent, /* elimination tree */
981 /* ---- in/out --- */
982 cholmod_factor *L, /* simplicial symbolic on input,
983 * supernodal symbolic on output */
984 /* --------------- */
985 cholmod_common *Common
986 )
987 {
988 return (CHOLMOD(super_symbolic2) (CHOLMOD_ANALYZE_FOR_CHOLESKY,
989 A, F, Parent, L, Common)) ;
990 }
991 #endif
992 #endif
993