1 /* ========================================================================== */
2 /* === Supernodal/t_cholmod_super_numeric =================================== */
3 /* ========================================================================== */
4
5 /* -----------------------------------------------------------------------------
6 * CHOLMOD/Supernodal Module. Copyright (C) 2005-2012, Timothy A. Davis
7 * http://www.suitesparse.com
8 * -------------------------------------------------------------------------- */
9
10 /* Template routine for cholmod_super_numeric. All xtypes supported, except
11 * that a zomplex A and F result in a complex L (there is no supernodal
12 * zomplex L).
13 */
14
15 /* ========================================================================== */
16 /* === complex arithmetic =================================================== */
17 /* ========================================================================== */
18
19 #include "cholmod_template.h"
20
21 #undef L_ENTRY
22 #undef L_CLEAR
23 #undef L_ASSIGN
24 #undef L_MULTADD
25 #undef L_ASSEMBLE
26 #undef L_ASSEMBLESUB
27
28 #ifdef REAL
29
30 /* -------------------------------------------------------------------------- */
31 /* A, F, and L are all real */
32 /* -------------------------------------------------------------------------- */
33
34 #define L_ENTRY 1
35 #define L_CLEAR(Lx,p) Lx [p] = 0
36 #define L_ASSIGN(Lx,q, Ax,Az,p) Lx [q] = Ax [p]
37 #define L_MULTADD(Lx,q, Ax,Az,p, f) Lx [q] += Ax [p] * f [0]
38 #define L_ASSEMBLE(Lx,q,b) Lx [q] += b [0]
39 #define L_ASSEMBLESUB(Lx,q,C,p) Lx [q] -= C [p]
40
41 #else
42
43 /* -------------------------------------------------------------------------- */
44 /* A and F are complex or zomplex, L and C are complex */
45 /* -------------------------------------------------------------------------- */
46
47 #define L_ENTRY 2
48 #define L_CLEAR(Lx,p) Lx [2*(p)] = 0 ; Lx [2*(p)+1] = 0
49 #define L_ASSEMBLE(Lx,q,b) Lx [2*(q)] += b [0] ;
50 #define L_ASSEMBLESUB(Lx,q,C,p) \
51 Lx [2*(q) ] -= C [2*(p) ] ; \
52 Lx [2*(q)+1] -= C [2*(p)+1] ;
53
54 #ifdef COMPLEX
55
56 /* -------------------------------------------------------------------------- */
57 /* A, F, L, and C are all complex */
58 /* -------------------------------------------------------------------------- */
59
60 #define L_ASSIGN(Lx,q, Ax,Az,p) \
61 Lx [2*(q) ] = Ax [2*(p) ] ; \
62 Lx [2*(q)+1] = Ax [2*(p)+1]
63
64 #define L_MULTADD(Lx,q, Ax,Az,p, f) \
65 Lx [2*(q) ] += Ax [2*(p) ] * f [0] - Ax [2*(p)+1] * f [1] ; \
66 Lx [2*(q)+1] += Ax [2*(p)+1] * f [0] + Ax [2*(p) ] * f [1]
67
68 #else
69
70 /* -------------------------------------------------------------------------- */
71 /* A and F are zomplex, L and C is complex */
72 /* -------------------------------------------------------------------------- */
73
74 #define L_ASSIGN(Lx,q, Ax,Az,p) \
75 Lx [2*(q) ] = Ax [p] ; \
76 Lx [2*(q)+1] = Az [p] ;
77
78 #define L_MULTADD(Lx,q, Ax,Az,p, f) \
79 Lx [2*(q) ] += Ax [p] * f [0] - Az [p] * f [1] ; \
80 Lx [2*(q)+1] += Az [p] * f [0] + Ax [p] * f [1]
81
82 #endif
83 #endif
84
85
86 /* ========================================================================== */
87 /* === t_cholmod_super_numeric ============================================== */
88 /* ========================================================================== */
89
90 /* This function returns FALSE only if integer overflow occurs in the BLAS.
91 * It returns TRUE otherwise whether or not the matrix is positive definite. */
92
TEMPLATE(cholmod_super_numeric)93 static int TEMPLATE (cholmod_super_numeric)
94 (
95 /* ---- input ---- */
96 cholmod_sparse *A, /* matrix to factorize */
97 cholmod_sparse *F, /* F = A' or A(:,f)' */
98 double beta [2], /* beta*I is added to diagonal of matrix to factorize */
99 /* ---- in/out --- */
100 cholmod_factor *L, /* factorization */
101 /* -- workspace -- */
102 cholmod_dense *Cwork, /* size (L->maxcsize)-by-1 */
103 /* --------------- */
104 cholmod_common *Common
105 )
106 {
107 double one [2], zero [2], tstart ;
108 double *Lx, *Ax, *Fx, *Az, *Fz, *C ;
109 Int *Super, *Head, *Ls, *Lpi, *Lpx, *Map, *SuperMap, *RelativeMap, *Next,
110 *Lpos, *Fp, *Fi, *Fnz, *Ap, *Ai, *Anz, *Iwork, *Next_save, *Lpos_save,
111 *Previous;
112 Int nsuper, n, j, i, k, s, p, pend, k1, k2, nscol, psi, psx, psend, nsrow,
113 pj, d, kd1, kd2, info, ndcol, ndrow, pdi, pdx, pdend, pdi1, pdi2, pdx1,
114 ndrow1, ndrow2, px, dancestor, sparent, dnext, nsrow2, ndrow3, pk, pf,
115 pfend, stype, Apacked, Fpacked, q, imap, repeat_supernode, nscol2, ss,
116 tail, nscol_new = 0;
117
118 /* ---------------------------------------------------------------------- */
119 /* declarations for the GPU */
120 /* ---------------------------------------------------------------------- */
121
122 /* these variables are not used if the GPU module is not installed */
123
124 #ifdef GPU_BLAS
125 Int ndescendants, mapCreatedOnGpu, supernodeUsedGPU,
126 idescendant, dlarge, dsmall, skips ;
127 int iHostBuff, iDevBuff, useGPU, GPUavailable ;
128 cholmod_gpu_pointers *gpu_p, gpu_pointer_struct ;
129 gpu_p = &gpu_pointer_struct ;
130 #endif
131
132 /* ---------------------------------------------------------------------- */
133 /* guard against integer overflow in the BLAS */
134 /* ---------------------------------------------------------------------- */
135
136 /* If integer overflow occurs in the BLAS, Common->status is set to
137 * CHOLMOD_TOO_LARGE, and the contents of Lx are undefined. */
138 Common->blas_ok = TRUE ;
139
140 /* ---------------------------------------------------------------------- */
141 /* get inputs */
142 /* ---------------------------------------------------------------------- */
143
144 nsuper = L->nsuper ;
145 n = L->n ;
146
147 C = Cwork->x ; /* workspace of size L->maxcsize */
148
149 one [0] = 1.0 ; /* ALPHA for *syrk, *herk, *gemm, and *trsm */
150 one [1] = 0. ;
151 zero [0] = 0. ; /* BETA for *syrk, *herk, and *gemm */
152 zero [1] = 0. ;
153
154 /* Iwork must be of size 2n + 5*nsuper, allocated in the caller,
155 * cholmod_super_numeric. The memory cannot be allocated here because the
156 * cholmod_super_numeric initializes SuperMap, and cholmod_allocate_work
157 * does not preserve existing workspace if the space needs to be increase
158 * in size. */
159
160 /* allocate integer workspace */
161 Iwork = Common->Iwork ;
162 SuperMap = Iwork ; /* size n (i/i/l) */
163 RelativeMap = Iwork + n ; /* size n (i/i/l) */
164 Next = Iwork + 2*((size_t) n) ; /* size nsuper*/
165 Lpos = Iwork + 2*((size_t) n) + nsuper ; /* size nsuper*/
166 Next_save = Iwork + 2*((size_t) n) + 2*((size_t) nsuper) ;/* size nsuper*/
167 Lpos_save = Iwork + 2*((size_t) n) + 3*((size_t) nsuper) ;/* size nsuper*/
168 Previous = Iwork + 2*((size_t) n) + 4*((size_t) nsuper) ;/* size nsuper*/
169
170 Map = Common->Flag ; /* size n, use Flag as workspace for Map array */
171 Head = Common->Head ; /* size n+1, only Head [0..nsuper-1] used */
172
173 Ls = L->s ;
174 Lpi = L->pi ;
175 Lpx = L->px ;
176
177 Super = L->super ;
178
179 Lx = L->x ;
180
181 #ifdef GPU_BLAS
182 /* local copy of useGPU */
183 if ( (Common->useGPU == 1) && L->useGPU)
184 {
185 /* Initialize the GPU. If not found, don't use it. */
186 useGPU = TEMPLATE2 (CHOLMOD (gpu_init))
187 (C, L, Common, nsuper, n, Lpi[nsuper]-Lpi[0], gpu_p) ;
188 }
189 else
190 {
191 useGPU = 0;
192 }
193 /* fprintf (stderr, "local useGPU %d\n", useGPU) ; */
194 #endif
195
196 #ifndef NTIMER
197 /* clear GPU / CPU statistics */
198 Common->CHOLMOD_CPU_GEMM_CALLS = 0 ;
199 Common->CHOLMOD_CPU_SYRK_CALLS = 0 ;
200 Common->CHOLMOD_CPU_TRSM_CALLS = 0 ;
201 Common->CHOLMOD_CPU_POTRF_CALLS = 0 ;
202 Common->CHOLMOD_GPU_GEMM_CALLS = 0 ;
203 Common->CHOLMOD_GPU_SYRK_CALLS = 0 ;
204 Common->CHOLMOD_GPU_TRSM_CALLS = 0 ;
205 Common->CHOLMOD_GPU_POTRF_CALLS = 0 ;
206 Common->CHOLMOD_CPU_GEMM_TIME = 0 ;
207 Common->CHOLMOD_CPU_SYRK_TIME = 0 ;
208 Common->CHOLMOD_CPU_TRSM_TIME = 0 ;
209 Common->CHOLMOD_CPU_POTRF_TIME = 0 ;
210 Common->CHOLMOD_GPU_GEMM_TIME = 0 ;
211 Common->CHOLMOD_GPU_SYRK_TIME = 0 ;
212 Common->CHOLMOD_GPU_TRSM_TIME = 0 ;
213 Common->CHOLMOD_GPU_POTRF_TIME = 0 ;
214 Common->CHOLMOD_ASSEMBLE_TIME = 0 ;
215 Common->CHOLMOD_ASSEMBLE_TIME2 = 0 ;
216 #endif
217
218 stype = A->stype ;
219
220 if (stype != 0)
221 {
222 /* F not accessed */
223 Fp = NULL ;
224 Fi = NULL ;
225 Fx = NULL ;
226 Fz = NULL ;
227 Fnz = NULL ;
228 Fpacked = TRUE ;
229 }
230 else
231 {
232 Fp = F->p ;
233 Fi = F->i ;
234 Fx = F->x ;
235 Fz = F->z ;
236 Fnz = F->nz ;
237 Fpacked = F->packed ;
238 }
239
240 Ap = A->p ;
241 Ai = A->i ;
242 Ax = A->x ;
243 Az = A->z ;
244 Anz = A->nz ;
245 Apacked = A->packed ;
246
247 /* clear the Map so that changes in the pattern of A can be detected */
248
249 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
250 if ( n > 128 ) schedule (static)
251
252 for (i = 0 ; i < n ; i++)
253 {
254 Map [i] = EMPTY ;
255 }
256
257 /* If the matrix is not positive definite, the supernode s containing the
258 * first zero or negative diagonal entry of L is repeated (but factorized
259 * only up to just before the problematic diagonal entry). The purpose is
260 * to provide MATLAB with [R,p]=chol(A); columns 1 to p-1 of L=R' are
261 * required, where L(p,p) is the problematic diagonal entry. The
262 * repeat_supernode flag tells us whether this is the repeated supernode.
263 * Once supernode s is repeated, the factorization is terminated. */
264 repeat_supernode = FALSE ;
265
266 #ifdef GPU_BLAS
267 if ( useGPU )
268 {
269 /* Case of GPU, zero all supernodes at one time for better performance*/
270 TEMPLATE2 (CHOLMOD (gpu_clear_memory))(Lx, L->xsize,
271 CHOLMOD_OMP_NUM_THREADS);
272 }
273 #endif
274
275 /* ---------------------------------------------------------------------- */
276 /* supernodal numerical factorization */
277 /* ---------------------------------------------------------------------- */
278
279 for (s = 0 ; s < nsuper ; s++)
280 {
281
282 /* ------------------------------------------------------------------ */
283 /* get the size of supernode s */
284 /* ------------------------------------------------------------------ */
285
286 k1 = Super [s] ; /* s contains columns k1 to k2-1 of L */
287 k2 = Super [s+1] ;
288 nscol = k2 - k1 ; /* # of columns in all of s */
289 psi = Lpi [s] ; /* pointer to first row of s in Ls */
290 psx = Lpx [s] ; /* pointer to first row of s in Lx */
291 psend = Lpi [s+1] ; /* pointer just past last row of s in Ls */
292 nsrow = psend - psi ; /* # of rows in all of s */
293
294 PRINT1 (("====================================================\n"
295 "S "ID" k1 "ID" k2 "ID" nsrow "ID" nscol "ID" psi "ID" psend "
296 ""ID" psx "ID"\n", s, k1, k2, nsrow, nscol, psi, psend, psx)) ;
297 /* ------------------------------------------------------------------ */
298 /* zero the supernode s */
299 /* ------------------------------------------------------------------ */
300
301 ASSERT ((size_t) (psx + nsrow*nscol) <= L->xsize) ;
302
303 pend = psx + nsrow * nscol ; /* s is nsrow-by-nscol */
304
305 #ifdef GPU_BLAS
306 if ( !useGPU )
307 #endif
308 {
309 /* Case of no GPU, zero individual supernodes */
310
311 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
312 schedule (static) if ( pend - psx > 1024 )
313
314 for (p = psx ; p < pend ; p++) {
315 L_CLEAR (Lx,p);
316 }
317 }
318
319 /* ------------------------------------------------------------------ */
320 /* construct the scattered Map for supernode s */
321 /* ------------------------------------------------------------------ */
322
323 /* If row i is the kth row in s, then Map [i] = k. Similarly, if
324 * column j is the kth column in s, then Map [j] = k. */
325
326 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
327 if ( nsrow > 128 )
328
329 for (k = 0 ; k < nsrow ; k++)
330 {
331 PRINT1 ((" "ID" map "ID"\n", Ls [psi+k], k)) ;
332 Map [Ls [psi + k]] = k ;
333 }
334
335 /* ------------------------------------------------------------------ */
336 /* when using GPU, reorder supernodes by levels.*/
337 /* (all supernodes in a level are independent) */
338 /* ------------------------------------------------------------------ */
339
340 #ifdef GPU_BLAS
341 if ( useGPU )
342 {
343 TEMPLATE2 (CHOLMOD (gpu_reorder_descendants))
344 ( Common, Super, &s, Lpi, Lpos, Head, Next, Previous,
345 &ndescendants, &tail, &mapCreatedOnGpu, gpu_p ) ;
346 }
347 #endif
348
349 /* ------------------------------------------------------------------ */
350 /* copy matrix into supernode s (lower triangular part only) */
351 /* ------------------------------------------------------------------ */
352
353 pk = psx ;
354
355 #pragma omp parallel for private ( p, pend, pfend, pf, i, j, imap, q ) \
356 num_threads(CHOLMOD_OMP_NUM_THREADS) if ( k2-k1 > 64 )
357
358 for (k = k1 ; k < k2 ; k++)
359 {
360 if (stype != 0)
361 {
362 /* copy the kth column of A into the supernode */
363 p = Ap [k] ;
364 pend = (Apacked) ? (Ap [k+1]) : (p + Anz [k]) ;
365 for ( ; p < pend ; p++)
366 {
367 /* row i of L is located in row Map [i] of s */
368 i = Ai [p] ;
369 if (i >= k)
370 {
371 /* This test is here simply to avoid a segfault. If
372 * the test is false, the numeric factorization of A
373 * is undefined. It does not detect all invalid
374 * entries, only some of them (when debugging is
375 * enabled, and Map is cleared after each step, then
376 * all entries not in the pattern of L are detected). */
377 imap = Map [i] ;
378 if (imap >= 0 && imap < nsrow)
379 {
380 /* Lx [Map [i] + pk] = Ax [p] ; */
381 L_ASSIGN (Lx,(imap+(psx+(k-k1)*nsrow)), Ax,Az,p) ;
382 }
383 }
384 }
385 }
386 else
387 {
388 double fjk[2];
389 /* copy the kth column of A*F into the supernode */
390 pf = Fp [k] ;
391 pfend = (Fpacked) ? (Fp [k+1]) : (p + Fnz [k]) ;
392 for ( ; pf < pfend ; pf++)
393 {
394 j = Fi [pf] ;
395
396 /* fjk = Fx [pf] ; */
397 L_ASSIGN (fjk,0, Fx,Fz,pf) ;
398
399 p = Ap [j] ;
400 pend = (Apacked) ? (Ap [j+1]) : (p + Anz [j]) ;
401 for ( ; p < pend ; p++)
402 {
403 i = Ai [p] ;
404 if (i >= k)
405 {
406 /* See the discussion of imap above. */
407 imap = Map [i] ;
408 if (imap >= 0 && imap < nsrow)
409 {
410 /* Lx [Map [i] + pk] += Ax [p] * fjk ; */
411 L_MULTADD (Lx,(imap+(psx+(k-k1)*nsrow)),
412 Ax,Az,p, fjk) ;
413 }
414 }
415 }
416 }
417 }
418 }
419
420 /* add beta to the diagonal of the supernode, if nonzero */
421 if (beta [0] != 0.0)
422 {
423 /* note that only the real part of beta is used */
424 pk = psx ;
425 for (k = k1 ; k < k2 ; k++)
426 {
427 /* Lx [pk] += beta [0] ; */
428 L_ASSEMBLE (Lx,pk, beta) ;
429 pk += nsrow + 1 ; /* advance to the next diagonal entry */
430 }
431 }
432
433 PRINT1 (("Supernode with just A: repeat: "ID"\n", repeat_supernode)) ;
434 DEBUG (CHOLMOD(dump_super) (s, Super, Lpi, Ls, Lpx, Lx, L_ENTRY,
435 Common)) ;
436 PRINT1 (("\n\n")) ;
437
438 /* ------------------------------------------------------------------ */
439 /* save/restore the list of supernodes */
440 /* ------------------------------------------------------------------ */
441
442 if (!repeat_supernode)
443 {
444 /* Save the list of pending descendants in case s is not positive
445 * definite. Also save Lpos for each descendant d, so that we can
446 * find which part of d is used to update s. */
447 for (d = Head [s] ; d != EMPTY ; d = Next [d])
448 {
449 Lpos_save [d] = Lpos [d] ;
450 Next_save [d] = Next [d] ;
451 }
452 }
453 else
454 {
455 for (d = Head [s] ; d != EMPTY ; d = Next [d])
456 {
457 Lpos [d] = Lpos_save [d] ;
458 Next [d] = Next_save [d] ;
459 }
460 }
461
462 /* ------------------------------------------------------------------ */
463 /* update supernode s with each pending descendant d */
464 /* ------------------------------------------------------------------ */
465
466 #ifndef NDEBUG
467 for (d = Head [s] ; d != EMPTY ; d = Next [d])
468 {
469 PRINT1 (("\nWill update "ID" with Child: "ID"\n", s, d)) ;
470 DEBUG (CHOLMOD(dump_super) (d, Super, Lpi, Ls, Lpx, Lx, L_ENTRY,
471 Common)) ;
472 }
473 PRINT1 (("\nNow factorizing supernode "ID":\n", s)) ;
474 #endif
475
476 #ifdef GPU_BLAS
477 /* initialize the buffer counter */
478 if ( useGPU ) {
479 Common->ibuffer = 0;
480 supernodeUsedGPU = 0;
481 idescendant = 0;
482 d = Head[s];
483 dnext = d;
484 dlarge = Next[d];
485 dsmall = tail;
486 GPUavailable = 1;
487 skips = 0;
488 }
489 else
490 {
491 dnext = Head[s];
492 }
493 #else
494 /* GPU module not installed */
495 dnext = Head[s];
496 #endif
497
498 while
499
500 #ifdef GPU_BLAS
501 ( (!useGPU && (dnext != EMPTY))
502 || (useGPU && (idescendant < ndescendants)))
503 #else
504
505 ( dnext != EMPTY )
506 #endif
507 {
508
509 #ifdef GPU_BLAS
510
511 if ( useGPU ) {
512
513 /* Conditionally select the next descendant supernode to
514 * assemble.
515 * + first, select the largest descendant
516 * + subsequently, if gpu host buffers are available, select
517 * the largest remaining descendant for assembly on the GPU
518 * + otherwise select the smallest remaining descendant for
519 * assembly on the CPU
520 *
521 * The objective is to keep the GPU busy assembling the largest
522 * descendants, and simultaneously keep the CPU busy assembling
523 * the smallest descendants.
524 *
525 * As this is called for every descendent supernode, moving
526 * this code to t_cholmod_gpu incurs substantial overhead -
527 * ~20 GF/s on audikw_1 - so it is being left here.
528 */
529
530 iHostBuff =
531 (Common->ibuffer) % CHOLMOD_HOST_SUPERNODE_BUFFERS;
532 cudaError_t cuErr;
533
534 if ( idescendant > 0 ) {
535 if ( GPUavailable == -1 || skips > 0) {
536 d = dsmall;
537 dsmall = Previous[dsmall];
538 skips--;
539 }
540 else {
541 cuErr = cudaEventQuery
542 ( Common->updateCBuffersFree[iHostBuff] );
543 if ( cuErr == cudaSuccess ) {
544 /* buffers are available, so assemble a large
545 * descendant (anticipating that this will be
546 * assembled on the GPU) */
547 d = dlarge;
548 dlarge = Next[dlarge];
549 GPUavailable = 1;
550 skips = 0;
551 }
552 else {
553 /* buffers are not available, so the GPU is busy,
554 * so assemble a small descendant (anticipating
555 * that it will be assembled on the host) */
556 d = dsmall;
557 dsmall = Previous[dsmall];
558 GPUavailable = 0;
559
560 /* if the GPUs are busy, then do this many
561 * supernodes on the CPU before querying GPUs
562 * again. */
563 skips = CHOLMOD_GPU_SKIP;
564 }
565 }
566 }
567
568 idescendant++;
569
570 }
571 else
572 {
573 d = dnext;
574 }
575 #else
576 /* GPU module not installed at compile time */
577 d = dnext ;
578 #endif
579 /* -------------------------------------------------------------- */
580 /* get the size of supernode d */
581 /* -------------------------------------------------------------- */
582
583 kd1 = Super [d] ; /* d contains cols kd1 to kd2-1 of L */
584 kd2 = Super [d+1] ;
585 ndcol = kd2 - kd1 ; /* # of columns in all of d */
586 pdi = Lpi [d] ; /* pointer to first row of d in Ls */
587 pdx = Lpx [d] ; /* pointer to first row of d in Lx */
588 pdend = Lpi [d+1] ; /* pointer just past last row of d in Ls */
589 ndrow = pdend - pdi ; /* # rows in all of d */
590
591 PRINT1 (("Child: ")) ;
592 DEBUG (CHOLMOD(dump_super) (d, Super, Lpi, Ls, Lpx, Lx, L_ENTRY,
593 Common)) ;
594
595 /* -------------------------------------------------------------- */
596 /* find the range of rows of d that affect rows k1 to k2-1 of s */
597 /* -------------------------------------------------------------- */
598
599 p = Lpos [d] ; /* offset of 1st row of d affecting s */
600 pdi1 = pdi + p ; /* ptr to 1st row of d affecting s in Ls */
601 pdx1 = pdx + p ; /* ptr to 1st row of d affecting s in Lx */
602
603 /* there must be at least one row remaining in d to update s */
604 ASSERT (pdi1 < pdend) ;
605 PRINT1 (("Lpos[d] "ID" pdi1 "ID" Ls[pdi1] "ID"\n",
606 Lpos[d], pdi1, Ls [pdi1])) ;
607 ASSERT (Ls [pdi1] >= k1 && Ls [pdi1] < k2) ;
608
609 for (pdi2 = pdi1 ; pdi2 < pdend && Ls [pdi2] < k2 ; pdi2++) ;
610 ndrow1 = pdi2 - pdi1 ; /* # rows in first part of d */
611 ndrow2 = pdend - pdi1 ; /* # rows in remaining d */
612
613 /* rows Ls [pdi1 ... pdi2-1] are in the range k1 to k2-1. Since d
614 * affects s, this set cannot be empty. */
615 ASSERT (pdi1 < pdi2 && pdi2 <= pdend) ;
616 PRINT1 (("ndrow1 "ID" ndrow2 "ID"\n", ndrow1, ndrow2)) ;
617 DEBUG (for (p = pdi1 ; p < pdi2 ; p++)
618 PRINT1 (("Ls["ID"] "ID"\n", p, Ls[p]))) ;
619
620 /* -------------------------------------------------------------- */
621 /* construct the update matrix C for this supernode d */
622 /* -------------------------------------------------------------- */
623
624 /* C = L (k1:n-1, kd1:kd2-1) * L (k1:k2-1, kd1:kd2-1)', except
625 * that k1:n-1 refers to all of the rows in L, but many of the
626 * rows are all zero. Supernode d holds columns kd1 to kd2-1 of L.
627 * Nonzero rows in the range k1:k2-1 are in the list
628 * Ls [pdi1 ... pdi2-1], of size ndrow1. Nonzero rows in the range
629 * k2:n-1 are in the list Ls [pdi2 ... pdend], of size ndrow2. Let
630 * L1 = L (Ls [pdi1 ... pdi2-1], kd1:kd2-1), and let
631 * L2 = L (Ls [pdi2 ... pdend], kd1:kd2-1). C is ndrow2-by-ndrow1.
632 * Let C1 be the first ndrow1 rows of C and let C2 be the last
633 * ndrow2-ndrow1 rows of C. Only the lower triangular part of C1
634 * needs to be computed since C1 is symmetric.
635 */
636
637 /* maxcsize is the largest size of C for all pairs (d,s) */
638 ASSERT (ndrow2 * ndrow1 <= ((Int) L->maxcsize)) ;
639
640 /* compute leading ndrow1-by-ndrow1 lower triangular block of C,
641 * C1 = L1*L1' */
642
643 ndrow3 = ndrow2 - ndrow1 ; /* number of rows of C2 */
644 ASSERT (ndrow3 >= 0) ;
645
646
647 #ifdef GPU_BLAS
648 if ( useGPU ) {
649 /* set up GPU to assemble new supernode */
650 if ( GPUavailable == 1) {
651 if ( ndrow2 * L_ENTRY >= CHOLMOD_ND_ROW_LIMIT &&
652 ndcol * L_ENTRY >= CHOLMOD_ND_COL_LIMIT ) {
653 if ( ! mapCreatedOnGpu ) {
654 TEMPLATE2 ( CHOLMOD (gpu_initialize_supernode))
655 ( Common, nscol, nsrow, psi, gpu_p );
656 mapCreatedOnGpu = 1;
657 }
658 }
659 else {
660 /* we've reached the limit of GPU-eligible descendants
661 * flag to stop stop performing cudaEventQueries */
662 GPUavailable = -1;
663 }
664 }
665 }
666 #endif
667
668 #ifdef GPU_BLAS
669 if ( !useGPU
670 || GPUavailable!=1
671 || !TEMPLATE2 (CHOLMOD (gpu_updateC)) (ndrow1, ndrow2, ndrow,
672 ndcol, nsrow, pdx1, pdi1, Lx, C, Common, gpu_p))
673 #endif
674 {
675 /* GPU not installed, or not used */
676 #ifndef NTIMER
677
678 Common->CHOLMOD_CPU_SYRK_CALLS++ ;
679 tstart = SuiteSparse_time () ;
680 #endif
681 #ifdef REAL
682 BLAS_dsyrk ("L", "N",
683 ndrow1, ndcol, /* N, K: L1 is ndrow1-by-ndcol*/
684 one, /* ALPHA: 1 */
685 Lx + L_ENTRY*pdx1, ndrow, /* A, LDA: L1, ndrow */
686 zero, /* BETA: 0 */
687 C, ndrow2) ; /* C, LDC: C1 */
688 #else
689 BLAS_zherk ("L", "N",
690 ndrow1, ndcol, /* N, K: L1 is ndrow1-by-ndcol*/
691 one, /* ALPHA: 1 */
692 Lx + L_ENTRY*pdx1, ndrow, /* A, LDA: L1, ndrow */
693 zero, /* BETA: 0 */
694 C, ndrow2) ; /* C, LDC: C1 */
695 #endif
696 #ifndef NTIMER
697 Common->CHOLMOD_CPU_SYRK_TIME += SuiteSparse_time () - tstart ;
698 #endif
699 /* compute remaining (ndrow2-ndrow1)-by-ndrow1 block of C,
700 * C2 = L2*L1' */
701 if (ndrow3 > 0)
702 {
703 #ifndef NTIMER
704 Common->CHOLMOD_CPU_GEMM_CALLS++ ;
705 tstart = SuiteSparse_time () ;
706 #endif
707 #ifdef REAL
708 BLAS_dgemm ("N", "C",
709 ndrow3, ndrow1, ndcol, /* M, N, K */
710 one, /* ALPHA: 1 */
711 Lx + L_ENTRY*(pdx1 + ndrow1), /* A, LDA: L2 */
712 ndrow, /* ndrow */
713 Lx + L_ENTRY*pdx1, /* B, LDB: L1 */
714 ndrow, /* ndrow */
715 zero, /* BETA: 0 */
716 C + L_ENTRY*ndrow1, /* C, LDC: C2 */
717 ndrow2) ;
718 #else
719 BLAS_zgemm ("N", "C",
720 ndrow3, ndrow1, ndcol, /* M, N, K */
721 one, /* ALPHA: 1 */
722 Lx + L_ENTRY*(pdx1 + ndrow1), /* A, LDA: L2 */
723 ndrow, /* ndrow */
724 Lx + L_ENTRY*pdx1, /* B, LDB: L1, ndrow */
725 ndrow,
726 zero, /* BETA: 0 */
727 C + L_ENTRY*ndrow1, /* C, LDC: C2 */
728 ndrow2) ;
729 #endif
730 #ifndef NTIMER
731 Common->CHOLMOD_CPU_GEMM_TIME +=
732 SuiteSparse_time () - tstart ;
733 #endif
734 }
735
736 /* ---------------------------------------------------------- */
737 /* construct relative map to assemble d into s */
738 /* ---------------------------------------------------------- */
739
740 DEBUG (CHOLMOD(dump_real) ("C", C, ndrow2, ndrow1, TRUE,
741 L_ENTRY, Common)) ;
742
743 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
744 if ( ndrow2 > 64 )
745
746 for (i = 0 ; i < ndrow2 ; i++)
747 {
748 RelativeMap [i] = Map [Ls [pdi1 + i]] ;
749 ASSERT (RelativeMap [i] >= 0 && RelativeMap [i] < nsrow) ;
750 }
751
752 /* ---------------------------------------------------------- */
753 /* assemble C into supernode s using the relative map */
754 /* ---------------------------------------------------------- */
755
756 #pragma omp parallel for private ( j, i, px, q ) \
757 num_threads(CHOLMOD_OMP_NUM_THREADS) if (ndrow1 > 64 )
758
759 for (j = 0 ; j < ndrow1 ; j++) /* cols k1:k2-1 */
760 {
761 ASSERT (RelativeMap [j] == Map [Ls [pdi1 + j]]) ;
762 ASSERT (RelativeMap [j] >= 0 && RelativeMap [j] < nscol) ;
763 px = psx + RelativeMap [j] * nsrow ;
764 for (i = j ; i < ndrow2 ; i++) /* rows k1:n-1 */
765 {
766 ASSERT (RelativeMap [i] == Map [Ls [pdi1 + i]]) ;
767 ASSERT (RelativeMap [i] >= j && RelativeMap[i] < nsrow);
768 /* Lx [px + RelativeMap [i]] -= C [i + pj] ; */
769 q = px + RelativeMap [i] ;
770 L_ASSEMBLESUB (Lx,q, C, i+ndrow2*j) ;
771 }
772 }
773
774 }
775 #ifdef GPU_BLAS
776 else
777 {
778 supernodeUsedGPU = 1; /* GPU was used for this supernode*/
779 Common->ibuffer++; /* gpu_updateC is asynchronous, so use
780 * the next host buffer for the next
781 * supernode */
782 Common->ibuffer = Common->ibuffer%
783 (CHOLMOD_HOST_SUPERNODE_BUFFERS*CHOLMOD_DEVICE_STREAMS);
784 }
785 #endif
786
787 /* -------------------------------------------------------------- */
788 /* prepare this supernode d for its next ancestor */
789 /* -------------------------------------------------------------- */
790
791 dnext = Next [d] ;
792
793 if (!repeat_supernode)
794 {
795 /* If node s is being repeated, Head [dancestor] has already
796 * been cleared (set to EMPTY). It must remain EMPTY. The
797 * dancestor will not be factorized since the factorization
798 * terminates at node s. */
799 Lpos [d] = pdi2 - pdi ;
800 if (Lpos [d] < ndrow)
801 {
802 dancestor = SuperMap [Ls [pdi2]] ;
803 ASSERT (dancestor > s && dancestor < nsuper) ;
804 /* place d in the link list of its next ancestor */
805 Next [d] = Head [dancestor] ;
806 Head [dancestor] = d ;
807 }
808 }
809
810 } /* end of descendant supernode loop */
811
812 #ifdef GPU_BLAS
813 if ( useGPU ) {
814 iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
815 iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;
816
817 /* combine updates assembled on the GPU with updates
818 * assembled on the CPU */
819 TEMPLATE2 ( CHOLMOD (gpu_final_assembly ))
820 ( Common, Lx, psx, nscol, nsrow, supernodeUsedGPU,
821 &iHostBuff, &iDevBuff, gpu_p );
822 }
823 #endif
824
825 PRINT1 (("\nSupernode with contributions A: repeat: "ID"\n",
826 repeat_supernode)) ;
827 DEBUG (CHOLMOD(dump_super) (s, Super, Lpi, Ls, Lpx, Lx, L_ENTRY,
828 Common)) ;
829 PRINT1 (("\n\n")) ;
830
831 /* ------------------------------------------------------------------ */
832 /* factorize diagonal block of supernode s in LL' */
833 /* ------------------------------------------------------------------ */
834
835 /* The current supernode s is ready to factorize. It has been updated
836 * by all descendant supernodes. Let S = the current supernode, which
837 * holds rows k1:n-1 and columns k1:k2-1 of the updated matrix. It
838 * splits into two parts: the square diagonal block S1, and the
839 * rectangular part S2. Here, S1 is factorized into L1*L1' and
840 * overwritten by L1.
841 *
842 * If supernode s is being repeated, only factorize it up to but not
843 * including the column containing the problematic entry.
844 */
845
846 nscol2 = (repeat_supernode) ? (nscol_new) : (nscol) ;
847
848 #ifdef GPU_BLAS
849 if ( !useGPU
850 || !supernodeUsedGPU
851 || !TEMPLATE2 (CHOLMOD (gpu_lower_potrf))(nscol2, nsrow, psx, Lx,
852 &info, Common, gpu_p))
853 #endif
854 {
855 /* Note that the GPU will not be used for the triangular solve */
856 #ifdef GPU_BLAS
857 supernodeUsedGPU = 0;
858 #endif
859 #ifndef NTIMER
860 Common->CHOLMOD_CPU_POTRF_CALLS++ ;
861 tstart = SuiteSparse_time () ;
862 #endif
863 #ifdef REAL
864 LAPACK_dpotrf ("L",
865 nscol2, /* N: nscol2 */
866 Lx + L_ENTRY*psx, nsrow, /* A, LDA: S1, nsrow */
867 info) ; /* INFO */
868 #else
869 LAPACK_zpotrf ("L",
870 nscol2, /* N: nscol2 */
871 Lx + L_ENTRY*psx, nsrow, /* A, LDA: S1, nsrow */
872 info) ; /* INFO */
873 #endif
874 #ifndef NTIMER
875 Common->CHOLMOD_CPU_POTRF_TIME += SuiteSparse_time ()- tstart ;
876 #endif
877 }
878
879 /* ------------------------------------------------------------------ */
880 /* check if the matrix is not positive definite */
881 /* ------------------------------------------------------------------ */
882
883 if (repeat_supernode)
884 {
885 /* the leading part has been refactorized; it must have succeeded */
886 info = 0 ;
887
888 /* zero out the rest of this supernode */
889 p = psx + nsrow * nscol_new ;
890 pend = psx + nsrow * nscol ; /* s is nsrow-by-nscol */
891 for ( ; p < pend ; p++)
892 {
893 /* Lx [p] = 0 ; */
894 L_CLEAR (Lx,p) ;
895 }
896 }
897
898 /* info is set to one in LAPACK_*potrf if blas_ok is FALSE. It is
899 * set to zero in dpotrf/zpotrf if the factorization was successful. */
900 if (CHECK_BLAS_INT && !Common->blas_ok)
901 {
902 ERROR (CHOLMOD_TOO_LARGE, "problem too large for the BLAS") ;
903 }
904
905 if (info != 0)
906 {
907 /* Matrix is not positive definite. dpotrf/zpotrf do NOT report an
908 * error if the diagonal of L has NaN's, only if it has a zero. */
909 if (Common->status == CHOLMOD_OK)
910 {
911 ERROR (CHOLMOD_NOT_POSDEF, "matrix not positive definite") ;
912 }
913
914 /* L->minor is the column of L that contains a zero or negative
915 * diagonal term. */
916 L->minor = k1 + info - 1 ;
917
918 /* clear the link lists of all subsequent supernodes */
919 for (ss = s+1 ; ss < nsuper ; ss++)
920 {
921 Head [ss] = EMPTY ;
922 }
923
924 /* zero this supernode, and all remaining supernodes */
925 pend = L->xsize ;
926 for (p = psx ; p < pend ; p++)
927 {
928 /* Lx [p] = 0. ; */
929 L_CLEAR (Lx,p) ;
930 }
931
932 /* If L is indefinite, it still contains useful information.
933 * Supernodes 0 to s-1 are valid, similar to MATLAB [R,p]=chol(A),
934 * where the 1-based p is identical to the 0-based L->minor. Since
935 * L->minor is in the current supernode s, it and any columns to the
936 * left of it in supernode s are also all zero. This differs from
937 * [R,p]=chol(A), which contains nonzero rows 1 to p-1. Fix this
938 * by setting repeat_supernode to TRUE, and repeating supernode s.
939 *
940 * If Common->quick_return_if_not_posdef is true, then the entire
941 * supernode s is not factorized; it is left as all zero.
942 */
943
944 if (info == 1 || Common->quick_return_if_not_posdef)
945 {
946 /* If the first column of supernode s contains a zero or
947 * negative diagonal entry, then it is already properly set to
948 * zero. Also, info will be 1 if integer overflow occured in
949 * the BLAS. */
950 Head [s] = EMPTY ;
951 #ifdef GPU_BLAS
952 if ( useGPU ) {
953 CHOLMOD (gpu_end) (Common) ;
954 }
955 #endif
956 return (Common->status >= CHOLMOD_OK) ;
957 }
958 else
959 {
960 /* Repeat supernode s, but only factorize it up to but not
961 * including the column containing the problematic diagonal
962 * entry. */
963 repeat_supernode = TRUE ;
964 s-- ;
965 nscol_new = info - 1 ;
966 continue ;
967 }
968 }
969
970 /* ------------------------------------------------------------------ */
971 /* compute the subdiagonal block and prepare supernode for its parent */
972 /* ------------------------------------------------------------------ */
973
974 nsrow2 = nsrow - nscol2 ;
975 if (nsrow2 > 0)
976 {
977 /* The current supernode is columns k1 to k2-1 of L. Let L1 be the
978 * diagonal block (factorized by dpotrf/zpotrf above; rows/cols
979 * k1:k2-1), and L2 be rows k2:n-1 and columns k1:k2-1 of L. The
980 * triangular system to solve is L2*L1' = S2, where S2 is
981 * overwritten with L2. More precisely, L2 = S2 / L1' in MATLAB
982 * notation.
983 */
984
985 #ifdef GPU_BLAS
986 if ( !useGPU
987 || !supernodeUsedGPU
988 || !TEMPLATE2 (CHOLMOD(gpu_triangular_solve))
989 (nsrow2, nscol2, nsrow, psx, Lx, Common, gpu_p))
990 #endif
991 {
992 #ifndef NTIMER
993 Common->CHOLMOD_CPU_TRSM_CALLS++ ;
994 tstart = SuiteSparse_time () ;
995 #endif
996 #ifdef REAL
997 BLAS_dtrsm ("R", "L", "C", "N",
998 nsrow2, nscol2, /* M, N */
999 one, /* ALPHA: 1 */
1000 Lx + L_ENTRY*psx, nsrow, /* A, LDA: L1, nsrow */
1001 Lx + L_ENTRY*(psx + nscol2), /* B, LDB, L2, nsrow */
1002 nsrow) ;
1003 #else
1004 BLAS_ztrsm ("R", "L", "C", "N",
1005 nsrow2, nscol2, /* M, N */
1006 one, /* ALPHA: 1 */
1007 Lx + L_ENTRY*psx, nsrow, /* A, LDA: L1, nsrow */
1008 Lx + L_ENTRY*(psx + nscol2), /* B, LDB, L2, nsrow */
1009 nsrow) ;
1010 #endif
1011 #ifndef NTIMER
1012 Common->CHOLMOD_CPU_TRSM_TIME += SuiteSparse_time () - tstart ;
1013 #endif
1014 }
1015
1016 if (CHECK_BLAS_INT && !Common->blas_ok)
1017 {
1018 ERROR (CHOLMOD_TOO_LARGE, "problem too large for the BLAS") ;
1019 }
1020
1021 if (!repeat_supernode)
1022 {
1023 /* Lpos [s] is offset of first row of s affecting its parent */
1024 Lpos [s] = nscol ;
1025 sparent = SuperMap [Ls [psi + nscol]] ;
1026 ASSERT (sparent != EMPTY) ;
1027 ASSERT (Ls [psi + nscol] >= Super [sparent]) ;
1028 ASSERT (Ls [psi + nscol] < Super [sparent+1]) ;
1029 ASSERT (SuperMap [Ls [psi + nscol]] == sparent) ;
1030 ASSERT (sparent > s && sparent < nsuper) ;
1031 /* place s in link list of its parent */
1032 Next [s] = Head [sparent] ;
1033 Head [sparent] = s ;
1034 }
1035 }
1036 else
1037 {
1038 #ifdef GPU_BLAS
1039 TEMPLATE2 ( CHOLMOD (gpu_copy_supernode) )
1040 ( Common, Lx, psx, nscol, nscol2, nsrow,
1041 supernodeUsedGPU, iHostBuff, gpu_p);
1042 #endif
1043 }
1044
1045 Head [s] = EMPTY ; /* link list for supernode s no longer needed */
1046
1047 /* clear the Map (debugging only, to detect changes in pattern of A) */
1048 DEBUG (for (k = 0 ; k < nsrow ; k++) Map [Ls [psi + k]] = EMPTY) ;
1049 DEBUG (CHOLMOD(dump_super) (s, Super, Lpi, Ls, Lpx, Lx, L_ENTRY,
1050 Common)) ;
1051
1052 if (repeat_supernode)
1053 {
1054 /* matrix is not positive definite; finished clean-up for supernode
1055 * containing negative diagonal */
1056
1057 #ifdef GPU_BLAS
1058 if ( useGPU )
1059 {
1060 CHOLMOD (gpu_end) (Common) ;
1061 }
1062 #endif
1063 return (Common->status >= CHOLMOD_OK) ;
1064 }
1065 }
1066
1067 /* success; matrix is positive definite */
1068 L->minor = n ;
1069
1070 #ifdef GPU_BLAS
1071 if ( useGPU )
1072 {
1073 CHOLMOD (gpu_end) (Common) ;
1074 }
1075 #endif
1076
1077 return (Common->status >= CHOLMOD_OK) ;
1078
1079 }
1080
1081 #undef PATTERN
1082 #undef REAL
1083 #undef COMPLEX
1084 #undef ZOMPLEX
1085