1 /* ========================================================================== */
2 /* === Supernodal/t_cholmod_super_numeric =================================== */
3 /* ========================================================================== */
4 
5 /* -----------------------------------------------------------------------------
6  * CHOLMOD/Supernodal Module.  Copyright (C) 2005-2012, Timothy A. Davis
7  * http://www.suitesparse.com
8  * -------------------------------------------------------------------------- */
9 
10 /* Template routine for cholmod_super_numeric.  All xtypes supported, except
11  * that a zomplex A and F result in a complex L (there is no supernodal
12  * zomplex L).
13  */
14 
15 /* ========================================================================== */
16 /* === complex arithmetic =================================================== */
17 /* ========================================================================== */
18 
19 #include "cholmod_template.h"
20 
21 #undef L_ENTRY
22 #undef L_CLEAR
23 #undef L_ASSIGN
24 #undef L_MULTADD
25 #undef L_ASSEMBLE
26 #undef L_ASSEMBLESUB
27 
28 #ifdef REAL
29 
30 /* -------------------------------------------------------------------------- */
31 /* A, F, and L are all real */
32 /* -------------------------------------------------------------------------- */
33 
34 #define L_ENTRY 1
35 #define L_CLEAR(Lx,p)               Lx [p] = 0
36 #define L_ASSIGN(Lx,q, Ax,Az,p)     Lx [q] = Ax [p]
37 #define L_MULTADD(Lx,q, Ax,Az,p, f) Lx [q] += Ax [p] * f [0]
38 #define L_ASSEMBLE(Lx,q,b)          Lx [q] += b [0]
39 #define L_ASSEMBLESUB(Lx,q,C,p)     Lx [q] -= C [p]
40 
41 #else
42 
43 /* -------------------------------------------------------------------------- */
44 /* A and F are complex or zomplex, L and C are complex */
45 /* -------------------------------------------------------------------------- */
46 
47 #define L_ENTRY 2
48 #define L_CLEAR(Lx,p)               Lx [2*(p)] = 0 ; Lx [2*(p)+1] = 0
49 #define L_ASSEMBLE(Lx,q,b)          Lx [2*(q)] += b [0] ;
50 #define L_ASSEMBLESUB(Lx,q,C,p)                 \
51     Lx [2*(q)  ] -= C [2*(p)  ] ;               \
52     Lx [2*(q)+1] -= C [2*(p)+1] ;
53 
54 #ifdef COMPLEX
55 
56 /* -------------------------------------------------------------------------- */
57 /* A, F, L, and C are all complex */
58 /* -------------------------------------------------------------------------- */
59 
60 #define L_ASSIGN(Lx,q, Ax,Az,p)                 \
61     Lx [2*(q)  ] = Ax [2*(p)  ] ;               \
62     Lx [2*(q)+1] = Ax [2*(p)+1]
63 
64 #define L_MULTADD(Lx,q, Ax,Az,p, f)                                     \
65     Lx [2*(q)  ] += Ax [2*(p)  ] * f [0] - Ax [2*(p)+1] * f [1] ;       \
66     Lx [2*(q)+1] += Ax [2*(p)+1] * f [0] + Ax [2*(p)  ] * f [1]
67 
68 #else
69 
70 /* -------------------------------------------------------------------------- */
71 /* A and F are zomplex, L and C is complex */
72 /* -------------------------------------------------------------------------- */
73 
74 #define L_ASSIGN(Lx,q, Ax,Az,p)                 \
75     Lx [2*(q)  ] = Ax [p] ;                     \
76     Lx [2*(q)+1] = Az [p] ;
77 
78 #define L_MULTADD(Lx,q, Ax,Az,p, f)                     \
79     Lx [2*(q)  ] += Ax [p] * f [0] - Az [p] * f [1] ;   \
80     Lx [2*(q)+1] += Az [p] * f [0] + Ax [p] * f [1]
81 
82 #endif
83 #endif
84 
85 
86 /* ========================================================================== */
87 /* === t_cholmod_super_numeric ============================================== */
88 /* ========================================================================== */
89 
90 /* This function returns FALSE only if integer overflow occurs in the BLAS.
91  * It returns TRUE otherwise whether or not the matrix is positive definite. */
92 
TEMPLATE(cholmod_super_numeric)93 static int TEMPLATE (cholmod_super_numeric)
94 (
95     /* ---- input ---- */
96     cholmod_sparse *A,  /* matrix to factorize */
97     cholmod_sparse *F,  /* F = A' or A(:,f)' */
98     double beta [2],    /* beta*I is added to diagonal of matrix to factorize */
99     /* ---- in/out --- */
100     cholmod_factor *L,  /* factorization */
101     /* -- workspace -- */
102     cholmod_dense *Cwork,       /* size (L->maxcsize)-by-1 */
103     /* --------------- */
104     cholmod_common *Common
105     )
106 {
107     double one [2], zero [2], tstart ;
108     double *Lx, *Ax, *Fx, *Az, *Fz, *C ;
109     Int *Super, *Head, *Ls, *Lpi, *Lpx, *Map, *SuperMap, *RelativeMap, *Next,
110         *Lpos, *Fp, *Fi, *Fnz, *Ap, *Ai, *Anz, *Iwork, *Next_save, *Lpos_save,
111         *Previous;
112     Int nsuper, n, j, i, k, s, p, pend, k1, k2, nscol, psi, psx, psend, nsrow,
113         pj, d, kd1, kd2, info, ndcol, ndrow, pdi, pdx, pdend, pdi1, pdi2, pdx1,
114         ndrow1, ndrow2, px, dancestor, sparent, dnext, nsrow2, ndrow3, pk, pf,
115         pfend, stype, Apacked, Fpacked, q, imap, repeat_supernode, nscol2, ss,
116         tail, nscol_new = 0;
117 
118     /* ---------------------------------------------------------------------- */
119     /* declarations for the GPU */
120     /* ---------------------------------------------------------------------- */
121 
122     /* these variables are not used if the GPU module is not installed */
123 
124 #ifdef GPU_BLAS
125     Int ndescendants, mapCreatedOnGpu, supernodeUsedGPU,
126         idescendant, dlarge, dsmall, skips ;
127     int iHostBuff, iDevBuff, useGPU, GPUavailable ;
128     cholmod_gpu_pointers *gpu_p, gpu_pointer_struct ;
129     gpu_p = &gpu_pointer_struct ;
130 #endif
131 
132     /* ---------------------------------------------------------------------- */
133     /* guard against integer overflow in the BLAS */
134     /* ---------------------------------------------------------------------- */
135 
136     /* If integer overflow occurs in the BLAS, Common->status is set to
137      * CHOLMOD_TOO_LARGE, and the contents of Lx are undefined. */
138     Common->blas_ok = TRUE ;
139 
140     /* ---------------------------------------------------------------------- */
141     /* get inputs */
142     /* ---------------------------------------------------------------------- */
143 
144     nsuper = L->nsuper ;
145     n = L->n ;
146 
147     C = Cwork->x ;      /* workspace of size L->maxcsize */
148 
149     one [0] =  1.0 ;    /* ALPHA for *syrk, *herk, *gemm, and *trsm */
150     one [1] =  0. ;
151     zero [0] = 0. ;     /* BETA for *syrk, *herk, and *gemm */
152     zero [1] = 0. ;
153 
154     /* Iwork must be of size 2n + 5*nsuper, allocated in the caller,
155      * cholmod_super_numeric.  The memory cannot be allocated here because the
156      * cholmod_super_numeric initializes SuperMap, and cholmod_allocate_work
157      * does not preserve existing workspace if the space needs to be increase
158      * in size. */
159 
160     /* allocate integer workspace */
161     Iwork = Common->Iwork ;
162     SuperMap    = Iwork ;                                   /* size n (i/i/l) */
163     RelativeMap = Iwork + n ;                               /* size n (i/i/l) */
164     Next        = Iwork + 2*((size_t) n) ;                  /* size nsuper*/
165     Lpos        = Iwork + 2*((size_t) n) + nsuper ;         /* size nsuper*/
166     Next_save   = Iwork + 2*((size_t) n) + 2*((size_t) nsuper) ;/* size nsuper*/
167     Lpos_save   = Iwork + 2*((size_t) n) + 3*((size_t) nsuper) ;/* size nsuper*/
168     Previous    = Iwork + 2*((size_t) n) + 4*((size_t) nsuper) ;/* size nsuper*/
169 
170     Map  = Common->Flag ;   /* size n, use Flag as workspace for Map array */
171     Head = Common->Head ;   /* size n+1, only Head [0..nsuper-1] used */
172 
173     Ls = L->s ;
174     Lpi = L->pi ;
175     Lpx = L->px ;
176 
177     Super = L->super ;
178 
179     Lx = L->x ;
180 
181 #ifdef GPU_BLAS
182     /* local copy of useGPU */
183     if ( (Common->useGPU == 1) && L->useGPU)
184     {
185         /* Initialize the GPU.  If not found, don't use it. */
186         useGPU = TEMPLATE2 (CHOLMOD (gpu_init))
187             (C, L, Common, nsuper, n, Lpi[nsuper]-Lpi[0], gpu_p) ;
188     }
189     else
190     {
191         useGPU = 0;
192     }
193     /* fprintf (stderr, "local useGPU %d\n", useGPU) ; */
194 #endif
195 
196 #ifndef NTIMER
197     /* clear GPU / CPU statistics */
198     Common->CHOLMOD_CPU_GEMM_CALLS  = 0 ;
199     Common->CHOLMOD_CPU_SYRK_CALLS  = 0 ;
200     Common->CHOLMOD_CPU_TRSM_CALLS  = 0 ;
201     Common->CHOLMOD_CPU_POTRF_CALLS = 0 ;
202     Common->CHOLMOD_GPU_GEMM_CALLS  = 0 ;
203     Common->CHOLMOD_GPU_SYRK_CALLS  = 0 ;
204     Common->CHOLMOD_GPU_TRSM_CALLS  = 0 ;
205     Common->CHOLMOD_GPU_POTRF_CALLS = 0 ;
206     Common->CHOLMOD_CPU_GEMM_TIME   = 0 ;
207     Common->CHOLMOD_CPU_SYRK_TIME   = 0 ;
208     Common->CHOLMOD_CPU_TRSM_TIME   = 0 ;
209     Common->CHOLMOD_CPU_POTRF_TIME  = 0 ;
210     Common->CHOLMOD_GPU_GEMM_TIME   = 0 ;
211     Common->CHOLMOD_GPU_SYRK_TIME   = 0 ;
212     Common->CHOLMOD_GPU_TRSM_TIME   = 0 ;
213     Common->CHOLMOD_GPU_POTRF_TIME  = 0 ;
214     Common->CHOLMOD_ASSEMBLE_TIME   = 0 ;
215     Common->CHOLMOD_ASSEMBLE_TIME2  = 0 ;
216 #endif
217 
218     stype = A->stype ;
219 
220     if (stype != 0)
221     {
222         /* F not accessed */
223         Fp = NULL ;
224         Fi = NULL ;
225         Fx = NULL ;
226         Fz = NULL ;
227         Fnz = NULL ;
228         Fpacked = TRUE ;
229     }
230     else
231     {
232         Fp = F->p ;
233         Fi = F->i ;
234         Fx = F->x ;
235         Fz = F->z ;
236         Fnz = F->nz ;
237         Fpacked = F->packed ;
238     }
239 
240     Ap = A->p ;
241     Ai = A->i ;
242     Ax = A->x ;
243     Az = A->z ;
244     Anz = A->nz ;
245     Apacked = A->packed ;
246 
247     /* clear the Map so that changes in the pattern of A can be detected */
248 
249 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \
250     if ( n > 128 ) schedule (static)
251 
252     for (i = 0 ; i < n ; i++)
253     {
254         Map [i] = EMPTY ;
255     }
256 
257     /* If the matrix is not positive definite, the supernode s containing the
258      * first zero or negative diagonal entry of L is repeated (but factorized
259      * only up to just before the problematic diagonal entry). The purpose is
260      * to provide MATLAB with [R,p]=chol(A); columns 1 to p-1 of L=R' are
261      * required, where L(p,p) is the problematic diagonal entry.  The
262      * repeat_supernode flag tells us whether this is the repeated supernode.
263      * Once supernode s is repeated, the factorization is terminated. */
264     repeat_supernode = FALSE ;
265 
266 #ifdef GPU_BLAS
267     if ( useGPU )
268     {
269         /* Case of GPU, zero all supernodes at one time for better performance*/
270         TEMPLATE2 (CHOLMOD (gpu_clear_memory))(Lx, L->xsize,
271             CHOLMOD_OMP_NUM_THREADS);
272     }
273 #endif
274 
275     /* ---------------------------------------------------------------------- */
276     /* supernodal numerical factorization */
277     /* ---------------------------------------------------------------------- */
278 
279     for (s = 0 ; s < nsuper ; s++)
280     {
281 
282         /* ------------------------------------------------------------------ */
283         /* get the size of supernode s */
284         /* ------------------------------------------------------------------ */
285 
286         k1 = Super [s] ;            /* s contains columns k1 to k2-1 of L */
287         k2 = Super [s+1] ;
288         nscol = k2 - k1 ;           /* # of columns in all of s */
289         psi = Lpi [s] ;             /* pointer to first row of s in Ls */
290         psx = Lpx [s] ;             /* pointer to first row of s in Lx */
291         psend = Lpi [s+1] ;         /* pointer just past last row of s in Ls */
292         nsrow = psend - psi ;       /* # of rows in all of s */
293 
294         PRINT1 (("====================================================\n"
295                  "S "ID" k1 "ID" k2 "ID" nsrow "ID" nscol "ID" psi "ID" psend "
296                  ""ID" psx "ID"\n", s, k1, k2, nsrow, nscol, psi, psend, psx)) ;
297         /* ------------------------------------------------------------------ */
298         /* zero the supernode s */
299         /* ------------------------------------------------------------------ */
300 
301         ASSERT ((size_t) (psx + nsrow*nscol) <= L->xsize) ;
302 
303         pend = psx + nsrow * nscol ;        /* s is nsrow-by-nscol */
304 
305 #ifdef GPU_BLAS
306         if ( !useGPU )
307 #endif
308         {
309             /* Case of no GPU, zero individual supernodes */
310 
311 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
312     schedule (static) if ( pend - psx > 1024 )
313 
314             for (p = psx ; p < pend ; p++) {
315                 L_CLEAR (Lx,p);
316             }
317         }
318 
319         /* ------------------------------------------------------------------ */
320         /* construct the scattered Map for supernode s */
321         /* ------------------------------------------------------------------ */
322 
323         /* If row i is the kth row in s, then Map [i] = k.  Similarly, if
324          * column j is the kth column in s, then  Map [j] = k. */
325 
326 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
327     if ( nsrow > 128 )
328 
329         for (k = 0 ; k < nsrow ; k++)
330         {
331             PRINT1 (("  "ID" map "ID"\n", Ls [psi+k], k)) ;
332             Map [Ls [psi + k]] = k ;
333         }
334 
335         /* ------------------------------------------------------------------ */
336         /* when using GPU, reorder supernodes by levels.*/
337         /* (all supernodes in a level are independent) */
338         /* ------------------------------------------------------------------ */
339 
340 #ifdef GPU_BLAS
341         if ( useGPU )
342         {
343             TEMPLATE2 (CHOLMOD (gpu_reorder_descendants))
344                 ( Common, Super, &s, Lpi, Lpos, Head, Next, Previous,
345                   &ndescendants, &tail, &mapCreatedOnGpu, gpu_p ) ;
346         }
347 #endif
348 
349         /* ------------------------------------------------------------------ */
350         /* copy matrix into supernode s (lower triangular part only) */
351         /* ------------------------------------------------------------------ */
352 
353         pk = psx ;
354 
355 #pragma omp parallel for private ( p, pend, pfend, pf, i, j, imap, q )  \
356     num_threads(CHOLMOD_OMP_NUM_THREADS) if ( k2-k1 > 64 )
357 
358         for (k = k1 ; k < k2 ; k++)
359         {
360             if (stype != 0)
361             {
362                 /* copy the kth column of A into the supernode */
363                 p = Ap [k] ;
364                 pend = (Apacked) ? (Ap [k+1]) : (p + Anz [k]) ;
365                 for ( ; p < pend ; p++)
366                 {
367                     /* row i of L is located in row Map [i] of s */
368                     i = Ai [p] ;
369                     if (i >= k)
370                     {
371                         /* This test is here simply to avoid a segfault.  If
372                          * the test is false, the numeric factorization of A
373                          * is undefined.  It does not detect all invalid
374                          * entries, only some of them (when debugging is
375                          * enabled, and Map is cleared after each step, then
376                          * all entries not in the pattern of L are detected). */
377                         imap = Map [i] ;
378                         if (imap >= 0 && imap < nsrow)
379                         {
380                             /* Lx [Map [i] + pk] = Ax [p] ; */
381                             L_ASSIGN (Lx,(imap+(psx+(k-k1)*nsrow)), Ax,Az,p) ;
382                         }
383                     }
384                 }
385             }
386             else
387             {
388                 double fjk[2];
389                 /* copy the kth column of A*F into the supernode */
390                 pf = Fp [k] ;
391                 pfend = (Fpacked) ? (Fp [k+1]) : (p + Fnz [k]) ;
392                 for ( ; pf < pfend ; pf++)
393                 {
394                     j = Fi [pf] ;
395 
396                     /* fjk = Fx [pf] ; */
397                     L_ASSIGN (fjk,0, Fx,Fz,pf) ;
398 
399                     p = Ap [j] ;
400                     pend = (Apacked) ? (Ap [j+1]) : (p + Anz [j]) ;
401                     for ( ; p < pend ; p++)
402                     {
403                         i = Ai [p] ;
404                         if (i >= k)
405                         {
406                             /* See the discussion of imap above. */
407                             imap = Map [i] ;
408                             if (imap >= 0 && imap < nsrow)
409                             {
410                                 /* Lx [Map [i] + pk] += Ax [p] * fjk ; */
411                                 L_MULTADD (Lx,(imap+(psx+(k-k1)*nsrow)),
412                                            Ax,Az,p, fjk) ;
413                             }
414                         }
415                     }
416                 }
417             }
418         }
419 
420         /* add beta to the diagonal of the supernode, if nonzero */
421         if (beta [0] != 0.0)
422         {
423             /* note that only the real part of beta is used */
424             pk = psx ;
425             for (k = k1 ; k < k2 ; k++)
426             {
427                 /* Lx [pk] += beta [0] ; */
428                 L_ASSEMBLE (Lx,pk, beta) ;
429                 pk += nsrow + 1 ;       /* advance to the next diagonal entry */
430             }
431         }
432 
433         PRINT1 (("Supernode with just A: repeat: "ID"\n", repeat_supernode)) ;
434         DEBUG (CHOLMOD(dump_super) (s, Super, Lpi, Ls, Lpx, Lx, L_ENTRY,
435                                     Common)) ;
436         PRINT1 (("\n\n")) ;
437 
438         /* ------------------------------------------------------------------ */
439         /* save/restore the list of supernodes */
440         /* ------------------------------------------------------------------ */
441 
442         if (!repeat_supernode)
443         {
444             /* Save the list of pending descendants in case s is not positive
445              * definite.  Also save Lpos for each descendant d, so that we can
446              * find which part of d is used to update s. */
447             for (d = Head [s] ; d != EMPTY ; d = Next [d])
448             {
449                 Lpos_save [d] = Lpos [d] ;
450                 Next_save [d] = Next [d] ;
451             }
452         }
453         else
454         {
455             for (d = Head [s] ; d != EMPTY ; d = Next [d])
456             {
457                 Lpos [d] = Lpos_save [d] ;
458                 Next [d] = Next_save [d] ;
459             }
460         }
461 
462         /* ------------------------------------------------------------------ */
463         /* update supernode s with each pending descendant d */
464         /* ------------------------------------------------------------------ */
465 
466 #ifndef NDEBUG
467         for (d = Head [s] ; d != EMPTY ; d = Next [d])
468         {
469             PRINT1 (("\nWill update "ID" with Child: "ID"\n", s, d)) ;
470             DEBUG (CHOLMOD(dump_super) (d, Super, Lpi, Ls, Lpx, Lx, L_ENTRY,
471                                         Common)) ;
472         }
473         PRINT1 (("\nNow factorizing supernode "ID":\n", s)) ;
474 #endif
475 
476 #ifdef GPU_BLAS
477         /* initialize the buffer counter */
478         if ( useGPU ) {
479             Common->ibuffer = 0;
480             supernodeUsedGPU = 0;
481             idescendant = 0;
482             d = Head[s];
483             dnext = d;
484             dlarge = Next[d];
485             dsmall = tail;
486             GPUavailable = 1;
487             skips = 0;
488         }
489         else
490         {
491             dnext = Head[s];
492         }
493 #else
494         /* GPU module not installed */
495         dnext = Head[s];
496 #endif
497 
498         while
499 
500 #ifdef GPU_BLAS
501             ( (!useGPU && (dnext != EMPTY))
502                || (useGPU && (idescendant < ndescendants)))
503 #else
504 
505             ( dnext != EMPTY )
506 #endif
507         {
508 
509 #ifdef GPU_BLAS
510 
511             if ( useGPU ) {
512 
513                 /* Conditionally select the next descendant supernode to
514                  *  assemble.
515                  *   + first, select the largest descendant
516                  *   + subsequently, if gpu host buffers are available, select
517                  *     the largest remaining descendant for assembly on the GPU
518                  *   + otherwise select the smallest remaining descendant for
519                  *     assembly on the CPU
520                  *
521                  * The objective is to keep the GPU busy assembling the largest
522                  * descendants, and simultaneously keep the CPU busy assembling
523                  * the smallest descendants.
524                  *
525                  * As this is called for every descendent supernode, moving
526                  * this code to t_cholmod_gpu incurs substantial overhead -
527                  * ~20 GF/s on audikw_1 - so it is being left here.
528                  */
529 
530                 iHostBuff =
531                     (Common->ibuffer) % CHOLMOD_HOST_SUPERNODE_BUFFERS;
532                 cudaError_t cuErr;
533 
534                 if ( idescendant > 0 )  {
535                     if ( GPUavailable == -1 || skips > 0) {
536                         d = dsmall;
537                         dsmall = Previous[dsmall];
538                         skips--;
539                     }
540                     else {
541                         cuErr = cudaEventQuery
542                             ( Common->updateCBuffersFree[iHostBuff] );
543                         if ( cuErr == cudaSuccess ) {
544                             /* buffers are available, so assemble a large
545                              * descendant (anticipating that this will be
546                              * assembled on the GPU) */
547                             d = dlarge;
548                             dlarge = Next[dlarge];
549                             GPUavailable = 1;
550                             skips = 0;
551                         }
552                         else {
553                             /* buffers are not available, so the GPU is busy,
554                              * so assemble a small descendant (anticipating
555                              * that it will be assembled on the host) */
556                             d = dsmall;
557                             dsmall = Previous[dsmall];
558                             GPUavailable = 0;
559 
560                             /* if the GPUs are busy, then do this many
561                              * supernodes on the CPU before querying GPUs
562                              * again. */
563                             skips = CHOLMOD_GPU_SKIP;
564                         }
565                     }
566                 }
567 
568                 idescendant++;
569 
570             }
571             else
572             {
573                 d = dnext;
574             }
575 #else
576             /* GPU module not installed at compile time */
577             d = dnext ;
578 #endif
579             /* -------------------------------------------------------------- */
580             /* get the size of supernode d */
581             /* -------------------------------------------------------------- */
582 
583             kd1 = Super [d] ;       /* d contains cols kd1 to kd2-1 of L */
584             kd2 = Super [d+1] ;
585             ndcol = kd2 - kd1 ;     /* # of columns in all of d */
586             pdi = Lpi [d] ;         /* pointer to first row of d in Ls */
587             pdx = Lpx [d] ;         /* pointer to first row of d in Lx */
588             pdend = Lpi [d+1] ;     /* pointer just past last row of d in Ls */
589             ndrow = pdend - pdi ;   /* # rows in all of d */
590 
591             PRINT1 (("Child: ")) ;
592             DEBUG (CHOLMOD(dump_super) (d, Super, Lpi, Ls, Lpx, Lx, L_ENTRY,
593                                         Common)) ;
594 
595             /* -------------------------------------------------------------- */
596             /* find the range of rows of d that affect rows k1 to k2-1 of s */
597             /* -------------------------------------------------------------- */
598 
599             p = Lpos [d] ;          /* offset of 1st row of d affecting s */
600             pdi1 = pdi + p ;        /* ptr to 1st row of d affecting s in Ls */
601             pdx1 = pdx + p ;        /* ptr to 1st row of d affecting s in Lx */
602 
603             /* there must be at least one row remaining in d to update s */
604             ASSERT (pdi1 < pdend) ;
605             PRINT1 (("Lpos[d] "ID" pdi1 "ID" Ls[pdi1] "ID"\n",
606                      Lpos[d], pdi1, Ls [pdi1])) ;
607             ASSERT (Ls [pdi1] >= k1 && Ls [pdi1] < k2) ;
608 
609             for (pdi2 = pdi1 ; pdi2 < pdend && Ls [pdi2] < k2 ; pdi2++) ;
610             ndrow1 = pdi2 - pdi1 ;      /* # rows in first part of d */
611             ndrow2 = pdend - pdi1 ;     /* # rows in remaining d */
612 
613             /* rows Ls [pdi1 ... pdi2-1] are in the range k1 to k2-1.  Since d
614              * affects s, this set cannot be empty. */
615             ASSERT (pdi1 < pdi2 && pdi2 <= pdend) ;
616             PRINT1 (("ndrow1 "ID" ndrow2 "ID"\n", ndrow1, ndrow2)) ;
617             DEBUG (for (p = pdi1 ; p < pdi2 ; p++)
618                        PRINT1 (("Ls["ID"] "ID"\n", p, Ls[p]))) ;
619 
620             /* -------------------------------------------------------------- */
621             /* construct the update matrix C for this supernode d */
622             /* -------------------------------------------------------------- */
623 
624             /* C = L (k1:n-1, kd1:kd2-1) * L (k1:k2-1, kd1:kd2-1)', except
625              * that k1:n-1 refers to all of the rows in L, but many of the
626              * rows are all zero.  Supernode d holds columns kd1 to kd2-1 of L.
627              * Nonzero rows in the range k1:k2-1 are in the list
628              * Ls [pdi1 ... pdi2-1], of size ndrow1.  Nonzero rows in the range
629              * k2:n-1 are in the list Ls [pdi2 ... pdend], of size ndrow2.  Let
630              * L1 = L (Ls [pdi1 ... pdi2-1], kd1:kd2-1), and let
631              * L2 = L (Ls [pdi2 ... pdend],  kd1:kd2-1).  C is ndrow2-by-ndrow1.
632              * Let C1 be the first ndrow1 rows of C and let C2 be the last
633              * ndrow2-ndrow1 rows of C.  Only the lower triangular part of C1
634              * needs to be computed since C1 is symmetric.
635              */
636 
637             /* maxcsize is the largest size of C for all pairs (d,s) */
638             ASSERT (ndrow2 * ndrow1 <= ((Int) L->maxcsize)) ;
639 
640             /* compute leading ndrow1-by-ndrow1 lower triangular block of C,
641              * C1 = L1*L1' */
642 
643             ndrow3 = ndrow2 - ndrow1 ;  /* number of rows of C2 */
644             ASSERT (ndrow3 >= 0) ;
645 
646 
647 #ifdef GPU_BLAS
648             if ( useGPU ) {
649                 /* set up GPU to assemble new supernode */
650                 if ( GPUavailable == 1) {
651                     if ( ndrow2 * L_ENTRY >= CHOLMOD_ND_ROW_LIMIT &&
652                          ndcol * L_ENTRY >= CHOLMOD_ND_COL_LIMIT ) {
653                         if ( ! mapCreatedOnGpu ) {
654                             TEMPLATE2 ( CHOLMOD (gpu_initialize_supernode))
655                                 ( Common, nscol, nsrow, psi, gpu_p );
656                             mapCreatedOnGpu = 1;
657                         }
658                     }
659                     else {
660                         /* we've reached the limit of GPU-eligible descendants
661                          * flag to stop stop performing cudaEventQueries */
662                         GPUavailable = -1;
663                     }
664                 }
665             }
666 #endif
667 
668 #ifdef GPU_BLAS
669             if ( !useGPU
670                 || GPUavailable!=1
671                 || !TEMPLATE2 (CHOLMOD (gpu_updateC)) (ndrow1, ndrow2, ndrow,
672                         ndcol, nsrow, pdx1, pdi1, Lx, C, Common, gpu_p))
673 #endif
674             {
675                 /* GPU not installed, or not used */
676 #ifndef NTIMER
677 
678                 Common->CHOLMOD_CPU_SYRK_CALLS++ ;
679                 tstart = SuiteSparse_time () ;
680 #endif
681 #ifdef REAL
682                 BLAS_dsyrk ("L", "N",
683                     ndrow1, ndcol,              /* N, K: L1 is ndrow1-by-ndcol*/
684                     one,                        /* ALPHA:  1 */
685                     Lx + L_ENTRY*pdx1, ndrow,   /* A, LDA: L1, ndrow */
686                     zero,                       /* BETA:   0 */
687                     C, ndrow2) ;                /* C, LDC: C1 */
688 #else
689                 BLAS_zherk ("L", "N",
690                     ndrow1, ndcol,              /* N, K: L1 is ndrow1-by-ndcol*/
691                     one,                        /* ALPHA:  1 */
692                     Lx + L_ENTRY*pdx1, ndrow,   /* A, LDA: L1, ndrow */
693                     zero,                       /* BETA:   0 */
694                     C, ndrow2) ;                /* C, LDC: C1 */
695 #endif
696 #ifndef NTIMER
697                 Common->CHOLMOD_CPU_SYRK_TIME += SuiteSparse_time () - tstart ;
698 #endif
699                 /* compute remaining (ndrow2-ndrow1)-by-ndrow1 block of C,
700                  * C2 = L2*L1' */
701                 if (ndrow3 > 0)
702                 {
703 #ifndef NTIMER
704                     Common->CHOLMOD_CPU_GEMM_CALLS++ ;
705                     tstart = SuiteSparse_time () ;
706 #endif
707 #ifdef REAL
708                     BLAS_dgemm ("N", "C",
709                         ndrow3, ndrow1, ndcol,          /* M, N, K */
710                         one,                            /* ALPHA:  1 */
711                         Lx + L_ENTRY*(pdx1 + ndrow1),   /* A, LDA: L2 */
712                         ndrow,                          /* ndrow */
713                         Lx + L_ENTRY*pdx1,              /* B, LDB: L1 */
714                         ndrow,                          /* ndrow */
715                         zero,                           /* BETA:   0 */
716                         C + L_ENTRY*ndrow1,             /* C, LDC: C2 */
717                         ndrow2) ;
718 #else
719                     BLAS_zgemm ("N", "C",
720                         ndrow3, ndrow1, ndcol,          /* M, N, K */
721                         one,                            /* ALPHA:  1 */
722                         Lx + L_ENTRY*(pdx1 + ndrow1),   /* A, LDA: L2 */
723                         ndrow,                          /* ndrow */
724                         Lx + L_ENTRY*pdx1,              /* B, LDB: L1, ndrow */
725                         ndrow,
726                         zero,                           /* BETA:   0 */
727                         C + L_ENTRY*ndrow1,             /* C, LDC: C2 */
728                         ndrow2) ;
729 #endif
730 #ifndef NTIMER
731                     Common->CHOLMOD_CPU_GEMM_TIME +=
732                         SuiteSparse_time () - tstart ;
733 #endif
734                 }
735 
736                 /* ---------------------------------------------------------- */
737                 /* construct relative map to assemble d into s */
738                 /* ---------------------------------------------------------- */
739 
740                 DEBUG (CHOLMOD(dump_real) ("C", C, ndrow2, ndrow1, TRUE,
741                                            L_ENTRY, Common)) ;
742 
743 #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
744     if ( ndrow2 > 64 )
745 
746                 for (i = 0 ; i < ndrow2 ; i++)
747                 {
748                     RelativeMap [i] = Map [Ls [pdi1 + i]] ;
749                     ASSERT (RelativeMap [i] >= 0 && RelativeMap [i] < nsrow) ;
750                 }
751 
752                 /* ---------------------------------------------------------- */
753                 /* assemble C into supernode s using the relative map */
754                 /* ---------------------------------------------------------- */
755 
756 #pragma omp parallel for private ( j, i, px, q )                \
757     num_threads(CHOLMOD_OMP_NUM_THREADS) if (ndrow1 > 64 )
758 
759                 for (j = 0 ; j < ndrow1 ; j++)              /* cols k1:k2-1 */
760                 {
761                     ASSERT (RelativeMap [j] == Map [Ls [pdi1 + j]]) ;
762                     ASSERT (RelativeMap [j] >= 0 && RelativeMap [j] < nscol) ;
763                     px = psx + RelativeMap [j] * nsrow ;
764                     for (i = j ; i < ndrow2 ; i++)          /* rows k1:n-1 */
765                     {
766                         ASSERT (RelativeMap [i] == Map [Ls [pdi1 + i]]) ;
767                         ASSERT (RelativeMap [i] >= j && RelativeMap[i] < nsrow);
768                         /* Lx [px + RelativeMap [i]] -= C [i + pj] ; */
769                         q = px + RelativeMap [i] ;
770                         L_ASSEMBLESUB (Lx,q, C, i+ndrow2*j) ;
771                     }
772                 }
773 
774             }
775 #ifdef GPU_BLAS
776             else
777             {
778                 supernodeUsedGPU = 1;   /* GPU was used for this supernode*/
779                 Common->ibuffer++;      /* gpu_updateC is asynchronous, so use
780                                          * the next host buffer for the next
781                                          * supernode */
782                 Common->ibuffer = Common->ibuffer%
783                     (CHOLMOD_HOST_SUPERNODE_BUFFERS*CHOLMOD_DEVICE_STREAMS);
784             }
785 #endif
786 
787             /* -------------------------------------------------------------- */
788             /* prepare this supernode d for its next ancestor */
789             /* -------------------------------------------------------------- */
790 
791             dnext = Next [d] ;
792 
793             if (!repeat_supernode)
794             {
795                 /* If node s is being repeated, Head [dancestor] has already
796                  * been cleared (set to EMPTY).  It must remain EMPTY.  The
797                  * dancestor will not be factorized since the factorization
798                  * terminates at node s. */
799                 Lpos [d] = pdi2 - pdi ;
800                 if (Lpos [d] < ndrow)
801                 {
802                     dancestor = SuperMap [Ls [pdi2]] ;
803                     ASSERT (dancestor > s && dancestor < nsuper) ;
804                     /* place d in the link list of its next ancestor */
805                     Next [d] = Head [dancestor] ;
806                     Head [dancestor] = d ;
807                 }
808             }
809 
810         }  /* end of descendant supernode loop */
811 
812 #ifdef GPU_BLAS
813         if ( useGPU ) {
814             iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
815             iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;
816 
817             /* combine updates assembled on the GPU with updates
818              * assembled on the CPU */
819             TEMPLATE2 ( CHOLMOD (gpu_final_assembly ))
820                 ( Common, Lx, psx, nscol, nsrow, supernodeUsedGPU,
821                   &iHostBuff, &iDevBuff, gpu_p );
822         }
823 #endif
824 
825         PRINT1 (("\nSupernode with contributions A: repeat: "ID"\n",
826                  repeat_supernode)) ;
827         DEBUG (CHOLMOD(dump_super) (s, Super, Lpi, Ls, Lpx, Lx, L_ENTRY,
828                                     Common)) ;
829         PRINT1 (("\n\n")) ;
830 
831         /* ------------------------------------------------------------------ */
832         /* factorize diagonal block of supernode s in LL' */
833         /* ------------------------------------------------------------------ */
834 
835         /* The current supernode s is ready to factorize.  It has been updated
836          * by all descendant supernodes.  Let S = the current supernode, which
837          * holds rows k1:n-1 and columns k1:k2-1 of the updated matrix.   It
838          * splits into two parts:  the square diagonal block S1, and the
839          * rectangular part S2.  Here, S1 is factorized into L1*L1' and
840          * overwritten by L1.
841          *
842          * If supernode s is being repeated, only factorize it up to but not
843          * including the column containing the problematic entry.
844          */
845 
846         nscol2 = (repeat_supernode) ? (nscol_new) : (nscol) ;
847 
848 #ifdef GPU_BLAS
849         if ( !useGPU
850             || !supernodeUsedGPU
851             || !TEMPLATE2 (CHOLMOD (gpu_lower_potrf))(nscol2, nsrow, psx, Lx,
852                                                    &info, Common, gpu_p))
853 #endif
854         {
855             /* Note that the GPU will not be used for the triangular solve */
856 #ifdef GPU_BLAS
857             supernodeUsedGPU = 0;
858 #endif
859 #ifndef NTIMER
860             Common->CHOLMOD_CPU_POTRF_CALLS++ ;
861             tstart = SuiteSparse_time () ;
862 #endif
863 #ifdef REAL
864             LAPACK_dpotrf ("L",
865                 nscol2,                     /* N: nscol2 */
866                 Lx + L_ENTRY*psx, nsrow,    /* A, LDA: S1, nsrow */
867                 info) ;                     /* INFO */
868 #else
869             LAPACK_zpotrf ("L",
870                 nscol2,                     /* N: nscol2 */
871                 Lx + L_ENTRY*psx, nsrow,    /* A, LDA: S1, nsrow */
872                 info) ;                     /* INFO */
873 #endif
874 #ifndef NTIMER
875             Common->CHOLMOD_CPU_POTRF_TIME += SuiteSparse_time ()- tstart ;
876 #endif
877         }
878 
879         /* ------------------------------------------------------------------ */
880         /* check if the matrix is not positive definite */
881         /* ------------------------------------------------------------------ */
882 
883         if (repeat_supernode)
884         {
885             /* the leading part has been refactorized; it must have succeeded */
886             info = 0 ;
887 
888             /* zero out the rest of this supernode */
889             p = psx + nsrow * nscol_new ;
890             pend = psx + nsrow * nscol ;            /* s is nsrow-by-nscol */
891             for ( ; p < pend ; p++)
892             {
893                 /* Lx [p] = 0 ; */
894                 L_CLEAR (Lx,p) ;
895             }
896         }
897 
898         /* info is set to one in LAPACK_*potrf if blas_ok is FALSE.  It is
899          * set to zero in dpotrf/zpotrf if the factorization was successful. */
900         if (CHECK_BLAS_INT && !Common->blas_ok)
901         {
902             ERROR (CHOLMOD_TOO_LARGE, "problem too large for the BLAS") ;
903         }
904 
905         if (info != 0)
906         {
907             /* Matrix is not positive definite.  dpotrf/zpotrf do NOT report an
908              * error if the diagonal of L has NaN's, only if it has a zero. */
909             if (Common->status == CHOLMOD_OK)
910             {
911                 ERROR (CHOLMOD_NOT_POSDEF, "matrix not positive definite") ;
912             }
913 
914             /* L->minor is the column of L that contains a zero or negative
915              * diagonal term. */
916             L->minor = k1 + info - 1 ;
917 
918             /* clear the link lists of all subsequent supernodes */
919             for (ss = s+1 ; ss < nsuper ; ss++)
920             {
921                 Head [ss] = EMPTY ;
922             }
923 
924             /* zero this supernode, and all remaining supernodes */
925             pend = L->xsize ;
926             for (p = psx ; p < pend ; p++)
927             {
928                 /* Lx [p] = 0. ; */
929                 L_CLEAR (Lx,p) ;
930             }
931 
932             /* If L is indefinite, it still contains useful information.
933              * Supernodes 0 to s-1 are valid, similar to MATLAB [R,p]=chol(A),
934              * where the 1-based p is identical to the 0-based L->minor.  Since
935              * L->minor is in the current supernode s, it and any columns to the
936              * left of it in supernode s are also all zero.  This differs from
937              * [R,p]=chol(A), which contains nonzero rows 1 to p-1.  Fix this
938              * by setting repeat_supernode to TRUE, and repeating supernode s.
939              *
940              * If Common->quick_return_if_not_posdef is true, then the entire
941              * supernode s is not factorized; it is left as all zero.
942              */
943 
944             if (info == 1 || Common->quick_return_if_not_posdef)
945             {
946                 /* If the first column of supernode s contains a zero or
947                  * negative diagonal entry, then it is already properly set to
948                  * zero.  Also, info will be 1 if integer overflow occured in
949                  * the BLAS. */
950                 Head [s] = EMPTY ;
951 #ifdef GPU_BLAS
952                 if ( useGPU ) {
953                     CHOLMOD (gpu_end) (Common) ;
954                 }
955 #endif
956                 return (Common->status >= CHOLMOD_OK) ;
957             }
958             else
959             {
960                 /* Repeat supernode s, but only factorize it up to but not
961                  * including the column containing the problematic diagonal
962                  * entry. */
963                 repeat_supernode = TRUE ;
964                 s-- ;
965                 nscol_new = info - 1 ;
966                 continue ;
967             }
968         }
969 
970         /* ------------------------------------------------------------------ */
971         /* compute the subdiagonal block and prepare supernode for its parent */
972         /* ------------------------------------------------------------------ */
973 
974         nsrow2 = nsrow - nscol2 ;
975         if (nsrow2 > 0)
976         {
977             /* The current supernode is columns k1 to k2-1 of L.  Let L1 be the
978              * diagonal block (factorized by dpotrf/zpotrf above; rows/cols
979              * k1:k2-1), and L2 be rows k2:n-1 and columns k1:k2-1 of L.  The
980              * triangular system to solve is L2*L1' = S2, where S2 is
981              * overwritten with L2.  More precisely, L2 = S2 / L1' in MATLAB
982              * notation.
983              */
984 
985 #ifdef GPU_BLAS
986             if ( !useGPU
987                 || !supernodeUsedGPU
988                 || !TEMPLATE2 (CHOLMOD(gpu_triangular_solve))
989                         (nsrow2, nscol2, nsrow, psx, Lx, Common, gpu_p))
990 #endif
991             {
992 #ifndef NTIMER
993                 Common->CHOLMOD_CPU_TRSM_CALLS++ ;
994                 tstart = SuiteSparse_time () ;
995 #endif
996 #ifdef REAL
997                 BLAS_dtrsm ("R", "L", "C", "N",
998                     nsrow2, nscol2,                 /* M, N */
999                     one,                            /* ALPHA: 1 */
1000                     Lx + L_ENTRY*psx, nsrow,        /* A, LDA: L1, nsrow */
1001                     Lx + L_ENTRY*(psx + nscol2),    /* B, LDB, L2, nsrow */
1002                     nsrow) ;
1003 #else
1004                 BLAS_ztrsm ("R", "L", "C", "N",
1005                     nsrow2, nscol2,                 /* M, N */
1006                     one,                            /* ALPHA: 1 */
1007                     Lx + L_ENTRY*psx, nsrow,        /* A, LDA: L1, nsrow */
1008                     Lx + L_ENTRY*(psx + nscol2),    /* B, LDB, L2, nsrow */
1009                     nsrow) ;
1010 #endif
1011 #ifndef NTIMER
1012                 Common->CHOLMOD_CPU_TRSM_TIME += SuiteSparse_time () - tstart ;
1013 #endif
1014             }
1015 
1016             if (CHECK_BLAS_INT && !Common->blas_ok)
1017             {
1018                 ERROR (CHOLMOD_TOO_LARGE, "problem too large for the BLAS") ;
1019             }
1020 
1021             if (!repeat_supernode)
1022             {
1023                 /* Lpos [s] is offset of first row of s affecting its parent */
1024                 Lpos [s] = nscol ;
1025                 sparent = SuperMap [Ls [psi + nscol]] ;
1026                 ASSERT (sparent != EMPTY) ;
1027                 ASSERT (Ls [psi + nscol] >= Super [sparent]) ;
1028                 ASSERT (Ls [psi + nscol] <  Super [sparent+1]) ;
1029                 ASSERT (SuperMap [Ls [psi + nscol]] == sparent) ;
1030                 ASSERT (sparent > s && sparent < nsuper) ;
1031                 /* place s in link list of its parent */
1032                 Next [s] = Head [sparent] ;
1033                 Head [sparent] = s ;
1034             }
1035         }
1036         else
1037         {
1038 #ifdef GPU_BLAS
1039             TEMPLATE2 ( CHOLMOD (gpu_copy_supernode) )
1040                 ( Common, Lx, psx, nscol, nscol2, nsrow,
1041                   supernodeUsedGPU, iHostBuff, gpu_p);
1042 #endif
1043         }
1044 
1045         Head [s] = EMPTY ;  /* link list for supernode s no longer needed */
1046 
1047         /* clear the Map (debugging only, to detect changes in pattern of A) */
1048         DEBUG (for (k = 0 ; k < nsrow ; k++) Map [Ls [psi + k]] = EMPTY) ;
1049         DEBUG (CHOLMOD(dump_super) (s, Super, Lpi, Ls, Lpx, Lx, L_ENTRY,
1050                                     Common)) ;
1051 
1052         if (repeat_supernode)
1053         {
1054             /* matrix is not positive definite; finished clean-up for supernode
1055              * containing negative diagonal */
1056 
1057 #ifdef GPU_BLAS
1058             if ( useGPU )
1059             {
1060                 CHOLMOD (gpu_end) (Common) ;
1061             }
1062 #endif
1063             return (Common->status >= CHOLMOD_OK) ;
1064         }
1065     }
1066 
1067     /* success; matrix is positive definite */
1068     L->minor = n ;
1069 
1070 #ifdef GPU_BLAS
1071     if ( useGPU )
1072     {
1073         CHOLMOD (gpu_end) (Common) ;
1074     }
1075 #endif
1076 
1077     return (Common->status >= CHOLMOD_OK) ;
1078 
1079 }
1080 
1081 #undef PATTERN
1082 #undef REAL
1083 #undef COMPLEX
1084 #undef ZOMPLEX
1085