1 // =============================================================================
2 // === spqr_kernel =============================================================
3 // =============================================================================
4 
5 // Factorize all the fronts in a single task
6 
7 #include "spqr.hpp"
8 
spqr_kernel(Long task,spqr_blob<Entry> * Blob)9 template <typename Entry> void spqr_kernel // _worker
10 (
11     Long task,
12     spqr_blob <Entry> *Blob
13 )
14 {
15 
16     // -------------------------------------------------------------------------
17     // get the Blob
18     // -------------------------------------------------------------------------
19 
20     spqr_symbolic *          QRsym = Blob->QRsym ;
21     spqr_numeric <Entry> *   QRnum = Blob->QRnum ;
22     double                   tol = Blob->tol ;
23     Long                     ntol = Blob->ntol ;
24     Long                     fchunk = Blob->fchunk ;
25     spqr_work <Entry> *      Work = Blob->Work ;
26     Long *                   Cm = Blob->Cm ;
27     Entry **                 Cblock = Blob->Cblock ;
28     Entry *                  Sx = Blob->Sx ;
29     cholmod_common *         cc = Blob->cc ;
30 
31     // -------------------------------------------------------------------------
32     // if we're using the GPU, reroute into the gpu-accelerated kernel code
33     // -------------------------------------------------------------------------
34 
35 #ifdef GPU_BLAS
36     if (QRsym->QRgpu != NULL)
37     {
38         ASSERT (task == 0) ;
39         spqrgpu_kernel (Blob) ;
40         return ;
41     }
42 #endif
43 
44     // -------------------------------------------------------------------------
45     // get the contents of the QR symbolic object
46     // -------------------------------------------------------------------------
47 
48     Long *  Super = QRsym->Super ;      // size nf+1, gives pivot columns in F
49     Long *  Rp = QRsym->Rp ;            // size nf+1, pointers for pattern of R
50     Long *  Rj = QRsym->Rj ;            // size QRsym->rjsize, col indices of R
51     Long *  Sleft = QRsym->Sleft ;      // size n+2, leftmost column sets
52     Long *  Sp = QRsym->Sp ;            // size m+1, row pointers for S
53     Long *  Sj = QRsym->Sj ;            // size anz, column indices for S
54     Long *  Child = QRsym->Child ;      // size nf, for lists of children
55     Long *  Childp = QRsym->Childp ;    // size nf+1, for lists of children
56     Long    maxfn  = QRsym->maxfn ;     // max # of columns in any front
57     Long    nf = QRsym->nf ;            // number of fronts
58 
59     Long *  Hip = QRsym->Hip ;          // only used if H is kept
60 
61     // these arrays are all NULL if QRsym->ntasks:
62     Long *  TaskFront = QRsym->TaskFront ;      // size nf+1
63     Long *  TaskFrontp = QRsym->TaskFrontp ;    // size ntasks+1
64     Long *  TaskStack = QRsym->TaskStack ;      // size ntasks+1
65     Long *  On_stack = QRsym->On_stack ;        // size nf+1
66 
67     // used for sequential case (when QRnum->ntasks == 1)
68     Long *  Post = QRsym->Post ;                // size nf
69 
70     // -------------------------------------------------------------------------
71     // get the contents of the QR numeric object
72     // -------------------------------------------------------------------------
73 
74     Entry ** Rblock = QRnum->Rblock ;
75     char *   Rdead = QRnum->Rdead ;
76     Long *   HStair = QRnum->HStair ;
77     Entry *  HTau = QRnum->HTau ;
78     Long *   Hii = QRnum->Hii ;          // only used if H is kept
79     Long *   Hm = QRnum->Hm ;
80     Long *   Hr = QRnum->Hr ;
81     Long     keepH = QRnum->keepH ;
82     Long     ntasks = QRnum->ntasks ;    // number of tasks
83 
84     // -------------------------------------------------------------------------
85     // get the stack for this task and the head/top pointers
86     // -------------------------------------------------------------------------
87 
88     Long stack, kfirst, klast ;
89 
90     if (ntasks == 1)
91     {
92         // sequential case
93         kfirst = 0 ;
94         klast  = nf ;
95         stack  = 0 ;
96     }
97     else
98     {
99         kfirst = TaskFrontp [task] ;
100         klast  = TaskFrontp [task+1] ;
101         stack  = TaskStack [task] ;
102     }
103 
104     Entry * Stack_head = Work [stack].Stack_head ;
105     Entry * Stack_top = Work [stack].Stack_top ;
106 
107 #ifndef NDEBUG
108     Entry **Stacks = QRnum->Stacks ;
109     Entry *Stack = Stacks [stack] ;    // stack being used
110     Long stacksize = (ntasks == 1) ?
111             QRsym->maxstack :
112             QRsym->Stack_maxstack [stack] ;
113 #endif
114 
115     // if H kept, Tau and Stair will point to permanent space in QRnum
116     Entry * Tau = keepH ? NULL : Work [stack].WTwork ;
117     Long *  Stair = keepH ? NULL : Work [stack].Stair1 ;
118     Entry * W = Work [stack].WTwork + (keepH ? 0 : maxfn) ;
119 
120     Long *  Fmap = Work [stack].Fmap ;
121     Long *  Cmap = Work [stack].Cmap ;
122 
123     Long    sumfrank = Work [stack].sumfrank ;
124     Long    maxfrank = Work [stack].maxfrank ;
125 
126     // for keeping track of norm(w) for dead column 2-norms
127     double wscale = Work [stack].wscale ;
128     double wssq   = Work [stack].wssq   ;
129 
130     // -------------------------------------------------------------------------
131     // factorize all the fronts in this task
132     // -------------------------------------------------------------------------
133 
134     for (Long kf = kfirst ; kf < klast ; kf++)
135     {
136 
137         // ---------------------------------------------------------------------
138         // factorize front F
139         // ---------------------------------------------------------------------
140 
141         Long f = (ntasks == 1) ? Post [kf] : TaskFront [kf] ;
142 
143 #ifndef NDEBUG
144         ASSERT (f >= 0 && f < QRsym->nf) ;
145         for (Long col = 0 ; col < QRsym->n ; col++) Fmap [col] = EMPTY ;
146 #endif
147 
148         if (keepH)
149         {
150             // get the permanent Stair and Tau vectors for this front
151             Stair = HStair + Rp [f] ;
152             Tau = HTau + Rp [f] ;
153         }
154 
155         // ---------------------------------------------------------------------
156         // determine the size of F, its staircase, and its Fmap
157         // ---------------------------------------------------------------------
158 
159         Long fm = spqr_fsize (f, Super, Rp, Rj, Sleft, Child, Childp, Cm,
160             Fmap, Stair) ;
161         Long fn = Rp [f+1] - Rp [f] ;        // F is fm-by-fn
162         Long col1 = Super [f] ;              // first global pivot column in F
163         Long fp = Super [f+1] - col1 ;       // with fp pivot columns
164         Long fsize = fm * fn ;
165         if (keepH)
166         {
167             Hm [f] = fm ;
168         }
169 
170 #ifndef NDEBUG
171         PR (("\n --- Front f %ld fm %ld fn %ld fp %ld\n", f, fm, fn, fp)) ;
172         ASSERT (Stack_head <= Stack_top) ;
173 #endif
174 
175         // ---------------------------------------------------------------------
176         // allocate F
177         // ---------------------------------------------------------------------
178 
179         // allocate F; this will reduce in size to hold just R or RH when done
180         Entry *F = Stack_head ;
181         Rblock [f] = F ;
182         Stack_head += fsize ;
183 
184 #ifndef NDEBUG
185         PR (("Stack head %ld top %ld total %ld stacksize %ld\n",
186             (Long) (Stack_head - Stack),
187             (Long) (Stack_top - Stack),
188             (Stack_head - Stack) +
189             stacksize - (Stack_top - Stack),
190             stacksize)) ;
191         ASSERT (Stack_head <= Stack_top) ;
192 #endif
193 
194         // ---------------------------------------------------------------------
195         // assemble the C blocks of the children of F
196         // ---------------------------------------------------------------------
197 
198         spqr_assemble (f, fm, keepH,
199             Super, Rp, Rj, Sp, Sj, Sleft, Child, Childp,
200             Sx, Fmap, Cm, Cblock,
201 #ifndef NDEBUG
202             Rdead,
203 #endif
204             Hr, Stair, Hii, Hip, F, Cmap) ;
205 
206 #ifndef NDEBUG
207 #ifndef NPRINT
208         PR (("\n --- Front assembled: f %ld\n", f)) ;
209         spqrDebug_dumpdense (F, fm, fn, fm, cc) ;
210         PR (("\n --- Freeing children from stack %ld\n", stack)) ;
211 #endif
212 #endif
213 
214         // ---------------------------------------------------------------------
215         // free the C blocks of the children of F
216         // ---------------------------------------------------------------------
217 
218         for (Long p = Childp [f] ; p < Childp [f+1] ; p++)
219         {
220             Long c = Child [p] ;
221             ASSERT (c >= 0 && c < f) ;
222             PR (("   child %ld on stack %ld\n", c,
223                 (ntasks == 1) ? 0 : On_stack [c])) ;
224             if (ntasks == 1 || On_stack [c] == stack)
225             {
226                 Long ccsize = spqr_csize (c, Rp, Cm, Super) ;
227                 Stack_top = MAX (Stack_top, Cblock [c] + ccsize) ;
228             }
229         }
230 
231         ASSERT (Stack_head >= Stack && Stack_head <= Stack_top) ;
232         ASSERT (Stack_top <= Stack + stacksize) ;
233 
234         // ---------------------------------------------------------------------
235         // factorize the front F
236         // ---------------------------------------------------------------------
237 
238         Long frank = spqr_front (fm, fn, fp, tol, ntol - col1,
239             fchunk, F, Stair, Rdead + col1, Tau, W,
240             &wscale, &wssq, cc) ;
241 
242 #ifndef NDEBUG
243 #ifndef NPRINT
244         PR (("\n F factorized: f %ld fp %ld frank %ld \n", f, fp, frank)) ;
245         for (Long jj = col1 ; jj < Super [f+1] ; jj++)
246             PR (("Rdead [%ld] = %d\n", jj, Rdead [jj])) ;
247         PR (("\n ::: Front factorized:\n")) ;
248         spqrDebug_dumpdense (F, fm, fn, fm, cc) ;
249 #endif
250 #endif
251 
252         // ---------------------------------------------------------------------
253         // keep track of the rank of fronts (just for this stack)
254         // ---------------------------------------------------------------------
255 
256         sumfrank += frank ;
257         maxfrank = MAX (maxfrank, frank) ;
258 
259         // ---------------------------------------------------------------------
260         // pack the C block of front F on stack
261         // ---------------------------------------------------------------------
262 
263         Long csize = spqr_fcsize (fm, fn, fp, frank) ;
264         Stack_top -= csize ;
265 
266 #ifndef NDEBUG
267         PR (("Front f %ld csize %ld piv rank %ld\n", f, csize, frank)) ;
268         PR (("Stack head %ld top %ld total %ld stacksize %ld\n",
269             (Long) (Stack_head - Stack),
270             (Long) (Stack_top - Stack),
271             (Stack_head - Stack) +
272             stacksize - (Stack_top - Stack),
273             stacksize)) ;
274         ASSERT (Stack_head <= Stack_top) ;
275 #endif
276 
277         Cblock [f] = Stack_top ;
278         Cm [f] = spqr_cpack (fm, fn, fp, frank, F, Cblock [f]) ;
279 
280         // ---------------------------------------------------------------------
281         // pack R or RH of front F in place
282         // ---------------------------------------------------------------------
283 
284         Long rm ;
285         Long rsize = spqr_rhpack (keepH, fm, fn, fp, Stair, F, F, &rm) ;
286         if (keepH)
287         {
288             Hr [f] = rm ;
289         }
290 
291 #ifndef NDEBUG
292         PR (("Cm of f %ld is %ld, csize: %ld\n", f, Cm [f], csize)) ;
293         if (keepH)
294         {
295             ASSERT (rsize <= fsize - csize) ;
296             PR (("Front %ld RHSIZE actual: %ld\n", f, rsize)) ;
297             PR (("Hr [ %ld ] =  %ld\n", f, Hr [f])) ;
298             ASSERT (spqrDebug_rhsize (fm, fn, fp, Stair, cc) == rsize) ;
299         }
300         else
301         {
302             ASSERT (rsize <= frank*(frank+1)/2 + frank*(fn-frank)) ;
303             ASSERT (rsize <= fsize) ;
304             ASSERT (spqrDebug_rhsize (fm, fn, fp, Stair, cc) >= rsize) ;
305         }
306 #endif
307 
308         // ---------------------------------------------------------------------
309         // free F, leaving R or RH in its place
310         // ---------------------------------------------------------------------
311 
312         Stack_head -= fsize ;             // free F
313         Stack_head += rsize ;             // allocate R or RH
314 
315 #ifndef NDEBUG
316         PR (("front %ld r/hsize %ld fp %ld fn %ld if trapezoidal %ld\n",
317             f, rsize, fp, fn, fp*(fp+1)/2 + fp*(fn-fp))) ;
318         PR (("rsize %ld frank %ld fn %ld\n", rsize, frank, fn)) ;
319         ASSERT (Stack_head <= Stack_top) ;
320 #endif
321     }
322 
323     // -------------------------------------------------------------------------
324     // save the stack top & head pointers for the next task on this stack
325     // -------------------------------------------------------------------------
326 
327     Work [stack].Stack_head = Stack_head  ;
328     Work [stack].Stack_top = Stack_top ;
329     Work [stack].sumfrank = sumfrank ;  // sum rank of fronts for this stack
330     Work [stack].maxfrank = maxfrank ;  // max rank of fronts for this stack
331 
332     // for keeping track of norm(w) for dead column 2-norms
333     Work [stack].wscale = wscale ;
334     Work [stack].wssq   = wssq   ;
335 }
336 
337 // =============================================================================
338 
339 template void spqr_kernel <double>
340 (
341     Long task,
342     spqr_blob <double> *Blob
343 ) ;
344 
345 template void spqr_kernel <Complex>
346 (
347     Long task,
348     spqr_blob <Complex> *Blob
349 ) ;
350