1 // =============================================================================
2 // === spqr_kernel =============================================================
3 // =============================================================================
4
5 // Factorize all the fronts in a single task
6
7 #include "spqr.hpp"
8
spqr_kernel(Long task,spqr_blob<Entry> * Blob)9 template <typename Entry> void spqr_kernel // _worker
10 (
11 Long task,
12 spqr_blob <Entry> *Blob
13 )
14 {
15
16 // -------------------------------------------------------------------------
17 // get the Blob
18 // -------------------------------------------------------------------------
19
20 spqr_symbolic * QRsym = Blob->QRsym ;
21 spqr_numeric <Entry> * QRnum = Blob->QRnum ;
22 double tol = Blob->tol ;
23 Long ntol = Blob->ntol ;
24 Long fchunk = Blob->fchunk ;
25 spqr_work <Entry> * Work = Blob->Work ;
26 Long * Cm = Blob->Cm ;
27 Entry ** Cblock = Blob->Cblock ;
28 Entry * Sx = Blob->Sx ;
29 cholmod_common * cc = Blob->cc ;
30
31 // -------------------------------------------------------------------------
32 // if we're using the GPU, reroute into the gpu-accelerated kernel code
33 // -------------------------------------------------------------------------
34
35 #ifdef GPU_BLAS
36 if (QRsym->QRgpu != NULL)
37 {
38 ASSERT (task == 0) ;
39 spqrgpu_kernel (Blob) ;
40 return ;
41 }
42 #endif
43
44 // -------------------------------------------------------------------------
45 // get the contents of the QR symbolic object
46 // -------------------------------------------------------------------------
47
48 Long * Super = QRsym->Super ; // size nf+1, gives pivot columns in F
49 Long * Rp = QRsym->Rp ; // size nf+1, pointers for pattern of R
50 Long * Rj = QRsym->Rj ; // size QRsym->rjsize, col indices of R
51 Long * Sleft = QRsym->Sleft ; // size n+2, leftmost column sets
52 Long * Sp = QRsym->Sp ; // size m+1, row pointers for S
53 Long * Sj = QRsym->Sj ; // size anz, column indices for S
54 Long * Child = QRsym->Child ; // size nf, for lists of children
55 Long * Childp = QRsym->Childp ; // size nf+1, for lists of children
56 Long maxfn = QRsym->maxfn ; // max # of columns in any front
57 Long nf = QRsym->nf ; // number of fronts
58
59 Long * Hip = QRsym->Hip ; // only used if H is kept
60
61 // these arrays are all NULL if QRsym->ntasks:
62 Long * TaskFront = QRsym->TaskFront ; // size nf+1
63 Long * TaskFrontp = QRsym->TaskFrontp ; // size ntasks+1
64 Long * TaskStack = QRsym->TaskStack ; // size ntasks+1
65 Long * On_stack = QRsym->On_stack ; // size nf+1
66
67 // used for sequential case (when QRnum->ntasks == 1)
68 Long * Post = QRsym->Post ; // size nf
69
70 // -------------------------------------------------------------------------
71 // get the contents of the QR numeric object
72 // -------------------------------------------------------------------------
73
74 Entry ** Rblock = QRnum->Rblock ;
75 char * Rdead = QRnum->Rdead ;
76 Long * HStair = QRnum->HStair ;
77 Entry * HTau = QRnum->HTau ;
78 Long * Hii = QRnum->Hii ; // only used if H is kept
79 Long * Hm = QRnum->Hm ;
80 Long * Hr = QRnum->Hr ;
81 Long keepH = QRnum->keepH ;
82 Long ntasks = QRnum->ntasks ; // number of tasks
83
84 // -------------------------------------------------------------------------
85 // get the stack for this task and the head/top pointers
86 // -------------------------------------------------------------------------
87
88 Long stack, kfirst, klast ;
89
90 if (ntasks == 1)
91 {
92 // sequential case
93 kfirst = 0 ;
94 klast = nf ;
95 stack = 0 ;
96 }
97 else
98 {
99 kfirst = TaskFrontp [task] ;
100 klast = TaskFrontp [task+1] ;
101 stack = TaskStack [task] ;
102 }
103
104 Entry * Stack_head = Work [stack].Stack_head ;
105 Entry * Stack_top = Work [stack].Stack_top ;
106
107 #ifndef NDEBUG
108 Entry **Stacks = QRnum->Stacks ;
109 Entry *Stack = Stacks [stack] ; // stack being used
110 Long stacksize = (ntasks == 1) ?
111 QRsym->maxstack :
112 QRsym->Stack_maxstack [stack] ;
113 #endif
114
115 // if H kept, Tau and Stair will point to permanent space in QRnum
116 Entry * Tau = keepH ? NULL : Work [stack].WTwork ;
117 Long * Stair = keepH ? NULL : Work [stack].Stair1 ;
118 Entry * W = Work [stack].WTwork + (keepH ? 0 : maxfn) ;
119
120 Long * Fmap = Work [stack].Fmap ;
121 Long * Cmap = Work [stack].Cmap ;
122
123 Long sumfrank = Work [stack].sumfrank ;
124 Long maxfrank = Work [stack].maxfrank ;
125
126 // for keeping track of norm(w) for dead column 2-norms
127 double wscale = Work [stack].wscale ;
128 double wssq = Work [stack].wssq ;
129
130 // -------------------------------------------------------------------------
131 // factorize all the fronts in this task
132 // -------------------------------------------------------------------------
133
134 for (Long kf = kfirst ; kf < klast ; kf++)
135 {
136
137 // ---------------------------------------------------------------------
138 // factorize front F
139 // ---------------------------------------------------------------------
140
141 Long f = (ntasks == 1) ? Post [kf] : TaskFront [kf] ;
142
143 #ifndef NDEBUG
144 ASSERT (f >= 0 && f < QRsym->nf) ;
145 for (Long col = 0 ; col < QRsym->n ; col++) Fmap [col] = EMPTY ;
146 #endif
147
148 if (keepH)
149 {
150 // get the permanent Stair and Tau vectors for this front
151 Stair = HStair + Rp [f] ;
152 Tau = HTau + Rp [f] ;
153 }
154
155 // ---------------------------------------------------------------------
156 // determine the size of F, its staircase, and its Fmap
157 // ---------------------------------------------------------------------
158
159 Long fm = spqr_fsize (f, Super, Rp, Rj, Sleft, Child, Childp, Cm,
160 Fmap, Stair) ;
161 Long fn = Rp [f+1] - Rp [f] ; // F is fm-by-fn
162 Long col1 = Super [f] ; // first global pivot column in F
163 Long fp = Super [f+1] - col1 ; // with fp pivot columns
164 Long fsize = fm * fn ;
165 if (keepH)
166 {
167 Hm [f] = fm ;
168 }
169
170 #ifndef NDEBUG
171 PR (("\n --- Front f %ld fm %ld fn %ld fp %ld\n", f, fm, fn, fp)) ;
172 ASSERT (Stack_head <= Stack_top) ;
173 #endif
174
175 // ---------------------------------------------------------------------
176 // allocate F
177 // ---------------------------------------------------------------------
178
179 // allocate F; this will reduce in size to hold just R or RH when done
180 Entry *F = Stack_head ;
181 Rblock [f] = F ;
182 Stack_head += fsize ;
183
184 #ifndef NDEBUG
185 PR (("Stack head %ld top %ld total %ld stacksize %ld\n",
186 (Long) (Stack_head - Stack),
187 (Long) (Stack_top - Stack),
188 (Stack_head - Stack) +
189 stacksize - (Stack_top - Stack),
190 stacksize)) ;
191 ASSERT (Stack_head <= Stack_top) ;
192 #endif
193
194 // ---------------------------------------------------------------------
195 // assemble the C blocks of the children of F
196 // ---------------------------------------------------------------------
197
198 spqr_assemble (f, fm, keepH,
199 Super, Rp, Rj, Sp, Sj, Sleft, Child, Childp,
200 Sx, Fmap, Cm, Cblock,
201 #ifndef NDEBUG
202 Rdead,
203 #endif
204 Hr, Stair, Hii, Hip, F, Cmap) ;
205
206 #ifndef NDEBUG
207 #ifndef NPRINT
208 PR (("\n --- Front assembled: f %ld\n", f)) ;
209 spqrDebug_dumpdense (F, fm, fn, fm, cc) ;
210 PR (("\n --- Freeing children from stack %ld\n", stack)) ;
211 #endif
212 #endif
213
214 // ---------------------------------------------------------------------
215 // free the C blocks of the children of F
216 // ---------------------------------------------------------------------
217
218 for (Long p = Childp [f] ; p < Childp [f+1] ; p++)
219 {
220 Long c = Child [p] ;
221 ASSERT (c >= 0 && c < f) ;
222 PR ((" child %ld on stack %ld\n", c,
223 (ntasks == 1) ? 0 : On_stack [c])) ;
224 if (ntasks == 1 || On_stack [c] == stack)
225 {
226 Long ccsize = spqr_csize (c, Rp, Cm, Super) ;
227 Stack_top = MAX (Stack_top, Cblock [c] + ccsize) ;
228 }
229 }
230
231 ASSERT (Stack_head >= Stack && Stack_head <= Stack_top) ;
232 ASSERT (Stack_top <= Stack + stacksize) ;
233
234 // ---------------------------------------------------------------------
235 // factorize the front F
236 // ---------------------------------------------------------------------
237
238 Long frank = spqr_front (fm, fn, fp, tol, ntol - col1,
239 fchunk, F, Stair, Rdead + col1, Tau, W,
240 &wscale, &wssq, cc) ;
241
242 #ifndef NDEBUG
243 #ifndef NPRINT
244 PR (("\n F factorized: f %ld fp %ld frank %ld \n", f, fp, frank)) ;
245 for (Long jj = col1 ; jj < Super [f+1] ; jj++)
246 PR (("Rdead [%ld] = %d\n", jj, Rdead [jj])) ;
247 PR (("\n ::: Front factorized:\n")) ;
248 spqrDebug_dumpdense (F, fm, fn, fm, cc) ;
249 #endif
250 #endif
251
252 // ---------------------------------------------------------------------
253 // keep track of the rank of fronts (just for this stack)
254 // ---------------------------------------------------------------------
255
256 sumfrank += frank ;
257 maxfrank = MAX (maxfrank, frank) ;
258
259 // ---------------------------------------------------------------------
260 // pack the C block of front F on stack
261 // ---------------------------------------------------------------------
262
263 Long csize = spqr_fcsize (fm, fn, fp, frank) ;
264 Stack_top -= csize ;
265
266 #ifndef NDEBUG
267 PR (("Front f %ld csize %ld piv rank %ld\n", f, csize, frank)) ;
268 PR (("Stack head %ld top %ld total %ld stacksize %ld\n",
269 (Long) (Stack_head - Stack),
270 (Long) (Stack_top - Stack),
271 (Stack_head - Stack) +
272 stacksize - (Stack_top - Stack),
273 stacksize)) ;
274 ASSERT (Stack_head <= Stack_top) ;
275 #endif
276
277 Cblock [f] = Stack_top ;
278 Cm [f] = spqr_cpack (fm, fn, fp, frank, F, Cblock [f]) ;
279
280 // ---------------------------------------------------------------------
281 // pack R or RH of front F in place
282 // ---------------------------------------------------------------------
283
284 Long rm ;
285 Long rsize = spqr_rhpack (keepH, fm, fn, fp, Stair, F, F, &rm) ;
286 if (keepH)
287 {
288 Hr [f] = rm ;
289 }
290
291 #ifndef NDEBUG
292 PR (("Cm of f %ld is %ld, csize: %ld\n", f, Cm [f], csize)) ;
293 if (keepH)
294 {
295 ASSERT (rsize <= fsize - csize) ;
296 PR (("Front %ld RHSIZE actual: %ld\n", f, rsize)) ;
297 PR (("Hr [ %ld ] = %ld\n", f, Hr [f])) ;
298 ASSERT (spqrDebug_rhsize (fm, fn, fp, Stair, cc) == rsize) ;
299 }
300 else
301 {
302 ASSERT (rsize <= frank*(frank+1)/2 + frank*(fn-frank)) ;
303 ASSERT (rsize <= fsize) ;
304 ASSERT (spqrDebug_rhsize (fm, fn, fp, Stair, cc) >= rsize) ;
305 }
306 #endif
307
308 // ---------------------------------------------------------------------
309 // free F, leaving R or RH in its place
310 // ---------------------------------------------------------------------
311
312 Stack_head -= fsize ; // free F
313 Stack_head += rsize ; // allocate R or RH
314
315 #ifndef NDEBUG
316 PR (("front %ld r/hsize %ld fp %ld fn %ld if trapezoidal %ld\n",
317 f, rsize, fp, fn, fp*(fp+1)/2 + fp*(fn-fp))) ;
318 PR (("rsize %ld frank %ld fn %ld\n", rsize, frank, fn)) ;
319 ASSERT (Stack_head <= Stack_top) ;
320 #endif
321 }
322
323 // -------------------------------------------------------------------------
324 // save the stack top & head pointers for the next task on this stack
325 // -------------------------------------------------------------------------
326
327 Work [stack].Stack_head = Stack_head ;
328 Work [stack].Stack_top = Stack_top ;
329 Work [stack].sumfrank = sumfrank ; // sum rank of fronts for this stack
330 Work [stack].maxfrank = maxfrank ; // max rank of fronts for this stack
331
332 // for keeping track of norm(w) for dead column 2-norms
333 Work [stack].wscale = wscale ;
334 Work [stack].wssq = wssq ;
335 }
336
337 // =============================================================================
338
339 template void spqr_kernel <double>
340 (
341 Long task,
342 spqr_blob <double> *Blob
343 ) ;
344
345 template void spqr_kernel <Complex>
346 (
347 Long task,
348 spqr_blob <Complex> *Blob
349 ) ;
350