1 /* ========================================================================== */
2 /* === KLU_tsolve =========================================================== */
3 /* ========================================================================== */
4
5 /* Solve A'x=b using the symbolic and numeric objects from KLU_analyze
6 * (or KLU_analyze_given) and KLU_factor. Note that no iterative refinement is
7 * performed. Uses Numeric->Xwork as workspace (undefined on input and output),
8 * of size 4n Entry's (note that columns 2 to 4 of Xwork overlap with
9 * Numeric->Iwork).
10 */
11
12 #include "klu_internal.h"
13
KLU_tsolve(KLU_symbolic * Symbolic,KLU_numeric * Numeric,Int d,Int nrhs,double B[],Int conj_solve,KLU_common * Common)14 Int KLU_tsolve
15 (
16 /* inputs, not modified */
17 KLU_symbolic *Symbolic,
18 KLU_numeric *Numeric,
19 Int d, /* leading dimension of B */
20 Int nrhs, /* number of right-hand-sides */
21
22 /* right-hand-side on input, overwritten with solution to Ax=b on output */
23 double B [ ], /* size n*nrhs, in column-oriented form, with
24 * leading dimension d. */
25 #ifdef COMPLEX
26 Int conj_solve, /* TRUE for conjugate transpose solve, FALSE for
27 * array transpose solve. Used for the complex
28 * case only. */
29 #endif
30 /* --------------- */
31 KLU_common *Common
32 )
33 {
34 Entry x [4], offik, s ;
35 double rs, *Rs ;
36 Entry *Offx, *X, *Bz, *Udiag ;
37 Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
38 Unit **LUbx ;
39 Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
40
41 /* ---------------------------------------------------------------------- */
42 /* check inputs */
43 /* ---------------------------------------------------------------------- */
44
45 if (Common == NULL)
46 {
47 return (FALSE) ;
48 }
49 if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
50 B == NULL)
51 {
52 Common->status = KLU_INVALID ;
53 return (FALSE) ;
54 }
55 Common->status = KLU_OK ;
56
57 /* ---------------------------------------------------------------------- */
58 /* get the contents of the Symbolic object */
59 /* ---------------------------------------------------------------------- */
60
61 Bz = (Entry *) B ;
62 n = Symbolic->n ;
63 nblocks = Symbolic->nblocks ;
64 Q = Symbolic->Q ;
65 R = Symbolic->R ;
66
67 /* ---------------------------------------------------------------------- */
68 /* get the contents of the Numeric object */
69 /* ---------------------------------------------------------------------- */
70
71 ASSERT (nblocks == Numeric->nblocks) ;
72 Pnum = Numeric->Pnum ;
73 Offp = Numeric->Offp ;
74 Offi = Numeric->Offi ;
75 Offx = (Entry *) Numeric->Offx ;
76
77 Lip = Numeric->Lip ;
78 Llen = Numeric->Llen ;
79 Uip = Numeric->Uip ;
80 Ulen = Numeric->Ulen ;
81 LUbx = (Unit **) Numeric->LUbx ;
82 Udiag = Numeric->Udiag ;
83
84 Rs = Numeric->Rs ;
85 X = (Entry *) Numeric->Xwork ;
86 ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
87
88 /* ---------------------------------------------------------------------- */
89 /* solve in chunks of 4 columns at a time */
90 /* ---------------------------------------------------------------------- */
91
92 for (chunk = 0 ; chunk < nrhs ; chunk += 4)
93 {
94
95 /* ------------------------------------------------------------------ */
96 /* get the size of the current chunk */
97 /* ------------------------------------------------------------------ */
98
99 nr = MIN (nrhs - chunk, 4) ;
100
101 /* ------------------------------------------------------------------ */
102 /* permute the right hand side, X = Q'*B */
103 /* ------------------------------------------------------------------ */
104
105 switch (nr)
106 {
107
108 case 1:
109
110 for (k = 0 ; k < n ; k++)
111 {
112 X [k] = Bz [Q [k]] ;
113 }
114 break ;
115
116 case 2:
117
118 for (k = 0 ; k < n ; k++)
119 {
120 i = Q [k] ;
121 X [2*k ] = Bz [i ] ;
122 X [2*k + 1] = Bz [i + d ] ;
123 }
124 break ;
125
126 case 3:
127
128 for (k = 0 ; k < n ; k++)
129 {
130 i = Q [k] ;
131 X [3*k ] = Bz [i ] ;
132 X [3*k + 1] = Bz [i + d ] ;
133 X [3*k + 2] = Bz [i + d*2] ;
134 }
135 break ;
136
137 case 4:
138
139 for (k = 0 ; k < n ; k++)
140 {
141 i = Q [k] ;
142 X [4*k ] = Bz [i ] ;
143 X [4*k + 1] = Bz [i + d ] ;
144 X [4*k + 2] = Bz [i + d*2] ;
145 X [4*k + 3] = Bz [i + d*3] ;
146 }
147 break ;
148
149 }
150
151 /* ------------------------------------------------------------------ */
152 /* solve X = (L*U + Off)'\X */
153 /* ------------------------------------------------------------------ */
154
155 for (block = 0 ; block < nblocks ; block++)
156 {
157
158 /* -------------------------------------------------------------- */
159 /* the block of size nk is from rows/columns k1 to k2-1 */
160 /* -------------------------------------------------------------- */
161
162 k1 = R [block] ;
163 k2 = R [block+1] ;
164 nk = k2 - k1 ;
165 PRINTF (("tsolve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
166
167 /* -------------------------------------------------------------- */
168 /* block back-substitution for the off-diagonal-block entries */
169 /* -------------------------------------------------------------- */
170
171 if (block > 0)
172 {
173 switch (nr)
174 {
175
176 case 1:
177
178 for (k = k1 ; k < k2 ; k++)
179 {
180 pend = Offp [k+1] ;
181 for (p = Offp [k] ; p < pend ; p++)
182 {
183 #ifdef COMPLEX
184 if (conj_solve)
185 {
186 MULT_SUB_CONJ (X [k], X [Offi [p]],
187 Offx [p]) ;
188 }
189 else
190 #endif
191 {
192 MULT_SUB (X [k], Offx [p], X [Offi [p]]) ;
193 }
194 }
195 }
196 break ;
197
198 case 2:
199
200 for (k = k1 ; k < k2 ; k++)
201 {
202 pend = Offp [k+1] ;
203 x [0] = X [2*k ] ;
204 x [1] = X [2*k + 1] ;
205 for (p = Offp [k] ; p < pend ; p++)
206 {
207 i = Offi [p] ;
208 #ifdef COMPLEX
209 if (conj_solve)
210 {
211 CONJ (offik, Offx [p]) ;
212 }
213 else
214 #endif
215 {
216 offik = Offx [p] ;
217 }
218 MULT_SUB (x [0], offik, X [2*i]) ;
219 MULT_SUB (x [1], offik, X [2*i + 1]) ;
220 }
221 X [2*k ] = x [0] ;
222 X [2*k + 1] = x [1] ;
223 }
224 break ;
225
226 case 3:
227
228 for (k = k1 ; k < k2 ; k++)
229 {
230 pend = Offp [k+1] ;
231 x [0] = X [3*k ] ;
232 x [1] = X [3*k + 1] ;
233 x [2] = X [3*k + 2] ;
234 for (p = Offp [k] ; p < pend ; p++)
235 {
236 i = Offi [p] ;
237 #ifdef COMPLEX
238 if (conj_solve)
239 {
240 CONJ (offik, Offx [p]) ;
241 }
242 else
243 #endif
244 {
245 offik = Offx [p] ;
246 }
247 MULT_SUB (x [0], offik, X [3*i]) ;
248 MULT_SUB (x [1], offik, X [3*i + 1]) ;
249 MULT_SUB (x [2], offik, X [3*i + 2]) ;
250 }
251 X [3*k ] = x [0] ;
252 X [3*k + 1] = x [1] ;
253 X [3*k + 2] = x [2] ;
254 }
255 break ;
256
257 case 4:
258
259 for (k = k1 ; k < k2 ; k++)
260 {
261 pend = Offp [k+1] ;
262 x [0] = X [4*k ] ;
263 x [1] = X [4*k + 1] ;
264 x [2] = X [4*k + 2] ;
265 x [3] = X [4*k + 3] ;
266 for (p = Offp [k] ; p < pend ; p++)
267 {
268 i = Offi [p] ;
269 #ifdef COMPLEX
270 if (conj_solve)
271 {
272 CONJ(offik, Offx [p]) ;
273 }
274 else
275 #endif
276 {
277 offik = Offx [p] ;
278 }
279 MULT_SUB (x [0], offik, X [4*i]) ;
280 MULT_SUB (x [1], offik, X [4*i + 1]) ;
281 MULT_SUB (x [2], offik, X [4*i + 2]) ;
282 MULT_SUB (x [3], offik, X [4*i + 3]) ;
283 }
284 X [4*k ] = x [0] ;
285 X [4*k + 1] = x [1] ;
286 X [4*k + 2] = x [2] ;
287 X [4*k + 3] = x [3] ;
288 }
289 break ;
290 }
291 }
292
293 /* -------------------------------------------------------------- */
294 /* solve the block system */
295 /* -------------------------------------------------------------- */
296
297 if (nk == 1)
298 {
299 #ifdef COMPLEX
300 if (conj_solve)
301 {
302 CONJ (s, Udiag [k1]) ;
303 }
304 else
305 #endif
306 {
307 s = Udiag [k1] ;
308 }
309 switch (nr)
310 {
311
312 case 1:
313 DIV (X [k1], X [k1], s) ;
314 break ;
315
316 case 2:
317 DIV (X [2*k1], X [2*k1], s) ;
318 DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
319 break ;
320
321 case 3:
322 DIV (X [3*k1], X [3*k1], s) ;
323 DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
324 DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
325 break ;
326
327 case 4:
328 DIV (X [4*k1], X [4*k1], s) ;
329 DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
330 DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
331 DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
332 break ;
333
334 }
335 }
336 else
337 {
338 KLU_utsolve (nk, Uip + k1, Ulen + k1, LUbx [block],
339 Udiag + k1, nr,
340 #ifdef COMPLEX
341 conj_solve,
342 #endif
343 X + nr*k1) ;
344 KLU_ltsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
345 #ifdef COMPLEX
346 conj_solve,
347 #endif
348 X + nr*k1) ;
349 }
350 }
351
352 /* ------------------------------------------------------------------ */
353 /* scale and permute the result, Bz = P'(R\X) */
354 /* ------------------------------------------------------------------ */
355
356 if (Rs == NULL)
357 {
358
359 /* no scaling */
360 switch (nr)
361 {
362
363 case 1:
364
365 for (k = 0 ; k < n ; k++)
366 {
367 Bz [Pnum [k]] = X [k] ;
368 }
369 break ;
370
371 case 2:
372
373 for (k = 0 ; k < n ; k++)
374 {
375 i = Pnum [k] ;
376 Bz [i ] = X [2*k ] ;
377 Bz [i + d ] = X [2*k + 1] ;
378 }
379 break ;
380
381 case 3:
382
383 for (k = 0 ; k < n ; k++)
384 {
385 i = Pnum [k] ;
386 Bz [i ] = X [3*k ] ;
387 Bz [i + d ] = X [3*k + 1] ;
388 Bz [i + d*2] = X [3*k + 2] ;
389 }
390 break ;
391
392 case 4:
393
394 for (k = 0 ; k < n ; k++)
395 {
396 i = Pnum [k] ;
397 Bz [i ] = X [4*k ] ;
398 Bz [i + d ] = X [4*k + 1] ;
399 Bz [i + d*2] = X [4*k + 2] ;
400 Bz [i + d*3] = X [4*k + 3] ;
401 }
402 break ;
403 }
404
405 }
406 else
407 {
408
409 switch (nr)
410 {
411
412 case 1:
413
414 for (k = 0 ; k < n ; k++)
415 {
416 SCALE_DIV_ASSIGN (Bz [Pnum [k]], X [k], Rs [k]) ;
417 }
418 break ;
419
420 case 2:
421
422 for (k = 0 ; k < n ; k++)
423 {
424 i = Pnum [k] ;
425 rs = Rs [k] ;
426 SCALE_DIV_ASSIGN (Bz [i], X [2*k], rs) ;
427 SCALE_DIV_ASSIGN (Bz [i + d], X [2*k + 1], rs) ;
428 }
429 break ;
430
431 case 3:
432
433 for (k = 0 ; k < n ; k++)
434 {
435 i = Pnum [k] ;
436 rs = Rs [k] ;
437 SCALE_DIV_ASSIGN (Bz [i], X [3*k], rs) ;
438 SCALE_DIV_ASSIGN (Bz [i + d], X [3*k + 1], rs) ;
439 SCALE_DIV_ASSIGN (Bz [i + d*2], X [3*k + 2], rs) ;
440 }
441 break ;
442
443 case 4:
444
445 for (k = 0 ; k < n ; k++)
446 {
447 i = Pnum [k] ;
448 rs = Rs [k] ;
449 SCALE_DIV_ASSIGN (Bz [i], X [4*k], rs) ;
450 SCALE_DIV_ASSIGN (Bz [i + d], X [4*k + 1], rs) ;
451 SCALE_DIV_ASSIGN (Bz [i + d*2], X [4*k + 2], rs) ;
452 SCALE_DIV_ASSIGN (Bz [i + d*3], X [4*k + 3], rs) ;
453 }
454 break ;
455 }
456 }
457
458 /* ------------------------------------------------------------------ */
459 /* go to the next chunk of B */
460 /* ------------------------------------------------------------------ */
461
462 Bz += d*4 ;
463 }
464 return (TRUE) ;
465 }
466