1 /* ========================================================================== */
2 /* === KLU_tsolve =========================================================== */
3 /* ========================================================================== */
4 
5 /* Solve A'x=b using the symbolic and numeric objects from KLU_analyze
6  * (or KLU_analyze_given) and KLU_factor.  Note that no iterative refinement is
7  * performed.  Uses Numeric->Xwork as workspace (undefined on input and output),
8  * of size 4n Entry's (note that columns 2 to 4 of Xwork overlap with
9  * Numeric->Iwork).
10  */
11 
12 #include "klu_internal.h"
13 
KLU_tsolve(KLU_symbolic * Symbolic,KLU_numeric * Numeric,Int d,Int nrhs,double B[],Int conj_solve,KLU_common * Common)14 Int KLU_tsolve
15 (
16     /* inputs, not modified */
17     KLU_symbolic *Symbolic,
18     KLU_numeric *Numeric,
19     Int d,                  /* leading dimension of B */
20     Int nrhs,               /* number of right-hand-sides */
21 
22     /* right-hand-side on input, overwritten with solution to Ax=b on output */
23     double B [ ],           /* size n*nrhs, in column-oriented form, with
24                              * leading dimension d. */
25 #ifdef COMPLEX
26     Int conj_solve,         /* TRUE for conjugate transpose solve, FALSE for
27                              * array transpose solve.  Used for the complex
28                              * case only. */
29 #endif
30     /* --------------- */
31     KLU_common *Common
32 )
33 {
34     Entry x [4], offik, s ;
35     double rs, *Rs ;
36     Entry *Offx, *X, *Bz, *Udiag ;
37     Int *Q, *R, *Pnum, *Offp, *Offi, *Lip, *Uip, *Llen, *Ulen ;
38     Unit **LUbx ;
39     Int k1, k2, nk, k, block, pend, n, p, nblocks, chunk, nr, i ;
40 
41     /* ---------------------------------------------------------------------- */
42     /* check inputs */
43     /* ---------------------------------------------------------------------- */
44 
45     if (Common == NULL)
46     {
47         return (FALSE) ;
48     }
49     if (Numeric == NULL || Symbolic == NULL || d < Symbolic->n || nrhs < 0 ||
50         B == NULL)
51     {
52         Common->status = KLU_INVALID ;
53         return (FALSE) ;
54     }
55     Common->status = KLU_OK ;
56 
57     /* ---------------------------------------------------------------------- */
58     /* get the contents of the Symbolic object */
59     /* ---------------------------------------------------------------------- */
60 
61     Bz = (Entry *) B ;
62     n = Symbolic->n ;
63     nblocks = Symbolic->nblocks ;
64     Q = Symbolic->Q ;
65     R = Symbolic->R ;
66 
67     /* ---------------------------------------------------------------------- */
68     /* get the contents of the Numeric object */
69     /* ---------------------------------------------------------------------- */
70 
71     ASSERT (nblocks == Numeric->nblocks) ;
72     Pnum = Numeric->Pnum ;
73     Offp = Numeric->Offp ;
74     Offi = Numeric->Offi ;
75     Offx = (Entry *) Numeric->Offx ;
76 
77     Lip  = Numeric->Lip ;
78     Llen = Numeric->Llen ;
79     Uip  = Numeric->Uip ;
80     Ulen = Numeric->Ulen ;
81     LUbx = (Unit **) Numeric->LUbx ;
82     Udiag = Numeric->Udiag ;
83 
84     Rs = Numeric->Rs ;
85     X = (Entry *) Numeric->Xwork ;
86     ASSERT (KLU_valid (n, Offp, Offi, Offx)) ;
87 
88     /* ---------------------------------------------------------------------- */
89     /* solve in chunks of 4 columns at a time */
90     /* ---------------------------------------------------------------------- */
91 
92     for (chunk = 0 ; chunk < nrhs ; chunk += 4)
93     {
94 
95         /* ------------------------------------------------------------------ */
96         /* get the size of the current chunk */
97         /* ------------------------------------------------------------------ */
98 
99         nr = MIN (nrhs - chunk, 4) ;
100 
101         /* ------------------------------------------------------------------ */
102         /* permute the right hand side, X = Q'*B */
103         /* ------------------------------------------------------------------ */
104 
105         switch (nr)
106         {
107 
108             case 1:
109 
110                 for (k = 0 ; k < n ; k++)
111                 {
112                     X [k] = Bz  [Q [k]] ;
113                 }
114                 break ;
115 
116             case 2:
117 
118                 for (k = 0 ; k < n ; k++)
119                 {
120                     i = Q [k] ;
121                     X [2*k    ] = Bz [i      ] ;
122                     X [2*k + 1] = Bz [i + d  ] ;
123                 }
124                 break ;
125 
126             case 3:
127 
128                 for (k = 0 ; k < n ; k++)
129                 {
130                     i = Q [k] ;
131                     X [3*k    ] = Bz [i      ] ;
132                     X [3*k + 1] = Bz [i + d  ] ;
133                     X [3*k + 2] = Bz [i + d*2] ;
134                 }
135                 break ;
136 
137             case 4:
138 
139                 for (k = 0 ; k < n ; k++)
140                 {
141                     i = Q [k] ;
142                     X [4*k    ] = Bz [i      ] ;
143                     X [4*k + 1] = Bz [i + d  ] ;
144                     X [4*k + 2] = Bz [i + d*2] ;
145                     X [4*k + 3] = Bz [i + d*3] ;
146                 }
147                 break ;
148 
149         }
150 
151         /* ------------------------------------------------------------------ */
152         /* solve X = (L*U + Off)'\X */
153         /* ------------------------------------------------------------------ */
154 
155         for (block = 0 ; block < nblocks ; block++)
156         {
157 
158             /* -------------------------------------------------------------- */
159             /* the block of size nk is from rows/columns k1 to k2-1 */
160             /* -------------------------------------------------------------- */
161 
162             k1 = R [block] ;
163             k2 = R [block+1] ;
164             nk = k2 - k1 ;
165             PRINTF (("tsolve %d, k1 %d k2-1 %d nk %d\n", block, k1,k2-1,nk)) ;
166 
167             /* -------------------------------------------------------------- */
168             /* block back-substitution for the off-diagonal-block entries */
169             /* -------------------------------------------------------------- */
170 
171             if (block > 0)
172             {
173                 switch (nr)
174                     {
175 
176                     case 1:
177 
178                         for (k = k1 ; k < k2 ; k++)
179                         {
180                             pend = Offp [k+1] ;
181                             for (p = Offp [k] ; p < pend ; p++)
182                             {
183 #ifdef COMPLEX
184                                 if (conj_solve)
185                                 {
186                                     MULT_SUB_CONJ (X [k], X [Offi [p]],
187                                             Offx [p]) ;
188                                 }
189                                 else
190 #endif
191                                 {
192                                     MULT_SUB (X [k], Offx [p], X [Offi [p]]) ;
193                                 }
194                             }
195                         }
196                         break ;
197 
198                     case 2:
199 
200                         for (k = k1 ; k < k2 ; k++)
201                         {
202                             pend = Offp [k+1] ;
203                             x [0] = X [2*k    ] ;
204                             x [1] = X [2*k + 1] ;
205                             for (p = Offp [k] ; p < pend ; p++)
206                             {
207                                 i = Offi [p] ;
208 #ifdef COMPLEX
209                                 if (conj_solve)
210                                 {
211                                     CONJ (offik, Offx [p]) ;
212                                 }
213                                 else
214 #endif
215                                 {
216                                     offik = Offx [p] ;
217                                 }
218                                 MULT_SUB (x [0], offik, X [2*i]) ;
219                                 MULT_SUB (x [1], offik, X [2*i + 1]) ;
220                             }
221                             X [2*k    ] = x [0] ;
222                             X [2*k + 1] = x [1] ;
223                         }
224                         break ;
225 
226                     case 3:
227 
228                         for (k = k1 ; k < k2 ; k++)
229                         {
230                             pend = Offp [k+1] ;
231                             x [0] = X [3*k    ] ;
232                             x [1] = X [3*k + 1] ;
233                             x [2] = X [3*k + 2] ;
234                             for (p = Offp [k] ; p < pend ; p++)
235                             {
236                                 i = Offi [p] ;
237 #ifdef COMPLEX
238                                 if (conj_solve)
239                                 {
240                                     CONJ (offik, Offx [p]) ;
241                                 }
242                                 else
243 #endif
244                                 {
245                                     offik = Offx [p] ;
246                                 }
247                                 MULT_SUB (x [0], offik, X [3*i]) ;
248                                 MULT_SUB (x [1], offik, X [3*i + 1]) ;
249                                 MULT_SUB (x [2], offik, X [3*i + 2]) ;
250                             }
251                             X [3*k    ] = x [0] ;
252                             X [3*k + 1] = x [1] ;
253                             X [3*k + 2] = x [2] ;
254                         }
255                         break ;
256 
257                     case 4:
258 
259                         for (k = k1 ; k < k2 ; k++)
260                         {
261                             pend = Offp [k+1] ;
262                             x [0] = X [4*k    ] ;
263                             x [1] = X [4*k + 1] ;
264                             x [2] = X [4*k + 2] ;
265                             x [3] = X [4*k + 3] ;
266                             for (p = Offp [k] ; p < pend ; p++)
267                             {
268                                 i = Offi [p] ;
269 #ifdef COMPLEX
270                                 if (conj_solve)
271                                 {
272                                     CONJ(offik, Offx [p]) ;
273                                 }
274                                 else
275 #endif
276                                 {
277                                     offik = Offx [p] ;
278                                 }
279                                 MULT_SUB (x [0], offik, X [4*i]) ;
280                                 MULT_SUB (x [1], offik, X [4*i + 1]) ;
281                                 MULT_SUB (x [2], offik, X [4*i + 2]) ;
282                                 MULT_SUB (x [3], offik, X [4*i + 3]) ;
283                             }
284                             X [4*k    ] = x [0] ;
285                             X [4*k + 1] = x [1] ;
286                             X [4*k + 2] = x [2] ;
287                             X [4*k + 3] = x [3] ;
288                         }
289                         break ;
290                     }
291             }
292 
293             /* -------------------------------------------------------------- */
294             /* solve the block system */
295             /* -------------------------------------------------------------- */
296 
297             if (nk == 1)
298             {
299 #ifdef COMPLEX
300                 if (conj_solve)
301                 {
302                     CONJ (s, Udiag [k1]) ;
303                 }
304                 else
305 #endif
306                 {
307                     s = Udiag [k1] ;
308                 }
309                 switch (nr)
310                 {
311 
312                     case 1:
313                         DIV (X [k1], X [k1], s) ;
314                         break ;
315 
316                     case 2:
317                         DIV (X [2*k1], X [2*k1], s) ;
318                         DIV (X [2*k1 + 1], X [2*k1 + 1], s) ;
319                         break ;
320 
321                     case 3:
322                         DIV (X [3*k1], X [3*k1], s) ;
323                         DIV (X [3*k1 + 1], X [3*k1 + 1], s) ;
324                         DIV (X [3*k1 + 2], X [3*k1 + 2], s) ;
325                         break ;
326 
327                     case 4:
328                         DIV (X [4*k1], X [4*k1], s) ;
329                         DIV (X [4*k1 + 1], X [4*k1 + 1], s) ;
330                         DIV (X [4*k1 + 2], X [4*k1 + 2], s) ;
331                         DIV (X [4*k1 + 3], X [4*k1 + 3], s) ;
332                         break ;
333 
334                 }
335             }
336             else
337             {
338                 KLU_utsolve (nk, Uip + k1, Ulen + k1, LUbx [block],
339                         Udiag + k1, nr,
340 #ifdef COMPLEX
341                         conj_solve,
342 #endif
343                         X + nr*k1) ;
344                 KLU_ltsolve (nk, Lip + k1, Llen + k1, LUbx [block], nr,
345 #ifdef COMPLEX
346                         conj_solve,
347 #endif
348                         X + nr*k1) ;
349             }
350         }
351 
352         /* ------------------------------------------------------------------ */
353         /* scale and permute the result, Bz  = P'(R\X) */
354         /* ------------------------------------------------------------------ */
355 
356         if (Rs == NULL)
357         {
358 
359             /* no scaling */
360             switch (nr)
361             {
362 
363                 case 1:
364 
365                     for (k = 0 ; k < n ; k++)
366                     {
367                         Bz  [Pnum [k]] = X [k] ;
368                     }
369                     break ;
370 
371                 case 2:
372 
373                     for (k = 0 ; k < n ; k++)
374                     {
375                         i = Pnum [k] ;
376                         Bz  [i      ] = X [2*k    ] ;
377                         Bz  [i + d  ] = X [2*k + 1] ;
378                     }
379                     break ;
380 
381                 case 3:
382 
383                     for (k = 0 ; k < n ; k++)
384                     {
385                         i = Pnum [k] ;
386                         Bz  [i      ] = X [3*k    ] ;
387                         Bz  [i + d  ] = X [3*k + 1] ;
388                         Bz  [i + d*2] = X [3*k + 2] ;
389                     }
390                     break ;
391 
392                 case 4:
393 
394                     for (k = 0 ; k < n ; k++)
395                     {
396                         i = Pnum [k] ;
397                         Bz  [i      ] = X [4*k    ] ;
398                         Bz  [i + d  ] = X [4*k + 1] ;
399                         Bz  [i + d*2] = X [4*k + 2] ;
400                         Bz  [i + d*3] = X [4*k + 3] ;
401                     }
402                     break ;
403             }
404 
405         }
406         else
407         {
408 
409             switch (nr)
410             {
411 
412                 case 1:
413 
414                     for (k = 0 ; k < n ; k++)
415                     {
416                         SCALE_DIV_ASSIGN (Bz [Pnum [k]], X [k], Rs [k]) ;
417                     }
418                     break ;
419 
420                 case 2:
421 
422                     for (k = 0 ; k < n ; k++)
423                     {
424                         i = Pnum [k] ;
425                         rs = Rs [k] ;
426                         SCALE_DIV_ASSIGN (Bz [i], X [2*k], rs) ;
427                         SCALE_DIV_ASSIGN (Bz [i + d], X [2*k + 1], rs) ;
428                     }
429                     break ;
430 
431                 case 3:
432 
433                     for (k = 0 ; k < n ; k++)
434                     {
435                         i = Pnum [k] ;
436                         rs = Rs [k] ;
437                         SCALE_DIV_ASSIGN (Bz [i], X [3*k], rs) ;
438                         SCALE_DIV_ASSIGN (Bz [i + d], X [3*k + 1], rs) ;
439                         SCALE_DIV_ASSIGN (Bz [i + d*2], X [3*k + 2], rs) ;
440                     }
441                     break ;
442 
443                 case 4:
444 
445                     for (k = 0 ; k < n ; k++)
446                     {
447                         i = Pnum [k] ;
448                         rs = Rs [k] ;
449                         SCALE_DIV_ASSIGN (Bz [i], X [4*k], rs) ;
450                         SCALE_DIV_ASSIGN (Bz [i + d], X [4*k + 1], rs) ;
451                         SCALE_DIV_ASSIGN (Bz [i + d*2], X [4*k + 2], rs) ;
452                         SCALE_DIV_ASSIGN (Bz [i + d*3], X [4*k + 3], rs) ;
453                     }
454                     break ;
455             }
456         }
457 
458         /* ------------------------------------------------------------------ */
459         /* go to the next chunk of B */
460         /* ------------------------------------------------------------------ */
461 
462         Bz  += d*4 ;
463     }
464     return (TRUE) ;
465 }
466