1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
FLA_Bidiag_UT_u_unb_var3(FLA_Obj A,FLA_Obj TU,FLA_Obj TV)13 FLA_Error FLA_Bidiag_UT_u_unb_var3( FLA_Obj A, FLA_Obj TU, FLA_Obj TV )
14 {
15 return FLA_Bidiag_UT_u_step_unb_var3( A, TU, TV );
16 }
17
FLA_Bidiag_UT_u_step_unb_var3(FLA_Obj A,FLA_Obj T,FLA_Obj S)18 FLA_Error FLA_Bidiag_UT_u_step_unb_var3( FLA_Obj A, FLA_Obj T, FLA_Obj S )
19 {
20 FLA_Obj ATL, ATR, A00, a01, A02,
21 ABL, ABR, a10t, alpha11, a12t,
22 A20, a21, A22;
23 FLA_Obj TTL, TTR, T00, t01, T02,
24 TBL, TBR, t10t, tau11, t12t,
25 T20, t21, T22;
26 FLA_Obj STL, STR, S00, s01, S02,
27 SBL, SBR, s10t, sigma11, s12t,
28 S20, s21, S22;
29 FLA_Obj wT, w01,
30 wB, omega11,
31 w21;
32 FLA_Obj apT, a01p,
33 apB, alpha11p,
34 a12p;
35 FLA_Obj uT, u01,
36 uB, upsilon11,
37 u21;
38 FLA_Obj uTp, u01p,
39 uBp, upsilon11p,
40 u21p;
41 FLA_Obj vT, v01,
42 vB, nu11,
43 v21;
44 FLA_Obj yT, y01,
45 yB, psi11,
46 y21;
47 FLA_Obj zT, z01,
48 zB, zeta11,
49 z21;
50 FLA_Obj w, ap, u, up, v, y, z;
51
52 FLA_Obj minus_inv_tau11;
53 FLA_Obj beta;
54 FLA_Obj alpha12;
55 FLA_Obj minus_conj_alpha12;
56 FLA_Obj psi11_minus_alpha12;
57 FLA_Obj minus_upsilon11;
58 FLA_Obj minus_conj_nu11;
59 FLA_Obj minus_conj_psi11;
60 FLA_Obj minus_zeta11;
61
62 FLA_Obj a12t_l, a12t_r;
63 FLA_Obj a12p_t,
64 a12p_b;
65 FLA_Obj A22_l, A22_r;
66 FLA_Obj v21_t,
67 v21_b;
68
69 FLA_Datatype datatype_A;
70 dim_t m_A, n_A;
71 dim_t b_alg;
72
73
74 b_alg = FLA_Obj_length( T );
75
76 datatype_A = FLA_Obj_datatype( A );
77 m_A = FLA_Obj_length( A );
78 n_A = FLA_Obj_width( A );
79
80 FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_inv_tau11 );
81 FLA_Obj_create( datatype_A, 1, 1, 0, 0, &beta );
82 FLA_Obj_create( datatype_A, 1, 1, 0, 0, &alpha12 );
83 FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_conj_alpha12 );
84 FLA_Obj_create( datatype_A, 1, 1, 0, 0, &psi11_minus_alpha12 );
85 FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_upsilon11 );
86 FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_conj_nu11 );
87 FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_conj_psi11 );
88 FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_zeta11 );
89 FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w );
90 FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &ap );
91 FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u );
92 FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &up );
93 FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &v );
94 FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &y );
95 FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
96
97 FLA_Part_2x2( A, &ATL, &ATR,
98 &ABL, &ABR, 0, 0, FLA_TL );
99 FLA_Part_2x2( T, &TTL, &TTR,
100 &TBL, &TBR, 0, 0, FLA_TL );
101 FLA_Part_2x2( S, &STL, &STR,
102 &SBL, &SBR, 0, 0, FLA_TL );
103 FLA_Part_2x1( w, &wT,
104 &wB, 0, FLA_TOP );
105 FLA_Part_2x1( ap, &apT,
106 &apB, 0, FLA_TOP );
107 FLA_Part_2x1( u, &uT,
108 &uB, 0, FLA_TOP );
109 FLA_Part_2x1( up, &uTp,
110 &uBp, 0, FLA_TOP );
111 FLA_Part_2x1( v, &vT,
112 &vB, 0, FLA_TOP );
113 FLA_Part_2x1( y, &yT,
114 &yB, 0, FLA_TOP );
115 FLA_Part_2x1( z, &zT,
116 &zB, 0, FLA_TOP );
117
118 while ( FLA_Obj_length( ATL ) < b_alg )
119 {
120 FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02,
121 /* ************* */ /* ************************** */
122 &a10t, /**/ &alpha11, &a12t,
123 ABL, /**/ ABR, &A20, /**/ &a21, &A22,
124 1, 1, FLA_BR );
125 FLA_Repart_2x2_to_3x3( TTL, /**/ TTR, &T00, /**/ &t01, &T02,
126 /* ************* */ /* ************************** */
127 &t10t, /**/ &tau11, &t12t,
128 TBL, /**/ TBR, &T20, /**/ &t21, &T22,
129 1, 1, FLA_BR );
130 FLA_Repart_2x2_to_3x3( STL, /**/ STR, &S00, /**/ &s01, &S02,
131 /* ************* */ /* ************************** */
132 &s10t, /**/ &sigma11, &s12t,
133 SBL, /**/ SBR, &S20, /**/ &s21, &S22,
134 1, 1, FLA_BR );
135 FLA_Repart_2x1_to_3x1( wT, &w01,
136 /* ** */ /* ***** */
137 &omega11,
138 wB, &w21, 1, FLA_BOTTOM );
139 FLA_Repart_2x1_to_3x1( apT, &a01p,
140 /* ** */ /* ***** */
141 &alpha11p,
142 apB, &a12p, 1, FLA_BOTTOM );
143 FLA_Repart_2x1_to_3x1( uT, &u01,
144 /* ** */ /* ***** */
145 &upsilon11,
146 uB, &u21, 1, FLA_BOTTOM );
147 FLA_Repart_2x1_to_3x1( uTp, &u01p,
148 /* ** */ /* ***** */
149 &upsilon11p,
150 uBp, &u21p, 1, FLA_BOTTOM );
151 FLA_Repart_2x1_to_3x1( vT, &v01,
152 /* ** */ /* ***** */
153 &nu11,
154 vB, &v21, 1, FLA_BOTTOM );
155 FLA_Repart_2x1_to_3x1( yT, &y01,
156 /* ** */ /* ***** */
157 &psi11,
158 yB, &y21, 1, FLA_BOTTOM );
159 FLA_Repart_2x1_to_3x1( zT, &z01,
160 /* ** */ /* ***** */
161 &zeta11,
162 zB, &z21, 1, FLA_BOTTOM );
163
164 /*------------------------------------------------------------*/
165
166 if ( FLA_Obj_length( ATL ) > 0 )
167 {
168 FLA_Copy( upsilon11, minus_upsilon11 );
169 FLA_Scal( FLA_MINUS_ONE, minus_upsilon11 );
170
171 FLA_Copy( zeta11, minus_zeta11 );
172 FLA_Scal( FLA_MINUS_ONE, minus_zeta11 );
173
174 FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, psi11, minus_conj_psi11 );
175 FLA_Scal( FLA_MINUS_ONE, minus_conj_psi11 );
176
177 FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, nu11, minus_conj_nu11 );
178 FLA_Scal( FLA_MINUS_ONE, minus_conj_nu11 );
179
180 // alpha11 = alpha11 - upsilon11 * conj(psi11) - zeta11 * conj(nu1);
181 FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi11, upsilon11, alpha11 );
182 FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_nu11, zeta11, alpha11 );
183
184 // a21 = a21 - u21 * conj(psi11) - z21 * conj(nu11);
185 FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi11, u21, a21 );
186 FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_nu11, z21, a21 );
187
188 // a12t = a12t - upsilon11 * y21' - zeta11 * v21';
189 FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_upsilon11, y21, a12t );
190 FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_zeta11, v21, a12t );
191 }
192
193 // [ alpha11, u21p, tau11 ] = House2( alpha11, a21 );
194 FLA_Househ2_UT( FLA_LEFT,
195 alpha11,
196 a21, tau11 );
197 FLA_Copy( a21, u21p );
198
199 if ( FLA_Obj_width( A22 ) > 0 )
200 {
201 // minus_inv_tau11 = - 1 / tau11;
202 FLA_Copy( FLA_MINUS_ONE, minus_inv_tau11 );
203 FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, minus_inv_tau11 );
204
205 // a12p = ( tau11 - 1 ) * a12t^T / tau11;
206 // = a12t^T - ( 1 / tau11 ) * a12t^T;
207 FLA_Copyt( FLA_TRANSPOSE, a12t, a12p );
208 FLA_Axpyt( FLA_TRANSPOSE, minus_inv_tau11, a12t, a12p );
209 }
210
211 if ( FLA_Obj_length( ATL ) > 0 )
212 {
213 // A22 = A22 - u21 * y21' - z21 * v21';
214 FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u21, y21, A22 );
215 FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z21, v21, A22 );
216 }
217
218 if ( FLA_Obj_width( A22 ) > 0 )
219 {
220 // y21 = A22' * u21p;
221 FLA_Gemvc( FLA_CONJ_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, u21p, FLA_ZERO, y21 );
222
223 // a12p = a12p - conj(y21) / tau11;
224 FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_inv_tau11, y21, a12p );
225
226 // w21 = A22 * conj(a12p);
227 FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_ONE, A22, a12p, FLA_ZERO, w21 );
228
229 // y21 = y21 + conj(a12t)^T;
230 FLA_Axpyt( FLA_CONJ_TRANSPOSE, FLA_ONE, a12t, y21 );
231
232 FLA_Part_1x2( a12t, &a12t_l, &a12t_r, 1, FLA_LEFT );
233 FLA_Part_2x1( v21, &v21_t,
234 &v21_b, 1, FLA_TOP );
235 FLA_Part_2x1( a12p, &a12p_t,
236 &a12p_b, 1, FLA_TOP );
237
238 // [ alpha12, psi11_minus_alpha12, sigma11 ] = House2s( a12p_t, a12p_b );
239 FLA_Househ2s_UT( FLA_RIGHT,
240 a12p_t,
241 a12p_b,
242 alpha12, psi11_minus_alpha12, sigma11 );
243
244 // v21 = conj( ( a12p - alpha12 * e0 ) / ( psi11 - alpha12 ) );
245 FLA_Copy( a12p, v21 );
246 FLA_Mult_add( FLA_MINUS_ONE, alpha12, v21_t );
247 FLA_Inv_scalc( FLA_NO_CONJUGATE, psi11_minus_alpha12, v21 );
248 FLA_Conjugate( v21_b );
249
250 // a12t_l = alpha12;
251 // a12t_r = v21_b^T;
252 FLA_Copyt( FLA_NO_TRANSPOSE, alpha12, a12t_l );
253 FLA_Copyt( FLA_TRANSPOSE, v21_b, a12t_r );
254 }
255
256 // u21 = u21p;
257 FLA_Copy( u21p, u21 );
258
259 if ( FLA_Obj_width( A22 ) > 0 )
260 {
261 // beta = - y21' * v21 / tau11;
262 FLA_Dotc( FLA_CONJUGATE, y21, v21, beta );
263 FLA_Scal( FLA_MINUS_ONE, beta );
264 FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, beta );
265
266 FLA_Part_1x2( A22, &A22_l, &A22_r, 1, FLA_LEFT );
267
268 // minus_conj_alpha12 = - conj(alpha12);
269 FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, alpha12, minus_conj_alpha12 );
270 FLA_Scal( FLA_MINUS_ONE, minus_conj_alpha12 );
271
272 // z21 = ( w21 - conj(alpha12) * A22 * e0 ) / conj(psi11 - alpha12) + beta * u21;
273 FLA_Copy( w21, z21 );
274 FLA_Axpy( minus_conj_alpha12, A22_l, z21 );
275 FLA_Inv_scalc( FLA_CONJUGATE, psi11_minus_alpha12, z21 );
276 FLA_Axpy( beta, u21, z21 );
277
278 // y21 = y21 / tau11;
279 // z21 = z21 / sigma11;
280 FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, y21 );
281 FLA_Inv_scalc( FLA_NO_CONJUGATE, sigma11, z21 );
282
283 // s01 = conj(V02) * v21;
284 FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_ONE, A02, v21, FLA_ZERO, s01 );
285 }
286
287 // t01 = a10t' + U20' * u21;
288 FLA_Copyt( FLA_CONJ_TRANSPOSE, a10t, t01 );
289 FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, u21, FLA_ONE, t01 );
290
291 // Update A22 if this is the last iteration; this is needed when we're
292 // being called from the blocked routine so A22 is left in a valid state.
293 if ( FLA_Obj_length( ATL ) + 1 == b_alg &&
294 FLA_Obj_width( A22 ) > 0 )
295 {
296 // A22 = A22 - u21 * y21' - z21 * v21';
297 FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u21, y21, A22 );
298 FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z21, v21, A22 );
299 }
300
301 /*------------------------------------------------------------*/
302
303 FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02,
304 a10t, alpha11, /**/ a12t,
305 /* ************** */ /* ************************ */
306 &ABL, /**/ &ABR, A20, a21, /**/ A22,
307 FLA_TL );
308 FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR, T00, t01, /**/ T02,
309 t10t, tau11, /**/ t12t,
310 /* ************** */ /* ************************ */
311 &TBL, /**/ &TBR, T20, t21, /**/ T22,
312 FLA_TL );
313 FLA_Cont_with_3x3_to_2x2( &STL, /**/ &STR, S00, s01, /**/ S02,
314 s10t, sigma11, /**/ s12t,
315 /* ************** */ /* ************************ */
316 &SBL, /**/ &SBR, S20, s21, /**/ S22,
317 FLA_TL );
318 FLA_Cont_with_3x1_to_2x1( &wT, w01,
319 omega11,
320 /* ** */ /* ***** */
321 &wB, w21, FLA_TOP );
322 FLA_Cont_with_3x1_to_2x1( &apT, a01p,
323 alpha11p,
324 /* ** */ /* ***** */
325 &apB, a12p, FLA_TOP );
326 FLA_Cont_with_3x1_to_2x1( &uT, u01,
327 upsilon11,
328 /* ** */ /* ***** */
329 &uB, u21, FLA_TOP );
330 FLA_Cont_with_3x1_to_2x1( &uTp, u01p,
331 upsilon11p,
332 /* ** */ /* ***** */
333 &uBp, u21p, FLA_TOP );
334 FLA_Cont_with_3x1_to_2x1( &vT, v01,
335 nu11,
336 /* ** */ /* ***** */
337 &vB, v21, FLA_TOP );
338 FLA_Cont_with_3x1_to_2x1( &yT, y01,
339 psi11,
340 /* ** */ /* ***** */
341 &yB, y21, FLA_TOP );
342 FLA_Cont_with_3x1_to_2x1( &zT, z01,
343 zeta11,
344 /* ** */ /* ***** */
345 &zB, z21, FLA_TOP );
346 }
347
348 FLA_Obj_free( &minus_inv_tau11 );
349 FLA_Obj_free( &beta );
350 FLA_Obj_free( &alpha12 );
351 FLA_Obj_free( &minus_conj_alpha12 );
352 FLA_Obj_free( &psi11_minus_alpha12 );
353 FLA_Obj_free( &minus_upsilon11 );
354 FLA_Obj_free( &minus_conj_nu11 );
355 FLA_Obj_free( &minus_conj_psi11 );
356 FLA_Obj_free( &minus_zeta11 );
357 FLA_Obj_free( &w );
358 FLA_Obj_free( &ap );
359 FLA_Obj_free( &u );
360 FLA_Obj_free( &up );
361 FLA_Obj_free( &v );
362 FLA_Obj_free( &y );
363 FLA_Obj_free( &z );
364
365 return FLA_SUCCESS;
366 }
367
368