1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
FLA_Bidiag_UT_u_ofu_var2(FLA_Obj A,FLA_Obj TU,FLA_Obj TV)13 FLA_Error FLA_Bidiag_UT_u_ofu_var2( FLA_Obj A, FLA_Obj TU, FLA_Obj TV )
14 {
15 return FLA_Bidiag_UT_u_step_ofu_var2( A, TU, TV );
16 }
17
FLA_Bidiag_UT_u_step_ofu_var2(FLA_Obj A,FLA_Obj T,FLA_Obj S)18 FLA_Error FLA_Bidiag_UT_u_step_ofu_var2( FLA_Obj A, FLA_Obj T, FLA_Obj S )
19 {
20 FLA_Datatype datatype;
21 int m_A, n_A, m_TS;
22 int rs_A, cs_A;
23 int rs_T, cs_T;
24 int rs_S, cs_S;
25
26 datatype = FLA_Obj_datatype( A );
27
28 m_A = FLA_Obj_length( A );
29 n_A = FLA_Obj_width( A );
30 m_TS = FLA_Obj_length( T );
31
32 rs_A = FLA_Obj_row_stride( A );
33 cs_A = FLA_Obj_col_stride( A );
34
35 rs_T = FLA_Obj_row_stride( T );
36 cs_T = FLA_Obj_col_stride( T );
37
38 rs_S = FLA_Obj_row_stride( S );
39 cs_S = FLA_Obj_col_stride( S );
40
41
42 switch ( datatype )
43 {
44 case FLA_FLOAT:
45 {
46 float* buff_A = FLA_FLOAT_PTR( A );
47 float* buff_T = FLA_FLOAT_PTR( T );
48 float* buff_S = FLA_FLOAT_PTR( S );
49
50 FLA_Bidiag_UT_u_step_ofs_var2( m_A,
51 n_A,
52 m_TS,
53 buff_A, rs_A, cs_A,
54 buff_T, rs_T, cs_T,
55 buff_S, rs_S, cs_S );
56
57 break;
58 }
59
60 case FLA_DOUBLE:
61 {
62 double* buff_A = FLA_DOUBLE_PTR( A );
63 double* buff_T = FLA_DOUBLE_PTR( T );
64 double* buff_S = FLA_DOUBLE_PTR( S );
65
66 FLA_Bidiag_UT_u_step_ofd_var2( m_A,
67 n_A,
68 m_TS,
69 buff_A, rs_A, cs_A,
70 buff_T, rs_T, cs_T,
71 buff_S, rs_S, cs_S );
72
73 break;
74 }
75
76 case FLA_COMPLEX:
77 {
78 scomplex* buff_A = FLA_COMPLEX_PTR( A );
79 scomplex* buff_T = FLA_COMPLEX_PTR( T );
80 scomplex* buff_S = FLA_COMPLEX_PTR( S );
81
82 FLA_Bidiag_UT_u_step_ofc_var2( m_A,
83 n_A,
84 m_TS,
85 buff_A, rs_A, cs_A,
86 buff_T, rs_T, cs_T,
87 buff_S, rs_S, cs_S );
88
89 break;
90 }
91
92 case FLA_DOUBLE_COMPLEX:
93 {
94 dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A );
95 dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T );
96 dcomplex* buff_S = FLA_DOUBLE_COMPLEX_PTR( S );
97
98 FLA_Bidiag_UT_u_step_ofz_var2( m_A,
99 n_A,
100 m_TS,
101 buff_A, rs_A, cs_A,
102 buff_T, rs_T, cs_T,
103 buff_S, rs_S, cs_S );
104
105 break;
106 }
107 }
108
109 return FLA_SUCCESS;
110 }
111
112
113
FLA_Bidiag_UT_u_step_ofs_var2(int m_A,int n_A,int m_TS,float * buff_A,int rs_A,int cs_A,float * buff_T,int rs_T,int cs_T,float * buff_S,int rs_S,int cs_S)114 FLA_Error FLA_Bidiag_UT_u_step_ofs_var2( int m_A,
115 int n_A,
116 int m_TS,
117 float* buff_A, int rs_A, int cs_A,
118 float* buff_T, int rs_T, int cs_T,
119 float* buff_S, int rs_S, int cs_S )
120 {
121 float* buff_1 = FLA_FLOAT_PTR( FLA_ONE );
122 float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO );
123 float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE );
124
125 float beta;
126 int i;
127
128 // b_alg = FLA_Obj_length( T );
129 int b_alg = m_TS;
130
131 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &v );
132 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &y );
133 // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
134 float* buff_v = ( float* ) FLA_malloc( n_A * sizeof( *buff_A ) );
135 float* buff_y = ( float* ) FLA_malloc( n_A * sizeof( *buff_A ) );
136 float* buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) );
137 int inc_v = 1;
138 int inc_y = 1;
139 int inc_z = 1;
140
141 for ( i = 0; i < b_alg; ++i )
142 {
143 float* a10t = buff_A + (0 )*cs_A + (i )*rs_A;
144 float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A;
145 float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A;
146 float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A;
147 float* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A;
148 float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A;
149 float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A;
150
151 float* t01 = buff_T + (i )*cs_T + (0 )*rs_T;
152 float* tau11 = buff_T + (i )*cs_T + (i )*rs_T;
153
154 float* s01 = buff_S + (i )*cs_S + (0 )*rs_S;
155 float* sigma11 = buff_S + (i )*cs_S + (i )*rs_S;
156
157 float* v21 = buff_v + (i+1)*inc_v;
158
159 float* y21 = buff_y + (i+1)*inc_y;
160
161 float* z21 = buff_z + (i+1)*inc_z;
162
163 float* a12t_l = a12t + (0 )*cs_A + (0 )*rs_A;
164 float* a12t_r = a12t + (1 )*cs_A + (0 )*rs_A;
165
166 float* v21_t = v21 + (0 )*inc_v;
167 float* v21_b = v21 + (1 )*inc_v;
168
169 int m_ahead = m_A - i - 1;
170 int n_ahead = n_A - i - 1;
171 int m_behind = i;
172 int n_behind = i;
173
174 /*------------------------------------------------------------*/
175
176 // FLA_Househ2_UT( FLA_LEFT,
177 // alpha11,
178 // a21, tau11 );
179 FLA_Househ2_UT_l_ops( m_ahead,
180 alpha11,
181 a21, rs_A,
182 tau11 );
183
184 if ( n_ahead > 0 )
185 {
186 // FLA_Copyt( FLA_TRANSPOSE, a12t, y21 );
187 // FLA_Gemvc( FLA_TRANSPOSE, FLA_CONJUGATE, FLA_ONE, A22, a21, FLA_ONE, y21 );
188 bl1_scopyv( BLIS1_NO_CONJUGATE,
189 n_ahead,
190 a12t, cs_A,
191 y21, inc_y );
192 bl1_sgemv( BLIS1_TRANSPOSE,
193 BLIS1_CONJUGATE,
194 m_ahead,
195 n_ahead,
196 buff_1,
197 A22, rs_A, cs_A,
198 a21, rs_A,
199 buff_1,
200 y21, inc_y );
201
202 // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, y21 );
203 bl1_sinvscalv( BLIS1_NO_CONJUGATE,
204 n_ahead,
205 tau11,
206 y21, inc_y );
207
208 // FLA_Axpyt( FLA_TRANSPOSE, FLA_MINUS_ONE, y21, a12t );
209 bl1_saxpyv( BLIS1_NO_CONJUGATE,
210 n_ahead,
211 buff_m1,
212 y21, inc_y,
213 a12t, cs_A );
214
215 // FLA_Househ2_UT( FLA_RIGHT, a12t_l, a12t_r, sigma11 );
216 FLA_Househ2_UT_r_ops( n_ahead - 1,
217 a12t_l,
218 a12t_r, cs_A,
219 sigma11 );
220
221 // FLA_Set( FLA_ONE, v21_t );
222 // FLA_Copyt( FLA_TRANSPOSE, a12t_r, v21_b );
223 *v21_t = *buff_1;
224 bl1_scopyv( BLIS1_NO_CONJUGATE,
225 n_ahead - 1,
226 a12t_r, cs_A,
227 v21_b, inc_y );
228
229 // FLA_Dotc( FLA_CONJUGATE, v21, y21, beta );
230 // FLA_Scal( FLA_MINUS_ONE, beta );
231 bl1_sdot( BLIS1_CONJUGATE,
232 n_ahead,
233 v21, inc_v,
234 y21, inc_y,
235 &beta );
236 bl1_sneg1( &beta );
237
238 // FLA_Copy( a21, z21 );
239 // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_ONE, A22, v21, beta, z21 );
240 // FLA_Inv_scalc( FLA_NO_CONJUGATE, sigma11, z21 );
241 bl1_scopyv( BLIS1_NO_CONJUGATE,
242 m_ahead,
243 a21, rs_A,
244 z21, inc_z );
245 bl1_sgemv( BLIS1_NO_TRANSPOSE,
246 BLIS1_CONJUGATE,
247 m_ahead,
248 n_ahead,
249 buff_1,
250 A22, rs_A, cs_A,
251 v21, inc_v,
252 &beta,
253 z21, inc_z );
254 bl1_sinvscalv( BLIS1_NO_CONJUGATE,
255 m_ahead,
256 sigma11,
257 z21, inc_z );
258
259 // FLA_Ger( FLA_MINUS_ONE, a21, y21, A22 );
260 // FLA_Ger( FLA_MINUS_ONE, z21, v21, A22 );
261 FLA_Fused_Gerc2_ops_var1( m_ahead,
262 n_ahead,
263 buff_m1,
264 a21, rs_A,
265 y21, inc_y,
266 z21, inc_z,
267 v21, inc_v,
268 A22, rs_A, cs_A );
269
270 // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_ONE, A02, v21, FLA_ZERO, s01 );
271 bl1_sgemv( BLIS1_NO_TRANSPOSE,
272 BLIS1_CONJUGATE,
273 m_behind,
274 n_ahead,
275 buff_1,
276 A02, rs_A, cs_A,
277 v21, inc_v,
278 buff_0,
279 s01, rs_S );
280 }
281
282 // FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, t01 );
283 // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ONE, t01 );
284 bl1_scopyv( BLIS1_CONJUGATE,
285 n_behind,
286 a10t, cs_A,
287 t01, rs_T );
288 bl1_sgemv( BLIS1_CONJ_TRANSPOSE,
289 BLIS1_NO_CONJUGATE,
290 m_ahead,
291 n_behind,
292 buff_1,
293 A20, rs_A, cs_A,
294 a21, rs_A,
295 buff_1,
296 t01, rs_T );
297
298 /*------------------------------------------------------------*/
299
300 }
301
302 // FLA_Obj_free( &v );
303 // FLA_Obj_free( &y );
304 // FLA_Obj_free( &z );
305 FLA_free( buff_v );
306 FLA_free( buff_y );
307 FLA_free( buff_z );
308
309 return FLA_SUCCESS;
310 }
311
312
313
FLA_Bidiag_UT_u_step_ofd_var2(int m_A,int n_A,int m_TS,double * buff_A,int rs_A,int cs_A,double * buff_T,int rs_T,int cs_T,double * buff_S,int rs_S,int cs_S)314 FLA_Error FLA_Bidiag_UT_u_step_ofd_var2( int m_A,
315 int n_A,
316 int m_TS,
317 double* buff_A, int rs_A, int cs_A,
318 double* buff_T, int rs_T, int cs_T,
319 double* buff_S, int rs_S, int cs_S )
320 {
321 double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE );
322 double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO );
323 double* buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE );
324
325 double beta;
326 int i;
327
328 // b_alg = FLA_Obj_length( T );
329 int b_alg = m_TS;
330
331 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &v );
332 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &y );
333 // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
334 double* buff_v = ( double* ) FLA_malloc( n_A * sizeof( *buff_A ) );
335 double* buff_y = ( double* ) FLA_malloc( n_A * sizeof( *buff_A ) );
336 double* buff_z = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) );
337 int inc_v = 1;
338 int inc_y = 1;
339 int inc_z = 1;
340
341 for ( i = 0; i < b_alg; ++i )
342 {
343 double* a10t = buff_A + (0 )*cs_A + (i )*rs_A;
344 double* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A;
345 double* alpha11 = buff_A + (i )*cs_A + (i )*rs_A;
346 double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A;
347 double* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A;
348 double* a12t = buff_A + (i+1)*cs_A + (i )*rs_A;
349 double* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A;
350
351 double* t01 = buff_T + (i )*cs_T + (0 )*rs_T;
352 double* tau11 = buff_T + (i )*cs_T + (i )*rs_T;
353
354 double* s01 = buff_S + (i )*cs_S + (0 )*rs_S;
355 double* sigma11 = buff_S + (i )*cs_S + (i )*rs_S;
356
357 double* v21 = buff_v + (i+1)*inc_v;
358
359 double* y21 = buff_y + (i+1)*inc_y;
360
361 double* z21 = buff_z + (i+1)*inc_z;
362
363 double* a12t_l = a12t + (0 )*cs_A + (0 )*rs_A;
364 double* a12t_r = a12t + (1 )*cs_A + (0 )*rs_A;
365
366 double* v21_t = v21 + (0 )*inc_v;
367 double* v21_b = v21 + (1 )*inc_v;
368
369 int m_ahead = m_A - i - 1;
370 int n_ahead = n_A - i - 1;
371 int m_behind = i;
372 int n_behind = i;
373
374 /*------------------------------------------------------------*/
375
376 // FLA_Househ2_UT( FLA_LEFT,
377 // alpha11,
378 // a21, tau11 );
379 FLA_Househ2_UT_l_opd( m_ahead,
380 alpha11,
381 a21, rs_A,
382 tau11 );
383
384 if ( n_ahead > 0 )
385 {
386 // FLA_Copyt( FLA_TRANSPOSE, a12t, y21 );
387 // FLA_Gemvc( FLA_TRANSPOSE, FLA_CONJUGATE, FLA_ONE, A22, a21, FLA_ONE, y21 );
388 bl1_dcopyv( BLIS1_NO_CONJUGATE,
389 n_ahead,
390 a12t, cs_A,
391 y21, inc_y );
392 bl1_dgemv( BLIS1_TRANSPOSE,
393 BLIS1_CONJUGATE,
394 m_ahead,
395 n_ahead,
396 buff_1,
397 A22, rs_A, cs_A,
398 a21, rs_A,
399 buff_1,
400 y21, inc_y );
401
402 // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, y21 );
403 bl1_dinvscalv( BLIS1_NO_CONJUGATE,
404 n_ahead,
405 tau11,
406 y21, inc_y );
407
408 // FLA_Axpyt( FLA_TRANSPOSE, FLA_MINUS_ONE, y21, a12t );
409 bl1_daxpyv( BLIS1_NO_CONJUGATE,
410 n_ahead,
411 buff_m1,
412 y21, inc_y,
413 a12t, cs_A );
414
415 // FLA_Househ2_UT( FLA_RIGHT, a12t_l, a12t_r, sigma11 );
416 FLA_Househ2_UT_r_opd( n_ahead - 1,
417 a12t_l,
418 a12t_r, cs_A,
419 sigma11 );
420
421 // FLA_Set( FLA_ONE, v21_t );
422 // FLA_Copyt( FLA_TRANSPOSE, a12t_r, v21_b );
423 *v21_t = *buff_1;
424 bl1_dcopyv( BLIS1_NO_CONJUGATE,
425 n_ahead - 1,
426 a12t_r, cs_A,
427 v21_b, inc_y );
428
429 // FLA_Dotc( FLA_CONJUGATE, v21, y21, beta );
430 // FLA_Scal( FLA_MINUS_ONE, beta );
431 bl1_ddot( BLIS1_CONJUGATE,
432 n_ahead,
433 v21, inc_v,
434 y21, inc_y,
435 &beta );
436 bl1_dneg1( &beta );
437
438 // FLA_Copy( a21, z21 );
439 // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_ONE, A22, v21, beta, z21 );
440 // FLA_Inv_scalc( FLA_NO_CONJUGATE, sigma11, z21 );
441 bl1_dcopyv( BLIS1_NO_CONJUGATE,
442 m_ahead,
443 a21, rs_A,
444 z21, inc_z );
445 bl1_dgemv( BLIS1_NO_TRANSPOSE,
446 BLIS1_CONJUGATE,
447 m_ahead,
448 n_ahead,
449 buff_1,
450 A22, rs_A, cs_A,
451 v21, inc_v,
452 &beta,
453 z21, inc_z );
454 bl1_dinvscalv( BLIS1_NO_CONJUGATE,
455 m_ahead,
456 sigma11,
457 z21, inc_z );
458
459 // FLA_Ger( FLA_MINUS_ONE, a21, y21, A22 );
460 // FLA_Ger( FLA_MINUS_ONE, z21, v21, A22 );
461 FLA_Fused_Gerc2_opd_var1( m_ahead,
462 n_ahead,
463 buff_m1,
464 a21, rs_A,
465 y21, inc_y,
466 z21, inc_z,
467 v21, inc_v,
468 A22, rs_A, cs_A );
469
470 // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_ONE, A02, v21, FLA_ZERO, s01 );
471 bl1_dgemv( BLIS1_NO_TRANSPOSE,
472 BLIS1_CONJUGATE,
473 m_behind,
474 n_ahead,
475 buff_1,
476 A02, rs_A, cs_A,
477 v21, inc_v,
478 buff_0,
479 s01, rs_S );
480 }
481
482 // FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, t01 );
483 // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ONE, t01 );
484 bl1_dcopyv( BLIS1_CONJUGATE,
485 n_behind,
486 a10t, cs_A,
487 t01, rs_T );
488 bl1_dgemv( BLIS1_CONJ_TRANSPOSE,
489 BLIS1_NO_CONJUGATE,
490 m_ahead,
491 n_behind,
492 buff_1,
493 A20, rs_A, cs_A,
494 a21, rs_A,
495 buff_1,
496 t01, rs_T );
497
498 /*------------------------------------------------------------*/
499
500 }
501
502 // FLA_Obj_free( &v );
503 // FLA_Obj_free( &y );
504 // FLA_Obj_free( &z );
505 FLA_free( buff_v );
506 FLA_free( buff_y );
507 FLA_free( buff_z );
508
509 return FLA_SUCCESS;
510 }
511
512
513
FLA_Bidiag_UT_u_step_ofc_var2(int m_A,int n_A,int m_TS,scomplex * buff_A,int rs_A,int cs_A,scomplex * buff_T,int rs_T,int cs_T,scomplex * buff_S,int rs_S,int cs_S)514 FLA_Error FLA_Bidiag_UT_u_step_ofc_var2( int m_A,
515 int n_A,
516 int m_TS,
517 scomplex* buff_A, int rs_A, int cs_A,
518 scomplex* buff_T, int rs_T, int cs_T,
519 scomplex* buff_S, int rs_S, int cs_S )
520 {
521 scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE );
522 scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO );
523 scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE );
524
525 scomplex beta;
526 int i;
527
528 // b_alg = FLA_Obj_length( T );
529 int b_alg = m_TS;
530
531 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &v );
532 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &y );
533 // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
534 scomplex* buff_v = ( scomplex* ) FLA_malloc( n_A * sizeof( *buff_A ) );
535 scomplex* buff_y = ( scomplex* ) FLA_malloc( n_A * sizeof( *buff_A ) );
536 scomplex* buff_z = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) );
537 int inc_v = 1;
538 int inc_y = 1;
539 int inc_z = 1;
540
541 for ( i = 0; i < b_alg; ++i )
542 {
543 scomplex* a10t = buff_A + (0 )*cs_A + (i )*rs_A;
544 scomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A;
545 scomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A;
546 scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A;
547 scomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A;
548 scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A;
549 scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A;
550
551 scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T;
552 scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T;
553
554 scomplex* s01 = buff_S + (i )*cs_S + (0 )*rs_S;
555 scomplex* sigma11 = buff_S + (i )*cs_S + (i )*rs_S;
556
557 scomplex* v21 = buff_v + (i+1)*inc_v;
558
559 scomplex* y21 = buff_y + (i+1)*inc_y;
560
561 scomplex* z21 = buff_z + (i+1)*inc_z;
562
563 scomplex* a12t_l = a12t + (0 )*cs_A + (0 )*rs_A;
564 scomplex* a12t_r = a12t + (1 )*cs_A + (0 )*rs_A;
565
566 scomplex* v21_t = v21 + (0 )*inc_v;
567 scomplex* v21_b = v21 + (1 )*inc_v;
568
569 int m_ahead = m_A - i - 1;
570 int n_ahead = n_A - i - 1;
571 int m_behind = i;
572 int n_behind = i;
573
574 /*------------------------------------------------------------*/
575
576 // FLA_Househ2_UT( FLA_LEFT,
577 // alpha11,
578 // a21, tau11 );
579 FLA_Househ2_UT_l_opc( m_ahead,
580 alpha11,
581 a21, rs_A,
582 tau11 );
583
584 if ( n_ahead > 0 )
585 {
586 // FLA_Copyt( FLA_CONJ_TRANSPOSE, a12t, y21 );
587 // FLA_Gemvc( FLA_CONJ_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, a21, FLA_ONE, y21 );
588 bl1_ccopyv( BLIS1_CONJUGATE,
589 n_ahead,
590 a12t, cs_A,
591 y21, inc_y );
592 bl1_cgemv( BLIS1_CONJ_TRANSPOSE,
593 BLIS1_NO_CONJUGATE,
594 m_ahead,
595 n_ahead,
596 buff_1,
597 A22, rs_A, cs_A,
598 a21, rs_A,
599 buff_1,
600 y21, inc_y );
601
602 // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, y21 );
603 bl1_cinvscalv( BLIS1_NO_CONJUGATE,
604 n_ahead,
605 tau11,
606 y21, inc_y );
607
608 // FLA_Axpyt( FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, y21, a12t );
609 bl1_caxpyv( BLIS1_CONJUGATE,
610 n_ahead,
611 buff_m1,
612 y21, inc_y,
613 a12t, cs_A );
614
615 // FLA_Househ2_UT( FLA_RIGHT, a12t_l, a12t_r, sigma11 );
616 FLA_Househ2_UT_r_opc( n_ahead - 1,
617 a12t_l,
618 a12t_r, cs_A,
619 sigma11 );
620
621 // FLA_Set( FLA_ONE, v21_t );
622 // FLA_Copyt( FLA_TRANSPOSE, a12t_r, v21_b );
623 *v21_t = *buff_1;
624 bl1_ccopyv( BLIS1_NO_CONJUGATE,
625 n_ahead - 1,
626 a12t_r, cs_A,
627 v21_b, inc_y );
628
629 // FLA_Dotc( FLA_CONJUGATE, y21, v21, beta );
630 // FLA_Scal( FLA_MINUS_ONE, beta );
631 bl1_cdot( BLIS1_CONJUGATE,
632 n_ahead,
633 y21, inc_y,
634 v21, inc_v,
635 &beta );
636 bl1_cneg1( &beta );
637
638 // FLA_Copy( a21, z21 );
639 // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, v21, beta, z21 );
640 // FLA_Inv_scalc( FLA_NO_CONJUGATE, sigma11, z21 );
641 bl1_ccopyv( BLIS1_NO_CONJUGATE,
642 m_ahead,
643 a21, rs_A,
644 z21, inc_z );
645 bl1_cgemv( BLIS1_NO_TRANSPOSE,
646 BLIS1_NO_CONJUGATE,
647 m_ahead,
648 n_ahead,
649 buff_1,
650 A22, rs_A, cs_A,
651 v21, inc_v,
652 &beta,
653 z21, inc_z );
654 bl1_cinvscalv( BLIS1_NO_CONJUGATE,
655 m_ahead,
656 sigma11,
657 z21, inc_z );
658
659 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y21, A22 );
660 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z21, v21, A22 );
661 FLA_Fused_Gerc2_opc_var1( m_ahead,
662 n_ahead,
663 buff_m1,
664 a21, rs_A,
665 y21, inc_y,
666 z21, inc_z,
667 v21, inc_v,
668 A22, rs_A, cs_A );
669
670 // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_ONE, A02, v21, FLA_ZERO, s01 );
671 bl1_cgemv( BLIS1_CONJ_NO_TRANSPOSE,
672 BLIS1_NO_CONJUGATE,
673 m_behind,
674 n_ahead,
675 buff_1,
676 A02, rs_A, cs_A,
677 v21, inc_v,
678 buff_0,
679 s01, rs_S );
680 }
681
682 // FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, t01 );
683 // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ONE, t01 );
684 bl1_ccopyv( BLIS1_CONJUGATE,
685 n_behind,
686 a10t, cs_A,
687 t01, rs_T );
688 bl1_cgemv( BLIS1_CONJ_TRANSPOSE,
689 BLIS1_NO_CONJUGATE,
690 m_ahead,
691 n_behind,
692 buff_1,
693 A20, rs_A, cs_A,
694 a21, rs_A,
695 buff_1,
696 t01, rs_T );
697
698 /*------------------------------------------------------------*/
699
700 }
701
702 // FLA_Obj_free( &v );
703 // FLA_Obj_free( &y );
704 // FLA_Obj_free( &z );
705 FLA_free( buff_v );
706 FLA_free( buff_y );
707 FLA_free( buff_z );
708
709 return FLA_SUCCESS;
710 }
711
712
713
FLA_Bidiag_UT_u_step_ofz_var2(int m_A,int n_A,int m_TS,dcomplex * buff_A,int rs_A,int cs_A,dcomplex * buff_T,int rs_T,int cs_T,dcomplex * buff_S,int rs_S,int cs_S)714 FLA_Error FLA_Bidiag_UT_u_step_ofz_var2( int m_A,
715 int n_A,
716 int m_TS,
717 dcomplex* buff_A, int rs_A, int cs_A,
718 dcomplex* buff_T, int rs_T, int cs_T,
719 dcomplex* buff_S, int rs_S, int cs_S )
720 {
721 dcomplex* buff_1 = FLA_DOUBLE_COMPLEX_PTR( FLA_ONE );
722 dcomplex* buff_0 = FLA_DOUBLE_COMPLEX_PTR( FLA_ZERO );
723 dcomplex* buff_m1 = FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE );
724
725 dcomplex beta;
726 int i;
727
728 // b_alg = FLA_Obj_length( T );
729 int b_alg = m_TS;
730
731 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &v );
732 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &y );
733 // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
734 dcomplex* buff_v = ( dcomplex* ) FLA_malloc( n_A * sizeof( *buff_A ) );
735 dcomplex* buff_y = ( dcomplex* ) FLA_malloc( n_A * sizeof( *buff_A ) );
736 dcomplex* buff_z = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) );
737 int inc_v = 1;
738 int inc_y = 1;
739 int inc_z = 1;
740
741 for ( i = 0; i < b_alg; ++i )
742 {
743 dcomplex* a10t = buff_A + (0 )*cs_A + (i )*rs_A;
744 dcomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A;
745 dcomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A;
746 dcomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A;
747 dcomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A;
748 dcomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A;
749 dcomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A;
750
751 dcomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T;
752 dcomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T;
753
754 dcomplex* s01 = buff_S + (i )*cs_S + (0 )*rs_S;
755 dcomplex* sigma11 = buff_S + (i )*cs_S + (i )*rs_S;
756
757 dcomplex* v21 = buff_v + (i+1)*inc_v;
758
759 dcomplex* y21 = buff_y + (i+1)*inc_y;
760
761 dcomplex* z21 = buff_z + (i+1)*inc_z;
762
763 dcomplex* a12t_l = a12t + (0 )*cs_A + (0 )*rs_A;
764 dcomplex* a12t_r = a12t + (1 )*cs_A + (0 )*rs_A;
765
766 dcomplex* v21_t = v21 + (0 )*inc_v;
767 dcomplex* v21_b = v21 + (1 )*inc_v;
768
769 int m_ahead = m_A - i - 1;
770 int n_ahead = n_A - i - 1;
771 int m_behind = i;
772 int n_behind = i;
773
774 /*------------------------------------------------------------*/
775
776 // FLA_Househ2_UT( FLA_LEFT,
777 // alpha11,
778 // a21, tau11 );
779 FLA_Househ2_UT_l_opz( m_ahead,
780 alpha11,
781 a21, rs_A,
782 tau11 );
783
784 if ( n_ahead > 0 )
785 {
786 // FLA_Copyt( FLA_CONJ_TRANSPOSE, a12t, y21 );
787 // FLA_Gemvc( FLA_CONJ_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, a21, FLA_ONE, y21 );
788 bl1_zcopyv( BLIS1_CONJUGATE,
789 n_ahead,
790 a12t, cs_A,
791 y21, inc_y );
792 bl1_zgemv( BLIS1_CONJ_TRANSPOSE,
793 BLIS1_NO_CONJUGATE,
794 m_ahead,
795 n_ahead,
796 buff_1,
797 A22, rs_A, cs_A,
798 a21, rs_A,
799 buff_1,
800 y21, inc_y );
801
802 // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, y21 );
803 bl1_zinvscalv( BLIS1_NO_CONJUGATE,
804 n_ahead,
805 tau11,
806 y21, inc_y );
807
808 // FLA_Axpyt( FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, y21, a12t );
809 bl1_zaxpyv( BLIS1_CONJUGATE,
810 n_ahead,
811 buff_m1,
812 y21, inc_y,
813 a12t, cs_A );
814
815 // FLA_Househ2_UT( FLA_RIGHT, a12t_l, a12t_r, sigma11 );
816 FLA_Househ2_UT_r_opz( n_ahead - 1,
817 a12t_l,
818 a12t_r, cs_A,
819 sigma11 );
820
821 // FLA_Set( FLA_ONE, v21_t );
822 // FLA_Copyt( FLA_TRANSPOSE, a12t_r, v21_b );
823 *v21_t = *buff_1;
824 bl1_zcopyv( BLIS1_NO_CONJUGATE,
825 n_ahead - 1,
826 a12t_r, cs_A,
827 v21_b, inc_y );
828
829 // FLA_Dotc( FLA_CONJUGATE, y21, v21, beta );
830 // FLA_Scal( FLA_MINUS_ONE, beta );
831 bl1_zdot( BLIS1_CONJUGATE,
832 n_ahead,
833 y21, inc_y,
834 v21, inc_v,
835 &beta );
836 bl1_zneg1( &beta );
837
838 // FLA_Copy( a21, z21 );
839 // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, v21, beta, z21 );
840 // FLA_Inv_scalc( FLA_NO_CONJUGATE, sigma11, z21 );
841 bl1_zcopyv( BLIS1_NO_CONJUGATE,
842 m_ahead,
843 a21, rs_A,
844 z21, inc_z );
845 bl1_zgemv( BLIS1_NO_TRANSPOSE,
846 BLIS1_NO_CONJUGATE,
847 m_ahead,
848 n_ahead,
849 buff_1,
850 A22, rs_A, cs_A,
851 v21, inc_v,
852 &beta,
853 z21, inc_z );
854 bl1_zinvscalv( BLIS1_NO_CONJUGATE,
855 m_ahead,
856 sigma11,
857 z21, inc_z );
858
859 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y21, A22 );
860 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z21, v21, A22 );
861 FLA_Fused_Gerc2_opz_var1( m_ahead,
862 n_ahead,
863 buff_m1,
864 a21, rs_A,
865 y21, inc_y,
866 z21, inc_z,
867 v21, inc_v,
868 A22, rs_A, cs_A );
869
870 // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_ONE, A02, v21, FLA_ZERO, s01 );
871 bl1_zgemv( BLIS1_CONJ_NO_TRANSPOSE,
872 BLIS1_NO_CONJUGATE,
873 m_behind,
874 n_ahead,
875 buff_1,
876 A02, rs_A, cs_A,
877 v21, inc_v,
878 buff_0,
879 s01, rs_S );
880 }
881
882 // FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, t01 );
883 // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ONE, t01 );
884 bl1_zcopyv( BLIS1_CONJUGATE,
885 n_behind,
886 a10t, cs_A,
887 t01, rs_T );
888 bl1_zgemv( BLIS1_CONJ_TRANSPOSE,
889 BLIS1_NO_CONJUGATE,
890 m_ahead,
891 n_behind,
892 buff_1,
893 A20, rs_A, cs_A,
894 a21, rs_A,
895 buff_1,
896 t01, rs_T );
897
898 /*------------------------------------------------------------*/
899
900 }
901
902 // FLA_Obj_free( &v );
903 // FLA_Obj_free( &y );
904 // FLA_Obj_free( &z );
905 FLA_free( buff_v );
906 FLA_free( buff_y );
907 FLA_free( buff_z );
908
909 return FLA_SUCCESS;
910 }
911
912