1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
FLA_Bidiag_UT_u_opt_var2(FLA_Obj A,FLA_Obj TU,FLA_Obj TV)13 FLA_Error FLA_Bidiag_UT_u_opt_var2( FLA_Obj A, FLA_Obj TU, FLA_Obj TV )
14 {
15 return FLA_Bidiag_UT_u_step_opt_var2( A, TU, TV );
16 }
17
FLA_Bidiag_UT_u_step_opt_var2(FLA_Obj A,FLA_Obj T,FLA_Obj S)18 FLA_Error FLA_Bidiag_UT_u_step_opt_var2( FLA_Obj A, FLA_Obj T, FLA_Obj S )
19 {
20 FLA_Datatype datatype;
21 int m_A, n_A, m_TS;
22 int rs_A, cs_A;
23 int rs_T, cs_T;
24 int rs_S, cs_S;
25
26 datatype = FLA_Obj_datatype( A );
27
28 m_A = FLA_Obj_length( A );
29 n_A = FLA_Obj_width( A );
30 m_TS = FLA_Obj_length( T );
31
32 rs_A = FLA_Obj_row_stride( A );
33 cs_A = FLA_Obj_col_stride( A );
34
35 rs_T = FLA_Obj_row_stride( T );
36 cs_T = FLA_Obj_col_stride( T );
37
38 rs_S = FLA_Obj_row_stride( S );
39 cs_S = FLA_Obj_col_stride( S );
40
41
42 switch ( datatype )
43 {
44 case FLA_FLOAT:
45 {
46 float* buff_A = FLA_FLOAT_PTR( A );
47 float* buff_T = FLA_FLOAT_PTR( T );
48 float* buff_S = FLA_FLOAT_PTR( S );
49
50 FLA_Bidiag_UT_u_step_ops_var2( m_A,
51 n_A,
52 m_TS,
53 buff_A, rs_A, cs_A,
54 buff_T, rs_T, cs_T,
55 buff_S, rs_S, cs_S );
56
57 break;
58 }
59
60 case FLA_DOUBLE:
61 {
62 double* buff_A = FLA_DOUBLE_PTR( A );
63 double* buff_T = FLA_DOUBLE_PTR( T );
64 double* buff_S = FLA_DOUBLE_PTR( S );
65
66 FLA_Bidiag_UT_u_step_opd_var2( m_A,
67 n_A,
68 m_TS,
69 buff_A, rs_A, cs_A,
70 buff_T, rs_T, cs_T,
71 buff_S, rs_S, cs_S );
72
73 break;
74 }
75
76 case FLA_COMPLEX:
77 {
78 scomplex* buff_A = FLA_COMPLEX_PTR( A );
79 scomplex* buff_T = FLA_COMPLEX_PTR( T );
80 scomplex* buff_S = FLA_COMPLEX_PTR( S );
81
82 FLA_Bidiag_UT_u_step_opc_var2( m_A,
83 n_A,
84 m_TS,
85 buff_A, rs_A, cs_A,
86 buff_T, rs_T, cs_T,
87 buff_S, rs_S, cs_S );
88
89 break;
90 }
91
92 case FLA_DOUBLE_COMPLEX:
93 {
94 dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A );
95 dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T );
96 dcomplex* buff_S = FLA_DOUBLE_COMPLEX_PTR( S );
97
98 FLA_Bidiag_UT_u_step_opz_var2( m_A,
99 n_A,
100 m_TS,
101 buff_A, rs_A, cs_A,
102 buff_T, rs_T, cs_T,
103 buff_S, rs_S, cs_S );
104
105 break;
106 }
107 }
108
109 return FLA_SUCCESS;
110 }
111
112
113
FLA_Bidiag_UT_u_step_ops_var2(int m_A,int n_A,int m_TS,float * buff_A,int rs_A,int cs_A,float * buff_T,int rs_T,int cs_T,float * buff_S,int rs_S,int cs_S)114 FLA_Error FLA_Bidiag_UT_u_step_ops_var2( int m_A,
115 int n_A,
116 int m_TS,
117 float* buff_A, int rs_A, int cs_A,
118 float* buff_T, int rs_T, int cs_T,
119 float* buff_S, int rs_S, int cs_S )
120 {
121 float* buff_1 = FLA_FLOAT_PTR( FLA_ONE );
122 float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO );
123 float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE );
124
125 float beta;
126 int i;
127
128 // b_alg = FLA_Obj_length( T );
129 int b_alg = m_TS;
130
131 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &v );
132 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &y );
133 // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
134 float* buff_v = ( float* ) FLA_malloc( n_A * sizeof( *buff_A ) );
135 float* buff_y = ( float* ) FLA_malloc( n_A * sizeof( *buff_A ) );
136 float* buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) );
137 int inc_v = 1;
138 int inc_y = 1;
139 int inc_z = 1;
140
141 for ( i = 0; i < b_alg; ++i )
142 {
143 float* a10t = buff_A + (0 )*cs_A + (i )*rs_A;
144 float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A;
145 float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A;
146 float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A;
147 float* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A;
148 float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A;
149 float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A;
150
151 float* t01 = buff_T + (i )*cs_T + (0 )*rs_T;
152 float* tau11 = buff_T + (i )*cs_T + (i )*rs_T;
153
154 float* s01 = buff_S + (i )*cs_S + (0 )*rs_S;
155 float* sigma11 = buff_S + (i )*cs_S + (i )*rs_S;
156
157 float* v21 = buff_v + (i+1)*inc_v;
158
159 float* y21 = buff_y + (i+1)*inc_y;
160
161 float* z21 = buff_z + (i+1)*inc_z;
162
163 float* a12t_l = a12t + (0 )*cs_A + (0 )*rs_A;
164 float* a12t_r = a12t + (1 )*cs_A + (0 )*rs_A;
165
166 float* v21_t = v21 + (0 )*inc_v;
167 float* v21_b = v21 + (1 )*inc_v;
168
169 int m_ahead = m_A - i - 1;
170 int n_ahead = n_A - i - 1;
171 int m_behind = i;
172 int n_behind = i;
173
174 /*------------------------------------------------------------*/
175
176 // FLA_Househ2_UT( FLA_LEFT,
177 // alpha11,
178 // a21, tau11 );
179 FLA_Househ2_UT_l_ops( m_ahead,
180 alpha11,
181 a21, rs_A,
182 tau11 );
183
184 if ( n_ahead > 0 )
185 {
186 // FLA_Copyt( FLA_CONJ_TRANSPOSE, a12t, y21 );
187 // FLA_Gemvc( FLA_CONJ_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, a21, FLA_ONE, y21 );
188 bl1_scopyv( BLIS1_CONJUGATE,
189 n_ahead,
190 a12t, cs_A,
191 y21, inc_y );
192 bl1_sgemv( BLIS1_CONJ_TRANSPOSE,
193 BLIS1_NO_CONJUGATE,
194 m_ahead,
195 n_ahead,
196 buff_1,
197 A22, rs_A, cs_A,
198 a21, rs_A,
199 buff_1,
200 y21, inc_y );
201
202 // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, y21 );
203 bl1_sinvscalv( BLIS1_NO_CONJUGATE,
204 n_ahead,
205 tau11,
206 y21, inc_y );
207
208 // FLA_Axpyt( FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, y21, a12t );
209 bl1_saxpyv( BLIS1_CONJUGATE,
210 n_ahead,
211 buff_m1,
212 y21, inc_y,
213 a12t, cs_A );
214
215 // FLA_Househ2_UT( FLA_RIGHT, a12t_l, a12t_r, sigma11 );
216 FLA_Househ2_UT_r_ops( n_ahead - 1,
217 a12t_l,
218 a12t_r, cs_A,
219 sigma11 );
220
221 // FLA_Set( FLA_ONE, v21_t );
222 // FLA_Copyt( FLA_TRANSPOSE, a12t_r, v21_b );
223 *v21_t = *buff_1;
224 bl1_scopyv( BLIS1_NO_CONJUGATE,
225 n_ahead - 1,
226 a12t_r, cs_A,
227 v21_b, inc_y );
228
229 // FLA_Dotc( FLA_CONJUGATE, y21, v21, beta );
230 // FLA_Scal( FLA_MINUS_ONE, beta );
231 bl1_sdot( BLIS1_CONJUGATE,
232 n_ahead,
233 y21, inc_y,
234 v21, inc_v,
235 &beta );
236 bl1_sneg1( &beta );
237
238 // FLA_Copy( a21, z21 );
239 // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, v21, beta, z21 );
240 // FLA_Inv_scalc( FLA_NO_CONJUGATE, sigma11, z21 );
241 bl1_scopyv( BLIS1_NO_CONJUGATE,
242 m_ahead,
243 a21, rs_A,
244 z21, inc_z );
245 bl1_sgemv( BLIS1_NO_TRANSPOSE,
246 BLIS1_NO_CONJUGATE,
247 m_ahead,
248 n_ahead,
249 buff_1,
250 A22, rs_A, cs_A,
251 v21, inc_v,
252 &beta,
253 z21, inc_z );
254 bl1_sinvscalv( BLIS1_NO_CONJUGATE,
255 m_ahead,
256 sigma11,
257 z21, inc_z );
258
259 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y21, A22 );
260 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z21, v21, A22 );
261 bl1_sger( BLIS1_NO_CONJUGATE,
262 BLIS1_CONJUGATE,
263 m_ahead,
264 n_ahead,
265 buff_m1,
266 a21, rs_A,
267 y21, inc_y,
268 A22, rs_A, cs_A );
269 bl1_sger( BLIS1_NO_CONJUGATE,
270 BLIS1_CONJUGATE,
271 m_ahead,
272 n_ahead,
273 buff_m1,
274 z21, inc_z,
275 v21, inc_v,
276 A22, rs_A, cs_A );
277
278 // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_ONE, A02, v21, FLA_ZERO, s01 );
279 bl1_sgemv( BLIS1_CONJ_NO_TRANSPOSE,
280 BLIS1_NO_CONJUGATE,
281 m_behind,
282 n_ahead,
283 buff_1,
284 A02, rs_A, cs_A,
285 v21, inc_v,
286 buff_0,
287 s01, rs_S );
288 }
289
290 // FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, t01 );
291 // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ONE, t01 );
292 bl1_scopyv( BLIS1_CONJUGATE,
293 n_behind,
294 a10t, cs_A,
295 t01, rs_T );
296 bl1_sgemv( BLIS1_CONJ_TRANSPOSE,
297 BLIS1_NO_CONJUGATE,
298 m_ahead,
299 n_behind,
300 buff_1,
301 A20, rs_A, cs_A,
302 a21, rs_A,
303 buff_1,
304 t01, rs_T );
305
306 /*------------------------------------------------------------*/
307
308 }
309
310 // FLA_Obj_free( &v );
311 // FLA_Obj_free( &y );
312 // FLA_Obj_free( &z );
313 FLA_free( buff_v );
314 FLA_free( buff_y );
315 FLA_free( buff_z );
316
317 return FLA_SUCCESS;
318 }
319
320
321
FLA_Bidiag_UT_u_step_opd_var2(int m_A,int n_A,int m_TS,double * buff_A,int rs_A,int cs_A,double * buff_T,int rs_T,int cs_T,double * buff_S,int rs_S,int cs_S)322 FLA_Error FLA_Bidiag_UT_u_step_opd_var2( int m_A,
323 int n_A,
324 int m_TS,
325 double* buff_A, int rs_A, int cs_A,
326 double* buff_T, int rs_T, int cs_T,
327 double* buff_S, int rs_S, int cs_S )
328 {
329 double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE );
330 double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO );
331 double* buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE );
332
333 double beta;
334 int i;
335
336 // b_alg = FLA_Obj_length( T );
337 int b_alg = m_TS;
338
339 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &v );
340 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &y );
341 // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
342 double* buff_v = ( double* ) FLA_malloc( n_A * sizeof( *buff_A ) );
343 double* buff_y = ( double* ) FLA_malloc( n_A * sizeof( *buff_A ) );
344 double* buff_z = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) );
345 int inc_v = 1;
346 int inc_y = 1;
347 int inc_z = 1;
348
349 for ( i = 0; i < b_alg; ++i )
350 {
351 double* a10t = buff_A + (0 )*cs_A + (i )*rs_A;
352 double* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A;
353 double* alpha11 = buff_A + (i )*cs_A + (i )*rs_A;
354 double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A;
355 double* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A;
356 double* a12t = buff_A + (i+1)*cs_A + (i )*rs_A;
357 double* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A;
358
359 double* t01 = buff_T + (i )*cs_T + (0 )*rs_T;
360 double* tau11 = buff_T + (i )*cs_T + (i )*rs_T;
361
362 double* s01 = buff_S + (i )*cs_S + (0 )*rs_S;
363 double* sigma11 = buff_S + (i )*cs_S + (i )*rs_S;
364
365 double* v21 = buff_v + (i+1)*inc_v;
366
367 double* y21 = buff_y + (i+1)*inc_y;
368
369 double* z21 = buff_z + (i+1)*inc_z;
370
371 double* a12t_l = a12t + (0 )*cs_A + (0 )*rs_A;
372 double* a12t_r = a12t + (1 )*cs_A + (0 )*rs_A;
373
374 double* v21_t = v21 + (0 )*inc_v;
375 double* v21_b = v21 + (1 )*inc_v;
376
377 int m_ahead = m_A - i - 1;
378 int n_ahead = n_A - i - 1;
379 int m_behind = i;
380 int n_behind = i;
381
382 /*------------------------------------------------------------*/
383
384 // FLA_Househ2_UT( FLA_LEFT,
385 // alpha11,
386 // a21, tau11 );
387 FLA_Househ2_UT_l_opd( m_ahead,
388 alpha11,
389 a21, rs_A,
390 tau11 );
391
392 if ( n_ahead > 0 )
393 {
394 // FLA_Copyt( FLA_CONJ_TRANSPOSE, a12t, y21 );
395 // FLA_Gemvc( FLA_CONJ_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, a21, FLA_ONE, y21 );
396 bl1_dcopyv( BLIS1_CONJUGATE,
397 n_ahead,
398 a12t, cs_A,
399 y21, inc_y );
400 bl1_dgemv( BLIS1_CONJ_TRANSPOSE,
401 BLIS1_NO_CONJUGATE,
402 m_ahead,
403 n_ahead,
404 buff_1,
405 A22, rs_A, cs_A,
406 a21, rs_A,
407 buff_1,
408 y21, inc_y );
409
410 // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, y21 );
411 bl1_dinvscalv( BLIS1_NO_CONJUGATE,
412 n_ahead,
413 tau11,
414 y21, inc_y );
415
416 // FLA_Axpyt( FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, y21, a12t );
417 bl1_daxpyv( BLIS1_CONJUGATE,
418 n_ahead,
419 buff_m1,
420 y21, inc_y,
421 a12t, cs_A );
422
423 // FLA_Househ2_UT( FLA_RIGHT, a12t_l, a12t_r, sigma11 );
424 FLA_Househ2_UT_r_opd( n_ahead - 1,
425 a12t_l,
426 a12t_r, cs_A,
427 sigma11 );
428
429 // FLA_Set( FLA_ONE, v21_t );
430 // FLA_Copyt( FLA_TRANSPOSE, a12t_r, v21_b );
431 *v21_t = *buff_1;
432 bl1_dcopyv( BLIS1_NO_CONJUGATE,
433 n_ahead - 1,
434 a12t_r, cs_A,
435 v21_b, inc_y );
436
437 // FLA_Dotc( FLA_CONJUGATE, y21, v21, beta );
438 // FLA_Scal( FLA_MINUS_ONE, beta );
439 bl1_ddot( BLIS1_CONJUGATE,
440 n_ahead,
441 y21, inc_y,
442 v21, inc_v,
443 &beta );
444 bl1_dneg1( &beta );
445
446 // FLA_Copy( a21, z21 );
447 // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, v21, beta, z21 );
448 // FLA_Inv_scalc( FLA_NO_CONJUGATE, sigma11, z21 );
449 bl1_dcopyv( BLIS1_NO_CONJUGATE,
450 m_ahead,
451 a21, rs_A,
452 z21, inc_z );
453 bl1_dgemv( BLIS1_NO_TRANSPOSE,
454 BLIS1_NO_CONJUGATE,
455 m_ahead,
456 n_ahead,
457 buff_1,
458 A22, rs_A, cs_A,
459 v21, inc_v,
460 &beta,
461 z21, inc_z );
462 bl1_dinvscalv( BLIS1_NO_CONJUGATE,
463 m_ahead,
464 sigma11,
465 z21, inc_z );
466
467 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y21, A22 );
468 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z21, v21, A22 );
469 bl1_dger( BLIS1_NO_CONJUGATE,
470 BLIS1_CONJUGATE,
471 m_ahead,
472 n_ahead,
473 buff_m1,
474 a21, rs_A,
475 y21, inc_y,
476 A22, rs_A, cs_A );
477 bl1_dger( BLIS1_NO_CONJUGATE,
478 BLIS1_CONJUGATE,
479 m_ahead,
480 n_ahead,
481 buff_m1,
482 z21, inc_z,
483 v21, inc_v,
484 A22, rs_A, cs_A );
485
486 // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_ONE, A02, v21, FLA_ZERO, s01 );
487 bl1_dgemv( BLIS1_CONJ_NO_TRANSPOSE,
488 BLIS1_NO_CONJUGATE,
489 m_behind,
490 n_ahead,
491 buff_1,
492 A02, rs_A, cs_A,
493 v21, inc_v,
494 buff_0,
495 s01, rs_S );
496 }
497
498 // FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, t01 );
499 // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ONE, t01 );
500 bl1_dcopyv( BLIS1_CONJUGATE,
501 n_behind,
502 a10t, cs_A,
503 t01, rs_T );
504 bl1_dgemv( BLIS1_CONJ_TRANSPOSE,
505 BLIS1_NO_CONJUGATE,
506 m_ahead,
507 n_behind,
508 buff_1,
509 A20, rs_A, cs_A,
510 a21, rs_A,
511 buff_1,
512 t01, rs_T );
513
514 /*------------------------------------------------------------*/
515
516 }
517
518 // FLA_Obj_free( &v );
519 // FLA_Obj_free( &y );
520 // FLA_Obj_free( &z );
521 FLA_free( buff_v );
522 FLA_free( buff_y );
523 FLA_free( buff_z );
524
525 return FLA_SUCCESS;
526 }
527
528
529
FLA_Bidiag_UT_u_step_opc_var2(int m_A,int n_A,int m_TS,scomplex * buff_A,int rs_A,int cs_A,scomplex * buff_T,int rs_T,int cs_T,scomplex * buff_S,int rs_S,int cs_S)530 FLA_Error FLA_Bidiag_UT_u_step_opc_var2( int m_A,
531 int n_A,
532 int m_TS,
533 scomplex* buff_A, int rs_A, int cs_A,
534 scomplex* buff_T, int rs_T, int cs_T,
535 scomplex* buff_S, int rs_S, int cs_S )
536 {
537 scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE );
538 scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO );
539 scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE );
540
541 scomplex beta;
542 int i;
543
544 // b_alg = FLA_Obj_length( T );
545 int b_alg = m_TS;
546
547 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &v );
548 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &y );
549 // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
550 scomplex* buff_v = ( scomplex* ) FLA_malloc( n_A * sizeof( *buff_A ) );
551 scomplex* buff_y = ( scomplex* ) FLA_malloc( n_A * sizeof( *buff_A ) );
552 scomplex* buff_z = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) );
553 int inc_v = 1;
554 int inc_y = 1;
555 int inc_z = 1;
556
557 for ( i = 0; i < b_alg; ++i )
558 {
559 scomplex* a10t = buff_A + (0 )*cs_A + (i )*rs_A;
560 scomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A;
561 scomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A;
562 scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A;
563 scomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A;
564 scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A;
565 scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A;
566
567 scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T;
568 scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T;
569
570 scomplex* s01 = buff_S + (i )*cs_S + (0 )*rs_S;
571 scomplex* sigma11 = buff_S + (i )*cs_S + (i )*rs_S;
572
573 scomplex* v21 = buff_v + (i+1)*inc_v;
574
575 scomplex* y21 = buff_y + (i+1)*inc_y;
576
577 scomplex* z21 = buff_z + (i+1)*inc_z;
578
579 scomplex* a12t_l = a12t + (0 )*cs_A + (0 )*rs_A;
580 scomplex* a12t_r = a12t + (1 )*cs_A + (0 )*rs_A;
581
582 scomplex* v21_t = v21 + (0 )*inc_v;
583 scomplex* v21_b = v21 + (1 )*inc_v;
584
585 int m_ahead = m_A - i - 1;
586 int n_ahead = n_A - i - 1;
587 int m_behind = i;
588 int n_behind = i;
589
590 /*------------------------------------------------------------*/
591
592 // FLA_Househ2_UT( FLA_LEFT,
593 // alpha11,
594 // a21, tau11 );
595 FLA_Househ2_UT_l_opc( m_ahead,
596 alpha11,
597 a21, rs_A,
598 tau11 );
599
600 if ( n_ahead > 0 )
601 {
602 // FLA_Copyt( FLA_CONJ_TRANSPOSE, a12t, y21 );
603 // FLA_Gemvc( FLA_CONJ_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, a21, FLA_ONE, y21 );
604 bl1_ccopyv( BLIS1_CONJUGATE,
605 n_ahead,
606 a12t, cs_A,
607 y21, inc_y );
608 bl1_cgemv( BLIS1_CONJ_TRANSPOSE,
609 BLIS1_NO_CONJUGATE,
610 m_ahead,
611 n_ahead,
612 buff_1,
613 A22, rs_A, cs_A,
614 a21, rs_A,
615 buff_1,
616 y21, inc_y );
617
618 // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, y21 );
619 bl1_cinvscalv( BLIS1_NO_CONJUGATE,
620 n_ahead,
621 tau11,
622 y21, inc_y );
623
624 // FLA_Axpyt( FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, y21, a12t );
625 bl1_caxpyv( BLIS1_CONJUGATE,
626 n_ahead,
627 buff_m1,
628 y21, inc_y,
629 a12t, cs_A );
630
631 // FLA_Househ2_UT( FLA_RIGHT, a12t_l, a12t_r, sigma11 );
632 FLA_Househ2_UT_r_opc( n_ahead - 1,
633 a12t_l,
634 a12t_r, cs_A,
635 sigma11 );
636
637 // FLA_Set( FLA_ONE, v21_t );
638 // FLA_Copyt( FLA_TRANSPOSE, a12t_r, v21_b );
639 *v21_t = *buff_1;
640 bl1_ccopyv( BLIS1_NO_CONJUGATE,
641 n_ahead - 1,
642 a12t_r, cs_A,
643 v21_b, inc_y );
644
645 // FLA_Dotc( FLA_CONJUGATE, y21, v21, beta );
646 // FLA_Scal( FLA_MINUS_ONE, beta );
647 bl1_cdot( BLIS1_CONJUGATE,
648 n_ahead,
649 y21, inc_y,
650 v21, inc_v,
651 &beta );
652 bl1_cneg1( &beta );
653
654 // FLA_Copy( a21, z21 );
655 // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, v21, beta, z21 );
656 // FLA_Inv_scalc( FLA_NO_CONJUGATE, sigma11, z21 );
657 bl1_ccopyv( BLIS1_NO_CONJUGATE,
658 m_ahead,
659 a21, rs_A,
660 z21, inc_z );
661 bl1_cgemv( BLIS1_NO_TRANSPOSE,
662 BLIS1_NO_CONJUGATE,
663 m_ahead,
664 n_ahead,
665 buff_1,
666 A22, rs_A, cs_A,
667 v21, inc_v,
668 &beta,
669 z21, inc_z );
670 bl1_cinvscalv( BLIS1_NO_CONJUGATE,
671 m_ahead,
672 sigma11,
673 z21, inc_z );
674
675 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y21, A22 );
676 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z21, v21, A22 );
677 bl1_cger( BLIS1_NO_CONJUGATE,
678 BLIS1_CONJUGATE,
679 m_ahead,
680 n_ahead,
681 buff_m1,
682 a21, rs_A,
683 y21, inc_y,
684 A22, rs_A, cs_A );
685 bl1_cger( BLIS1_NO_CONJUGATE,
686 BLIS1_CONJUGATE,
687 m_ahead,
688 n_ahead,
689 buff_m1,
690 z21, inc_z,
691 v21, inc_v,
692 A22, rs_A, cs_A );
693
694 // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_ONE, A02, v21, FLA_ZERO, s01 );
695 bl1_cgemv( BLIS1_CONJ_NO_TRANSPOSE,
696 BLIS1_NO_CONJUGATE,
697 m_behind,
698 n_ahead,
699 buff_1,
700 A02, rs_A, cs_A,
701 v21, inc_v,
702 buff_0,
703 s01, rs_S );
704 }
705
706 // FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, t01 );
707 // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ONE, t01 );
708 bl1_ccopyv( BLIS1_CONJUGATE,
709 n_behind,
710 a10t, cs_A,
711 t01, rs_T );
712 bl1_cgemv( BLIS1_CONJ_TRANSPOSE,
713 BLIS1_NO_CONJUGATE,
714 m_ahead,
715 n_behind,
716 buff_1,
717 A20, rs_A, cs_A,
718 a21, rs_A,
719 buff_1,
720 t01, rs_T );
721
722 /*------------------------------------------------------------*/
723
724 }
725
726 // FLA_Obj_free( &v );
727 // FLA_Obj_free( &y );
728 // FLA_Obj_free( &z );
729 FLA_free( buff_v );
730 FLA_free( buff_y );
731 FLA_free( buff_z );
732
733 return FLA_SUCCESS;
734 }
735
736
737
FLA_Bidiag_UT_u_step_opz_var2(int m_A,int n_A,int m_TS,dcomplex * buff_A,int rs_A,int cs_A,dcomplex * buff_T,int rs_T,int cs_T,dcomplex * buff_S,int rs_S,int cs_S)738 FLA_Error FLA_Bidiag_UT_u_step_opz_var2( int m_A,
739 int n_A,
740 int m_TS,
741 dcomplex* buff_A, int rs_A, int cs_A,
742 dcomplex* buff_T, int rs_T, int cs_T,
743 dcomplex* buff_S, int rs_S, int cs_S )
744 {
745 dcomplex* buff_1 = FLA_DOUBLE_COMPLEX_PTR( FLA_ONE );
746 dcomplex* buff_0 = FLA_DOUBLE_COMPLEX_PTR( FLA_ZERO );
747 dcomplex* buff_m1 = FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE );
748
749 dcomplex beta;
750 int i;
751
752 // b_alg = FLA_Obj_length( T );
753 int b_alg = m_TS;
754
755 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &v );
756 // FLA_Obj_create( datatype_A, n_A, 1, 0, 0, &y );
757 // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
758 dcomplex* buff_v = ( dcomplex* ) FLA_malloc( n_A * sizeof( *buff_A ) );
759 dcomplex* buff_y = ( dcomplex* ) FLA_malloc( n_A * sizeof( *buff_A ) );
760 dcomplex* buff_z = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) );
761 int inc_v = 1;
762 int inc_y = 1;
763 int inc_z = 1;
764
765 for ( i = 0; i < b_alg; ++i )
766 {
767 dcomplex* a10t = buff_A + (0 )*cs_A + (i )*rs_A;
768 dcomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A;
769 dcomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A;
770 dcomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A;
771 dcomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A;
772 dcomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A;
773 dcomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A;
774
775 dcomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T;
776 dcomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T;
777
778 dcomplex* s01 = buff_S + (i )*cs_S + (0 )*rs_S;
779 dcomplex* sigma11 = buff_S + (i )*cs_S + (i )*rs_S;
780
781 dcomplex* v21 = buff_v + (i+1)*inc_v;
782
783 dcomplex* y21 = buff_y + (i+1)*inc_y;
784
785 dcomplex* z21 = buff_z + (i+1)*inc_z;
786
787 dcomplex* a12t_l = a12t + (0 )*cs_A + (0 )*rs_A;
788 dcomplex* a12t_r = a12t + (1 )*cs_A + (0 )*rs_A;
789
790 dcomplex* v21_t = v21 + (0 )*inc_v;
791 dcomplex* v21_b = v21 + (1 )*inc_v;
792
793 int m_ahead = m_A - i - 1;
794 int n_ahead = n_A - i - 1;
795 int m_behind = i;
796 int n_behind = i;
797
798 /*------------------------------------------------------------*/
799
800 // FLA_Househ2_UT( FLA_LEFT,
801 // alpha11,
802 // a21, tau11 );
803 FLA_Househ2_UT_l_opz( m_ahead,
804 alpha11,
805 a21, rs_A,
806 tau11 );
807
808 if ( n_ahead > 0 )
809 {
810 // FLA_Copyt( FLA_CONJ_TRANSPOSE, a12t, y21 );
811 // FLA_Gemvc( FLA_CONJ_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, a21, FLA_ONE, y21 );
812 bl1_zcopyv( BLIS1_CONJUGATE,
813 n_ahead,
814 a12t, cs_A,
815 y21, inc_y );
816 bl1_zgemv( BLIS1_CONJ_TRANSPOSE,
817 BLIS1_NO_CONJUGATE,
818 m_ahead,
819 n_ahead,
820 buff_1,
821 A22, rs_A, cs_A,
822 a21, rs_A,
823 buff_1,
824 y21, inc_y );
825
826 // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, y21 );
827 bl1_zinvscalv( BLIS1_NO_CONJUGATE,
828 n_ahead,
829 tau11,
830 y21, inc_y );
831
832 // FLA_Axpyt( FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, y21, a12t );
833 bl1_zaxpyv( BLIS1_CONJUGATE,
834 n_ahead,
835 buff_m1,
836 y21, inc_y,
837 a12t, cs_A );
838
839 // FLA_Househ2_UT( FLA_RIGHT, a12t_l, a12t_r, sigma11 );
840 FLA_Househ2_UT_r_opz( n_ahead - 1,
841 a12t_l,
842 a12t_r, cs_A,
843 sigma11 );
844
845 // FLA_Set( FLA_ONE, v21_t );
846 // FLA_Copyt( FLA_TRANSPOSE, a12t_r, v21_b );
847 *v21_t = *buff_1;
848 bl1_zcopyv( BLIS1_NO_CONJUGATE,
849 n_ahead - 1,
850 a12t_r, cs_A,
851 v21_b, inc_y );
852
853 // FLA_Dotc( FLA_CONJUGATE, y21, v21, beta );
854 // FLA_Scal( FLA_MINUS_ONE, beta );
855 bl1_zdot( BLIS1_CONJUGATE,
856 n_ahead,
857 y21, inc_y,
858 v21, inc_v,
859 &beta );
860 bl1_zneg1( &beta );
861
862 // FLA_Copy( a21, z21 );
863 // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_NO_CONJUGATE, FLA_ONE, A22, v21, beta, z21 );
864 // FLA_Inv_scalc( FLA_NO_CONJUGATE, sigma11, z21 );
865 bl1_zcopyv( BLIS1_NO_CONJUGATE,
866 m_ahead,
867 a21, rs_A,
868 z21, inc_z );
869 bl1_zgemv( BLIS1_NO_TRANSPOSE,
870 BLIS1_NO_CONJUGATE,
871 m_ahead,
872 n_ahead,
873 buff_1,
874 A22, rs_A, cs_A,
875 v21, inc_v,
876 &beta,
877 z21, inc_z );
878 bl1_zinvscalv( BLIS1_NO_CONJUGATE,
879 m_ahead,
880 sigma11,
881 z21, inc_z );
882
883 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y21, A22 );
884 // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z21, v21, A22 );
885 bl1_zger( BLIS1_NO_CONJUGATE,
886 BLIS1_CONJUGATE,
887 m_ahead,
888 n_ahead,
889 buff_m1,
890 a21, rs_A,
891 y21, inc_y,
892 A22, rs_A, cs_A );
893 bl1_zger( BLIS1_NO_CONJUGATE,
894 BLIS1_CONJUGATE,
895 m_ahead,
896 n_ahead,
897 buff_m1,
898 z21, inc_z,
899 v21, inc_v,
900 A22, rs_A, cs_A );
901
902 // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_ONE, A02, v21, FLA_ZERO, s01 );
903 bl1_zgemv( BLIS1_CONJ_NO_TRANSPOSE,
904 BLIS1_NO_CONJUGATE,
905 m_behind,
906 n_ahead,
907 buff_1,
908 A02, rs_A, cs_A,
909 v21, inc_v,
910 buff_0,
911 s01, rs_S );
912 }
913
914 // FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, t01 );
915 // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ONE, t01 );
916 bl1_zcopyv( BLIS1_CONJUGATE,
917 n_behind,
918 a10t, cs_A,
919 t01, rs_T );
920 bl1_zgemv( BLIS1_CONJ_TRANSPOSE,
921 BLIS1_NO_CONJUGATE,
922 m_ahead,
923 n_behind,
924 buff_1,
925 A20, rs_A, cs_A,
926 a21, rs_A,
927 buff_1,
928 t01, rs_T );
929
930 /*------------------------------------------------------------*/
931
932 }
933
934 // FLA_Obj_free( &v );
935 // FLA_Obj_free( &y );
936 // FLA_Obj_free( &z );
937 FLA_free( buff_v );
938 FLA_free( buff_y );
939 FLA_free( buff_z );
940
941 return FLA_SUCCESS;
942 }
943
944