1 #include <math.h>
2 #include "blas_extended.h"
3 #include "blas_extended_private.h"
BLAS_ztbsv_c(enum blas_order_type order,enum blas_uplo_type uplo,enum blas_trans_type trans,enum blas_diag_type diag,int n,int k,const void * alpha,const void * t,int ldt,void * x,int incx)4 void BLAS_ztbsv_c(enum blas_order_type order, enum blas_uplo_type uplo,
5 enum blas_trans_type trans, enum blas_diag_type diag,
6 int n, int k, const void *alpha, const void *t, int ldt,
7 void *x, int incx)
8
9 /*
10 * Purpose
11 * =======
12 *
13 * This routine solves :
14 *
15 * x <- alpha * inverse(t) * x
16 *
17 * Arguments
18 * =========
19 *
20 * order (input) enum blas_order_type
21 * column major, row major (blas_rowmajor, blas_colmajor)
22 *
23 * uplo (input) enum blas_uplo_type
24 * upper, lower (blas_upper, blas_lower)
25 *
26 * trans (input) enum blas_trans_type
27 * no trans, trans, conj trans
28 *
29 * diag (input) enum blas_diag_type
30 * unit, non unit (blas_unit_diag, blas_non_unit_diag)
31 *
32 * n (input) int
33 * the dimension of t
34 *
35 * k (input) int
36 * the number of subdiagonals/superdiagonals of t
37 *
38 * alpha (input) const void*
39 *
40 * t (input) void*
41 * Triangular Banded matrix
42 *
43 * x (input) const void*
44 * Array of length n.
45 *
46 * incx (input) int
47 * The stride used to access components x[i].
48 *
49 */
50 {
51 /* Routine name */
52 static const char routine_name[] = "BLAS_ztbsv_c";
53
54 int i, j; /* used to keep track of loop counts */
55 int xi; /* used to index vector x */
56 int start_xi; /* used as the starting idx to vector x */
57 int incxi;
58 int Tij; /* index inside of Banded structure */
59 int dot_start, dot_start_inc1, dot_start_inc2, dot_inc;
60
61 const float *t_i = (float *) t; /* internal matrix t */
62 double *x_i = (double *) x; /* internal x */
63 double *alpha_i = (double *) alpha; /* internal alpha */
64
65 if (order != blas_rowmajor && order != blas_colmajor) {
66 BLAS_error(routine_name, -1, order, 0);
67 }
68 if (uplo != blas_upper && uplo != blas_lower) {
69 BLAS_error(routine_name, -2, uplo, 0);
70 }
71 if ((trans != blas_trans) && (trans != blas_no_trans) &&
72 (trans != blas_conj) && (trans != blas_conj_trans)) {
73 BLAS_error(routine_name, -2, uplo, 0);
74 }
75 if (diag != blas_non_unit_diag && diag != blas_unit_diag) {
76 BLAS_error(routine_name, -4, diag, 0);
77 }
78 if (n < 0) {
79 BLAS_error(routine_name, -5, n, 0);
80 }
81 if (k >= n) {
82 BLAS_error(routine_name, -6, k, 0);
83 }
84 if ((ldt < 1) || (ldt <= k)) {
85 BLAS_error(routine_name, -9, ldt, 0);
86 }
87 if (incx == 0) {
88 BLAS_error(routine_name, -11, incx, 0);
89 }
90
91 if (n <= 0)
92 return;
93
94 incxi = incx;
95 incxi *= 2;
96
97 /* configuring the vector starting idx */
98 if (incxi < 0) {
99 start_xi = (1 - n) * incxi;
100 } else {
101 start_xi = 0;
102 }
103
104 /* if alpha is zero, then return x as a zero vector */
105 if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
106 xi = start_xi;
107 for (i = 0; i < n; i++) {
108 x_i[xi] = 0.0;
109 x_i[xi + 1] = 0.0;
110 xi += incxi;
111 }
112 return;
113 }
114 /* check to see if k=0. if so, we can optimize somewhat */
115 if (k == 0) {
116 if (((alpha_i[0] == 1.0 && alpha_i[1] == 0.0))
117 && (diag == blas_unit_diag)) {
118 /* nothing to do */
119 return;
120 } else {
121 /* just run the loops as is. */
122
123 }
124 }
125
126 /* get index variables prepared */
127 if (((trans == blas_trans) || (trans == blas_conj_trans)) ^
128 (order == blas_rowmajor)) {
129 dot_start = k;
130 } else {
131 dot_start = 0;
132 }
133
134 if (((trans == blas_trans) || (trans == blas_conj_trans)) ^
135 (order == blas_rowmajor)) {
136 dot_inc = 1;
137 dot_start_inc1 = ldt - 1;
138 dot_start_inc2 = ldt;
139 } else {
140 dot_inc = ldt - 1;
141 dot_start_inc1 = 1;
142 dot_start_inc2 = ldt;
143 }
144
145 if (((trans == blas_trans) || (trans == blas_conj_trans)) ^
146 (uplo == blas_lower)) {
147 /*start at the first element of x */
148 /* substitution will proceed forwards (forwardsubstitution) */
149 } else {
150 /*start at the last element of x */
151 /* substitution will proceed backwards (backsubstitution) */
152 dot_inc = -dot_inc;
153 dot_start_inc1 = -dot_start_inc1;
154 dot_start_inc2 = -dot_start_inc2;
155 dot_start = ldt * (n - 1) + k - dot_start;
156 /*order of the following 2 statements matters! */
157 start_xi = start_xi + (n - 1) * incxi;
158 incxi = -incxi;
159 }
160
161 dot_inc *= 2;
162 dot_start *= 2;
163 dot_start_inc1 *= 2;
164 dot_start_inc2 *= 2;
165
166
167 {
168
169 {
170 double temp1[2]; /* temporary variable for calculations */
171 double temp2[2]; /* temporary variable for calculations */
172 double x_elem[2];
173 float T_element[2];
174
175
176
177
178 if ((trans == blas_conj) || (trans == blas_conj_trans)) {
179 /* conjugated */
180
181
182 /*loop 1 */
183 xi = start_xi;
184 for (j = 0; j < k; j++) {
185
186 /* each time through loop, xi lands on next x to compute. */
187 x_elem[0] = x_i[xi];
188 x_elem[1] = x_i[xi + 1];
189 /* preform the multiplication -
190 in this implementation we do not seperate the alpha = 1 case */
191 {
192 temp1[0] =
193 (double) x_elem[0] * alpha_i[0] -
194 (double) x_elem[1] * alpha_i[1];
195 temp1[1] =
196 (double) x_elem[0] * alpha_i[1] +
197 (double) x_elem[1] * alpha_i[0];
198 }
199
200 xi = start_xi;
201
202 Tij = dot_start;
203 dot_start += dot_start_inc1;
204
205 for (i = j; i > 0; i--) {
206 T_element[0] = t_i[Tij];
207 T_element[1] = t_i[Tij + 1];
208 T_element[1] = -T_element[1];
209 x_elem[0] = x_i[xi];
210 x_elem[1] = x_i[xi + 1];
211 {
212 temp2[0] =
213 (double) x_elem[0] * T_element[0] -
214 (double) x_elem[1] * T_element[1];
215 temp2[1] =
216 (double) x_elem[0] * T_element[1] +
217 (double) x_elem[1] * T_element[0];
218 }
219 temp1[0] = temp1[0] - temp2[0];
220 temp1[1] = temp1[1] - temp2[1];
221 xi += incxi;
222 Tij += dot_inc;
223 } /* for across row */
224
225
226 /* if the diagonal entry is not equal to one, then divide Xj by
227 the entry */
228 if (diag == blas_non_unit_diag) {
229 T_element[0] = t_i[Tij];
230 T_element[1] = t_i[Tij + 1];
231 T_element[1] = -T_element[1];
232
233 {
234 double S = 1.0, eps, ov, un, eps1, ov1, un1;
235 double abs_a, abs_b, abs_c, abs_d, ab, cd;
236 double r;
237 double t;
238 double q[2];
239
240 eps = pow(2.0, -24.0);
241 un = pow(2.0, -126.0);
242 ov = pow(2.0, 128.0) * (1 - eps);
243 eps1 = pow(2.0, -53.0);
244 un1 = pow(2.0, -1022.0);
245 ov1 = 1.79769313486231571e+308;
246 /* = (pow(2.0, 1023.0) * (1 - eps1)) * 2.0; */
247 abs_a = fabs(temp1[0]);
248 abs_b = fabs(temp1[1]);
249 abs_c = fabs((double) T_element[0]);
250 abs_d = fabs((double) T_element[1]);
251 ab = MAX(abs_a, abs_b);
252 cd = MAX(abs_c, abs_d);
253
254 /* Scaling */
255 if (ab > ov1 / 16) { /* scale down a, b */
256 temp1[0] /= 16;
257 temp1[1] /= 16;
258 S = S * 16;
259 }
260 if (cd > ov / 16) { /* scale down c, d */
261 T_element[0] /= 16;
262 T_element[1] /= 16;
263 S = S / 16;
264 }
265 if (ab < un1 / eps1 * 2) { /* scale up a, b */
266 t = 2.0 / (eps1 * eps1);
267 temp1[0] *= t;
268 temp1[1] *= t;
269 S = S / t;
270 }
271 if (cd < un / eps * 2) { /* scale up c, d */
272 t = 2.0 / (eps * eps);
273 T_element[0] *= t;
274 T_element[1] *= t;
275 S = S * t;
276 }
277
278 /* Now un/eps*2 <= (a, b, c, d) >= ov/16 */
279 if (abs_c > abs_d) {
280 r = T_element[1] / T_element[0];
281 t = 1 / (T_element[0] + T_element[1] * r);
282 q[0] = (temp1[0] + temp1[1] * r) * t;
283 q[1] = (temp1[1] - temp1[0] * r) * t;
284 } else {
285 r = T_element[0] / T_element[1];
286 t = 1 / (T_element[1] + T_element[0] * r);
287 q[0] = (temp1[1] + temp1[0] * r) * t;
288 q[1] = (-temp1[0] + temp1[1] * r) * t;
289 }
290 /* Scale back */
291 temp1[0] = q[0] * S;
292 temp1[1] = q[1] * S;
293 }
294
295 }
296 /* if (diag == blas_non_unit_diag) */
297 x_i[xi] = temp1[0];
298 x_i[xi + 1] = temp1[1];
299 xi += incxi;
300 } /* for j<k */
301 /*end loop 1 */
302
303 /*loop 2 continue without changing j to start */
304 for (; j < n; j++) {
305
306 /* each time through loop, xi lands on next x to compute. */
307 x_elem[0] = x_i[xi];
308 x_elem[1] = x_i[xi + 1];
309 {
310 temp1[0] =
311 (double) x_elem[0] * alpha_i[0] -
312 (double) x_elem[1] * alpha_i[1];
313 temp1[1] =
314 (double) x_elem[0] * alpha_i[1] +
315 (double) x_elem[1] * alpha_i[0];
316 }
317
318 xi = start_xi;
319 start_xi += incxi;
320
321 Tij = dot_start;
322 dot_start += dot_start_inc2;
323
324 for (i = k; i > 0; i--) {
325 T_element[0] = t_i[Tij];
326 T_element[1] = t_i[Tij + 1];
327 T_element[1] = -T_element[1];
328 x_elem[0] = x_i[xi];
329 x_elem[1] = x_i[xi + 1];
330 {
331 temp2[0] =
332 (double) x_elem[0] * T_element[0] -
333 (double) x_elem[1] * T_element[1];
334 temp2[1] =
335 (double) x_elem[0] * T_element[1] +
336 (double) x_elem[1] * T_element[0];
337 }
338 temp1[0] = temp1[0] - temp2[0];
339 temp1[1] = temp1[1] - temp2[1];
340 xi += incxi;
341 Tij += dot_inc;
342 } /* for across row */
343
344
345 /* if the diagonal entry is not equal to one, then divide by
346 the entry */
347 if (diag == blas_non_unit_diag) {
348 T_element[0] = t_i[Tij];
349 T_element[1] = t_i[Tij + 1];
350 T_element[1] = -T_element[1];
351
352 {
353 double S = 1.0, eps, ov, un, eps1, ov1, un1;
354 double abs_a, abs_b, abs_c, abs_d, ab, cd;
355 double r;
356 double t;
357 double q[2];
358
359 eps = pow(2.0, -24.0);
360 un = pow(2.0, -126.0);
361 ov = pow(2.0, 128.0) * (1 - eps);
362 eps1 = pow(2.0, -53.0);
363 un1 = pow(2.0, -1022.0);
364 ov1 = 1.79769313486231571e+308;
365 /* = (pow(2.0, 1023.0) * (1 - eps1)) * 2.0; */
366 abs_a = fabs(temp1[0]);
367 abs_b = fabs(temp1[1]);
368 abs_c = fabs((double) T_element[0]);
369 abs_d = fabs((double) T_element[1]);
370 ab = MAX(abs_a, abs_b);
371 cd = MAX(abs_c, abs_d);
372
373 /* Scaling */
374 if (ab > ov1 / 16) { /* scale down a, b */
375 temp1[0] /= 16;
376 temp1[1] /= 16;
377 S = S * 16;
378 }
379 if (cd > ov / 16) { /* scale down c, d */
380 T_element[0] /= 16;
381 T_element[1] /= 16;
382 S = S / 16;
383 }
384 if (ab < un1 / eps1 * 2) { /* scale up a, b */
385 t = 2.0 / (eps1 * eps1);
386 temp1[0] *= t;
387 temp1[1] *= t;
388 S = S / t;
389 }
390 if (cd < un / eps * 2) { /* scale up c, d */
391 t = 2.0 / (eps * eps);
392 T_element[0] *= t;
393 T_element[1] *= t;
394 S = S * t;
395 }
396
397 /* Now un/eps*2 <= (a, b, c, d) >= ov/16 */
398 if (abs_c > abs_d) {
399 r = T_element[1] / T_element[0];
400 t = 1 / (T_element[0] + T_element[1] * r);
401 q[0] = (temp1[0] + temp1[1] * r) * t;
402 q[1] = (temp1[1] - temp1[0] * r) * t;
403 } else {
404 r = T_element[0] / T_element[1];
405 t = 1 / (T_element[1] + T_element[0] * r);
406 q[0] = (temp1[1] + temp1[0] * r) * t;
407 q[1] = (-temp1[0] + temp1[1] * r) * t;
408 }
409 /* Scale back */
410 temp1[0] = q[0] * S;
411 temp1[1] = q[1] * S;
412 }
413
414 }
415 /* if (diag == blas_non_unit_diag) */
416 x_i[xi] = temp1[0];
417 x_i[xi + 1] = temp1[1];
418 xi += incxi;
419 } /* for j<n */
420
421 } else {
422 /* not conjugated */
423
424
425 /*loop 1 */
426 xi = start_xi;
427 for (j = 0; j < k; j++) {
428
429 /* each time through loop, xi lands on next x to compute. */
430 x_elem[0] = x_i[xi];
431 x_elem[1] = x_i[xi + 1];
432 /* preform the multiplication -
433 in this implementation we do not seperate the alpha = 1 case */
434 {
435 temp1[0] =
436 (double) x_elem[0] * alpha_i[0] -
437 (double) x_elem[1] * alpha_i[1];
438 temp1[1] =
439 (double) x_elem[0] * alpha_i[1] +
440 (double) x_elem[1] * alpha_i[0];
441 }
442
443 xi = start_xi;
444
445 Tij = dot_start;
446 dot_start += dot_start_inc1;
447
448 for (i = j; i > 0; i--) {
449 T_element[0] = t_i[Tij];
450 T_element[1] = t_i[Tij + 1];
451
452 x_elem[0] = x_i[xi];
453 x_elem[1] = x_i[xi + 1];
454 {
455 temp2[0] =
456 (double) x_elem[0] * T_element[0] -
457 (double) x_elem[1] * T_element[1];
458 temp2[1] =
459 (double) x_elem[0] * T_element[1] +
460 (double) x_elem[1] * T_element[0];
461 }
462 temp1[0] = temp1[0] - temp2[0];
463 temp1[1] = temp1[1] - temp2[1];
464 xi += incxi;
465 Tij += dot_inc;
466 } /* for across row */
467
468
469 /* if the diagonal entry is not equal to one, then divide Xj by
470 the entry */
471 if (diag == blas_non_unit_diag) {
472 T_element[0] = t_i[Tij];
473 T_element[1] = t_i[Tij + 1];
474
475
476 {
477 double S = 1.0, eps, ov, un, eps1, ov1, un1;
478 double abs_a, abs_b, abs_c, abs_d, ab, cd;
479 double r;
480 double t;
481 double q[2];
482
483 eps = pow(2.0, -24.0);
484 un = pow(2.0, -126.0);
485 ov = pow(2.0, 128.0) * (1 - eps);
486 eps1 = pow(2.0, -53.0);
487 un1 = pow(2.0, -1022.0);
488 ov1 = 1.79769313486231571e+308;
489 /* = (pow(2.0, 1023.0) * (1 - eps1)) * 2.0; */
490 abs_a = fabs(temp1[0]);
491 abs_b = fabs(temp1[1]);
492 abs_c = fabs((double) T_element[0]);
493 abs_d = fabs((double) T_element[1]);
494 ab = MAX(abs_a, abs_b);
495 cd = MAX(abs_c, abs_d);
496
497 /* Scaling */
498 if (ab > ov1 / 16) { /* scale down a, b */
499 temp1[0] /= 16;
500 temp1[1] /= 16;
501 S = S * 16;
502 }
503 if (cd > ov / 16) { /* scale down c, d */
504 T_element[0] /= 16;
505 T_element[1] /= 16;
506 S = S / 16;
507 }
508 if (ab < un1 / eps1 * 2) { /* scale up a, b */
509 t = 2.0 / (eps1 * eps1);
510 temp1[0] *= t;
511 temp1[1] *= t;
512 S = S / t;
513 }
514 if (cd < un / eps * 2) { /* scale up c, d */
515 t = 2.0 / (eps * eps);
516 T_element[0] *= t;
517 T_element[1] *= t;
518 S = S * t;
519 }
520
521 /* Now un/eps*2 <= (a, b, c, d) >= ov/16 */
522 if (abs_c > abs_d) {
523 r = T_element[1] / T_element[0];
524 t = 1 / (T_element[0] + T_element[1] * r);
525 q[0] = (temp1[0] + temp1[1] * r) * t;
526 q[1] = (temp1[1] - temp1[0] * r) * t;
527 } else {
528 r = T_element[0] / T_element[1];
529 t = 1 / (T_element[1] + T_element[0] * r);
530 q[0] = (temp1[1] + temp1[0] * r) * t;
531 q[1] = (-temp1[0] + temp1[1] * r) * t;
532 }
533 /* Scale back */
534 temp1[0] = q[0] * S;
535 temp1[1] = q[1] * S;
536 }
537
538 }
539 /* if (diag == blas_non_unit_diag) */
540 x_i[xi] = temp1[0];
541 x_i[xi + 1] = temp1[1];
542 xi += incxi;
543 } /* for j<k */
544 /*end loop 1 */
545
546 /*loop 2 continue without changing j to start */
547 for (; j < n; j++) {
548
549 /* each time through loop, xi lands on next x to compute. */
550 x_elem[0] = x_i[xi];
551 x_elem[1] = x_i[xi + 1];
552 {
553 temp1[0] =
554 (double) x_elem[0] * alpha_i[0] -
555 (double) x_elem[1] * alpha_i[1];
556 temp1[1] =
557 (double) x_elem[0] * alpha_i[1] +
558 (double) x_elem[1] * alpha_i[0];
559 }
560
561 xi = start_xi;
562 start_xi += incxi;
563
564 Tij = dot_start;
565 dot_start += dot_start_inc2;
566
567 for (i = k; i > 0; i--) {
568 T_element[0] = t_i[Tij];
569 T_element[1] = t_i[Tij + 1];
570
571 x_elem[0] = x_i[xi];
572 x_elem[1] = x_i[xi + 1];
573 {
574 temp2[0] =
575 (double) x_elem[0] * T_element[0] -
576 (double) x_elem[1] * T_element[1];
577 temp2[1] =
578 (double) x_elem[0] * T_element[1] +
579 (double) x_elem[1] * T_element[0];
580 }
581 temp1[0] = temp1[0] - temp2[0];
582 temp1[1] = temp1[1] - temp2[1];
583 xi += incxi;
584 Tij += dot_inc;
585 } /* for across row */
586
587
588 /* if the diagonal entry is not equal to one, then divide by
589 the entry */
590 if (diag == blas_non_unit_diag) {
591 T_element[0] = t_i[Tij];
592 T_element[1] = t_i[Tij + 1];
593
594
595 {
596 double S = 1.0, eps, ov, un, eps1, ov1, un1;
597 double abs_a, abs_b, abs_c, abs_d, ab, cd;
598 double r;
599 double t;
600 double q[2];
601
602 eps = pow(2.0, -24.0);
603 un = pow(2.0, -126.0);
604 ov = pow(2.0, 128.0) * (1 - eps);
605 eps1 = pow(2.0, -53.0);
606 un1 = pow(2.0, -1022.0);
607 ov1 = 1.79769313486231571e+308;
608 /* = (pow(2.0, 1023.0) * (1 - eps1)) * 2.0; */
609 abs_a = fabs(temp1[0]);
610 abs_b = fabs(temp1[1]);
611 abs_c = fabs((double) T_element[0]);
612 abs_d = fabs((double) T_element[1]);
613 ab = MAX(abs_a, abs_b);
614 cd = MAX(abs_c, abs_d);
615
616 /* Scaling */
617 if (ab > ov1 / 16) { /* scale down a, b */
618 temp1[0] /= 16;
619 temp1[1] /= 16;
620 S = S * 16;
621 }
622 if (cd > ov / 16) { /* scale down c, d */
623 T_element[0] /= 16;
624 T_element[1] /= 16;
625 S = S / 16;
626 }
627 if (ab < un1 / eps1 * 2) { /* scale up a, b */
628 t = 2.0 / (eps1 * eps1);
629 temp1[0] *= t;
630 temp1[1] *= t;
631 S = S / t;
632 }
633 if (cd < un / eps * 2) { /* scale up c, d */
634 t = 2.0 / (eps * eps);
635 T_element[0] *= t;
636 T_element[1] *= t;
637 S = S * t;
638 }
639
640 /* Now un/eps*2 <= (a, b, c, d) >= ov/16 */
641 if (abs_c > abs_d) {
642 r = T_element[1] / T_element[0];
643 t = 1 / (T_element[0] + T_element[1] * r);
644 q[0] = (temp1[0] + temp1[1] * r) * t;
645 q[1] = (temp1[1] - temp1[0] * r) * t;
646 } else {
647 r = T_element[0] / T_element[1];
648 t = 1 / (T_element[1] + T_element[0] * r);
649 q[0] = (temp1[1] + temp1[0] * r) * t;
650 q[1] = (-temp1[0] + temp1[1] * r) * t;
651 }
652 /* Scale back */
653 temp1[0] = q[0] * S;
654 temp1[1] = q[1] * S;
655 }
656
657 }
658 /* if (diag == blas_non_unit_diag) */
659 x_i[xi] = temp1[0];
660 x_i[xi + 1] = temp1[1];
661 xi += incxi;
662 } /* for j<n */
663
664 }
665
666 }
667 }
668 } /* end BLAS_ztbsv_c */
669