1 #include <blas_extended.h>
2 #include <blas_extended_private.h>
3 #include <blas_fpu.h>
BLAS_dsymv2_d_s_x(enum blas_order_type order,enum blas_uplo_type uplo,int n,double alpha,const double * a,int lda,const float * x_head,const float * x_tail,int incx,double beta,double * y,int incy,enum blas_prec_type prec)4 void BLAS_dsymv2_d_s_x(enum blas_order_type order, enum blas_uplo_type uplo,
5 int n, double alpha, const double *a, int lda,
6 const float *x_head, const float *x_tail, int incx,
7 double beta, double *y, int incy,
8 enum blas_prec_type prec)
9
10 /*
11 * Purpose
12 * =======
13 *
14 * This routines computes the matrix product:
15 *
16 * y <- alpha * A * (x_head + x_tail) + beta * y
17 *
18 * where A is a symmetric matrix.
19 *
20 * Arguments
21 * =========
22 *
23 * order (input) enum blas_order_type
24 * Storage format of input symmetric matrix A.
25 *
26 * uplo (input) enum blas_uplo_type
27 * Determines which half of matrix A (upper or lower triangle)
28 * is accessed.
29 *
30 * n (input) int
31 * Dimension of A and size of vectors x, y.
32 *
33 * alpha (input) double
34 *
35 * a (input) double*
36 * Matrix A.
37 *
38 * lda (input) int
39 * Leading dimension of matrix A.
40 *
41 * x_head (input) float*
42 * Vector x_head
43 *
44 * x_tail (input) float*
45 * Vector x_tail
46 *
47 * incx (input) int
48 * Stride for vector x.
49 *
50 * beta (input) double
51 *
52 * y (input) float*
53 * Vector y.
54 *
55 * incy (input) int
56 * Stride for vector y.
57 *
58 * prec (input) enum blas_prec_type
59 * Specifies the internal precision to be used.
60 * = blas_prec_single: single precision.
61 * = blas_prec_double: double precision.
62 * = blas_prec_extra : anything at least 1.5 times as accurate
63 * than double, and wider than 80-bits.
64 * We use double-double in our implementation.
65 *
66 */
67 {
68 /* Routine name */
69 const char routine_name[] = "BLAS_dsymv2_d_s_x";
70 switch (prec) {
71
72 case blas_prec_single:{
73
74 int i, j;
75 int xi, yi, xi0, yi0;
76 int aij, ai;
77 int incai;
78 int incaij, incaij2;
79
80 const double *a_i = a;
81 const float *x_head_i = x_head;
82 const float *x_tail_i = x_tail;
83 double *y_i = y;
84 double alpha_i = alpha;
85 double beta_i = beta;
86 double a_elem;
87 float x_elem;
88 double y_elem;
89 double prod1;
90 double prod2;
91 double sum1;
92 double sum2;
93 double tmp1;
94 double tmp2;
95 double tmp3;
96
97
98
99 /* Test for no-op */
100 if (n <= 0) {
101 return;
102 }
103 if (alpha_i == 0.0 && beta_i == 1.0) {
104 return;
105 }
106
107 /* Check for error conditions. */
108 if (n < 0) {
109 BLAS_error(routine_name, -3, n, NULL);
110 }
111 if (lda < n) {
112 BLAS_error(routine_name, -6, n, NULL);
113 }
114 if (incx == 0) {
115 BLAS_error(routine_name, -9, incx, NULL);
116 }
117 if (incy == 0) {
118 BLAS_error(routine_name, -12, incy, NULL);
119 }
120
121 if ((order == blas_colmajor && uplo == blas_upper) ||
122 (order == blas_rowmajor && uplo == blas_lower)) {
123 incai = lda;
124 incaij = 1;
125 incaij2 = lda;
126 } else {
127 incai = 1;
128 incaij = lda;
129 incaij2 = 1;
130 }
131
132
133
134
135
136
137 xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
138 yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
139
140
141
142 /* The most general form, y <--- alpha * A * (x_head + x_tail) + beta * y */
143 for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
144 sum1 = 0.0;
145 sum2 = 0.0;
146
147 for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
148 a_elem = a_i[aij];
149 x_elem = x_head_i[xi];
150 prod1 = a_elem * x_elem;
151 sum1 = sum1 + prod1;
152 x_elem = x_tail_i[xi];
153 prod2 = a_elem * x_elem;
154 sum2 = sum2 + prod2;
155 }
156 for (; j < n; j++, aij += incaij2, xi += incx) {
157 a_elem = a_i[aij];
158 x_elem = x_head_i[xi];
159 prod1 = a_elem * x_elem;
160 sum1 = sum1 + prod1;
161 x_elem = x_tail_i[xi];
162 prod2 = a_elem * x_elem;
163 sum2 = sum2 + prod2;
164 }
165 sum1 = sum1 + sum2;
166 tmp1 = sum1 * alpha_i;
167 y_elem = y_i[yi];
168 tmp2 = y_elem * beta_i;
169 tmp3 = tmp1 + tmp2;
170 y_i[yi] = tmp3;
171 }
172
173
174
175 break;
176 }
177
178 case blas_prec_double:
179 case blas_prec_indigenous:{
180
181 int i, j;
182 int xi, yi, xi0, yi0;
183 int aij, ai;
184 int incai;
185 int incaij, incaij2;
186
187 const double *a_i = a;
188 const float *x_head_i = x_head;
189 const float *x_tail_i = x_tail;
190 double *y_i = y;
191 double alpha_i = alpha;
192 double beta_i = beta;
193 double a_elem;
194 float x_elem;
195 double y_elem;
196 double prod1;
197 double prod2;
198 double sum1;
199 double sum2;
200 double tmp1;
201 double tmp2;
202 double tmp3;
203
204
205
206 /* Test for no-op */
207 if (n <= 0) {
208 return;
209 }
210 if (alpha_i == 0.0 && beta_i == 1.0) {
211 return;
212 }
213
214 /* Check for error conditions. */
215 if (n < 0) {
216 BLAS_error(routine_name, -3, n, NULL);
217 }
218 if (lda < n) {
219 BLAS_error(routine_name, -6, n, NULL);
220 }
221 if (incx == 0) {
222 BLAS_error(routine_name, -9, incx, NULL);
223 }
224 if (incy == 0) {
225 BLAS_error(routine_name, -12, incy, NULL);
226 }
227
228 if ((order == blas_colmajor && uplo == blas_upper) ||
229 (order == blas_rowmajor && uplo == blas_lower)) {
230 incai = lda;
231 incaij = 1;
232 incaij2 = lda;
233 } else {
234 incai = 1;
235 incaij = lda;
236 incaij2 = 1;
237 }
238
239
240
241
242
243
244 xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
245 yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
246
247
248
249 /* The most general form, y <--- alpha * A * (x_head + x_tail) + beta * y */
250 for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
251 sum1 = 0.0;
252 sum2 = 0.0;
253
254 for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
255 a_elem = a_i[aij];
256 x_elem = x_head_i[xi];
257 prod1 = a_elem * x_elem;
258 sum1 = sum1 + prod1;
259 x_elem = x_tail_i[xi];
260 prod2 = a_elem * x_elem;
261 sum2 = sum2 + prod2;
262 }
263 for (; j < n; j++, aij += incaij2, xi += incx) {
264 a_elem = a_i[aij];
265 x_elem = x_head_i[xi];
266 prod1 = a_elem * x_elem;
267 sum1 = sum1 + prod1;
268 x_elem = x_tail_i[xi];
269 prod2 = a_elem * x_elem;
270 sum2 = sum2 + prod2;
271 }
272 sum1 = sum1 + sum2;
273 tmp1 = sum1 * alpha_i;
274 y_elem = y_i[yi];
275 tmp2 = y_elem * beta_i;
276 tmp3 = tmp1 + tmp2;
277 y_i[yi] = tmp3;
278 }
279
280
281
282 break;
283 }
284
285 case blas_prec_extra:{
286
287 int i, j;
288 int xi, yi, xi0, yi0;
289 int aij, ai;
290 int incai;
291 int incaij, incaij2;
292
293 const double *a_i = a;
294 const float *x_head_i = x_head;
295 const float *x_tail_i = x_tail;
296 double *y_i = y;
297 double alpha_i = alpha;
298 double beta_i = beta;
299 double a_elem;
300 float x_elem;
301 double y_elem;
302 double head_prod1, tail_prod1;
303 double head_prod2, tail_prod2;
304 double head_sum1, tail_sum1;
305 double head_sum2, tail_sum2;
306 double head_tmp1, tail_tmp1;
307 double head_tmp2, tail_tmp2;
308 double head_tmp3, tail_tmp3;
309
310 FPU_FIX_DECL;
311
312 /* Test for no-op */
313 if (n <= 0) {
314 return;
315 }
316 if (alpha_i == 0.0 && beta_i == 1.0) {
317 return;
318 }
319
320 /* Check for error conditions. */
321 if (n < 0) {
322 BLAS_error(routine_name, -3, n, NULL);
323 }
324 if (lda < n) {
325 BLAS_error(routine_name, -6, n, NULL);
326 }
327 if (incx == 0) {
328 BLAS_error(routine_name, -9, incx, NULL);
329 }
330 if (incy == 0) {
331 BLAS_error(routine_name, -12, incy, NULL);
332 }
333
334 if ((order == blas_colmajor && uplo == blas_upper) ||
335 (order == blas_rowmajor && uplo == blas_lower)) {
336 incai = lda;
337 incaij = 1;
338 incaij2 = lda;
339 } else {
340 incai = 1;
341 incaij = lda;
342 incaij2 = 1;
343 }
344
345
346
347
348
349
350 xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
351 yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
352
353 FPU_FIX_START;
354
355 /* The most general form, y <--- alpha * A * (x_head + x_tail) + beta * y */
356 for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
357 head_sum1 = tail_sum1 = 0.0;
358 head_sum2 = tail_sum2 = 0.0;
359
360 for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
361 a_elem = a_i[aij];
362 x_elem = x_head_i[xi];
363 {
364 double dt = (double) x_elem;
365 {
366 /* Compute double_double = double * double. */
367 double a1, a2, b1, b2, con;
368
369 con = a_elem * split;
370 a1 = con - a_elem;
371 a1 = con - a1;
372 a2 = a_elem - a1;
373 con = dt * split;
374 b1 = con - dt;
375 b1 = con - b1;
376 b2 = dt - b1;
377
378 head_prod1 = a_elem * dt;
379 tail_prod1 =
380 (((a1 * b1 - head_prod1) + a1 * b2) + a2 * b1) + a2 * b2;
381 }
382 }
383 {
384 /* Compute double-double = double-double + double-double. */
385 double bv;
386 double s1, s2, t1, t2;
387
388 /* Add two hi words. */
389 s1 = head_sum1 + head_prod1;
390 bv = s1 - head_sum1;
391 s2 = ((head_prod1 - bv) + (head_sum1 - (s1 - bv)));
392
393 /* Add two lo words. */
394 t1 = tail_sum1 + tail_prod1;
395 bv = t1 - tail_sum1;
396 t2 = ((tail_prod1 - bv) + (tail_sum1 - (t1 - bv)));
397
398 s2 += t1;
399
400 /* Renormalize (s1, s2) to (t1, s2) */
401 t1 = s1 + s2;
402 s2 = s2 - (t1 - s1);
403
404 t2 += s2;
405
406 /* Renormalize (t1, t2) */
407 head_sum1 = t1 + t2;
408 tail_sum1 = t2 - (head_sum1 - t1);
409 }
410 x_elem = x_tail_i[xi];
411 {
412 double dt = (double) x_elem;
413 {
414 /* Compute double_double = double * double. */
415 double a1, a2, b1, b2, con;
416
417 con = a_elem * split;
418 a1 = con - a_elem;
419 a1 = con - a1;
420 a2 = a_elem - a1;
421 con = dt * split;
422 b1 = con - dt;
423 b1 = con - b1;
424 b2 = dt - b1;
425
426 head_prod2 = a_elem * dt;
427 tail_prod2 =
428 (((a1 * b1 - head_prod2) + a1 * b2) + a2 * b1) + a2 * b2;
429 }
430 }
431 {
432 /* Compute double-double = double-double + double-double. */
433 double bv;
434 double s1, s2, t1, t2;
435
436 /* Add two hi words. */
437 s1 = head_sum2 + head_prod2;
438 bv = s1 - head_sum2;
439 s2 = ((head_prod2 - bv) + (head_sum2 - (s1 - bv)));
440
441 /* Add two lo words. */
442 t1 = tail_sum2 + tail_prod2;
443 bv = t1 - tail_sum2;
444 t2 = ((tail_prod2 - bv) + (tail_sum2 - (t1 - bv)));
445
446 s2 += t1;
447
448 /* Renormalize (s1, s2) to (t1, s2) */
449 t1 = s1 + s2;
450 s2 = s2 - (t1 - s1);
451
452 t2 += s2;
453
454 /* Renormalize (t1, t2) */
455 head_sum2 = t1 + t2;
456 tail_sum2 = t2 - (head_sum2 - t1);
457 }
458 }
459 for (; j < n; j++, aij += incaij2, xi += incx) {
460 a_elem = a_i[aij];
461 x_elem = x_head_i[xi];
462 {
463 double dt = (double) x_elem;
464 {
465 /* Compute double_double = double * double. */
466 double a1, a2, b1, b2, con;
467
468 con = a_elem * split;
469 a1 = con - a_elem;
470 a1 = con - a1;
471 a2 = a_elem - a1;
472 con = dt * split;
473 b1 = con - dt;
474 b1 = con - b1;
475 b2 = dt - b1;
476
477 head_prod1 = a_elem * dt;
478 tail_prod1 =
479 (((a1 * b1 - head_prod1) + a1 * b2) + a2 * b1) + a2 * b2;
480 }
481 }
482 {
483 /* Compute double-double = double-double + double-double. */
484 double bv;
485 double s1, s2, t1, t2;
486
487 /* Add two hi words. */
488 s1 = head_sum1 + head_prod1;
489 bv = s1 - head_sum1;
490 s2 = ((head_prod1 - bv) + (head_sum1 - (s1 - bv)));
491
492 /* Add two lo words. */
493 t1 = tail_sum1 + tail_prod1;
494 bv = t1 - tail_sum1;
495 t2 = ((tail_prod1 - bv) + (tail_sum1 - (t1 - bv)));
496
497 s2 += t1;
498
499 /* Renormalize (s1, s2) to (t1, s2) */
500 t1 = s1 + s2;
501 s2 = s2 - (t1 - s1);
502
503 t2 += s2;
504
505 /* Renormalize (t1, t2) */
506 head_sum1 = t1 + t2;
507 tail_sum1 = t2 - (head_sum1 - t1);
508 }
509 x_elem = x_tail_i[xi];
510 {
511 double dt = (double) x_elem;
512 {
513 /* Compute double_double = double * double. */
514 double a1, a2, b1, b2, con;
515
516 con = a_elem * split;
517 a1 = con - a_elem;
518 a1 = con - a1;
519 a2 = a_elem - a1;
520 con = dt * split;
521 b1 = con - dt;
522 b1 = con - b1;
523 b2 = dt - b1;
524
525 head_prod2 = a_elem * dt;
526 tail_prod2 =
527 (((a1 * b1 - head_prod2) + a1 * b2) + a2 * b1) + a2 * b2;
528 }
529 }
530 {
531 /* Compute double-double = double-double + double-double. */
532 double bv;
533 double s1, s2, t1, t2;
534
535 /* Add two hi words. */
536 s1 = head_sum2 + head_prod2;
537 bv = s1 - head_sum2;
538 s2 = ((head_prod2 - bv) + (head_sum2 - (s1 - bv)));
539
540 /* Add two lo words. */
541 t1 = tail_sum2 + tail_prod2;
542 bv = t1 - tail_sum2;
543 t2 = ((tail_prod2 - bv) + (tail_sum2 - (t1 - bv)));
544
545 s2 += t1;
546
547 /* Renormalize (s1, s2) to (t1, s2) */
548 t1 = s1 + s2;
549 s2 = s2 - (t1 - s1);
550
551 t2 += s2;
552
553 /* Renormalize (t1, t2) */
554 head_sum2 = t1 + t2;
555 tail_sum2 = t2 - (head_sum2 - t1);
556 }
557 }
558 {
559 /* Compute double-double = double-double + double-double. */
560 double bv;
561 double s1, s2, t1, t2;
562
563 /* Add two hi words. */
564 s1 = head_sum1 + head_sum2;
565 bv = s1 - head_sum1;
566 s2 = ((head_sum2 - bv) + (head_sum1 - (s1 - bv)));
567
568 /* Add two lo words. */
569 t1 = tail_sum1 + tail_sum2;
570 bv = t1 - tail_sum1;
571 t2 = ((tail_sum2 - bv) + (tail_sum1 - (t1 - bv)));
572
573 s2 += t1;
574
575 /* Renormalize (s1, s2) to (t1, s2) */
576 t1 = s1 + s2;
577 s2 = s2 - (t1 - s1);
578
579 t2 += s2;
580
581 /* Renormalize (t1, t2) */
582 head_sum1 = t1 + t2;
583 tail_sum1 = t2 - (head_sum1 - t1);
584 }
585 {
586 /* Compute double-double = double-double * double. */
587 double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
588
589 con = head_sum1 * split;
590 a11 = con - head_sum1;
591 a11 = con - a11;
592 a21 = head_sum1 - a11;
593 con = alpha_i * split;
594 b1 = con - alpha_i;
595 b1 = con - b1;
596 b2 = alpha_i - b1;
597
598 c11 = head_sum1 * alpha_i;
599 c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
600
601 c2 = tail_sum1 * alpha_i;
602 t1 = c11 + c2;
603 t2 = (c2 - (t1 - c11)) + c21;
604
605 head_tmp1 = t1 + t2;
606 tail_tmp1 = t2 - (head_tmp1 - t1);
607 }
608 y_elem = y_i[yi];
609 {
610 /* Compute double_double = double * double. */
611 double a1, a2, b1, b2, con;
612
613 con = y_elem * split;
614 a1 = con - y_elem;
615 a1 = con - a1;
616 a2 = y_elem - a1;
617 con = beta_i * split;
618 b1 = con - beta_i;
619 b1 = con - b1;
620 b2 = beta_i - b1;
621
622 head_tmp2 = y_elem * beta_i;
623 tail_tmp2 = (((a1 * b1 - head_tmp2) + a1 * b2) + a2 * b1) + a2 * b2;
624 }
625 {
626 /* Compute double-double = double-double + double-double. */
627 double bv;
628 double s1, s2, t1, t2;
629
630 /* Add two hi words. */
631 s1 = head_tmp1 + head_tmp2;
632 bv = s1 - head_tmp1;
633 s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
634
635 /* Add two lo words. */
636 t1 = tail_tmp1 + tail_tmp2;
637 bv = t1 - tail_tmp1;
638 t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
639
640 s2 += t1;
641
642 /* Renormalize (s1, s2) to (t1, s2) */
643 t1 = s1 + s2;
644 s2 = s2 - (t1 - s1);
645
646 t2 += s2;
647
648 /* Renormalize (t1, t2) */
649 head_tmp3 = t1 + t2;
650 tail_tmp3 = t2 - (head_tmp3 - t1);
651 }
652 y_i[yi] = head_tmp3;
653 }
654
655 FPU_FIX_STOP;
656
657 break;
658 }
659 }
660 } /* end BLAS_dsymv2_d_s_x */
661