1 #include <blas_extended.h>
2 #include <blas_extended_private.h>
3 #include <blas_fpu.h>
BLAS_dsymv2_s_s_x(enum blas_order_type order,enum blas_uplo_type uplo,int n,double alpha,const float * a,int lda,const float * x_head,const float * x_tail,int incx,double beta,double * y,int incy,enum blas_prec_type prec)4 void BLAS_dsymv2_s_s_x(enum blas_order_type order, enum blas_uplo_type uplo,
5 int n, double alpha, const float *a, int lda,
6 const float *x_head, const float *x_tail, int incx,
7 double beta, double *y, int incy,
8 enum blas_prec_type prec)
9
10 /*
11 * Purpose
12 * =======
13 *
14 * This routines computes the matrix product:
15 *
16 * y <- alpha * A * (x_head + x_tail) + beta * y
17 *
18 * where A is a symmetric matrix.
19 *
20 * Arguments
21 * =========
22 *
23 * order (input) enum blas_order_type
24 * Storage format of input symmetric matrix A.
25 *
26 * uplo (input) enum blas_uplo_type
27 * Determines which half of matrix A (upper or lower triangle)
28 * is accessed.
29 *
30 * n (input) int
31 * Dimension of A and size of vectors x, y.
32 *
33 * alpha (input) double
34 *
35 * a (input) float*
36 * Matrix A.
37 *
38 * lda (input) int
39 * Leading dimension of matrix A.
40 *
41 * x_head (input) float*
42 * Vector x_head
43 *
44 * x_tail (input) float*
45 * Vector x_tail
46 *
47 * incx (input) int
48 * Stride for vector x.
49 *
50 * beta (input) double
51 *
52 * y (input) float*
53 * Vector y.
54 *
55 * incy (input) int
56 * Stride for vector y.
57 *
58 * prec (input) enum blas_prec_type
59 * Specifies the internal precision to be used.
60 * = blas_prec_single: single precision.
61 * = blas_prec_double: double precision.
62 * = blas_prec_extra : anything at least 1.5 times as accurate
63 * than double, and wider than 80-bits.
64 * We use double-double in our implementation.
65 *
66 */
67 {
68 /* Routine name */
69 const char routine_name[] = "BLAS_dsymv2_s_s_x";
70 switch (prec) {
71
72 case blas_prec_single:{
73
74 int i, j;
75 int xi, yi, xi0, yi0;
76 int aij, ai;
77 int incai;
78 int incaij, incaij2;
79
80 const float *a_i = a;
81 const float *x_head_i = x_head;
82 const float *x_tail_i = x_tail;
83 double *y_i = y;
84 double alpha_i = alpha;
85 double beta_i = beta;
86 float a_elem;
87 float x_elem;
88 double y_elem;
89 double prod1;
90 double prod2;
91 double sum1;
92 double sum2;
93 double tmp1;
94 double tmp2;
95 double tmp3;
96
97
98
99 /* Test for no-op */
100 if (n <= 0) {
101 return;
102 }
103 if (alpha_i == 0.0 && beta_i == 1.0) {
104 return;
105 }
106
107 /* Check for error conditions. */
108 if (n < 0) {
109 BLAS_error(routine_name, -3, n, NULL);
110 }
111 if (lda < n) {
112 BLAS_error(routine_name, -6, n, NULL);
113 }
114 if (incx == 0) {
115 BLAS_error(routine_name, -9, incx, NULL);
116 }
117 if (incy == 0) {
118 BLAS_error(routine_name, -12, incy, NULL);
119 }
120
121 if ((order == blas_colmajor && uplo == blas_upper) ||
122 (order == blas_rowmajor && uplo == blas_lower)) {
123 incai = lda;
124 incaij = 1;
125 incaij2 = lda;
126 } else {
127 incai = 1;
128 incaij = lda;
129 incaij2 = 1;
130 }
131
132
133
134
135
136
137 xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
138 yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
139
140
141
142 /* The most general form, y <--- alpha * A * (x_head + x_tail) + beta * y */
143 for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
144 sum1 = 0.0;
145 sum2 = 0.0;
146
147 for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
148 a_elem = a_i[aij];
149 x_elem = x_head_i[xi];
150 prod1 = (double) a_elem *x_elem;
151 sum1 = sum1 + prod1;
152 x_elem = x_tail_i[xi];
153 prod2 = (double) a_elem *x_elem;
154 sum2 = sum2 + prod2;
155 }
156 for (; j < n; j++, aij += incaij2, xi += incx) {
157 a_elem = a_i[aij];
158 x_elem = x_head_i[xi];
159 prod1 = (double) a_elem *x_elem;
160 sum1 = sum1 + prod1;
161 x_elem = x_tail_i[xi];
162 prod2 = (double) a_elem *x_elem;
163 sum2 = sum2 + prod2;
164 }
165 sum1 = sum1 + sum2;
166 tmp1 = sum1 * alpha_i;
167 y_elem = y_i[yi];
168 tmp2 = y_elem * beta_i;
169 tmp3 = tmp1 + tmp2;
170 y_i[yi] = tmp3;
171 }
172
173
174
175 break;
176 }
177
178 case blas_prec_double:
179 case blas_prec_indigenous:{
180
181 int i, j;
182 int xi, yi, xi0, yi0;
183 int aij, ai;
184 int incai;
185 int incaij, incaij2;
186
187 const float *a_i = a;
188 const float *x_head_i = x_head;
189 const float *x_tail_i = x_tail;
190 double *y_i = y;
191 double alpha_i = alpha;
192 double beta_i = beta;
193 float a_elem;
194 float x_elem;
195 double y_elem;
196 double prod1;
197 double prod2;
198 double sum1;
199 double sum2;
200 double tmp1;
201 double tmp2;
202 double tmp3;
203
204
205
206 /* Test for no-op */
207 if (n <= 0) {
208 return;
209 }
210 if (alpha_i == 0.0 && beta_i == 1.0) {
211 return;
212 }
213
214 /* Check for error conditions. */
215 if (n < 0) {
216 BLAS_error(routine_name, -3, n, NULL);
217 }
218 if (lda < n) {
219 BLAS_error(routine_name, -6, n, NULL);
220 }
221 if (incx == 0) {
222 BLAS_error(routine_name, -9, incx, NULL);
223 }
224 if (incy == 0) {
225 BLAS_error(routine_name, -12, incy, NULL);
226 }
227
228 if ((order == blas_colmajor && uplo == blas_upper) ||
229 (order == blas_rowmajor && uplo == blas_lower)) {
230 incai = lda;
231 incaij = 1;
232 incaij2 = lda;
233 } else {
234 incai = 1;
235 incaij = lda;
236 incaij2 = 1;
237 }
238
239
240
241
242
243
244 xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
245 yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
246
247
248
249 /* The most general form, y <--- alpha * A * (x_head + x_tail) + beta * y */
250 for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
251 sum1 = 0.0;
252 sum2 = 0.0;
253
254 for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
255 a_elem = a_i[aij];
256 x_elem = x_head_i[xi];
257 prod1 = (double) a_elem *x_elem;
258 sum1 = sum1 + prod1;
259 x_elem = x_tail_i[xi];
260 prod2 = (double) a_elem *x_elem;
261 sum2 = sum2 + prod2;
262 }
263 for (; j < n; j++, aij += incaij2, xi += incx) {
264 a_elem = a_i[aij];
265 x_elem = x_head_i[xi];
266 prod1 = (double) a_elem *x_elem;
267 sum1 = sum1 + prod1;
268 x_elem = x_tail_i[xi];
269 prod2 = (double) a_elem *x_elem;
270 sum2 = sum2 + prod2;
271 }
272 sum1 = sum1 + sum2;
273 tmp1 = sum1 * alpha_i;
274 y_elem = y_i[yi];
275 tmp2 = y_elem * beta_i;
276 tmp3 = tmp1 + tmp2;
277 y_i[yi] = tmp3;
278 }
279
280
281
282 break;
283 }
284
285 case blas_prec_extra:{
286
287 int i, j;
288 int xi, yi, xi0, yi0;
289 int aij, ai;
290 int incai;
291 int incaij, incaij2;
292
293 const float *a_i = a;
294 const float *x_head_i = x_head;
295 const float *x_tail_i = x_tail;
296 double *y_i = y;
297 double alpha_i = alpha;
298 double beta_i = beta;
299 float a_elem;
300 float x_elem;
301 double y_elem;
302 double head_prod1, tail_prod1;
303 double head_prod2, tail_prod2;
304 double head_sum1, tail_sum1;
305 double head_sum2, tail_sum2;
306 double head_tmp1, tail_tmp1;
307 double head_tmp2, tail_tmp2;
308 double head_tmp3, tail_tmp3;
309
310 FPU_FIX_DECL;
311
312 /* Test for no-op */
313 if (n <= 0) {
314 return;
315 }
316 if (alpha_i == 0.0 && beta_i == 1.0) {
317 return;
318 }
319
320 /* Check for error conditions. */
321 if (n < 0) {
322 BLAS_error(routine_name, -3, n, NULL);
323 }
324 if (lda < n) {
325 BLAS_error(routine_name, -6, n, NULL);
326 }
327 if (incx == 0) {
328 BLAS_error(routine_name, -9, incx, NULL);
329 }
330 if (incy == 0) {
331 BLAS_error(routine_name, -12, incy, NULL);
332 }
333
334 if ((order == blas_colmajor && uplo == blas_upper) ||
335 (order == blas_rowmajor && uplo == blas_lower)) {
336 incai = lda;
337 incaij = 1;
338 incaij2 = lda;
339 } else {
340 incai = 1;
341 incaij = lda;
342 incaij2 = 1;
343 }
344
345
346
347
348
349
350 xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
351 yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
352
353 FPU_FIX_START;
354
355 /* The most general form, y <--- alpha * A * (x_head + x_tail) + beta * y */
356 for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
357 head_sum1 = tail_sum1 = 0.0;
358 head_sum2 = tail_sum2 = 0.0;
359
360 for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
361 a_elem = a_i[aij];
362 x_elem = x_head_i[xi];
363 head_prod1 = (double) a_elem *x_elem;
364 tail_prod1 = 0.0;
365 {
366 /* Compute double-double = double-double + double-double. */
367 double bv;
368 double s1, s2, t1, t2;
369
370 /* Add two hi words. */
371 s1 = head_sum1 + head_prod1;
372 bv = s1 - head_sum1;
373 s2 = ((head_prod1 - bv) + (head_sum1 - (s1 - bv)));
374
375 /* Add two lo words. */
376 t1 = tail_sum1 + tail_prod1;
377 bv = t1 - tail_sum1;
378 t2 = ((tail_prod1 - bv) + (tail_sum1 - (t1 - bv)));
379
380 s2 += t1;
381
382 /* Renormalize (s1, s2) to (t1, s2) */
383 t1 = s1 + s2;
384 s2 = s2 - (t1 - s1);
385
386 t2 += s2;
387
388 /* Renormalize (t1, t2) */
389 head_sum1 = t1 + t2;
390 tail_sum1 = t2 - (head_sum1 - t1);
391 }
392 x_elem = x_tail_i[xi];
393 head_prod2 = (double) a_elem *x_elem;
394 tail_prod2 = 0.0;
395 {
396 /* Compute double-double = double-double + double-double. */
397 double bv;
398 double s1, s2, t1, t2;
399
400 /* Add two hi words. */
401 s1 = head_sum2 + head_prod2;
402 bv = s1 - head_sum2;
403 s2 = ((head_prod2 - bv) + (head_sum2 - (s1 - bv)));
404
405 /* Add two lo words. */
406 t1 = tail_sum2 + tail_prod2;
407 bv = t1 - tail_sum2;
408 t2 = ((tail_prod2 - bv) + (tail_sum2 - (t1 - bv)));
409
410 s2 += t1;
411
412 /* Renormalize (s1, s2) to (t1, s2) */
413 t1 = s1 + s2;
414 s2 = s2 - (t1 - s1);
415
416 t2 += s2;
417
418 /* Renormalize (t1, t2) */
419 head_sum2 = t1 + t2;
420 tail_sum2 = t2 - (head_sum2 - t1);
421 }
422 }
423 for (; j < n; j++, aij += incaij2, xi += incx) {
424 a_elem = a_i[aij];
425 x_elem = x_head_i[xi];
426 head_prod1 = (double) a_elem *x_elem;
427 tail_prod1 = 0.0;
428 {
429 /* Compute double-double = double-double + double-double. */
430 double bv;
431 double s1, s2, t1, t2;
432
433 /* Add two hi words. */
434 s1 = head_sum1 + head_prod1;
435 bv = s1 - head_sum1;
436 s2 = ((head_prod1 - bv) + (head_sum1 - (s1 - bv)));
437
438 /* Add two lo words. */
439 t1 = tail_sum1 + tail_prod1;
440 bv = t1 - tail_sum1;
441 t2 = ((tail_prod1 - bv) + (tail_sum1 - (t1 - bv)));
442
443 s2 += t1;
444
445 /* Renormalize (s1, s2) to (t1, s2) */
446 t1 = s1 + s2;
447 s2 = s2 - (t1 - s1);
448
449 t2 += s2;
450
451 /* Renormalize (t1, t2) */
452 head_sum1 = t1 + t2;
453 tail_sum1 = t2 - (head_sum1 - t1);
454 }
455 x_elem = x_tail_i[xi];
456 head_prod2 = (double) a_elem *x_elem;
457 tail_prod2 = 0.0;
458 {
459 /* Compute double-double = double-double + double-double. */
460 double bv;
461 double s1, s2, t1, t2;
462
463 /* Add two hi words. */
464 s1 = head_sum2 + head_prod2;
465 bv = s1 - head_sum2;
466 s2 = ((head_prod2 - bv) + (head_sum2 - (s1 - bv)));
467
468 /* Add two lo words. */
469 t1 = tail_sum2 + tail_prod2;
470 bv = t1 - tail_sum2;
471 t2 = ((tail_prod2 - bv) + (tail_sum2 - (t1 - bv)));
472
473 s2 += t1;
474
475 /* Renormalize (s1, s2) to (t1, s2) */
476 t1 = s1 + s2;
477 s2 = s2 - (t1 - s1);
478
479 t2 += s2;
480
481 /* Renormalize (t1, t2) */
482 head_sum2 = t1 + t2;
483 tail_sum2 = t2 - (head_sum2 - t1);
484 }
485 }
486 {
487 /* Compute double-double = double-double + double-double. */
488 double bv;
489 double s1, s2, t1, t2;
490
491 /* Add two hi words. */
492 s1 = head_sum1 + head_sum2;
493 bv = s1 - head_sum1;
494 s2 = ((head_sum2 - bv) + (head_sum1 - (s1 - bv)));
495
496 /* Add two lo words. */
497 t1 = tail_sum1 + tail_sum2;
498 bv = t1 - tail_sum1;
499 t2 = ((tail_sum2 - bv) + (tail_sum1 - (t1 - bv)));
500
501 s2 += t1;
502
503 /* Renormalize (s1, s2) to (t1, s2) */
504 t1 = s1 + s2;
505 s2 = s2 - (t1 - s1);
506
507 t2 += s2;
508
509 /* Renormalize (t1, t2) */
510 head_sum1 = t1 + t2;
511 tail_sum1 = t2 - (head_sum1 - t1);
512 }
513 {
514 /* Compute double-double = double-double * double. */
515 double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
516
517 con = head_sum1 * split;
518 a11 = con - head_sum1;
519 a11 = con - a11;
520 a21 = head_sum1 - a11;
521 con = alpha_i * split;
522 b1 = con - alpha_i;
523 b1 = con - b1;
524 b2 = alpha_i - b1;
525
526 c11 = head_sum1 * alpha_i;
527 c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
528
529 c2 = tail_sum1 * alpha_i;
530 t1 = c11 + c2;
531 t2 = (c2 - (t1 - c11)) + c21;
532
533 head_tmp1 = t1 + t2;
534 tail_tmp1 = t2 - (head_tmp1 - t1);
535 }
536 y_elem = y_i[yi];
537 {
538 /* Compute double_double = double * double. */
539 double a1, a2, b1, b2, con;
540
541 con = y_elem * split;
542 a1 = con - y_elem;
543 a1 = con - a1;
544 a2 = y_elem - a1;
545 con = beta_i * split;
546 b1 = con - beta_i;
547 b1 = con - b1;
548 b2 = beta_i - b1;
549
550 head_tmp2 = y_elem * beta_i;
551 tail_tmp2 = (((a1 * b1 - head_tmp2) + a1 * b2) + a2 * b1) + a2 * b2;
552 }
553 {
554 /* Compute double-double = double-double + double-double. */
555 double bv;
556 double s1, s2, t1, t2;
557
558 /* Add two hi words. */
559 s1 = head_tmp1 + head_tmp2;
560 bv = s1 - head_tmp1;
561 s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
562
563 /* Add two lo words. */
564 t1 = tail_tmp1 + tail_tmp2;
565 bv = t1 - tail_tmp1;
566 t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
567
568 s2 += t1;
569
570 /* Renormalize (s1, s2) to (t1, s2) */
571 t1 = s1 + s2;
572 s2 = s2 - (t1 - s1);
573
574 t2 += s2;
575
576 /* Renormalize (t1, t2) */
577 head_tmp3 = t1 + t2;
578 tail_tmp3 = t2 - (head_tmp3 - t1);
579 }
580 y_i[yi] = head_tmp3;
581 }
582
583 FPU_FIX_STOP;
584
585 break;
586 }
587 }
588 } /* end BLAS_dsymv2_s_s_x */
589