1 #include <blas_extended.h>
2 #include <blas_extended_private.h>
3 #include <blas_fpu.h>
BLAS_dsymv2_s_s_x(enum blas_order_type order,enum blas_uplo_type uplo,int n,double alpha,const float * a,int lda,const float * x_head,const float * x_tail,int incx,double beta,double * y,int incy,enum blas_prec_type prec)4 void BLAS_dsymv2_s_s_x(enum blas_order_type order, enum blas_uplo_type uplo,
5 		       int n, double alpha, const float *a, int lda,
6 		       const float *x_head, const float *x_tail, int incx,
7 		       double beta, double *y, int incy,
8 		       enum blas_prec_type prec)
9 
10 /*
11  * Purpose
12  * =======
13  *
14  * This routines computes the matrix product:
15  *
16  *     y  <-  alpha * A * (x_head + x_tail) + beta * y
17  *
18  * where A is a symmetric matrix.
19  *
20  * Arguments
21  * =========
22  *
23  * order   (input) enum blas_order_type
24  *         Storage format of input symmetric matrix A.
25  *
26  * uplo    (input) enum blas_uplo_type
27  *         Determines which half of matrix A (upper or lower triangle)
28  *         is accessed.
29  *
30  * n       (input) int
31  *         Dimension of A and size of vectors x, y.
32  *
33  * alpha   (input) double
34  *
35  * a       (input) float*
36  *         Matrix A.
37  *
38  * lda     (input) int
39  *         Leading dimension of matrix A.
40  *
41  * x_head  (input) float*
42  *         Vector x_head
43  *
44  * x_tail  (input) float*
45  *         Vector x_tail
46  *
47  * incx    (input) int
48  *         Stride for vector x.
49  *
50  * beta    (input) double
51  *
52  * y       (input) float*
53  *         Vector y.
54  *
55  * incy    (input) int
56  *         Stride for vector y.
57  *
58  * prec   (input) enum blas_prec_type
59  *        Specifies the internal precision to be used.
60  *        = blas_prec_single: single precision.
61  *        = blas_prec_double: double precision.
62  *        = blas_prec_extra : anything at least 1.5 times as accurate
63  *                            than double, and wider than 80-bits.
64  *                            We use double-double in our implementation.
65  *
66  */
67 {
68   /* Routine name */
69   const char routine_name[] = "BLAS_dsymv2_s_s_x";
70   switch (prec) {
71 
72   case blas_prec_single:{
73 
74       int i, j;
75       int xi, yi, xi0, yi0;
76       int aij, ai;
77       int incai;
78       int incaij, incaij2;
79 
80       const float *a_i = a;
81       const float *x_head_i = x_head;
82       const float *x_tail_i = x_tail;
83       double *y_i = y;
84       double alpha_i = alpha;
85       double beta_i = beta;
86       float a_elem;
87       float x_elem;
88       double y_elem;
89       double prod1;
90       double prod2;
91       double sum1;
92       double sum2;
93       double tmp1;
94       double tmp2;
95       double tmp3;
96 
97 
98 
99       /* Test for no-op */
100       if (n <= 0) {
101 	return;
102       }
103       if (alpha_i == 0.0 && beta_i == 1.0) {
104 	return;
105       }
106 
107       /* Check for error conditions. */
108       if (n < 0) {
109 	BLAS_error(routine_name, -3, n, NULL);
110       }
111       if (lda < n) {
112 	BLAS_error(routine_name, -6, n, NULL);
113       }
114       if (incx == 0) {
115 	BLAS_error(routine_name, -9, incx, NULL);
116       }
117       if (incy == 0) {
118 	BLAS_error(routine_name, -12, incy, NULL);
119       }
120 
121       if ((order == blas_colmajor && uplo == blas_upper) ||
122 	  (order == blas_rowmajor && uplo == blas_lower)) {
123 	incai = lda;
124 	incaij = 1;
125 	incaij2 = lda;
126       } else {
127 	incai = 1;
128 	incaij = lda;
129 	incaij2 = 1;
130       }
131 
132 
133 
134 
135 
136 
137       xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
138       yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
139 
140 
141 
142       /* The most general form,   y <--- alpha * A * (x_head + x_tail) + beta * y   */
143       for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
144 	sum1 = 0.0;
145 	sum2 = 0.0;
146 
147 	for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
148 	  a_elem = a_i[aij];
149 	  x_elem = x_head_i[xi];
150 	  prod1 = (double) a_elem *x_elem;
151 	  sum1 = sum1 + prod1;
152 	  x_elem = x_tail_i[xi];
153 	  prod2 = (double) a_elem *x_elem;
154 	  sum2 = sum2 + prod2;
155 	}
156 	for (; j < n; j++, aij += incaij2, xi += incx) {
157 	  a_elem = a_i[aij];
158 	  x_elem = x_head_i[xi];
159 	  prod1 = (double) a_elem *x_elem;
160 	  sum1 = sum1 + prod1;
161 	  x_elem = x_tail_i[xi];
162 	  prod2 = (double) a_elem *x_elem;
163 	  sum2 = sum2 + prod2;
164 	}
165 	sum1 = sum1 + sum2;
166 	tmp1 = sum1 * alpha_i;
167 	y_elem = y_i[yi];
168 	tmp2 = y_elem * beta_i;
169 	tmp3 = tmp1 + tmp2;
170 	y_i[yi] = tmp3;
171       }
172 
173 
174 
175       break;
176     }
177 
178   case blas_prec_double:
179   case blas_prec_indigenous:{
180 
181       int i, j;
182       int xi, yi, xi0, yi0;
183       int aij, ai;
184       int incai;
185       int incaij, incaij2;
186 
187       const float *a_i = a;
188       const float *x_head_i = x_head;
189       const float *x_tail_i = x_tail;
190       double *y_i = y;
191       double alpha_i = alpha;
192       double beta_i = beta;
193       float a_elem;
194       float x_elem;
195       double y_elem;
196       double prod1;
197       double prod2;
198       double sum1;
199       double sum2;
200       double tmp1;
201       double tmp2;
202       double tmp3;
203 
204 
205 
206       /* Test for no-op */
207       if (n <= 0) {
208 	return;
209       }
210       if (alpha_i == 0.0 && beta_i == 1.0) {
211 	return;
212       }
213 
214       /* Check for error conditions. */
215       if (n < 0) {
216 	BLAS_error(routine_name, -3, n, NULL);
217       }
218       if (lda < n) {
219 	BLAS_error(routine_name, -6, n, NULL);
220       }
221       if (incx == 0) {
222 	BLAS_error(routine_name, -9, incx, NULL);
223       }
224       if (incy == 0) {
225 	BLAS_error(routine_name, -12, incy, NULL);
226       }
227 
228       if ((order == blas_colmajor && uplo == blas_upper) ||
229 	  (order == blas_rowmajor && uplo == blas_lower)) {
230 	incai = lda;
231 	incaij = 1;
232 	incaij2 = lda;
233       } else {
234 	incai = 1;
235 	incaij = lda;
236 	incaij2 = 1;
237       }
238 
239 
240 
241 
242 
243 
244       xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
245       yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
246 
247 
248 
249       /* The most general form,   y <--- alpha * A * (x_head + x_tail) + beta * y   */
250       for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
251 	sum1 = 0.0;
252 	sum2 = 0.0;
253 
254 	for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
255 	  a_elem = a_i[aij];
256 	  x_elem = x_head_i[xi];
257 	  prod1 = (double) a_elem *x_elem;
258 	  sum1 = sum1 + prod1;
259 	  x_elem = x_tail_i[xi];
260 	  prod2 = (double) a_elem *x_elem;
261 	  sum2 = sum2 + prod2;
262 	}
263 	for (; j < n; j++, aij += incaij2, xi += incx) {
264 	  a_elem = a_i[aij];
265 	  x_elem = x_head_i[xi];
266 	  prod1 = (double) a_elem *x_elem;
267 	  sum1 = sum1 + prod1;
268 	  x_elem = x_tail_i[xi];
269 	  prod2 = (double) a_elem *x_elem;
270 	  sum2 = sum2 + prod2;
271 	}
272 	sum1 = sum1 + sum2;
273 	tmp1 = sum1 * alpha_i;
274 	y_elem = y_i[yi];
275 	tmp2 = y_elem * beta_i;
276 	tmp3 = tmp1 + tmp2;
277 	y_i[yi] = tmp3;
278       }
279 
280 
281 
282       break;
283     }
284 
285   case blas_prec_extra:{
286 
287       int i, j;
288       int xi, yi, xi0, yi0;
289       int aij, ai;
290       int incai;
291       int incaij, incaij2;
292 
293       const float *a_i = a;
294       const float *x_head_i = x_head;
295       const float *x_tail_i = x_tail;
296       double *y_i = y;
297       double alpha_i = alpha;
298       double beta_i = beta;
299       float a_elem;
300       float x_elem;
301       double y_elem;
302       double head_prod1, tail_prod1;
303       double head_prod2, tail_prod2;
304       double head_sum1, tail_sum1;
305       double head_sum2, tail_sum2;
306       double head_tmp1, tail_tmp1;
307       double head_tmp2, tail_tmp2;
308       double head_tmp3, tail_tmp3;
309 
310       FPU_FIX_DECL;
311 
312       /* Test for no-op */
313       if (n <= 0) {
314 	return;
315       }
316       if (alpha_i == 0.0 && beta_i == 1.0) {
317 	return;
318       }
319 
320       /* Check for error conditions. */
321       if (n < 0) {
322 	BLAS_error(routine_name, -3, n, NULL);
323       }
324       if (lda < n) {
325 	BLAS_error(routine_name, -6, n, NULL);
326       }
327       if (incx == 0) {
328 	BLAS_error(routine_name, -9, incx, NULL);
329       }
330       if (incy == 0) {
331 	BLAS_error(routine_name, -12, incy, NULL);
332       }
333 
334       if ((order == blas_colmajor && uplo == blas_upper) ||
335 	  (order == blas_rowmajor && uplo == blas_lower)) {
336 	incai = lda;
337 	incaij = 1;
338 	incaij2 = lda;
339       } else {
340 	incai = 1;
341 	incaij = lda;
342 	incaij2 = 1;
343       }
344 
345 
346 
347 
348 
349 
350       xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
351       yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
352 
353       FPU_FIX_START;
354 
355       /* The most general form,   y <--- alpha * A * (x_head + x_tail) + beta * y   */
356       for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
357 	head_sum1 = tail_sum1 = 0.0;
358 	head_sum2 = tail_sum2 = 0.0;
359 
360 	for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
361 	  a_elem = a_i[aij];
362 	  x_elem = x_head_i[xi];
363 	  head_prod1 = (double) a_elem *x_elem;
364 	  tail_prod1 = 0.0;
365 	  {
366 	    /* Compute double-double = double-double + double-double. */
367 	    double bv;
368 	    double s1, s2, t1, t2;
369 
370 	    /* Add two hi words. */
371 	    s1 = head_sum1 + head_prod1;
372 	    bv = s1 - head_sum1;
373 	    s2 = ((head_prod1 - bv) + (head_sum1 - (s1 - bv)));
374 
375 	    /* Add two lo words. */
376 	    t1 = tail_sum1 + tail_prod1;
377 	    bv = t1 - tail_sum1;
378 	    t2 = ((tail_prod1 - bv) + (tail_sum1 - (t1 - bv)));
379 
380 	    s2 += t1;
381 
382 	    /* Renormalize (s1, s2)  to  (t1, s2) */
383 	    t1 = s1 + s2;
384 	    s2 = s2 - (t1 - s1);
385 
386 	    t2 += s2;
387 
388 	    /* Renormalize (t1, t2)  */
389 	    head_sum1 = t1 + t2;
390 	    tail_sum1 = t2 - (head_sum1 - t1);
391 	  }
392 	  x_elem = x_tail_i[xi];
393 	  head_prod2 = (double) a_elem *x_elem;
394 	  tail_prod2 = 0.0;
395 	  {
396 	    /* Compute double-double = double-double + double-double. */
397 	    double bv;
398 	    double s1, s2, t1, t2;
399 
400 	    /* Add two hi words. */
401 	    s1 = head_sum2 + head_prod2;
402 	    bv = s1 - head_sum2;
403 	    s2 = ((head_prod2 - bv) + (head_sum2 - (s1 - bv)));
404 
405 	    /* Add two lo words. */
406 	    t1 = tail_sum2 + tail_prod2;
407 	    bv = t1 - tail_sum2;
408 	    t2 = ((tail_prod2 - bv) + (tail_sum2 - (t1 - bv)));
409 
410 	    s2 += t1;
411 
412 	    /* Renormalize (s1, s2)  to  (t1, s2) */
413 	    t1 = s1 + s2;
414 	    s2 = s2 - (t1 - s1);
415 
416 	    t2 += s2;
417 
418 	    /* Renormalize (t1, t2)  */
419 	    head_sum2 = t1 + t2;
420 	    tail_sum2 = t2 - (head_sum2 - t1);
421 	  }
422 	}
423 	for (; j < n; j++, aij += incaij2, xi += incx) {
424 	  a_elem = a_i[aij];
425 	  x_elem = x_head_i[xi];
426 	  head_prod1 = (double) a_elem *x_elem;
427 	  tail_prod1 = 0.0;
428 	  {
429 	    /* Compute double-double = double-double + double-double. */
430 	    double bv;
431 	    double s1, s2, t1, t2;
432 
433 	    /* Add two hi words. */
434 	    s1 = head_sum1 + head_prod1;
435 	    bv = s1 - head_sum1;
436 	    s2 = ((head_prod1 - bv) + (head_sum1 - (s1 - bv)));
437 
438 	    /* Add two lo words. */
439 	    t1 = tail_sum1 + tail_prod1;
440 	    bv = t1 - tail_sum1;
441 	    t2 = ((tail_prod1 - bv) + (tail_sum1 - (t1 - bv)));
442 
443 	    s2 += t1;
444 
445 	    /* Renormalize (s1, s2)  to  (t1, s2) */
446 	    t1 = s1 + s2;
447 	    s2 = s2 - (t1 - s1);
448 
449 	    t2 += s2;
450 
451 	    /* Renormalize (t1, t2)  */
452 	    head_sum1 = t1 + t2;
453 	    tail_sum1 = t2 - (head_sum1 - t1);
454 	  }
455 	  x_elem = x_tail_i[xi];
456 	  head_prod2 = (double) a_elem *x_elem;
457 	  tail_prod2 = 0.0;
458 	  {
459 	    /* Compute double-double = double-double + double-double. */
460 	    double bv;
461 	    double s1, s2, t1, t2;
462 
463 	    /* Add two hi words. */
464 	    s1 = head_sum2 + head_prod2;
465 	    bv = s1 - head_sum2;
466 	    s2 = ((head_prod2 - bv) + (head_sum2 - (s1 - bv)));
467 
468 	    /* Add two lo words. */
469 	    t1 = tail_sum2 + tail_prod2;
470 	    bv = t1 - tail_sum2;
471 	    t2 = ((tail_prod2 - bv) + (tail_sum2 - (t1 - bv)));
472 
473 	    s2 += t1;
474 
475 	    /* Renormalize (s1, s2)  to  (t1, s2) */
476 	    t1 = s1 + s2;
477 	    s2 = s2 - (t1 - s1);
478 
479 	    t2 += s2;
480 
481 	    /* Renormalize (t1, t2)  */
482 	    head_sum2 = t1 + t2;
483 	    tail_sum2 = t2 - (head_sum2 - t1);
484 	  }
485 	}
486 	{
487 	  /* Compute double-double = double-double + double-double. */
488 	  double bv;
489 	  double s1, s2, t1, t2;
490 
491 	  /* Add two hi words. */
492 	  s1 = head_sum1 + head_sum2;
493 	  bv = s1 - head_sum1;
494 	  s2 = ((head_sum2 - bv) + (head_sum1 - (s1 - bv)));
495 
496 	  /* Add two lo words. */
497 	  t1 = tail_sum1 + tail_sum2;
498 	  bv = t1 - tail_sum1;
499 	  t2 = ((tail_sum2 - bv) + (tail_sum1 - (t1 - bv)));
500 
501 	  s2 += t1;
502 
503 	  /* Renormalize (s1, s2)  to  (t1, s2) */
504 	  t1 = s1 + s2;
505 	  s2 = s2 - (t1 - s1);
506 
507 	  t2 += s2;
508 
509 	  /* Renormalize (t1, t2)  */
510 	  head_sum1 = t1 + t2;
511 	  tail_sum1 = t2 - (head_sum1 - t1);
512 	}
513 	{
514 	  /* Compute double-double = double-double * double. */
515 	  double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
516 
517 	  con = head_sum1 * split;
518 	  a11 = con - head_sum1;
519 	  a11 = con - a11;
520 	  a21 = head_sum1 - a11;
521 	  con = alpha_i * split;
522 	  b1 = con - alpha_i;
523 	  b1 = con - b1;
524 	  b2 = alpha_i - b1;
525 
526 	  c11 = head_sum1 * alpha_i;
527 	  c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
528 
529 	  c2 = tail_sum1 * alpha_i;
530 	  t1 = c11 + c2;
531 	  t2 = (c2 - (t1 - c11)) + c21;
532 
533 	  head_tmp1 = t1 + t2;
534 	  tail_tmp1 = t2 - (head_tmp1 - t1);
535 	}
536 	y_elem = y_i[yi];
537 	{
538 	  /* Compute double_double = double * double. */
539 	  double a1, a2, b1, b2, con;
540 
541 	  con = y_elem * split;
542 	  a1 = con - y_elem;
543 	  a1 = con - a1;
544 	  a2 = y_elem - a1;
545 	  con = beta_i * split;
546 	  b1 = con - beta_i;
547 	  b1 = con - b1;
548 	  b2 = beta_i - b1;
549 
550 	  head_tmp2 = y_elem * beta_i;
551 	  tail_tmp2 = (((a1 * b1 - head_tmp2) + a1 * b2) + a2 * b1) + a2 * b2;
552 	}
553 	{
554 	  /* Compute double-double = double-double + double-double. */
555 	  double bv;
556 	  double s1, s2, t1, t2;
557 
558 	  /* Add two hi words. */
559 	  s1 = head_tmp1 + head_tmp2;
560 	  bv = s1 - head_tmp1;
561 	  s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
562 
563 	  /* Add two lo words. */
564 	  t1 = tail_tmp1 + tail_tmp2;
565 	  bv = t1 - tail_tmp1;
566 	  t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
567 
568 	  s2 += t1;
569 
570 	  /* Renormalize (s1, s2)  to  (t1, s2) */
571 	  t1 = s1 + s2;
572 	  s2 = s2 - (t1 - s1);
573 
574 	  t2 += s2;
575 
576 	  /* Renormalize (t1, t2)  */
577 	  head_tmp3 = t1 + t2;
578 	  tail_tmp3 = t2 - (head_tmp3 - t1);
579 	}
580 	y_i[yi] = head_tmp3;
581       }
582 
583       FPU_FIX_STOP;
584 
585       break;
586     }
587   }
588 }				/* end BLAS_dsymv2_s_s_x */
589