1 #include "blas_extended.h"
2 #include "blas_extended_private.h"
BLAS_dgemv_s_d_x(enum blas_order_type order,enum blas_trans_type trans,int m,int n,double alpha,const float * a,int lda,const double * x,int incx,double beta,double * y,int incy,enum blas_prec_type prec)3 void BLAS_dgemv_s_d_x(enum blas_order_type order, enum blas_trans_type trans,
4 		      int m, int n, double alpha, const float *a, int lda,
5 		      const double *x, int incx, double beta, double *y,
6 		      int incy, enum blas_prec_type prec)
7 
8 /*
9  * Purpose
10  * =======
11  *
12  * Computes y = alpha * A * x + beta * y, where A is a general matrix.
13  *
14  * Arguments
15  * =========
16  *
17  * order        (input) blas_order_type
18  *              Order of AP; row or column major
19  *
20  * trans        (input) blas_trans_type
21  *              Transpose of AB; no trans,
22  *              trans, or conjugate trans
23  *
24  * m            (input) int
25  *              Dimension of AB
26  *
27  * n            (input) int
28  *              Dimension of AB and the length of vector x
29  *
30  * alpha        (input) double
31  *
32  * A            (input) const float*
33  *
34  * lda          (input) int
35  *              Leading dimension of A
36  *
37  * x            (input) const double*
38  *
39  * incx         (input) int
40  *              The stride for vector x.
41  *
42  * beta         (input) double
43  *
44  * y            (input/output) double*
45  *
46  * incy         (input) int
47  *              The stride for vector y.
48  *
49  * prec   (input) enum blas_prec_type
50  *        Specifies the internal precision to be used.
51  *        = blas_prec_single: single precision.
52  *        = blas_prec_double: double precision.
53  *        = blas_prec_extra : anything at least 1.5 times as accurate
54  *                            than double, and wider than 80-bits.
55  *                            We use double-double in our implementation.
56  *
57  */
58 {
59   static const char routine_name[] = "BLAS_dgemv_s_d_x";
60   switch (prec) {
61   case blas_prec_single:
62   case blas_prec_double:
63   case blas_prec_indigenous:{
64 
65       int i, j;
66       int iy, jx, kx, ky;
67       int lenx, leny;
68       int ai, aij;
69       int incai, incaij;
70 
71       const float *a_i = a;
72       const double *x_i = x;
73       double *y_i = y;
74       double alpha_i = alpha;
75       double beta_i = beta;
76       float a_elem;
77       double x_elem;
78       double y_elem;
79       double prod;
80       double sum;
81       double tmp1;
82       double tmp2;
83 
84 
85       /* all error calls */
86       if (m < 0)
87 	BLAS_error(routine_name, -3, m, 0);
88       else if (n <= 0)
89 	BLAS_error(routine_name, -4, n, 0);
90       else if (incx == 0)
91 	BLAS_error(routine_name, -9, incx, 0);
92       else if (incy == 0)
93 	BLAS_error(routine_name, -12, incy, 0);
94 
95       if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
96 	lenx = n;
97 	leny = m;
98 	incai = lda;
99 	incaij = 1;
100       } else if ((order == blas_rowmajor) && (trans != blas_no_trans)) {
101 	lenx = m;
102 	leny = n;
103 	incai = 1;
104 	incaij = lda;
105       } else if ((order == blas_colmajor) && (trans == blas_no_trans)) {
106 	lenx = n;
107 	leny = m;
108 	incai = 1;
109 	incaij = lda;
110       } else {			/* colmajor and blas_trans */
111 	lenx = m;
112 	leny = n;
113 	incai = lda;
114 	incaij = 1;
115       }
116       if ((order == blas_colmajor && lda < m) ||
117 	  (order == blas_rowmajor && lda < n))
118 	BLAS_error(routine_name, -7, lda, NULL);
119 
120 
121 
122 
123 
124 
125 
126 
127       if (incx > 0)
128 	kx = 0;
129       else
130 	kx = (1 - lenx) * incx;
131       if (incy > 0)
132 	ky = 0;
133       else
134 	ky = (1 - leny) * incy;
135 
136       /* No extra-precision needed for alpha = 0 */
137       if (alpha_i == 0.0) {
138 	if (beta_i == 0.0) {
139 	  iy = ky;
140 	  for (i = 0; i < leny; i++) {
141 	    y_i[iy] = 0.0;
142 	    iy += incy;
143 	  }
144 	} else if (!(beta_i == 0.0)) {
145 	  iy = ky;
146 	  for (i = 0; i < leny; i++) {
147 	    y_elem = y_i[iy];
148 	    tmp1 = y_elem * beta_i;
149 	    y_i[iy] = tmp1;
150 	    iy += incy;
151 	  }
152 	}
153       } else {
154 
155 	/* if beta = 0, we can save m multiplies: y = alpha*A*x */
156 	if (beta_i == 0.0) {
157 	  /* save m more multiplies if alpha = 1 */
158 	  if (alpha_i == 1.0) {
159 	    ai = 0;
160 	    iy = ky;
161 	    for (i = 0; i < leny; i++) {
162 	      sum = 0.0;
163 	      aij = ai;
164 	      jx = kx;
165 	      for (j = 0; j < lenx; j++) {
166 		a_elem = a_i[aij];
167 
168 		x_elem = x_i[jx];
169 		prod = a_elem * x_elem;
170 		sum = sum + prod;
171 		aij += incaij;
172 		jx += incx;
173 	      }
174 	      y_i[iy] = sum;
175 	      ai += incai;
176 	      iy += incy;
177 	    }
178 	  } else {
179 	    ai = 0;
180 	    iy = ky;
181 	    for (i = 0; i < leny; i++) {
182 	      sum = 0.0;
183 	      aij = ai;
184 	      jx = kx;
185 	      for (j = 0; j < lenx; j++) {
186 		a_elem = a_i[aij];
187 
188 		x_elem = x_i[jx];
189 		prod = a_elem * x_elem;
190 		sum = sum + prod;
191 		aij += incaij;
192 		jx += incx;
193 	      }
194 	      tmp1 = sum * alpha_i;
195 	      y_i[iy] = tmp1;
196 	      ai += incai;
197 	      iy += incy;
198 	    }
199 	  }
200 	} else {
201 	  /* the most general form, y = alpha*A*x + beta*y */
202 	  ai = 0;
203 	  iy = ky;
204 	  for (i = 0; i < leny; i++) {
205 	    sum = 0.0;;
206 	    aij = ai;
207 	    jx = kx;
208 	    for (j = 0; j < lenx; j++) {
209 	      a_elem = a_i[aij];
210 
211 	      x_elem = x_i[jx];
212 	      prod = a_elem * x_elem;
213 	      sum = sum + prod;
214 	      aij += incaij;
215 	      jx += incx;
216 	    }
217 	    tmp1 = sum * alpha_i;
218 	    y_elem = y_i[iy];
219 	    tmp2 = y_elem * beta_i;
220 	    tmp1 = tmp1 + tmp2;
221 	    y_i[iy] = tmp1;
222 	    ai += incai;
223 	    iy += incy;
224 	  }
225 	}
226 
227       }
228 
229 
230 
231       break;
232     }
233   case blas_prec_extra:{
234 
235       int i, j;
236       int iy, jx, kx, ky;
237       int lenx, leny;
238       int ai, aij;
239       int incai, incaij;
240 
241       const float *a_i = a;
242       const double *x_i = x;
243       double *y_i = y;
244       double alpha_i = alpha;
245       double beta_i = beta;
246       float a_elem;
247       double x_elem;
248       double y_elem;
249       double head_prod, tail_prod;
250       double head_sum, tail_sum;
251       double head_tmp1, tail_tmp1;
252       double head_tmp2, tail_tmp2;
253       FPU_FIX_DECL;
254 
255       /* all error calls */
256       if (m < 0)
257 	BLAS_error(routine_name, -3, m, 0);
258       else if (n <= 0)
259 	BLAS_error(routine_name, -4, n, 0);
260       else if (incx == 0)
261 	BLAS_error(routine_name, -9, incx, 0);
262       else if (incy == 0)
263 	BLAS_error(routine_name, -12, incy, 0);
264 
265       if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
266 	lenx = n;
267 	leny = m;
268 	incai = lda;
269 	incaij = 1;
270       } else if ((order == blas_rowmajor) && (trans != blas_no_trans)) {
271 	lenx = m;
272 	leny = n;
273 	incai = 1;
274 	incaij = lda;
275       } else if ((order == blas_colmajor) && (trans == blas_no_trans)) {
276 	lenx = n;
277 	leny = m;
278 	incai = 1;
279 	incaij = lda;
280       } else {			/* colmajor and blas_trans */
281 	lenx = m;
282 	leny = n;
283 	incai = lda;
284 	incaij = 1;
285       }
286       if ((order == blas_colmajor && lda < m) ||
287 	  (order == blas_rowmajor && lda < n))
288 	BLAS_error(routine_name, -7, lda, NULL);
289 
290       FPU_FIX_START;
291 
292 
293 
294 
295 
296 
297       if (incx > 0)
298 	kx = 0;
299       else
300 	kx = (1 - lenx) * incx;
301       if (incy > 0)
302 	ky = 0;
303       else
304 	ky = (1 - leny) * incy;
305 
306       /* No extra-precision needed for alpha = 0 */
307       if (alpha_i == 0.0) {
308 	if (beta_i == 0.0) {
309 	  iy = ky;
310 	  for (i = 0; i < leny; i++) {
311 	    y_i[iy] = 0.0;
312 	    iy += incy;
313 	  }
314 	} else if (!(beta_i == 0.0)) {
315 	  iy = ky;
316 	  for (i = 0; i < leny; i++) {
317 	    y_elem = y_i[iy];
318 	    {
319 	      /* Compute double_double = double * double. */
320 	      double a1, a2, b1, b2, con;
321 
322 	      con = y_elem * split;
323 	      a1 = con - y_elem;
324 	      a1 = con - a1;
325 	      a2 = y_elem - a1;
326 	      con = beta_i * split;
327 	      b1 = con - beta_i;
328 	      b1 = con - b1;
329 	      b2 = beta_i - b1;
330 
331 	      head_tmp1 = y_elem * beta_i;
332 	      tail_tmp1 =
333 		(((a1 * b1 - head_tmp1) + a1 * b2) + a2 * b1) + a2 * b2;
334 	    }
335 	    y_i[iy] = head_tmp1;
336 	    iy += incy;
337 	  }
338 	}
339       } else {
340 
341 	/* if beta = 0, we can save m multiplies: y = alpha*A*x */
342 	if (beta_i == 0.0) {
343 	  /* save m more multiplies if alpha = 1 */
344 	  if (alpha_i == 1.0) {
345 	    ai = 0;
346 	    iy = ky;
347 	    for (i = 0; i < leny; i++) {
348 	      head_sum = tail_sum = 0.0;
349 	      aij = ai;
350 	      jx = kx;
351 	      for (j = 0; j < lenx; j++) {
352 		a_elem = a_i[aij];
353 
354 		x_elem = x_i[jx];
355 		{
356 		  double dt = (double) a_elem;
357 		  {
358 		    /* Compute double_double = double * double. */
359 		    double a1, a2, b1, b2, con;
360 
361 		    con = dt * split;
362 		    a1 = con - dt;
363 		    a1 = con - a1;
364 		    a2 = dt - a1;
365 		    con = x_elem * split;
366 		    b1 = con - x_elem;
367 		    b1 = con - b1;
368 		    b2 = x_elem - b1;
369 
370 		    head_prod = dt * x_elem;
371 		    tail_prod =
372 		      (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
373 		  }
374 		}
375 		{
376 		  /* Compute double-double = double-double + double-double. */
377 		  double bv;
378 		  double s1, s2, t1, t2;
379 
380 		  /* Add two hi words. */
381 		  s1 = head_sum + head_prod;
382 		  bv = s1 - head_sum;
383 		  s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
384 
385 		  /* Add two lo words. */
386 		  t1 = tail_sum + tail_prod;
387 		  bv = t1 - tail_sum;
388 		  t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
389 
390 		  s2 += t1;
391 
392 		  /* Renormalize (s1, s2)  to  (t1, s2) */
393 		  t1 = s1 + s2;
394 		  s2 = s2 - (t1 - s1);
395 
396 		  t2 += s2;
397 
398 		  /* Renormalize (t1, t2)  */
399 		  head_sum = t1 + t2;
400 		  tail_sum = t2 - (head_sum - t1);
401 		}
402 		aij += incaij;
403 		jx += incx;
404 	      }
405 	      y_i[iy] = head_sum;
406 	      ai += incai;
407 	      iy += incy;
408 	    }
409 	  } else {
410 	    ai = 0;
411 	    iy = ky;
412 	    for (i = 0; i < leny; i++) {
413 	      head_sum = tail_sum = 0.0;
414 	      aij = ai;
415 	      jx = kx;
416 	      for (j = 0; j < lenx; j++) {
417 		a_elem = a_i[aij];
418 
419 		x_elem = x_i[jx];
420 		{
421 		  double dt = (double) a_elem;
422 		  {
423 		    /* Compute double_double = double * double. */
424 		    double a1, a2, b1, b2, con;
425 
426 		    con = dt * split;
427 		    a1 = con - dt;
428 		    a1 = con - a1;
429 		    a2 = dt - a1;
430 		    con = x_elem * split;
431 		    b1 = con - x_elem;
432 		    b1 = con - b1;
433 		    b2 = x_elem - b1;
434 
435 		    head_prod = dt * x_elem;
436 		    tail_prod =
437 		      (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
438 		  }
439 		}
440 		{
441 		  /* Compute double-double = double-double + double-double. */
442 		  double bv;
443 		  double s1, s2, t1, t2;
444 
445 		  /* Add two hi words. */
446 		  s1 = head_sum + head_prod;
447 		  bv = s1 - head_sum;
448 		  s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
449 
450 		  /* Add two lo words. */
451 		  t1 = tail_sum + tail_prod;
452 		  bv = t1 - tail_sum;
453 		  t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
454 
455 		  s2 += t1;
456 
457 		  /* Renormalize (s1, s2)  to  (t1, s2) */
458 		  t1 = s1 + s2;
459 		  s2 = s2 - (t1 - s1);
460 
461 		  t2 += s2;
462 
463 		  /* Renormalize (t1, t2)  */
464 		  head_sum = t1 + t2;
465 		  tail_sum = t2 - (head_sum - t1);
466 		}
467 		aij += incaij;
468 		jx += incx;
469 	      }
470 	      {
471 		/* Compute double-double = double-double * double. */
472 		double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
473 
474 		con = head_sum * split;
475 		a11 = con - head_sum;
476 		a11 = con - a11;
477 		a21 = head_sum - a11;
478 		con = alpha_i * split;
479 		b1 = con - alpha_i;
480 		b1 = con - b1;
481 		b2 = alpha_i - b1;
482 
483 		c11 = head_sum * alpha_i;
484 		c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
485 
486 		c2 = tail_sum * alpha_i;
487 		t1 = c11 + c2;
488 		t2 = (c2 - (t1 - c11)) + c21;
489 
490 		head_tmp1 = t1 + t2;
491 		tail_tmp1 = t2 - (head_tmp1 - t1);
492 	      }
493 	      y_i[iy] = head_tmp1;
494 	      ai += incai;
495 	      iy += incy;
496 	    }
497 	  }
498 	} else {
499 	  /* the most general form, y = alpha*A*x + beta*y */
500 	  ai = 0;
501 	  iy = ky;
502 	  for (i = 0; i < leny; i++) {
503 	    head_sum = tail_sum = 0.0;;
504 	    aij = ai;
505 	    jx = kx;
506 	    for (j = 0; j < lenx; j++) {
507 	      a_elem = a_i[aij];
508 
509 	      x_elem = x_i[jx];
510 	      {
511 		double dt = (double) a_elem;
512 		{
513 		  /* Compute double_double = double * double. */
514 		  double a1, a2, b1, b2, con;
515 
516 		  con = dt * split;
517 		  a1 = con - dt;
518 		  a1 = con - a1;
519 		  a2 = dt - a1;
520 		  con = x_elem * split;
521 		  b1 = con - x_elem;
522 		  b1 = con - b1;
523 		  b2 = x_elem - b1;
524 
525 		  head_prod = dt * x_elem;
526 		  tail_prod =
527 		    (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
528 		}
529 	      }
530 	      {
531 		/* Compute double-double = double-double + double-double. */
532 		double bv;
533 		double s1, s2, t1, t2;
534 
535 		/* Add two hi words. */
536 		s1 = head_sum + head_prod;
537 		bv = s1 - head_sum;
538 		s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
539 
540 		/* Add two lo words. */
541 		t1 = tail_sum + tail_prod;
542 		bv = t1 - tail_sum;
543 		t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
544 
545 		s2 += t1;
546 
547 		/* Renormalize (s1, s2)  to  (t1, s2) */
548 		t1 = s1 + s2;
549 		s2 = s2 - (t1 - s1);
550 
551 		t2 += s2;
552 
553 		/* Renormalize (t1, t2)  */
554 		head_sum = t1 + t2;
555 		tail_sum = t2 - (head_sum - t1);
556 	      }
557 	      aij += incaij;
558 	      jx += incx;
559 	    }
560 	    {
561 	      /* Compute double-double = double-double * double. */
562 	      double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
563 
564 	      con = head_sum * split;
565 	      a11 = con - head_sum;
566 	      a11 = con - a11;
567 	      a21 = head_sum - a11;
568 	      con = alpha_i * split;
569 	      b1 = con - alpha_i;
570 	      b1 = con - b1;
571 	      b2 = alpha_i - b1;
572 
573 	      c11 = head_sum * alpha_i;
574 	      c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
575 
576 	      c2 = tail_sum * alpha_i;
577 	      t1 = c11 + c2;
578 	      t2 = (c2 - (t1 - c11)) + c21;
579 
580 	      head_tmp1 = t1 + t2;
581 	      tail_tmp1 = t2 - (head_tmp1 - t1);
582 	    }
583 	    y_elem = y_i[iy];
584 	    {
585 	      /* Compute double_double = double * double. */
586 	      double a1, a2, b1, b2, con;
587 
588 	      con = y_elem * split;
589 	      a1 = con - y_elem;
590 	      a1 = con - a1;
591 	      a2 = y_elem - a1;
592 	      con = beta_i * split;
593 	      b1 = con - beta_i;
594 	      b1 = con - b1;
595 	      b2 = beta_i - b1;
596 
597 	      head_tmp2 = y_elem * beta_i;
598 	      tail_tmp2 =
599 		(((a1 * b1 - head_tmp2) + a1 * b2) + a2 * b1) + a2 * b2;
600 	    }
601 	    {
602 	      /* Compute double-double = double-double + double-double. */
603 	      double bv;
604 	      double s1, s2, t1, t2;
605 
606 	      /* Add two hi words. */
607 	      s1 = head_tmp1 + head_tmp2;
608 	      bv = s1 - head_tmp1;
609 	      s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
610 
611 	      /* Add two lo words. */
612 	      t1 = tail_tmp1 + tail_tmp2;
613 	      bv = t1 - tail_tmp1;
614 	      t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
615 
616 	      s2 += t1;
617 
618 	      /* Renormalize (s1, s2)  to  (t1, s2) */
619 	      t1 = s1 + s2;
620 	      s2 = s2 - (t1 - s1);
621 
622 	      t2 += s2;
623 
624 	      /* Renormalize (t1, t2)  */
625 	      head_tmp1 = t1 + t2;
626 	      tail_tmp1 = t2 - (head_tmp1 - t1);
627 	    }
628 	    y_i[iy] = head_tmp1;
629 	    ai += incai;
630 	    iy += incy;
631 	  }
632 	}
633 
634       }
635 
636       FPU_FIX_STOP;
637     }
638     break;
639   }
640 }
641