1 #include "blas_extended.h"
2 #include "blas_extended_private.h"
BLAS_dgemv_s_d_x(enum blas_order_type order,enum blas_trans_type trans,int m,int n,double alpha,const float * a,int lda,const double * x,int incx,double beta,double * y,int incy,enum blas_prec_type prec)3 void BLAS_dgemv_s_d_x(enum blas_order_type order, enum blas_trans_type trans,
4 int m, int n, double alpha, const float *a, int lda,
5 const double *x, int incx, double beta, double *y,
6 int incy, enum blas_prec_type prec)
7
8 /*
9 * Purpose
10 * =======
11 *
12 * Computes y = alpha * A * x + beta * y, where A is a general matrix.
13 *
14 * Arguments
15 * =========
16 *
17 * order (input) blas_order_type
18 * Order of AP; row or column major
19 *
20 * trans (input) blas_trans_type
21 * Transpose of AB; no trans,
22 * trans, or conjugate trans
23 *
24 * m (input) int
25 * Dimension of AB
26 *
27 * n (input) int
28 * Dimension of AB and the length of vector x
29 *
30 * alpha (input) double
31 *
32 * A (input) const float*
33 *
34 * lda (input) int
35 * Leading dimension of A
36 *
37 * x (input) const double*
38 *
39 * incx (input) int
40 * The stride for vector x.
41 *
42 * beta (input) double
43 *
44 * y (input/output) double*
45 *
46 * incy (input) int
47 * The stride for vector y.
48 *
49 * prec (input) enum blas_prec_type
50 * Specifies the internal precision to be used.
51 * = blas_prec_single: single precision.
52 * = blas_prec_double: double precision.
53 * = blas_prec_extra : anything at least 1.5 times as accurate
54 * than double, and wider than 80-bits.
55 * We use double-double in our implementation.
56 *
57 */
58 {
59 static const char routine_name[] = "BLAS_dgemv_s_d_x";
60 switch (prec) {
61 case blas_prec_single:
62 case blas_prec_double:
63 case blas_prec_indigenous:{
64
65 int i, j;
66 int iy, jx, kx, ky;
67 int lenx, leny;
68 int ai, aij;
69 int incai, incaij;
70
71 const float *a_i = a;
72 const double *x_i = x;
73 double *y_i = y;
74 double alpha_i = alpha;
75 double beta_i = beta;
76 float a_elem;
77 double x_elem;
78 double y_elem;
79 double prod;
80 double sum;
81 double tmp1;
82 double tmp2;
83
84
85 /* all error calls */
86 if (m < 0)
87 BLAS_error(routine_name, -3, m, 0);
88 else if (n <= 0)
89 BLAS_error(routine_name, -4, n, 0);
90 else if (incx == 0)
91 BLAS_error(routine_name, -9, incx, 0);
92 else if (incy == 0)
93 BLAS_error(routine_name, -12, incy, 0);
94
95 if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
96 lenx = n;
97 leny = m;
98 incai = lda;
99 incaij = 1;
100 } else if ((order == blas_rowmajor) && (trans != blas_no_trans)) {
101 lenx = m;
102 leny = n;
103 incai = 1;
104 incaij = lda;
105 } else if ((order == blas_colmajor) && (trans == blas_no_trans)) {
106 lenx = n;
107 leny = m;
108 incai = 1;
109 incaij = lda;
110 } else { /* colmajor and blas_trans */
111 lenx = m;
112 leny = n;
113 incai = lda;
114 incaij = 1;
115 }
116 if ((order == blas_colmajor && lda < m) ||
117 (order == blas_rowmajor && lda < n))
118 BLAS_error(routine_name, -7, lda, NULL);
119
120
121
122
123
124
125
126
127 if (incx > 0)
128 kx = 0;
129 else
130 kx = (1 - lenx) * incx;
131 if (incy > 0)
132 ky = 0;
133 else
134 ky = (1 - leny) * incy;
135
136 /* No extra-precision needed for alpha = 0 */
137 if (alpha_i == 0.0) {
138 if (beta_i == 0.0) {
139 iy = ky;
140 for (i = 0; i < leny; i++) {
141 y_i[iy] = 0.0;
142 iy += incy;
143 }
144 } else if (!(beta_i == 0.0)) {
145 iy = ky;
146 for (i = 0; i < leny; i++) {
147 y_elem = y_i[iy];
148 tmp1 = y_elem * beta_i;
149 y_i[iy] = tmp1;
150 iy += incy;
151 }
152 }
153 } else {
154
155 /* if beta = 0, we can save m multiplies: y = alpha*A*x */
156 if (beta_i == 0.0) {
157 /* save m more multiplies if alpha = 1 */
158 if (alpha_i == 1.0) {
159 ai = 0;
160 iy = ky;
161 for (i = 0; i < leny; i++) {
162 sum = 0.0;
163 aij = ai;
164 jx = kx;
165 for (j = 0; j < lenx; j++) {
166 a_elem = a_i[aij];
167
168 x_elem = x_i[jx];
169 prod = a_elem * x_elem;
170 sum = sum + prod;
171 aij += incaij;
172 jx += incx;
173 }
174 y_i[iy] = sum;
175 ai += incai;
176 iy += incy;
177 }
178 } else {
179 ai = 0;
180 iy = ky;
181 for (i = 0; i < leny; i++) {
182 sum = 0.0;
183 aij = ai;
184 jx = kx;
185 for (j = 0; j < lenx; j++) {
186 a_elem = a_i[aij];
187
188 x_elem = x_i[jx];
189 prod = a_elem * x_elem;
190 sum = sum + prod;
191 aij += incaij;
192 jx += incx;
193 }
194 tmp1 = sum * alpha_i;
195 y_i[iy] = tmp1;
196 ai += incai;
197 iy += incy;
198 }
199 }
200 } else {
201 /* the most general form, y = alpha*A*x + beta*y */
202 ai = 0;
203 iy = ky;
204 for (i = 0; i < leny; i++) {
205 sum = 0.0;;
206 aij = ai;
207 jx = kx;
208 for (j = 0; j < lenx; j++) {
209 a_elem = a_i[aij];
210
211 x_elem = x_i[jx];
212 prod = a_elem * x_elem;
213 sum = sum + prod;
214 aij += incaij;
215 jx += incx;
216 }
217 tmp1 = sum * alpha_i;
218 y_elem = y_i[iy];
219 tmp2 = y_elem * beta_i;
220 tmp1 = tmp1 + tmp2;
221 y_i[iy] = tmp1;
222 ai += incai;
223 iy += incy;
224 }
225 }
226
227 }
228
229
230
231 break;
232 }
233 case blas_prec_extra:{
234
235 int i, j;
236 int iy, jx, kx, ky;
237 int lenx, leny;
238 int ai, aij;
239 int incai, incaij;
240
241 const float *a_i = a;
242 const double *x_i = x;
243 double *y_i = y;
244 double alpha_i = alpha;
245 double beta_i = beta;
246 float a_elem;
247 double x_elem;
248 double y_elem;
249 double head_prod, tail_prod;
250 double head_sum, tail_sum;
251 double head_tmp1, tail_tmp1;
252 double head_tmp2, tail_tmp2;
253 FPU_FIX_DECL;
254
255 /* all error calls */
256 if (m < 0)
257 BLAS_error(routine_name, -3, m, 0);
258 else if (n <= 0)
259 BLAS_error(routine_name, -4, n, 0);
260 else if (incx == 0)
261 BLAS_error(routine_name, -9, incx, 0);
262 else if (incy == 0)
263 BLAS_error(routine_name, -12, incy, 0);
264
265 if ((order == blas_rowmajor) && (trans == blas_no_trans)) {
266 lenx = n;
267 leny = m;
268 incai = lda;
269 incaij = 1;
270 } else if ((order == blas_rowmajor) && (trans != blas_no_trans)) {
271 lenx = m;
272 leny = n;
273 incai = 1;
274 incaij = lda;
275 } else if ((order == blas_colmajor) && (trans == blas_no_trans)) {
276 lenx = n;
277 leny = m;
278 incai = 1;
279 incaij = lda;
280 } else { /* colmajor and blas_trans */
281 lenx = m;
282 leny = n;
283 incai = lda;
284 incaij = 1;
285 }
286 if ((order == blas_colmajor && lda < m) ||
287 (order == blas_rowmajor && lda < n))
288 BLAS_error(routine_name, -7, lda, NULL);
289
290 FPU_FIX_START;
291
292
293
294
295
296
297 if (incx > 0)
298 kx = 0;
299 else
300 kx = (1 - lenx) * incx;
301 if (incy > 0)
302 ky = 0;
303 else
304 ky = (1 - leny) * incy;
305
306 /* No extra-precision needed for alpha = 0 */
307 if (alpha_i == 0.0) {
308 if (beta_i == 0.0) {
309 iy = ky;
310 for (i = 0; i < leny; i++) {
311 y_i[iy] = 0.0;
312 iy += incy;
313 }
314 } else if (!(beta_i == 0.0)) {
315 iy = ky;
316 for (i = 0; i < leny; i++) {
317 y_elem = y_i[iy];
318 {
319 /* Compute double_double = double * double. */
320 double a1, a2, b1, b2, con;
321
322 con = y_elem * split;
323 a1 = con - y_elem;
324 a1 = con - a1;
325 a2 = y_elem - a1;
326 con = beta_i * split;
327 b1 = con - beta_i;
328 b1 = con - b1;
329 b2 = beta_i - b1;
330
331 head_tmp1 = y_elem * beta_i;
332 tail_tmp1 =
333 (((a1 * b1 - head_tmp1) + a1 * b2) + a2 * b1) + a2 * b2;
334 }
335 y_i[iy] = head_tmp1;
336 iy += incy;
337 }
338 }
339 } else {
340
341 /* if beta = 0, we can save m multiplies: y = alpha*A*x */
342 if (beta_i == 0.0) {
343 /* save m more multiplies if alpha = 1 */
344 if (alpha_i == 1.0) {
345 ai = 0;
346 iy = ky;
347 for (i = 0; i < leny; i++) {
348 head_sum = tail_sum = 0.0;
349 aij = ai;
350 jx = kx;
351 for (j = 0; j < lenx; j++) {
352 a_elem = a_i[aij];
353
354 x_elem = x_i[jx];
355 {
356 double dt = (double) a_elem;
357 {
358 /* Compute double_double = double * double. */
359 double a1, a2, b1, b2, con;
360
361 con = dt * split;
362 a1 = con - dt;
363 a1 = con - a1;
364 a2 = dt - a1;
365 con = x_elem * split;
366 b1 = con - x_elem;
367 b1 = con - b1;
368 b2 = x_elem - b1;
369
370 head_prod = dt * x_elem;
371 tail_prod =
372 (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
373 }
374 }
375 {
376 /* Compute double-double = double-double + double-double. */
377 double bv;
378 double s1, s2, t1, t2;
379
380 /* Add two hi words. */
381 s1 = head_sum + head_prod;
382 bv = s1 - head_sum;
383 s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
384
385 /* Add two lo words. */
386 t1 = tail_sum + tail_prod;
387 bv = t1 - tail_sum;
388 t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
389
390 s2 += t1;
391
392 /* Renormalize (s1, s2) to (t1, s2) */
393 t1 = s1 + s2;
394 s2 = s2 - (t1 - s1);
395
396 t2 += s2;
397
398 /* Renormalize (t1, t2) */
399 head_sum = t1 + t2;
400 tail_sum = t2 - (head_sum - t1);
401 }
402 aij += incaij;
403 jx += incx;
404 }
405 y_i[iy] = head_sum;
406 ai += incai;
407 iy += incy;
408 }
409 } else {
410 ai = 0;
411 iy = ky;
412 for (i = 0; i < leny; i++) {
413 head_sum = tail_sum = 0.0;
414 aij = ai;
415 jx = kx;
416 for (j = 0; j < lenx; j++) {
417 a_elem = a_i[aij];
418
419 x_elem = x_i[jx];
420 {
421 double dt = (double) a_elem;
422 {
423 /* Compute double_double = double * double. */
424 double a1, a2, b1, b2, con;
425
426 con = dt * split;
427 a1 = con - dt;
428 a1 = con - a1;
429 a2 = dt - a1;
430 con = x_elem * split;
431 b1 = con - x_elem;
432 b1 = con - b1;
433 b2 = x_elem - b1;
434
435 head_prod = dt * x_elem;
436 tail_prod =
437 (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
438 }
439 }
440 {
441 /* Compute double-double = double-double + double-double. */
442 double bv;
443 double s1, s2, t1, t2;
444
445 /* Add two hi words. */
446 s1 = head_sum + head_prod;
447 bv = s1 - head_sum;
448 s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
449
450 /* Add two lo words. */
451 t1 = tail_sum + tail_prod;
452 bv = t1 - tail_sum;
453 t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
454
455 s2 += t1;
456
457 /* Renormalize (s1, s2) to (t1, s2) */
458 t1 = s1 + s2;
459 s2 = s2 - (t1 - s1);
460
461 t2 += s2;
462
463 /* Renormalize (t1, t2) */
464 head_sum = t1 + t2;
465 tail_sum = t2 - (head_sum - t1);
466 }
467 aij += incaij;
468 jx += incx;
469 }
470 {
471 /* Compute double-double = double-double * double. */
472 double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
473
474 con = head_sum * split;
475 a11 = con - head_sum;
476 a11 = con - a11;
477 a21 = head_sum - a11;
478 con = alpha_i * split;
479 b1 = con - alpha_i;
480 b1 = con - b1;
481 b2 = alpha_i - b1;
482
483 c11 = head_sum * alpha_i;
484 c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
485
486 c2 = tail_sum * alpha_i;
487 t1 = c11 + c2;
488 t2 = (c2 - (t1 - c11)) + c21;
489
490 head_tmp1 = t1 + t2;
491 tail_tmp1 = t2 - (head_tmp1 - t1);
492 }
493 y_i[iy] = head_tmp1;
494 ai += incai;
495 iy += incy;
496 }
497 }
498 } else {
499 /* the most general form, y = alpha*A*x + beta*y */
500 ai = 0;
501 iy = ky;
502 for (i = 0; i < leny; i++) {
503 head_sum = tail_sum = 0.0;;
504 aij = ai;
505 jx = kx;
506 for (j = 0; j < lenx; j++) {
507 a_elem = a_i[aij];
508
509 x_elem = x_i[jx];
510 {
511 double dt = (double) a_elem;
512 {
513 /* Compute double_double = double * double. */
514 double a1, a2, b1, b2, con;
515
516 con = dt * split;
517 a1 = con - dt;
518 a1 = con - a1;
519 a2 = dt - a1;
520 con = x_elem * split;
521 b1 = con - x_elem;
522 b1 = con - b1;
523 b2 = x_elem - b1;
524
525 head_prod = dt * x_elem;
526 tail_prod =
527 (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
528 }
529 }
530 {
531 /* Compute double-double = double-double + double-double. */
532 double bv;
533 double s1, s2, t1, t2;
534
535 /* Add two hi words. */
536 s1 = head_sum + head_prod;
537 bv = s1 - head_sum;
538 s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
539
540 /* Add two lo words. */
541 t1 = tail_sum + tail_prod;
542 bv = t1 - tail_sum;
543 t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
544
545 s2 += t1;
546
547 /* Renormalize (s1, s2) to (t1, s2) */
548 t1 = s1 + s2;
549 s2 = s2 - (t1 - s1);
550
551 t2 += s2;
552
553 /* Renormalize (t1, t2) */
554 head_sum = t1 + t2;
555 tail_sum = t2 - (head_sum - t1);
556 }
557 aij += incaij;
558 jx += incx;
559 }
560 {
561 /* Compute double-double = double-double * double. */
562 double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
563
564 con = head_sum * split;
565 a11 = con - head_sum;
566 a11 = con - a11;
567 a21 = head_sum - a11;
568 con = alpha_i * split;
569 b1 = con - alpha_i;
570 b1 = con - b1;
571 b2 = alpha_i - b1;
572
573 c11 = head_sum * alpha_i;
574 c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
575
576 c2 = tail_sum * alpha_i;
577 t1 = c11 + c2;
578 t2 = (c2 - (t1 - c11)) + c21;
579
580 head_tmp1 = t1 + t2;
581 tail_tmp1 = t2 - (head_tmp1 - t1);
582 }
583 y_elem = y_i[iy];
584 {
585 /* Compute double_double = double * double. */
586 double a1, a2, b1, b2, con;
587
588 con = y_elem * split;
589 a1 = con - y_elem;
590 a1 = con - a1;
591 a2 = y_elem - a1;
592 con = beta_i * split;
593 b1 = con - beta_i;
594 b1 = con - b1;
595 b2 = beta_i - b1;
596
597 head_tmp2 = y_elem * beta_i;
598 tail_tmp2 =
599 (((a1 * b1 - head_tmp2) + a1 * b2) + a2 * b1) + a2 * b2;
600 }
601 {
602 /* Compute double-double = double-double + double-double. */
603 double bv;
604 double s1, s2, t1, t2;
605
606 /* Add two hi words. */
607 s1 = head_tmp1 + head_tmp2;
608 bv = s1 - head_tmp1;
609 s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
610
611 /* Add two lo words. */
612 t1 = tail_tmp1 + tail_tmp2;
613 bv = t1 - tail_tmp1;
614 t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
615
616 s2 += t1;
617
618 /* Renormalize (s1, s2) to (t1, s2) */
619 t1 = s1 + s2;
620 s2 = s2 - (t1 - s1);
621
622 t2 += s2;
623
624 /* Renormalize (t1, t2) */
625 head_tmp1 = t1 + t2;
626 tail_tmp1 = t2 - (head_tmp1 - t1);
627 }
628 y_i[iy] = head_tmp1;
629 ai += incai;
630 iy += incy;
631 }
632 }
633
634 }
635
636 FPU_FIX_STOP;
637 }
638 break;
639 }
640 }
641