1 #include "blas_extended.h"
2 #include "blas_extended_private.h"
BLAS_cgemm_s_s_x(enum blas_order_type order,enum blas_trans_type transa,enum blas_trans_type transb,int m,int n,int k,const void * alpha,const float * a,int lda,const float * b,int ldb,const void * beta,void * c,int ldc,enum blas_prec_type prec)3 void BLAS_cgemm_s_s_x(enum blas_order_type order, enum blas_trans_type transa,
4 enum blas_trans_type transb, int m, int n, int k,
5 const void *alpha, const float *a, int lda,
6 const float *b, int ldb, const void *beta, void *c,
7 int ldc, enum blas_prec_type prec)
8
9 /*
10 * Purpose
11 * =======
12 *
13 * This routine computes the matrix product:
14 *
15 * C <- alpha * op(A) * op(B) + beta * C .
16 *
17 * where op(M) represents either M, M transpose,
18 * or M conjugate transpose.
19 *
20 * Arguments
21 * =========
22 *
23 * order (input) enum blas_order_type
24 * Storage format of input matrices A, B, and C.
25 *
26 * transa (input) enum blas_trans_type
27 * Operation to be done on matrix A before multiplication.
28 * Can be no operation, transposition, or conjugate transposition.
29 *
30 * transb (input) enum blas_trans_type
31 * Operation to be done on matrix B before multiplication.
32 * Can be no operation, transposition, or conjugate transposition.
33 *
34 * m n k (input) int
35 * The dimensions of matrices A, B, and C.
36 * Matrix C is m-by-n matrix.
37 * Matrix A is m-by-k if A is not transposed,
38 * k-by-m otherwise.
39 * Matrix B is k-by-n if B is not transposed,
40 * n-by-k otherwise.
41 *
42 * alpha (input) const void*
43 *
44 * a (input) const float*
45 * matrix A.
46 *
47 * lda (input) int
48 * leading dimension of A.
49 *
50 * b (input) const float*
51 * matrix B
52 *
53 * ldb (input) int
54 * leading dimension of B.
55 *
56 * beta (input) const void*
57 *
58 * c (input/output) void*
59 * matrix C
60 *
61 * ldc (input) int
62 * leading dimension of C.
63 *
64 * prec (input) enum blas_prec_type
65 * Specifies the internal precision to be used.
66 * = blas_prec_single: single precision.
67 * = blas_prec_double: double precision.
68 * = blas_prec_extra : anything at least 1.5 times as accurate
69 * than double, and wider than 80-bits.
70 * We use double-double in our implementation.
71 *
72 */
73 {
74 static const char routine_name[] = "BLAS_cgemm_s_s_x";
75 switch (prec) {
76
77 case blas_prec_single:{
78
79
80 /* Integer Index Variables */
81 int i, j, h;
82
83 int ai, bj, ci;
84 int aih, bhj, cij; /* Index into matrices a, b, c during multiply */
85
86 int incai, incaih; /* Index increments for matrix a */
87 int incbj, incbhj; /* Index increments for matrix b */
88 int incci, inccij; /* Index increments for matrix c */
89
90 /* Input Matrices */
91 const float *a_i = a;
92 const float *b_i = b;
93
94 /* Output Matrix */
95 float *c_i = (float *) c;
96
97 /* Input Scalars */
98 float *alpha_i = (float *) alpha;
99 float *beta_i = (float *) beta;
100
101 /* Temporary Floating-Point Variables */
102 float a_elem;
103 float b_elem;
104 float c_elem[2];
105 float prod;
106 float sum;
107 float tmp1[2];
108 float tmp2[2];
109
110
111
112 /* Test for error conditions */
113 if (m < 0)
114 BLAS_error(routine_name, -4, m, NULL);
115 if (n < 0)
116 BLAS_error(routine_name, -5, n, NULL);
117 if (k < 0)
118 BLAS_error(routine_name, -6, k, NULL);
119
120 if (order == blas_colmajor) {
121
122 if (ldc < m)
123 BLAS_error(routine_name, -14, ldc, NULL);
124
125 if (transa == blas_no_trans) {
126 if (lda < m)
127 BLAS_error(routine_name, -9, lda, NULL);
128 } else {
129 if (lda < k)
130 BLAS_error(routine_name, -9, lda, NULL);
131 }
132
133 if (transb == blas_no_trans) {
134 if (ldb < k)
135 BLAS_error(routine_name, -11, ldb, NULL);
136 } else {
137 if (ldb < n)
138 BLAS_error(routine_name, -11, ldb, NULL);
139 }
140
141 } else {
142 /* row major */
143 if (ldc < n)
144 BLAS_error(routine_name, -14, ldc, NULL);
145
146 if (transa == blas_no_trans) {
147 if (lda < k)
148 BLAS_error(routine_name, -9, lda, NULL);
149 } else {
150 if (lda < m)
151 BLAS_error(routine_name, -9, lda, NULL);
152 }
153
154 if (transb == blas_no_trans) {
155 if (ldb < n)
156 BLAS_error(routine_name, -11, ldb, NULL);
157 } else {
158 if (ldb < k)
159 BLAS_error(routine_name, -11, ldb, NULL);
160 }
161 }
162
163 /* Test for no-op */
164 if (n == 0 || m == 0 || k == 0)
165 return;
166 if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0
167 && (beta_i[0] == 1.0 && beta_i[1] == 0.0)) {
168 return;
169 }
170
171 /* Set Index Parameters */
172 if (order == blas_colmajor) {
173 incci = 1;
174 inccij = ldc;
175
176 if (transa == blas_no_trans) {
177 incai = 1;
178 incaih = lda;
179 } else {
180 incai = lda;
181 incaih = 1;
182 }
183
184 if (transb == blas_no_trans) {
185 incbj = ldb;
186 incbhj = 1;
187 } else {
188 incbj = 1;
189 incbhj = ldb;
190 }
191
192 } else {
193 /* row major */
194 incci = ldc;
195 inccij = 1;
196
197 if (transa == blas_no_trans) {
198 incai = lda;
199 incaih = 1;
200 } else {
201 incai = 1;
202 incaih = lda;
203 }
204
205 if (transb == blas_no_trans) {
206 incbj = 1;
207 incbhj = ldb;
208 } else {
209 incbj = ldb;
210 incbhj = 1;
211 }
212
213 }
214
215
216
217 /* Ajustment to increments */
218 incci *= 2;
219 inccij *= 2;
220
221
222
223
224
225 /* alpha = 0. In this case, just return beta * C */
226 if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
227
228 ci = 0;
229 for (i = 0; i < m; i++, ci += incci) {
230 cij = ci;
231 for (j = 0; j < n; j++, cij += inccij) {
232 c_elem[0] = c_i[cij];
233 c_elem[1] = c_i[cij + 1];
234 {
235 tmp1[0] = c_elem[0] * beta_i[0] - c_elem[1] * beta_i[1];
236 tmp1[1] = c_elem[0] * beta_i[1] + c_elem[1] * beta_i[0];
237 }
238
239 c_i[cij] = tmp1[0];
240 c_i[cij + 1] = tmp1[1];
241 }
242 }
243
244 } else if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
245
246 /* Case alpha == 1. */
247
248 if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
249 /* Case alpha == 1, beta == 0. We compute C <--- A * B */
250
251 ci = 0;
252 ai = 0;
253 for (i = 0; i < m; i++, ci += incci, ai += incai) {
254
255 cij = ci;
256 bj = 0;
257
258 for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
259
260 aih = ai;
261 bhj = bj;
262
263 sum = 0.0;
264
265 for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
266 a_elem = a_i[aih];
267 b_elem = b_i[bhj];
268 if (transa == blas_conj_trans) {
269
270 }
271 if (transb == blas_conj_trans) {
272
273 }
274 prod = a_elem * b_elem;
275 sum = sum + prod;
276 }
277 tmp1[0] = sum;
278 tmp1[1] = 0.0;
279 c_i[cij] = tmp1[0];
280 c_i[cij + 1] = tmp1[1];
281 }
282 }
283
284 } else {
285 /* Case alpha == 1, but beta != 0.
286 We compute C <--- A * B + beta * C */
287
288 ci = 0;
289 ai = 0;
290 for (i = 0; i < m; i++, ci += incci, ai += incai) {
291
292 cij = ci;
293 bj = 0;
294
295 for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
296
297 aih = ai;
298 bhj = bj;
299
300 sum = 0.0;
301
302 for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
303 a_elem = a_i[aih];
304 b_elem = b_i[bhj];
305 if (transa == blas_conj_trans) {
306
307 }
308 if (transb == blas_conj_trans) {
309
310 }
311 prod = a_elem * b_elem;
312 sum = sum + prod;
313 }
314
315 c_elem[0] = c_i[cij];
316 c_elem[1] = c_i[cij + 1];
317 {
318 tmp2[0] = c_elem[0] * beta_i[0] - c_elem[1] * beta_i[1];
319 tmp2[1] = c_elem[0] * beta_i[1] + c_elem[1] * beta_i[0];
320 }
321
322 tmp1[0] = sum;
323 tmp1[1] = 0.0;
324 tmp1[0] = tmp2[0] + tmp1[0];
325 tmp1[1] = tmp2[1] + tmp1[1];
326 c_i[cij] = tmp1[0];
327 c_i[cij + 1] = tmp1[1];
328 }
329 }
330 }
331
332 } else {
333
334 /* The most general form, C <-- alpha * A * B + beta * C */
335 ci = 0;
336 ai = 0;
337 for (i = 0; i < m; i++, ci += incci, ai += incai) {
338
339 cij = ci;
340 bj = 0;
341
342 for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
343
344 aih = ai;
345 bhj = bj;
346
347 sum = 0.0;
348
349 for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
350 a_elem = a_i[aih];
351 b_elem = b_i[bhj];
352 if (transa == blas_conj_trans) {
353
354 }
355 if (transb == blas_conj_trans) {
356
357 }
358 prod = a_elem * b_elem;
359 sum = sum + prod;
360 }
361
362 {
363 tmp1[0] = alpha_i[0] * sum;
364 tmp1[1] = alpha_i[1] * sum;
365 }
366 c_elem[0] = c_i[cij];
367 c_elem[1] = c_i[cij + 1];
368 {
369 tmp2[0] = c_elem[0] * beta_i[0] - c_elem[1] * beta_i[1];
370 tmp2[1] = c_elem[0] * beta_i[1] + c_elem[1] * beta_i[0];
371 }
372
373 tmp1[0] = tmp1[0] + tmp2[0];
374 tmp1[1] = tmp1[1] + tmp2[1];
375 c_i[cij] = tmp1[0];
376 c_i[cij + 1] = tmp1[1];
377 }
378 }
379
380 }
381
382
383
384 break;
385 }
386 case blas_prec_double:
387 case blas_prec_indigenous:{
388
389
390 /* Integer Index Variables */
391 int i, j, h;
392
393 int ai, bj, ci;
394 int aih, bhj, cij; /* Index into matrices a, b, c during multiply */
395
396 int incai, incaih; /* Index increments for matrix a */
397 int incbj, incbhj; /* Index increments for matrix b */
398 int incci, inccij; /* Index increments for matrix c */
399
400 /* Input Matrices */
401 const float *a_i = a;
402 const float *b_i = b;
403
404 /* Output Matrix */
405 float *c_i = (float *) c;
406
407 /* Input Scalars */
408 float *alpha_i = (float *) alpha;
409 float *beta_i = (float *) beta;
410
411 /* Temporary Floating-Point Variables */
412 float a_elem;
413 float b_elem;
414 float c_elem[2];
415 double prod;
416 double sum;
417 double tmp1[2];
418 double tmp2[2];
419
420
421
422 /* Test for error conditions */
423 if (m < 0)
424 BLAS_error(routine_name, -4, m, NULL);
425 if (n < 0)
426 BLAS_error(routine_name, -5, n, NULL);
427 if (k < 0)
428 BLAS_error(routine_name, -6, k, NULL);
429
430 if (order == blas_colmajor) {
431
432 if (ldc < m)
433 BLAS_error(routine_name, -14, ldc, NULL);
434
435 if (transa == blas_no_trans) {
436 if (lda < m)
437 BLAS_error(routine_name, -9, lda, NULL);
438 } else {
439 if (lda < k)
440 BLAS_error(routine_name, -9, lda, NULL);
441 }
442
443 if (transb == blas_no_trans) {
444 if (ldb < k)
445 BLAS_error(routine_name, -11, ldb, NULL);
446 } else {
447 if (ldb < n)
448 BLAS_error(routine_name, -11, ldb, NULL);
449 }
450
451 } else {
452 /* row major */
453 if (ldc < n)
454 BLAS_error(routine_name, -14, ldc, NULL);
455
456 if (transa == blas_no_trans) {
457 if (lda < k)
458 BLAS_error(routine_name, -9, lda, NULL);
459 } else {
460 if (lda < m)
461 BLAS_error(routine_name, -9, lda, NULL);
462 }
463
464 if (transb == blas_no_trans) {
465 if (ldb < n)
466 BLAS_error(routine_name, -11, ldb, NULL);
467 } else {
468 if (ldb < k)
469 BLAS_error(routine_name, -11, ldb, NULL);
470 }
471 }
472
473 /* Test for no-op */
474 if (n == 0 || m == 0 || k == 0)
475 return;
476 if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0
477 && (beta_i[0] == 1.0 && beta_i[1] == 0.0)) {
478 return;
479 }
480
481 /* Set Index Parameters */
482 if (order == blas_colmajor) {
483 incci = 1;
484 inccij = ldc;
485
486 if (transa == blas_no_trans) {
487 incai = 1;
488 incaih = lda;
489 } else {
490 incai = lda;
491 incaih = 1;
492 }
493
494 if (transb == blas_no_trans) {
495 incbj = ldb;
496 incbhj = 1;
497 } else {
498 incbj = 1;
499 incbhj = ldb;
500 }
501
502 } else {
503 /* row major */
504 incci = ldc;
505 inccij = 1;
506
507 if (transa == blas_no_trans) {
508 incai = lda;
509 incaih = 1;
510 } else {
511 incai = 1;
512 incaih = lda;
513 }
514
515 if (transb == blas_no_trans) {
516 incbj = 1;
517 incbhj = ldb;
518 } else {
519 incbj = ldb;
520 incbhj = 1;
521 }
522
523 }
524
525
526
527 /* Ajustment to increments */
528 incci *= 2;
529 inccij *= 2;
530
531
532
533
534
535 /* alpha = 0. In this case, just return beta * C */
536 if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
537
538 ci = 0;
539 for (i = 0; i < m; i++, ci += incci) {
540 cij = ci;
541 for (j = 0; j < n; j++, cij += inccij) {
542 c_elem[0] = c_i[cij];
543 c_elem[1] = c_i[cij + 1];
544 {
545 tmp1[0] =
546 (double) c_elem[0] * beta_i[0] -
547 (double) c_elem[1] * beta_i[1];
548 tmp1[1] =
549 (double) c_elem[0] * beta_i[1] +
550 (double) c_elem[1] * beta_i[0];
551 }
552 c_i[cij] = tmp1[0];
553 c_i[cij + 1] = tmp1[1];
554 }
555 }
556
557 } else if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
558
559 /* Case alpha == 1. */
560
561 if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
562 /* Case alpha == 1, beta == 0. We compute C <--- A * B */
563
564 ci = 0;
565 ai = 0;
566 for (i = 0; i < m; i++, ci += incci, ai += incai) {
567
568 cij = ci;
569 bj = 0;
570
571 for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
572
573 aih = ai;
574 bhj = bj;
575
576 sum = 0.0;
577
578 for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
579 a_elem = a_i[aih];
580 b_elem = b_i[bhj];
581 if (transa == blas_conj_trans) {
582
583 }
584 if (transb == blas_conj_trans) {
585
586 }
587 prod = (double) a_elem *b_elem;
588 sum = sum + prod;
589 }
590 tmp1[0] = sum;
591 tmp1[1] = 0.0;
592 c_i[cij] = tmp1[0];
593 c_i[cij + 1] = tmp1[1];
594 }
595 }
596
597 } else {
598 /* Case alpha == 1, but beta != 0.
599 We compute C <--- A * B + beta * C */
600
601 ci = 0;
602 ai = 0;
603 for (i = 0; i < m; i++, ci += incci, ai += incai) {
604
605 cij = ci;
606 bj = 0;
607
608 for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
609
610 aih = ai;
611 bhj = bj;
612
613 sum = 0.0;
614
615 for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
616 a_elem = a_i[aih];
617 b_elem = b_i[bhj];
618 if (transa == blas_conj_trans) {
619
620 }
621 if (transb == blas_conj_trans) {
622
623 }
624 prod = (double) a_elem *b_elem;
625 sum = sum + prod;
626 }
627
628 c_elem[0] = c_i[cij];
629 c_elem[1] = c_i[cij + 1];
630 {
631 tmp2[0] =
632 (double) c_elem[0] * beta_i[0] -
633 (double) c_elem[1] * beta_i[1];
634 tmp2[1] =
635 (double) c_elem[0] * beta_i[1] +
636 (double) c_elem[1] * beta_i[0];
637 }
638 tmp1[0] = sum;
639 tmp1[1] = 0.0;
640 tmp1[0] = tmp2[0] + tmp1[0];
641 tmp1[1] = tmp2[1] + tmp1[1];
642 c_i[cij] = tmp1[0];
643 c_i[cij + 1] = tmp1[1];
644 }
645 }
646 }
647
648 } else {
649
650 /* The most general form, C <-- alpha * A * B + beta * C */
651 ci = 0;
652 ai = 0;
653 for (i = 0; i < m; i++, ci += incci, ai += incai) {
654
655 cij = ci;
656 bj = 0;
657
658 for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
659
660 aih = ai;
661 bhj = bj;
662
663 sum = 0.0;
664
665 for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
666 a_elem = a_i[aih];
667 b_elem = b_i[bhj];
668 if (transa == blas_conj_trans) {
669
670 }
671 if (transb == blas_conj_trans) {
672
673 }
674 prod = (double) a_elem *b_elem;
675 sum = sum + prod;
676 }
677
678 {
679 tmp1[0] = alpha_i[0] * sum;
680 tmp1[1] = alpha_i[1] * sum;
681 }
682 c_elem[0] = c_i[cij];
683 c_elem[1] = c_i[cij + 1];
684 {
685 tmp2[0] =
686 (double) c_elem[0] * beta_i[0] -
687 (double) c_elem[1] * beta_i[1];
688 tmp2[1] =
689 (double) c_elem[0] * beta_i[1] +
690 (double) c_elem[1] * beta_i[0];
691 }
692 tmp1[0] = tmp1[0] + tmp2[0];
693 tmp1[1] = tmp1[1] + tmp2[1];
694 c_i[cij] = tmp1[0];
695 c_i[cij + 1] = tmp1[1];
696 }
697 }
698
699 }
700
701
702
703 break;
704 }
705
706 case blas_prec_extra:{
707
708
709 /* Integer Index Variables */
710 int i, j, h;
711
712 int ai, bj, ci;
713 int aih, bhj, cij; /* Index into matrices a, b, c during multiply */
714
715 int incai, incaih; /* Index increments for matrix a */
716 int incbj, incbhj; /* Index increments for matrix b */
717 int incci, inccij; /* Index increments for matrix c */
718
719 /* Input Matrices */
720 const float *a_i = a;
721 const float *b_i = b;
722
723 /* Output Matrix */
724 float *c_i = (float *) c;
725
726 /* Input Scalars */
727 float *alpha_i = (float *) alpha;
728 float *beta_i = (float *) beta;
729
730 /* Temporary Floating-Point Variables */
731 float a_elem;
732 float b_elem;
733 float c_elem[2];
734 double head_prod, tail_prod;
735 double head_sum, tail_sum;
736 double head_tmp1[2], tail_tmp1[2];
737 double head_tmp2[2], tail_tmp2[2];
738
739 FPU_FIX_DECL;
740
741 /* Test for error conditions */
742 if (m < 0)
743 BLAS_error(routine_name, -4, m, NULL);
744 if (n < 0)
745 BLAS_error(routine_name, -5, n, NULL);
746 if (k < 0)
747 BLAS_error(routine_name, -6, k, NULL);
748
749 if (order == blas_colmajor) {
750
751 if (ldc < m)
752 BLAS_error(routine_name, -14, ldc, NULL);
753
754 if (transa == blas_no_trans) {
755 if (lda < m)
756 BLAS_error(routine_name, -9, lda, NULL);
757 } else {
758 if (lda < k)
759 BLAS_error(routine_name, -9, lda, NULL);
760 }
761
762 if (transb == blas_no_trans) {
763 if (ldb < k)
764 BLAS_error(routine_name, -11, ldb, NULL);
765 } else {
766 if (ldb < n)
767 BLAS_error(routine_name, -11, ldb, NULL);
768 }
769
770 } else {
771 /* row major */
772 if (ldc < n)
773 BLAS_error(routine_name, -14, ldc, NULL);
774
775 if (transa == blas_no_trans) {
776 if (lda < k)
777 BLAS_error(routine_name, -9, lda, NULL);
778 } else {
779 if (lda < m)
780 BLAS_error(routine_name, -9, lda, NULL);
781 }
782
783 if (transb == blas_no_trans) {
784 if (ldb < n)
785 BLAS_error(routine_name, -11, ldb, NULL);
786 } else {
787 if (ldb < k)
788 BLAS_error(routine_name, -11, ldb, NULL);
789 }
790 }
791
792 /* Test for no-op */
793 if (n == 0 || m == 0 || k == 0)
794 return;
795 if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0
796 && (beta_i[0] == 1.0 && beta_i[1] == 0.0)) {
797 return;
798 }
799
800 /* Set Index Parameters */
801 if (order == blas_colmajor) {
802 incci = 1;
803 inccij = ldc;
804
805 if (transa == blas_no_trans) {
806 incai = 1;
807 incaih = lda;
808 } else {
809 incai = lda;
810 incaih = 1;
811 }
812
813 if (transb == blas_no_trans) {
814 incbj = ldb;
815 incbhj = 1;
816 } else {
817 incbj = 1;
818 incbhj = ldb;
819 }
820
821 } else {
822 /* row major */
823 incci = ldc;
824 inccij = 1;
825
826 if (transa == blas_no_trans) {
827 incai = lda;
828 incaih = 1;
829 } else {
830 incai = 1;
831 incaih = lda;
832 }
833
834 if (transb == blas_no_trans) {
835 incbj = 1;
836 incbhj = ldb;
837 } else {
838 incbj = ldb;
839 incbhj = 1;
840 }
841
842 }
843
844 FPU_FIX_START;
845
846 /* Ajustment to increments */
847 incci *= 2;
848 inccij *= 2;
849
850
851
852
853
854 /* alpha = 0. In this case, just return beta * C */
855 if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
856
857 ci = 0;
858 for (i = 0; i < m; i++, ci += incci) {
859 cij = ci;
860 for (j = 0; j < n; j++, cij += inccij) {
861 c_elem[0] = c_i[cij];
862 c_elem[1] = c_i[cij + 1];
863 {
864 double head_e1, tail_e1;
865 double d1;
866 double d2;
867 /* Real part */
868 d1 = (double) c_elem[0] * beta_i[0];
869 d2 = (double) -c_elem[1] * beta_i[1];
870 {
871 /* Compute double-double = double + double. */
872 double e, t1, t2;
873
874 /* Knuth trick. */
875 t1 = d1 + d2;
876 e = t1 - d1;
877 t2 = ((d2 - e) + (d1 - (t1 - e)));
878
879 /* The result is t1 + t2, after normalization. */
880 head_e1 = t1 + t2;
881 tail_e1 = t2 - (head_e1 - t1);
882 }
883 head_tmp1[0] = head_e1;
884 tail_tmp1[0] = tail_e1;
885 /* imaginary part */
886 d1 = (double) c_elem[0] * beta_i[1];
887 d2 = (double) c_elem[1] * beta_i[0];
888 {
889 /* Compute double-double = double + double. */
890 double e, t1, t2;
891
892 /* Knuth trick. */
893 t1 = d1 + d2;
894 e = t1 - d1;
895 t2 = ((d2 - e) + (d1 - (t1 - e)));
896
897 /* The result is t1 + t2, after normalization. */
898 head_e1 = t1 + t2;
899 tail_e1 = t2 - (head_e1 - t1);
900 }
901 head_tmp1[1] = head_e1;
902 tail_tmp1[1] = tail_e1;
903 }
904 c_i[cij] = head_tmp1[0];
905 c_i[cij + 1] = head_tmp1[1];
906 }
907 }
908
909 } else if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
910
911 /* Case alpha == 1. */
912
913 if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
914 /* Case alpha == 1, beta == 0. We compute C <--- A * B */
915
916 ci = 0;
917 ai = 0;
918 for (i = 0; i < m; i++, ci += incci, ai += incai) {
919
920 cij = ci;
921 bj = 0;
922
923 for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
924
925 aih = ai;
926 bhj = bj;
927
928 head_sum = tail_sum = 0.0;
929
930 for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
931 a_elem = a_i[aih];
932 b_elem = b_i[bhj];
933 if (transa == blas_conj_trans) {
934
935 }
936 if (transb == blas_conj_trans) {
937
938 }
939 head_prod = (double) a_elem *b_elem;
940 tail_prod = 0.0;
941 {
942 /* Compute double-double = double-double + double-double. */
943 double bv;
944 double s1, s2, t1, t2;
945
946 /* Add two hi words. */
947 s1 = head_sum + head_prod;
948 bv = s1 - head_sum;
949 s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
950
951 /* Add two lo words. */
952 t1 = tail_sum + tail_prod;
953 bv = t1 - tail_sum;
954 t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
955
956 s2 += t1;
957
958 /* Renormalize (s1, s2) to (t1, s2) */
959 t1 = s1 + s2;
960 s2 = s2 - (t1 - s1);
961
962 t2 += s2;
963
964 /* Renormalize (t1, t2) */
965 head_sum = t1 + t2;
966 tail_sum = t2 - (head_sum - t1);
967 }
968 }
969 head_tmp1[0] = head_sum;
970 tail_tmp1[0] = tail_sum;
971 head_tmp1[1] = tail_tmp1[1] = 0.0;
972 c_i[cij] = head_tmp1[0];
973 c_i[cij + 1] = head_tmp1[1];
974 }
975 }
976
977 } else {
978 /* Case alpha == 1, but beta != 0.
979 We compute C <--- A * B + beta * C */
980
981 ci = 0;
982 ai = 0;
983 for (i = 0; i < m; i++, ci += incci, ai += incai) {
984
985 cij = ci;
986 bj = 0;
987
988 for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
989
990 aih = ai;
991 bhj = bj;
992
993 head_sum = tail_sum = 0.0;
994
995 for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
996 a_elem = a_i[aih];
997 b_elem = b_i[bhj];
998 if (transa == blas_conj_trans) {
999
1000 }
1001 if (transb == blas_conj_trans) {
1002
1003 }
1004 head_prod = (double) a_elem *b_elem;
1005 tail_prod = 0.0;
1006 {
1007 /* Compute double-double = double-double + double-double. */
1008 double bv;
1009 double s1, s2, t1, t2;
1010
1011 /* Add two hi words. */
1012 s1 = head_sum + head_prod;
1013 bv = s1 - head_sum;
1014 s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
1015
1016 /* Add two lo words. */
1017 t1 = tail_sum + tail_prod;
1018 bv = t1 - tail_sum;
1019 t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
1020
1021 s2 += t1;
1022
1023 /* Renormalize (s1, s2) to (t1, s2) */
1024 t1 = s1 + s2;
1025 s2 = s2 - (t1 - s1);
1026
1027 t2 += s2;
1028
1029 /* Renormalize (t1, t2) */
1030 head_sum = t1 + t2;
1031 tail_sum = t2 - (head_sum - t1);
1032 }
1033 }
1034
1035 c_elem[0] = c_i[cij];
1036 c_elem[1] = c_i[cij + 1];
1037 {
1038 double head_e1, tail_e1;
1039 double d1;
1040 double d2;
1041 /* Real part */
1042 d1 = (double) c_elem[0] * beta_i[0];
1043 d2 = (double) -c_elem[1] * beta_i[1];
1044 {
1045 /* Compute double-double = double + double. */
1046 double e, t1, t2;
1047
1048 /* Knuth trick. */
1049 t1 = d1 + d2;
1050 e = t1 - d1;
1051 t2 = ((d2 - e) + (d1 - (t1 - e)));
1052
1053 /* The result is t1 + t2, after normalization. */
1054 head_e1 = t1 + t2;
1055 tail_e1 = t2 - (head_e1 - t1);
1056 }
1057 head_tmp2[0] = head_e1;
1058 tail_tmp2[0] = tail_e1;
1059 /* imaginary part */
1060 d1 = (double) c_elem[0] * beta_i[1];
1061 d2 = (double) c_elem[1] * beta_i[0];
1062 {
1063 /* Compute double-double = double + double. */
1064 double e, t1, t2;
1065
1066 /* Knuth trick. */
1067 t1 = d1 + d2;
1068 e = t1 - d1;
1069 t2 = ((d2 - e) + (d1 - (t1 - e)));
1070
1071 /* The result is t1 + t2, after normalization. */
1072 head_e1 = t1 + t2;
1073 tail_e1 = t2 - (head_e1 - t1);
1074 }
1075 head_tmp2[1] = head_e1;
1076 tail_tmp2[1] = tail_e1;
1077 }
1078 head_tmp1[0] = head_sum;
1079 tail_tmp1[0] = tail_sum;
1080 head_tmp1[1] = tail_tmp1[1] = 0.0;
1081 {
1082 double head_t, tail_t;
1083 double head_a, tail_a;
1084 double head_b, tail_b;
1085 /* Real part */
1086 head_a = head_tmp2[0];
1087 tail_a = tail_tmp2[0];
1088 head_b = head_tmp1[0];
1089 tail_b = tail_tmp1[0];
1090 {
1091 /* Compute double-double = double-double + double-double. */
1092 double bv;
1093 double s1, s2, t1, t2;
1094
1095 /* Add two hi words. */
1096 s1 = head_a + head_b;
1097 bv = s1 - head_a;
1098 s2 = ((head_b - bv) + (head_a - (s1 - bv)));
1099
1100 /* Add two lo words. */
1101 t1 = tail_a + tail_b;
1102 bv = t1 - tail_a;
1103 t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
1104
1105 s2 += t1;
1106
1107 /* Renormalize (s1, s2) to (t1, s2) */
1108 t1 = s1 + s2;
1109 s2 = s2 - (t1 - s1);
1110
1111 t2 += s2;
1112
1113 /* Renormalize (t1, t2) */
1114 head_t = t1 + t2;
1115 tail_t = t2 - (head_t - t1);
1116 }
1117 head_tmp1[0] = head_t;
1118 tail_tmp1[0] = tail_t;
1119 /* Imaginary part */
1120 head_a = head_tmp2[1];
1121 tail_a = tail_tmp2[1];
1122 head_b = head_tmp1[1];
1123 tail_b = tail_tmp1[1];
1124 {
1125 /* Compute double-double = double-double + double-double. */
1126 double bv;
1127 double s1, s2, t1, t2;
1128
1129 /* Add two hi words. */
1130 s1 = head_a + head_b;
1131 bv = s1 - head_a;
1132 s2 = ((head_b - bv) + (head_a - (s1 - bv)));
1133
1134 /* Add two lo words. */
1135 t1 = tail_a + tail_b;
1136 bv = t1 - tail_a;
1137 t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
1138
1139 s2 += t1;
1140
1141 /* Renormalize (s1, s2) to (t1, s2) */
1142 t1 = s1 + s2;
1143 s2 = s2 - (t1 - s1);
1144
1145 t2 += s2;
1146
1147 /* Renormalize (t1, t2) */
1148 head_t = t1 + t2;
1149 tail_t = t2 - (head_t - t1);
1150 }
1151 head_tmp1[1] = head_t;
1152 tail_tmp1[1] = tail_t;
1153 }
1154 c_i[cij] = head_tmp1[0];
1155 c_i[cij + 1] = head_tmp1[1];
1156 }
1157 }
1158 }
1159
1160 } else {
1161
1162 /* The most general form, C <-- alpha * A * B + beta * C */
1163 ci = 0;
1164 ai = 0;
1165 for (i = 0; i < m; i++, ci += incci, ai += incai) {
1166
1167 cij = ci;
1168 bj = 0;
1169
1170 for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
1171
1172 aih = ai;
1173 bhj = bj;
1174
1175 head_sum = tail_sum = 0.0;
1176
1177 for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
1178 a_elem = a_i[aih];
1179 b_elem = b_i[bhj];
1180 if (transa == blas_conj_trans) {
1181
1182 }
1183 if (transb == blas_conj_trans) {
1184
1185 }
1186 head_prod = (double) a_elem *b_elem;
1187 tail_prod = 0.0;
1188 {
1189 /* Compute double-double = double-double + double-double. */
1190 double bv;
1191 double s1, s2, t1, t2;
1192
1193 /* Add two hi words. */
1194 s1 = head_sum + head_prod;
1195 bv = s1 - head_sum;
1196 s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
1197
1198 /* Add two lo words. */
1199 t1 = tail_sum + tail_prod;
1200 bv = t1 - tail_sum;
1201 t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
1202
1203 s2 += t1;
1204
1205 /* Renormalize (s1, s2) to (t1, s2) */
1206 t1 = s1 + s2;
1207 s2 = s2 - (t1 - s1);
1208
1209 t2 += s2;
1210
1211 /* Renormalize (t1, t2) */
1212 head_sum = t1 + t2;
1213 tail_sum = t2 - (head_sum - t1);
1214 }
1215 }
1216
1217 {
1218 double head_e1, tail_e1;
1219 double dt;
1220 dt = (double) alpha_i[0];
1221 {
1222 /* Compute double-double = double-double * double. */
1223 double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
1224
1225 con = head_sum * split;
1226 a11 = con - head_sum;
1227 a11 = con - a11;
1228 a21 = head_sum - a11;
1229 con = dt * split;
1230 b1 = con - dt;
1231 b1 = con - b1;
1232 b2 = dt - b1;
1233
1234 c11 = head_sum * dt;
1235 c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
1236
1237 c2 = tail_sum * dt;
1238 t1 = c11 + c2;
1239 t2 = (c2 - (t1 - c11)) + c21;
1240
1241 head_e1 = t1 + t2;
1242 tail_e1 = t2 - (head_e1 - t1);
1243 }
1244 head_tmp1[0] = head_e1;
1245 tail_tmp1[0] = tail_e1;
1246 dt = (double) alpha_i[1];
1247 {
1248 /* Compute double-double = double-double * double. */
1249 double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
1250
1251 con = head_sum * split;
1252 a11 = con - head_sum;
1253 a11 = con - a11;
1254 a21 = head_sum - a11;
1255 con = dt * split;
1256 b1 = con - dt;
1257 b1 = con - b1;
1258 b2 = dt - b1;
1259
1260 c11 = head_sum * dt;
1261 c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
1262
1263 c2 = tail_sum * dt;
1264 t1 = c11 + c2;
1265 t2 = (c2 - (t1 - c11)) + c21;
1266
1267 head_e1 = t1 + t2;
1268 tail_e1 = t2 - (head_e1 - t1);
1269 }
1270 head_tmp1[1] = head_e1;
1271 tail_tmp1[1] = tail_e1;
1272 }
1273 c_elem[0] = c_i[cij];
1274 c_elem[1] = c_i[cij + 1];
1275 {
1276 double head_e1, tail_e1;
1277 double d1;
1278 double d2;
1279 /* Real part */
1280 d1 = (double) c_elem[0] * beta_i[0];
1281 d2 = (double) -c_elem[1] * beta_i[1];
1282 {
1283 /* Compute double-double = double + double. */
1284 double e, t1, t2;
1285
1286 /* Knuth trick. */
1287 t1 = d1 + d2;
1288 e = t1 - d1;
1289 t2 = ((d2 - e) + (d1 - (t1 - e)));
1290
1291 /* The result is t1 + t2, after normalization. */
1292 head_e1 = t1 + t2;
1293 tail_e1 = t2 - (head_e1 - t1);
1294 }
1295 head_tmp2[0] = head_e1;
1296 tail_tmp2[0] = tail_e1;
1297 /* imaginary part */
1298 d1 = (double) c_elem[0] * beta_i[1];
1299 d2 = (double) c_elem[1] * beta_i[0];
1300 {
1301 /* Compute double-double = double + double. */
1302 double e, t1, t2;
1303
1304 /* Knuth trick. */
1305 t1 = d1 + d2;
1306 e = t1 - d1;
1307 t2 = ((d2 - e) + (d1 - (t1 - e)));
1308
1309 /* The result is t1 + t2, after normalization. */
1310 head_e1 = t1 + t2;
1311 tail_e1 = t2 - (head_e1 - t1);
1312 }
1313 head_tmp2[1] = head_e1;
1314 tail_tmp2[1] = tail_e1;
1315 }
1316 {
1317 double head_t, tail_t;
1318 double head_a, tail_a;
1319 double head_b, tail_b;
1320 /* Real part */
1321 head_a = head_tmp1[0];
1322 tail_a = tail_tmp1[0];
1323 head_b = head_tmp2[0];
1324 tail_b = tail_tmp2[0];
1325 {
1326 /* Compute double-double = double-double + double-double. */
1327 double bv;
1328 double s1, s2, t1, t2;
1329
1330 /* Add two hi words. */
1331 s1 = head_a + head_b;
1332 bv = s1 - head_a;
1333 s2 = ((head_b - bv) + (head_a - (s1 - bv)));
1334
1335 /* Add two lo words. */
1336 t1 = tail_a + tail_b;
1337 bv = t1 - tail_a;
1338 t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
1339
1340 s2 += t1;
1341
1342 /* Renormalize (s1, s2) to (t1, s2) */
1343 t1 = s1 + s2;
1344 s2 = s2 - (t1 - s1);
1345
1346 t2 += s2;
1347
1348 /* Renormalize (t1, t2) */
1349 head_t = t1 + t2;
1350 tail_t = t2 - (head_t - t1);
1351 }
1352 head_tmp1[0] = head_t;
1353 tail_tmp1[0] = tail_t;
1354 /* Imaginary part */
1355 head_a = head_tmp1[1];
1356 tail_a = tail_tmp1[1];
1357 head_b = head_tmp2[1];
1358 tail_b = tail_tmp2[1];
1359 {
1360 /* Compute double-double = double-double + double-double. */
1361 double bv;
1362 double s1, s2, t1, t2;
1363
1364 /* Add two hi words. */
1365 s1 = head_a + head_b;
1366 bv = s1 - head_a;
1367 s2 = ((head_b - bv) + (head_a - (s1 - bv)));
1368
1369 /* Add two lo words. */
1370 t1 = tail_a + tail_b;
1371 bv = t1 - tail_a;
1372 t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
1373
1374 s2 += t1;
1375
1376 /* Renormalize (s1, s2) to (t1, s2) */
1377 t1 = s1 + s2;
1378 s2 = s2 - (t1 - s1);
1379
1380 t2 += s2;
1381
1382 /* Renormalize (t1, t2) */
1383 head_t = t1 + t2;
1384 tail_t = t2 - (head_t - t1);
1385 }
1386 head_tmp1[1] = head_t;
1387 tail_tmp1[1] = tail_t;
1388 }
1389 c_i[cij] = head_tmp1[0];
1390 c_i[cij + 1] = head_tmp1[1];
1391 }
1392 }
1393
1394 }
1395
1396 FPU_FIX_STOP;
1397
1398 break;
1399 }
1400 }
1401 }
1402