1 #include "blas_extended.h"
2 #include "blas_extended_private.h"
BLAS_cgemm_s_s_x(enum blas_order_type order,enum blas_trans_type transa,enum blas_trans_type transb,int m,int n,int k,const void * alpha,const float * a,int lda,const float * b,int ldb,const void * beta,void * c,int ldc,enum blas_prec_type prec)3 void BLAS_cgemm_s_s_x(enum blas_order_type order, enum blas_trans_type transa,
4 		      enum blas_trans_type transb, int m, int n, int k,
5 		      const void *alpha, const float *a, int lda,
6 		      const float *b, int ldb, const void *beta, void *c,
7 		      int ldc, enum blas_prec_type prec)
8 
9 /*
10  * Purpose
11  * =======
12  *
13  * This routine computes the matrix product:
14  *
15  *      C   <-  alpha * op(A) * op(B)  +  beta * C .
16  *
17  * where op(M) represents either M, M transpose,
18  * or M conjugate transpose.
19  *
20  * Arguments
21  * =========
22  *
23  * order   (input) enum blas_order_type
24  *         Storage format of input matrices A, B, and C.
25  *
26  * transa  (input) enum blas_trans_type
27  *         Operation to be done on matrix A before multiplication.
28  *         Can be no operation, transposition, or conjugate transposition.
29  *
30  * transb  (input) enum blas_trans_type
31  *         Operation to be done on matrix B before multiplication.
32  *         Can be no operation, transposition, or conjugate transposition.
33  *
34  * m n k   (input) int
35  *         The dimensions of matrices A, B, and C.
36  *         Matrix C is m-by-n matrix.
37  *         Matrix A is m-by-k if A is not transposed,
38  *                     k-by-m otherwise.
39  *         Matrix B is k-by-n if B is not transposed,
40  *                     n-by-k otherwise.
41  *
42  * alpha   (input) const void*
43  *
44  * a       (input) const float*
45  *         matrix A.
46  *
47  * lda     (input) int
48  *         leading dimension of A.
49  *
50  * b       (input) const float*
51  *         matrix B
52  *
53  * ldb     (input) int
54  *         leading dimension of B.
55  *
56  * beta    (input) const void*
57  *
58  * c       (input/output) void*
59  *         matrix C
60  *
61  * ldc     (input) int
62  *         leading dimension of C.
63  *
64  * prec   (input) enum blas_prec_type
65  *        Specifies the internal precision to be used.
66  *        = blas_prec_single: single precision.
67  *        = blas_prec_double: double precision.
68  *        = blas_prec_extra : anything at least 1.5 times as accurate
69  *                            than double, and wider than 80-bits.
70  *                            We use double-double in our implementation.
71  *
72  */
73 {
74   static const char routine_name[] = "BLAS_cgemm_s_s_x";
75   switch (prec) {
76 
77   case blas_prec_single:{
78 
79 
80       /* Integer Index Variables */
81       int i, j, h;
82 
83       int ai, bj, ci;
84       int aih, bhj, cij;	/* Index into matrices a, b, c during multiply */
85 
86       int incai, incaih;	/* Index increments for matrix a */
87       int incbj, incbhj;	/* Index increments for matrix b */
88       int incci, inccij;	/* Index increments for matrix c */
89 
90       /* Input Matrices */
91       const float *a_i = a;
92       const float *b_i = b;
93 
94       /* Output Matrix */
95       float *c_i = (float *) c;
96 
97       /* Input Scalars */
98       float *alpha_i = (float *) alpha;
99       float *beta_i = (float *) beta;
100 
101       /* Temporary Floating-Point Variables */
102       float a_elem;
103       float b_elem;
104       float c_elem[2];
105       float prod;
106       float sum;
107       float tmp1[2];
108       float tmp2[2];
109 
110 
111 
112       /* Test for error conditions */
113       if (m < 0)
114 	BLAS_error(routine_name, -4, m, NULL);
115       if (n < 0)
116 	BLAS_error(routine_name, -5, n, NULL);
117       if (k < 0)
118 	BLAS_error(routine_name, -6, k, NULL);
119 
120       if (order == blas_colmajor) {
121 
122 	if (ldc < m)
123 	  BLAS_error(routine_name, -14, ldc, NULL);
124 
125 	if (transa == blas_no_trans) {
126 	  if (lda < m)
127 	    BLAS_error(routine_name, -9, lda, NULL);
128 	} else {
129 	  if (lda < k)
130 	    BLAS_error(routine_name, -9, lda, NULL);
131 	}
132 
133 	if (transb == blas_no_trans) {
134 	  if (ldb < k)
135 	    BLAS_error(routine_name, -11, ldb, NULL);
136 	} else {
137 	  if (ldb < n)
138 	    BLAS_error(routine_name, -11, ldb, NULL);
139 	}
140 
141       } else {
142 	/* row major */
143 	if (ldc < n)
144 	  BLAS_error(routine_name, -14, ldc, NULL);
145 
146 	if (transa == blas_no_trans) {
147 	  if (lda < k)
148 	    BLAS_error(routine_name, -9, lda, NULL);
149 	} else {
150 	  if (lda < m)
151 	    BLAS_error(routine_name, -9, lda, NULL);
152 	}
153 
154 	if (transb == blas_no_trans) {
155 	  if (ldb < n)
156 	    BLAS_error(routine_name, -11, ldb, NULL);
157 	} else {
158 	  if (ldb < k)
159 	    BLAS_error(routine_name, -11, ldb, NULL);
160 	}
161       }
162 
163       /* Test for no-op */
164       if (n == 0 || m == 0 || k == 0)
165 	return;
166       if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0
167 	  && (beta_i[0] == 1.0 && beta_i[1] == 0.0)) {
168 	return;
169       }
170 
171       /* Set Index Parameters */
172       if (order == blas_colmajor) {
173 	incci = 1;
174 	inccij = ldc;
175 
176 	if (transa == blas_no_trans) {
177 	  incai = 1;
178 	  incaih = lda;
179 	} else {
180 	  incai = lda;
181 	  incaih = 1;
182 	}
183 
184 	if (transb == blas_no_trans) {
185 	  incbj = ldb;
186 	  incbhj = 1;
187 	} else {
188 	  incbj = 1;
189 	  incbhj = ldb;
190 	}
191 
192       } else {
193 	/* row major */
194 	incci = ldc;
195 	inccij = 1;
196 
197 	if (transa == blas_no_trans) {
198 	  incai = lda;
199 	  incaih = 1;
200 	} else {
201 	  incai = 1;
202 	  incaih = lda;
203 	}
204 
205 	if (transb == blas_no_trans) {
206 	  incbj = 1;
207 	  incbhj = ldb;
208 	} else {
209 	  incbj = ldb;
210 	  incbhj = 1;
211 	}
212 
213       }
214 
215 
216 
217       /* Ajustment to increments */
218       incci *= 2;
219       inccij *= 2;
220 
221 
222 
223 
224 
225       /* alpha = 0.  In this case, just return beta * C */
226       if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
227 
228 	ci = 0;
229 	for (i = 0; i < m; i++, ci += incci) {
230 	  cij = ci;
231 	  for (j = 0; j < n; j++, cij += inccij) {
232 	    c_elem[0] = c_i[cij];
233 	    c_elem[1] = c_i[cij + 1];
234 	    {
235 	      tmp1[0] = c_elem[0] * beta_i[0] - c_elem[1] * beta_i[1];
236 	      tmp1[1] = c_elem[0] * beta_i[1] + c_elem[1] * beta_i[0];
237 	    }
238 
239 	    c_i[cij] = tmp1[0];
240 	    c_i[cij + 1] = tmp1[1];
241 	  }
242 	}
243 
244       } else if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
245 
246 	/* Case alpha == 1. */
247 
248 	if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
249 	  /* Case alpha == 1, beta == 0.   We compute  C <--- A * B */
250 
251 	  ci = 0;
252 	  ai = 0;
253 	  for (i = 0; i < m; i++, ci += incci, ai += incai) {
254 
255 	    cij = ci;
256 	    bj = 0;
257 
258 	    for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
259 
260 	      aih = ai;
261 	      bhj = bj;
262 
263 	      sum = 0.0;
264 
265 	      for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
266 		a_elem = a_i[aih];
267 		b_elem = b_i[bhj];
268 		if (transa == blas_conj_trans) {
269 
270 		}
271 		if (transb == blas_conj_trans) {
272 
273 		}
274 		prod = a_elem * b_elem;
275 		sum = sum + prod;
276 	      }
277 	      tmp1[0] = sum;
278 	      tmp1[1] = 0.0;
279 	      c_i[cij] = tmp1[0];
280 	      c_i[cij + 1] = tmp1[1];
281 	    }
282 	  }
283 
284 	} else {
285 	  /* Case alpha == 1, but beta != 0.
286 	     We compute   C <--- A * B + beta * C   */
287 
288 	  ci = 0;
289 	  ai = 0;
290 	  for (i = 0; i < m; i++, ci += incci, ai += incai) {
291 
292 	    cij = ci;
293 	    bj = 0;
294 
295 	    for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
296 
297 	      aih = ai;
298 	      bhj = bj;
299 
300 	      sum = 0.0;
301 
302 	      for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
303 		a_elem = a_i[aih];
304 		b_elem = b_i[bhj];
305 		if (transa == blas_conj_trans) {
306 
307 		}
308 		if (transb == blas_conj_trans) {
309 
310 		}
311 		prod = a_elem * b_elem;
312 		sum = sum + prod;
313 	      }
314 
315 	      c_elem[0] = c_i[cij];
316 	      c_elem[1] = c_i[cij + 1];
317 	      {
318 		tmp2[0] = c_elem[0] * beta_i[0] - c_elem[1] * beta_i[1];
319 		tmp2[1] = c_elem[0] * beta_i[1] + c_elem[1] * beta_i[0];
320 	      }
321 
322 	      tmp1[0] = sum;
323 	      tmp1[1] = 0.0;
324 	      tmp1[0] = tmp2[0] + tmp1[0];
325 	      tmp1[1] = tmp2[1] + tmp1[1];
326 	      c_i[cij] = tmp1[0];
327 	      c_i[cij + 1] = tmp1[1];
328 	    }
329 	  }
330 	}
331 
332       } else {
333 
334 	/* The most general form,   C <-- alpha * A * B + beta * C  */
335 	ci = 0;
336 	ai = 0;
337 	for (i = 0; i < m; i++, ci += incci, ai += incai) {
338 
339 	  cij = ci;
340 	  bj = 0;
341 
342 	  for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
343 
344 	    aih = ai;
345 	    bhj = bj;
346 
347 	    sum = 0.0;
348 
349 	    for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
350 	      a_elem = a_i[aih];
351 	      b_elem = b_i[bhj];
352 	      if (transa == blas_conj_trans) {
353 
354 	      }
355 	      if (transb == blas_conj_trans) {
356 
357 	      }
358 	      prod = a_elem * b_elem;
359 	      sum = sum + prod;
360 	    }
361 
362 	    {
363 	      tmp1[0] = alpha_i[0] * sum;
364 	      tmp1[1] = alpha_i[1] * sum;
365 	    }
366 	    c_elem[0] = c_i[cij];
367 	    c_elem[1] = c_i[cij + 1];
368 	    {
369 	      tmp2[0] = c_elem[0] * beta_i[0] - c_elem[1] * beta_i[1];
370 	      tmp2[1] = c_elem[0] * beta_i[1] + c_elem[1] * beta_i[0];
371 	    }
372 
373 	    tmp1[0] = tmp1[0] + tmp2[0];
374 	    tmp1[1] = tmp1[1] + tmp2[1];
375 	    c_i[cij] = tmp1[0];
376 	    c_i[cij + 1] = tmp1[1];
377 	  }
378 	}
379 
380       }
381 
382 
383 
384       break;
385     }
386   case blas_prec_double:
387   case blas_prec_indigenous:{
388 
389 
390       /* Integer Index Variables */
391       int i, j, h;
392 
393       int ai, bj, ci;
394       int aih, bhj, cij;	/* Index into matrices a, b, c during multiply */
395 
396       int incai, incaih;	/* Index increments for matrix a */
397       int incbj, incbhj;	/* Index increments for matrix b */
398       int incci, inccij;	/* Index increments for matrix c */
399 
400       /* Input Matrices */
401       const float *a_i = a;
402       const float *b_i = b;
403 
404       /* Output Matrix */
405       float *c_i = (float *) c;
406 
407       /* Input Scalars */
408       float *alpha_i = (float *) alpha;
409       float *beta_i = (float *) beta;
410 
411       /* Temporary Floating-Point Variables */
412       float a_elem;
413       float b_elem;
414       float c_elem[2];
415       double prod;
416       double sum;
417       double tmp1[2];
418       double tmp2[2];
419 
420 
421 
422       /* Test for error conditions */
423       if (m < 0)
424 	BLAS_error(routine_name, -4, m, NULL);
425       if (n < 0)
426 	BLAS_error(routine_name, -5, n, NULL);
427       if (k < 0)
428 	BLAS_error(routine_name, -6, k, NULL);
429 
430       if (order == blas_colmajor) {
431 
432 	if (ldc < m)
433 	  BLAS_error(routine_name, -14, ldc, NULL);
434 
435 	if (transa == blas_no_trans) {
436 	  if (lda < m)
437 	    BLAS_error(routine_name, -9, lda, NULL);
438 	} else {
439 	  if (lda < k)
440 	    BLAS_error(routine_name, -9, lda, NULL);
441 	}
442 
443 	if (transb == blas_no_trans) {
444 	  if (ldb < k)
445 	    BLAS_error(routine_name, -11, ldb, NULL);
446 	} else {
447 	  if (ldb < n)
448 	    BLAS_error(routine_name, -11, ldb, NULL);
449 	}
450 
451       } else {
452 	/* row major */
453 	if (ldc < n)
454 	  BLAS_error(routine_name, -14, ldc, NULL);
455 
456 	if (transa == blas_no_trans) {
457 	  if (lda < k)
458 	    BLAS_error(routine_name, -9, lda, NULL);
459 	} else {
460 	  if (lda < m)
461 	    BLAS_error(routine_name, -9, lda, NULL);
462 	}
463 
464 	if (transb == blas_no_trans) {
465 	  if (ldb < n)
466 	    BLAS_error(routine_name, -11, ldb, NULL);
467 	} else {
468 	  if (ldb < k)
469 	    BLAS_error(routine_name, -11, ldb, NULL);
470 	}
471       }
472 
473       /* Test for no-op */
474       if (n == 0 || m == 0 || k == 0)
475 	return;
476       if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0
477 	  && (beta_i[0] == 1.0 && beta_i[1] == 0.0)) {
478 	return;
479       }
480 
481       /* Set Index Parameters */
482       if (order == blas_colmajor) {
483 	incci = 1;
484 	inccij = ldc;
485 
486 	if (transa == blas_no_trans) {
487 	  incai = 1;
488 	  incaih = lda;
489 	} else {
490 	  incai = lda;
491 	  incaih = 1;
492 	}
493 
494 	if (transb == blas_no_trans) {
495 	  incbj = ldb;
496 	  incbhj = 1;
497 	} else {
498 	  incbj = 1;
499 	  incbhj = ldb;
500 	}
501 
502       } else {
503 	/* row major */
504 	incci = ldc;
505 	inccij = 1;
506 
507 	if (transa == blas_no_trans) {
508 	  incai = lda;
509 	  incaih = 1;
510 	} else {
511 	  incai = 1;
512 	  incaih = lda;
513 	}
514 
515 	if (transb == blas_no_trans) {
516 	  incbj = 1;
517 	  incbhj = ldb;
518 	} else {
519 	  incbj = ldb;
520 	  incbhj = 1;
521 	}
522 
523       }
524 
525 
526 
527       /* Ajustment to increments */
528       incci *= 2;
529       inccij *= 2;
530 
531 
532 
533 
534 
535       /* alpha = 0.  In this case, just return beta * C */
536       if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
537 
538 	ci = 0;
539 	for (i = 0; i < m; i++, ci += incci) {
540 	  cij = ci;
541 	  for (j = 0; j < n; j++, cij += inccij) {
542 	    c_elem[0] = c_i[cij];
543 	    c_elem[1] = c_i[cij + 1];
544 	    {
545 	      tmp1[0] =
546 		(double) c_elem[0] * beta_i[0] -
547 		(double) c_elem[1] * beta_i[1];
548 	      tmp1[1] =
549 		(double) c_elem[0] * beta_i[1] +
550 		(double) c_elem[1] * beta_i[0];
551 	    }
552 	    c_i[cij] = tmp1[0];
553 	    c_i[cij + 1] = tmp1[1];
554 	  }
555 	}
556 
557       } else if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
558 
559 	/* Case alpha == 1. */
560 
561 	if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
562 	  /* Case alpha == 1, beta == 0.   We compute  C <--- A * B */
563 
564 	  ci = 0;
565 	  ai = 0;
566 	  for (i = 0; i < m; i++, ci += incci, ai += incai) {
567 
568 	    cij = ci;
569 	    bj = 0;
570 
571 	    for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
572 
573 	      aih = ai;
574 	      bhj = bj;
575 
576 	      sum = 0.0;
577 
578 	      for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
579 		a_elem = a_i[aih];
580 		b_elem = b_i[bhj];
581 		if (transa == blas_conj_trans) {
582 
583 		}
584 		if (transb == blas_conj_trans) {
585 
586 		}
587 		prod = (double) a_elem *b_elem;
588 		sum = sum + prod;
589 	      }
590 	      tmp1[0] = sum;
591 	      tmp1[1] = 0.0;
592 	      c_i[cij] = tmp1[0];
593 	      c_i[cij + 1] = tmp1[1];
594 	    }
595 	  }
596 
597 	} else {
598 	  /* Case alpha == 1, but beta != 0.
599 	     We compute   C <--- A * B + beta * C   */
600 
601 	  ci = 0;
602 	  ai = 0;
603 	  for (i = 0; i < m; i++, ci += incci, ai += incai) {
604 
605 	    cij = ci;
606 	    bj = 0;
607 
608 	    for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
609 
610 	      aih = ai;
611 	      bhj = bj;
612 
613 	      sum = 0.0;
614 
615 	      for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
616 		a_elem = a_i[aih];
617 		b_elem = b_i[bhj];
618 		if (transa == blas_conj_trans) {
619 
620 		}
621 		if (transb == blas_conj_trans) {
622 
623 		}
624 		prod = (double) a_elem *b_elem;
625 		sum = sum + prod;
626 	      }
627 
628 	      c_elem[0] = c_i[cij];
629 	      c_elem[1] = c_i[cij + 1];
630 	      {
631 		tmp2[0] =
632 		  (double) c_elem[0] * beta_i[0] -
633 		  (double) c_elem[1] * beta_i[1];
634 		tmp2[1] =
635 		  (double) c_elem[0] * beta_i[1] +
636 		  (double) c_elem[1] * beta_i[0];
637 	      }
638 	      tmp1[0] = sum;
639 	      tmp1[1] = 0.0;
640 	      tmp1[0] = tmp2[0] + tmp1[0];
641 	      tmp1[1] = tmp2[1] + tmp1[1];
642 	      c_i[cij] = tmp1[0];
643 	      c_i[cij + 1] = tmp1[1];
644 	    }
645 	  }
646 	}
647 
648       } else {
649 
650 	/* The most general form,   C <-- alpha * A * B + beta * C  */
651 	ci = 0;
652 	ai = 0;
653 	for (i = 0; i < m; i++, ci += incci, ai += incai) {
654 
655 	  cij = ci;
656 	  bj = 0;
657 
658 	  for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
659 
660 	    aih = ai;
661 	    bhj = bj;
662 
663 	    sum = 0.0;
664 
665 	    for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
666 	      a_elem = a_i[aih];
667 	      b_elem = b_i[bhj];
668 	      if (transa == blas_conj_trans) {
669 
670 	      }
671 	      if (transb == blas_conj_trans) {
672 
673 	      }
674 	      prod = (double) a_elem *b_elem;
675 	      sum = sum + prod;
676 	    }
677 
678 	    {
679 	      tmp1[0] = alpha_i[0] * sum;
680 	      tmp1[1] = alpha_i[1] * sum;
681 	    }
682 	    c_elem[0] = c_i[cij];
683 	    c_elem[1] = c_i[cij + 1];
684 	    {
685 	      tmp2[0] =
686 		(double) c_elem[0] * beta_i[0] -
687 		(double) c_elem[1] * beta_i[1];
688 	      tmp2[1] =
689 		(double) c_elem[0] * beta_i[1] +
690 		(double) c_elem[1] * beta_i[0];
691 	    }
692 	    tmp1[0] = tmp1[0] + tmp2[0];
693 	    tmp1[1] = tmp1[1] + tmp2[1];
694 	    c_i[cij] = tmp1[0];
695 	    c_i[cij + 1] = tmp1[1];
696 	  }
697 	}
698 
699       }
700 
701 
702 
703       break;
704     }
705 
706   case blas_prec_extra:{
707 
708 
709       /* Integer Index Variables */
710       int i, j, h;
711 
712       int ai, bj, ci;
713       int aih, bhj, cij;	/* Index into matrices a, b, c during multiply */
714 
715       int incai, incaih;	/* Index increments for matrix a */
716       int incbj, incbhj;	/* Index increments for matrix b */
717       int incci, inccij;	/* Index increments for matrix c */
718 
719       /* Input Matrices */
720       const float *a_i = a;
721       const float *b_i = b;
722 
723       /* Output Matrix */
724       float *c_i = (float *) c;
725 
726       /* Input Scalars */
727       float *alpha_i = (float *) alpha;
728       float *beta_i = (float *) beta;
729 
730       /* Temporary Floating-Point Variables */
731       float a_elem;
732       float b_elem;
733       float c_elem[2];
734       double head_prod, tail_prod;
735       double head_sum, tail_sum;
736       double head_tmp1[2], tail_tmp1[2];
737       double head_tmp2[2], tail_tmp2[2];
738 
739       FPU_FIX_DECL;
740 
741       /* Test for error conditions */
742       if (m < 0)
743 	BLAS_error(routine_name, -4, m, NULL);
744       if (n < 0)
745 	BLAS_error(routine_name, -5, n, NULL);
746       if (k < 0)
747 	BLAS_error(routine_name, -6, k, NULL);
748 
749       if (order == blas_colmajor) {
750 
751 	if (ldc < m)
752 	  BLAS_error(routine_name, -14, ldc, NULL);
753 
754 	if (transa == blas_no_trans) {
755 	  if (lda < m)
756 	    BLAS_error(routine_name, -9, lda, NULL);
757 	} else {
758 	  if (lda < k)
759 	    BLAS_error(routine_name, -9, lda, NULL);
760 	}
761 
762 	if (transb == blas_no_trans) {
763 	  if (ldb < k)
764 	    BLAS_error(routine_name, -11, ldb, NULL);
765 	} else {
766 	  if (ldb < n)
767 	    BLAS_error(routine_name, -11, ldb, NULL);
768 	}
769 
770       } else {
771 	/* row major */
772 	if (ldc < n)
773 	  BLAS_error(routine_name, -14, ldc, NULL);
774 
775 	if (transa == blas_no_trans) {
776 	  if (lda < k)
777 	    BLAS_error(routine_name, -9, lda, NULL);
778 	} else {
779 	  if (lda < m)
780 	    BLAS_error(routine_name, -9, lda, NULL);
781 	}
782 
783 	if (transb == blas_no_trans) {
784 	  if (ldb < n)
785 	    BLAS_error(routine_name, -11, ldb, NULL);
786 	} else {
787 	  if (ldb < k)
788 	    BLAS_error(routine_name, -11, ldb, NULL);
789 	}
790       }
791 
792       /* Test for no-op */
793       if (n == 0 || m == 0 || k == 0)
794 	return;
795       if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0
796 	  && (beta_i[0] == 1.0 && beta_i[1] == 0.0)) {
797 	return;
798       }
799 
800       /* Set Index Parameters */
801       if (order == blas_colmajor) {
802 	incci = 1;
803 	inccij = ldc;
804 
805 	if (transa == blas_no_trans) {
806 	  incai = 1;
807 	  incaih = lda;
808 	} else {
809 	  incai = lda;
810 	  incaih = 1;
811 	}
812 
813 	if (transb == blas_no_trans) {
814 	  incbj = ldb;
815 	  incbhj = 1;
816 	} else {
817 	  incbj = 1;
818 	  incbhj = ldb;
819 	}
820 
821       } else {
822 	/* row major */
823 	incci = ldc;
824 	inccij = 1;
825 
826 	if (transa == blas_no_trans) {
827 	  incai = lda;
828 	  incaih = 1;
829 	} else {
830 	  incai = 1;
831 	  incaih = lda;
832 	}
833 
834 	if (transb == blas_no_trans) {
835 	  incbj = 1;
836 	  incbhj = ldb;
837 	} else {
838 	  incbj = ldb;
839 	  incbhj = 1;
840 	}
841 
842       }
843 
844       FPU_FIX_START;
845 
846       /* Ajustment to increments */
847       incci *= 2;
848       inccij *= 2;
849 
850 
851 
852 
853 
854       /* alpha = 0.  In this case, just return beta * C */
855       if (alpha_i[0] == 0.0 && alpha_i[1] == 0.0) {
856 
857 	ci = 0;
858 	for (i = 0; i < m; i++, ci += incci) {
859 	  cij = ci;
860 	  for (j = 0; j < n; j++, cij += inccij) {
861 	    c_elem[0] = c_i[cij];
862 	    c_elem[1] = c_i[cij + 1];
863 	    {
864 	      double head_e1, tail_e1;
865 	      double d1;
866 	      double d2;
867 	      /* Real part */
868 	      d1 = (double) c_elem[0] * beta_i[0];
869 	      d2 = (double) -c_elem[1] * beta_i[1];
870 	      {
871 		/* Compute double-double = double + double. */
872 		double e, t1, t2;
873 
874 		/* Knuth trick. */
875 		t1 = d1 + d2;
876 		e = t1 - d1;
877 		t2 = ((d2 - e) + (d1 - (t1 - e)));
878 
879 		/* The result is t1 + t2, after normalization. */
880 		head_e1 = t1 + t2;
881 		tail_e1 = t2 - (head_e1 - t1);
882 	      }
883 	      head_tmp1[0] = head_e1;
884 	      tail_tmp1[0] = tail_e1;
885 	      /* imaginary part */
886 	      d1 = (double) c_elem[0] * beta_i[1];
887 	      d2 = (double) c_elem[1] * beta_i[0];
888 	      {
889 		/* Compute double-double = double + double. */
890 		double e, t1, t2;
891 
892 		/* Knuth trick. */
893 		t1 = d1 + d2;
894 		e = t1 - d1;
895 		t2 = ((d2 - e) + (d1 - (t1 - e)));
896 
897 		/* The result is t1 + t2, after normalization. */
898 		head_e1 = t1 + t2;
899 		tail_e1 = t2 - (head_e1 - t1);
900 	      }
901 	      head_tmp1[1] = head_e1;
902 	      tail_tmp1[1] = tail_e1;
903 	    }
904 	    c_i[cij] = head_tmp1[0];
905 	    c_i[cij + 1] = head_tmp1[1];
906 	  }
907 	}
908 
909       } else if ((alpha_i[0] == 1.0 && alpha_i[1] == 0.0)) {
910 
911 	/* Case alpha == 1. */
912 
913 	if (beta_i[0] == 0.0 && beta_i[1] == 0.0) {
914 	  /* Case alpha == 1, beta == 0.   We compute  C <--- A * B */
915 
916 	  ci = 0;
917 	  ai = 0;
918 	  for (i = 0; i < m; i++, ci += incci, ai += incai) {
919 
920 	    cij = ci;
921 	    bj = 0;
922 
923 	    for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
924 
925 	      aih = ai;
926 	      bhj = bj;
927 
928 	      head_sum = tail_sum = 0.0;
929 
930 	      for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
931 		a_elem = a_i[aih];
932 		b_elem = b_i[bhj];
933 		if (transa == blas_conj_trans) {
934 
935 		}
936 		if (transb == blas_conj_trans) {
937 
938 		}
939 		head_prod = (double) a_elem *b_elem;
940 		tail_prod = 0.0;
941 		{
942 		  /* Compute double-double = double-double + double-double. */
943 		  double bv;
944 		  double s1, s2, t1, t2;
945 
946 		  /* Add two hi words. */
947 		  s1 = head_sum + head_prod;
948 		  bv = s1 - head_sum;
949 		  s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
950 
951 		  /* Add two lo words. */
952 		  t1 = tail_sum + tail_prod;
953 		  bv = t1 - tail_sum;
954 		  t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
955 
956 		  s2 += t1;
957 
958 		  /* Renormalize (s1, s2)  to  (t1, s2) */
959 		  t1 = s1 + s2;
960 		  s2 = s2 - (t1 - s1);
961 
962 		  t2 += s2;
963 
964 		  /* Renormalize (t1, t2)  */
965 		  head_sum = t1 + t2;
966 		  tail_sum = t2 - (head_sum - t1);
967 		}
968 	      }
969 	      head_tmp1[0] = head_sum;
970 	      tail_tmp1[0] = tail_sum;
971 	      head_tmp1[1] = tail_tmp1[1] = 0.0;
972 	      c_i[cij] = head_tmp1[0];
973 	      c_i[cij + 1] = head_tmp1[1];
974 	    }
975 	  }
976 
977 	} else {
978 	  /* Case alpha == 1, but beta != 0.
979 	     We compute   C <--- A * B + beta * C   */
980 
981 	  ci = 0;
982 	  ai = 0;
983 	  for (i = 0; i < m; i++, ci += incci, ai += incai) {
984 
985 	    cij = ci;
986 	    bj = 0;
987 
988 	    for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
989 
990 	      aih = ai;
991 	      bhj = bj;
992 
993 	      head_sum = tail_sum = 0.0;
994 
995 	      for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
996 		a_elem = a_i[aih];
997 		b_elem = b_i[bhj];
998 		if (transa == blas_conj_trans) {
999 
1000 		}
1001 		if (transb == blas_conj_trans) {
1002 
1003 		}
1004 		head_prod = (double) a_elem *b_elem;
1005 		tail_prod = 0.0;
1006 		{
1007 		  /* Compute double-double = double-double + double-double. */
1008 		  double bv;
1009 		  double s1, s2, t1, t2;
1010 
1011 		  /* Add two hi words. */
1012 		  s1 = head_sum + head_prod;
1013 		  bv = s1 - head_sum;
1014 		  s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
1015 
1016 		  /* Add two lo words. */
1017 		  t1 = tail_sum + tail_prod;
1018 		  bv = t1 - tail_sum;
1019 		  t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
1020 
1021 		  s2 += t1;
1022 
1023 		  /* Renormalize (s1, s2)  to  (t1, s2) */
1024 		  t1 = s1 + s2;
1025 		  s2 = s2 - (t1 - s1);
1026 
1027 		  t2 += s2;
1028 
1029 		  /* Renormalize (t1, t2)  */
1030 		  head_sum = t1 + t2;
1031 		  tail_sum = t2 - (head_sum - t1);
1032 		}
1033 	      }
1034 
1035 	      c_elem[0] = c_i[cij];
1036 	      c_elem[1] = c_i[cij + 1];
1037 	      {
1038 		double head_e1, tail_e1;
1039 		double d1;
1040 		double d2;
1041 		/* Real part */
1042 		d1 = (double) c_elem[0] * beta_i[0];
1043 		d2 = (double) -c_elem[1] * beta_i[1];
1044 		{
1045 		  /* Compute double-double = double + double. */
1046 		  double e, t1, t2;
1047 
1048 		  /* Knuth trick. */
1049 		  t1 = d1 + d2;
1050 		  e = t1 - d1;
1051 		  t2 = ((d2 - e) + (d1 - (t1 - e)));
1052 
1053 		  /* The result is t1 + t2, after normalization. */
1054 		  head_e1 = t1 + t2;
1055 		  tail_e1 = t2 - (head_e1 - t1);
1056 		}
1057 		head_tmp2[0] = head_e1;
1058 		tail_tmp2[0] = tail_e1;
1059 		/* imaginary part */
1060 		d1 = (double) c_elem[0] * beta_i[1];
1061 		d2 = (double) c_elem[1] * beta_i[0];
1062 		{
1063 		  /* Compute double-double = double + double. */
1064 		  double e, t1, t2;
1065 
1066 		  /* Knuth trick. */
1067 		  t1 = d1 + d2;
1068 		  e = t1 - d1;
1069 		  t2 = ((d2 - e) + (d1 - (t1 - e)));
1070 
1071 		  /* The result is t1 + t2, after normalization. */
1072 		  head_e1 = t1 + t2;
1073 		  tail_e1 = t2 - (head_e1 - t1);
1074 		}
1075 		head_tmp2[1] = head_e1;
1076 		tail_tmp2[1] = tail_e1;
1077 	      }
1078 	      head_tmp1[0] = head_sum;
1079 	      tail_tmp1[0] = tail_sum;
1080 	      head_tmp1[1] = tail_tmp1[1] = 0.0;
1081 	      {
1082 		double head_t, tail_t;
1083 		double head_a, tail_a;
1084 		double head_b, tail_b;
1085 		/* Real part */
1086 		head_a = head_tmp2[0];
1087 		tail_a = tail_tmp2[0];
1088 		head_b = head_tmp1[0];
1089 		tail_b = tail_tmp1[0];
1090 		{
1091 		  /* Compute double-double = double-double + double-double. */
1092 		  double bv;
1093 		  double s1, s2, t1, t2;
1094 
1095 		  /* Add two hi words. */
1096 		  s1 = head_a + head_b;
1097 		  bv = s1 - head_a;
1098 		  s2 = ((head_b - bv) + (head_a - (s1 - bv)));
1099 
1100 		  /* Add two lo words. */
1101 		  t1 = tail_a + tail_b;
1102 		  bv = t1 - tail_a;
1103 		  t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
1104 
1105 		  s2 += t1;
1106 
1107 		  /* Renormalize (s1, s2)  to  (t1, s2) */
1108 		  t1 = s1 + s2;
1109 		  s2 = s2 - (t1 - s1);
1110 
1111 		  t2 += s2;
1112 
1113 		  /* Renormalize (t1, t2)  */
1114 		  head_t = t1 + t2;
1115 		  tail_t = t2 - (head_t - t1);
1116 		}
1117 		head_tmp1[0] = head_t;
1118 		tail_tmp1[0] = tail_t;
1119 		/* Imaginary part */
1120 		head_a = head_tmp2[1];
1121 		tail_a = tail_tmp2[1];
1122 		head_b = head_tmp1[1];
1123 		tail_b = tail_tmp1[1];
1124 		{
1125 		  /* Compute double-double = double-double + double-double. */
1126 		  double bv;
1127 		  double s1, s2, t1, t2;
1128 
1129 		  /* Add two hi words. */
1130 		  s1 = head_a + head_b;
1131 		  bv = s1 - head_a;
1132 		  s2 = ((head_b - bv) + (head_a - (s1 - bv)));
1133 
1134 		  /* Add two lo words. */
1135 		  t1 = tail_a + tail_b;
1136 		  bv = t1 - tail_a;
1137 		  t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
1138 
1139 		  s2 += t1;
1140 
1141 		  /* Renormalize (s1, s2)  to  (t1, s2) */
1142 		  t1 = s1 + s2;
1143 		  s2 = s2 - (t1 - s1);
1144 
1145 		  t2 += s2;
1146 
1147 		  /* Renormalize (t1, t2)  */
1148 		  head_t = t1 + t2;
1149 		  tail_t = t2 - (head_t - t1);
1150 		}
1151 		head_tmp1[1] = head_t;
1152 		tail_tmp1[1] = tail_t;
1153 	      }
1154 	      c_i[cij] = head_tmp1[0];
1155 	      c_i[cij + 1] = head_tmp1[1];
1156 	    }
1157 	  }
1158 	}
1159 
1160       } else {
1161 
1162 	/* The most general form,   C <-- alpha * A * B + beta * C  */
1163 	ci = 0;
1164 	ai = 0;
1165 	for (i = 0; i < m; i++, ci += incci, ai += incai) {
1166 
1167 	  cij = ci;
1168 	  bj = 0;
1169 
1170 	  for (j = 0; j < n; j++, cij += inccij, bj += incbj) {
1171 
1172 	    aih = ai;
1173 	    bhj = bj;
1174 
1175 	    head_sum = tail_sum = 0.0;
1176 
1177 	    for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
1178 	      a_elem = a_i[aih];
1179 	      b_elem = b_i[bhj];
1180 	      if (transa == blas_conj_trans) {
1181 
1182 	      }
1183 	      if (transb == blas_conj_trans) {
1184 
1185 	      }
1186 	      head_prod = (double) a_elem *b_elem;
1187 	      tail_prod = 0.0;
1188 	      {
1189 		/* Compute double-double = double-double + double-double. */
1190 		double bv;
1191 		double s1, s2, t1, t2;
1192 
1193 		/* Add two hi words. */
1194 		s1 = head_sum + head_prod;
1195 		bv = s1 - head_sum;
1196 		s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
1197 
1198 		/* Add two lo words. */
1199 		t1 = tail_sum + tail_prod;
1200 		bv = t1 - tail_sum;
1201 		t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
1202 
1203 		s2 += t1;
1204 
1205 		/* Renormalize (s1, s2)  to  (t1, s2) */
1206 		t1 = s1 + s2;
1207 		s2 = s2 - (t1 - s1);
1208 
1209 		t2 += s2;
1210 
1211 		/* Renormalize (t1, t2)  */
1212 		head_sum = t1 + t2;
1213 		tail_sum = t2 - (head_sum - t1);
1214 	      }
1215 	    }
1216 
1217 	    {
1218 	      double head_e1, tail_e1;
1219 	      double dt;
1220 	      dt = (double) alpha_i[0];
1221 	      {
1222 		/* Compute double-double = double-double * double. */
1223 		double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
1224 
1225 		con = head_sum * split;
1226 		a11 = con - head_sum;
1227 		a11 = con - a11;
1228 		a21 = head_sum - a11;
1229 		con = dt * split;
1230 		b1 = con - dt;
1231 		b1 = con - b1;
1232 		b2 = dt - b1;
1233 
1234 		c11 = head_sum * dt;
1235 		c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
1236 
1237 		c2 = tail_sum * dt;
1238 		t1 = c11 + c2;
1239 		t2 = (c2 - (t1 - c11)) + c21;
1240 
1241 		head_e1 = t1 + t2;
1242 		tail_e1 = t2 - (head_e1 - t1);
1243 	      }
1244 	      head_tmp1[0] = head_e1;
1245 	      tail_tmp1[0] = tail_e1;
1246 	      dt = (double) alpha_i[1];
1247 	      {
1248 		/* Compute double-double = double-double * double. */
1249 		double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
1250 
1251 		con = head_sum * split;
1252 		a11 = con - head_sum;
1253 		a11 = con - a11;
1254 		a21 = head_sum - a11;
1255 		con = dt * split;
1256 		b1 = con - dt;
1257 		b1 = con - b1;
1258 		b2 = dt - b1;
1259 
1260 		c11 = head_sum * dt;
1261 		c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
1262 
1263 		c2 = tail_sum * dt;
1264 		t1 = c11 + c2;
1265 		t2 = (c2 - (t1 - c11)) + c21;
1266 
1267 		head_e1 = t1 + t2;
1268 		tail_e1 = t2 - (head_e1 - t1);
1269 	      }
1270 	      head_tmp1[1] = head_e1;
1271 	      tail_tmp1[1] = tail_e1;
1272 	    }
1273 	    c_elem[0] = c_i[cij];
1274 	    c_elem[1] = c_i[cij + 1];
1275 	    {
1276 	      double head_e1, tail_e1;
1277 	      double d1;
1278 	      double d2;
1279 	      /* Real part */
1280 	      d1 = (double) c_elem[0] * beta_i[0];
1281 	      d2 = (double) -c_elem[1] * beta_i[1];
1282 	      {
1283 		/* Compute double-double = double + double. */
1284 		double e, t1, t2;
1285 
1286 		/* Knuth trick. */
1287 		t1 = d1 + d2;
1288 		e = t1 - d1;
1289 		t2 = ((d2 - e) + (d1 - (t1 - e)));
1290 
1291 		/* The result is t1 + t2, after normalization. */
1292 		head_e1 = t1 + t2;
1293 		tail_e1 = t2 - (head_e1 - t1);
1294 	      }
1295 	      head_tmp2[0] = head_e1;
1296 	      tail_tmp2[0] = tail_e1;
1297 	      /* imaginary part */
1298 	      d1 = (double) c_elem[0] * beta_i[1];
1299 	      d2 = (double) c_elem[1] * beta_i[0];
1300 	      {
1301 		/* Compute double-double = double + double. */
1302 		double e, t1, t2;
1303 
1304 		/* Knuth trick. */
1305 		t1 = d1 + d2;
1306 		e = t1 - d1;
1307 		t2 = ((d2 - e) + (d1 - (t1 - e)));
1308 
1309 		/* The result is t1 + t2, after normalization. */
1310 		head_e1 = t1 + t2;
1311 		tail_e1 = t2 - (head_e1 - t1);
1312 	      }
1313 	      head_tmp2[1] = head_e1;
1314 	      tail_tmp2[1] = tail_e1;
1315 	    }
1316 	    {
1317 	      double head_t, tail_t;
1318 	      double head_a, tail_a;
1319 	      double head_b, tail_b;
1320 	      /* Real part */
1321 	      head_a = head_tmp1[0];
1322 	      tail_a = tail_tmp1[0];
1323 	      head_b = head_tmp2[0];
1324 	      tail_b = tail_tmp2[0];
1325 	      {
1326 		/* Compute double-double = double-double + double-double. */
1327 		double bv;
1328 		double s1, s2, t1, t2;
1329 
1330 		/* Add two hi words. */
1331 		s1 = head_a + head_b;
1332 		bv = s1 - head_a;
1333 		s2 = ((head_b - bv) + (head_a - (s1 - bv)));
1334 
1335 		/* Add two lo words. */
1336 		t1 = tail_a + tail_b;
1337 		bv = t1 - tail_a;
1338 		t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
1339 
1340 		s2 += t1;
1341 
1342 		/* Renormalize (s1, s2)  to  (t1, s2) */
1343 		t1 = s1 + s2;
1344 		s2 = s2 - (t1 - s1);
1345 
1346 		t2 += s2;
1347 
1348 		/* Renormalize (t1, t2)  */
1349 		head_t = t1 + t2;
1350 		tail_t = t2 - (head_t - t1);
1351 	      }
1352 	      head_tmp1[0] = head_t;
1353 	      tail_tmp1[0] = tail_t;
1354 	      /* Imaginary part */
1355 	      head_a = head_tmp1[1];
1356 	      tail_a = tail_tmp1[1];
1357 	      head_b = head_tmp2[1];
1358 	      tail_b = tail_tmp2[1];
1359 	      {
1360 		/* Compute double-double = double-double + double-double. */
1361 		double bv;
1362 		double s1, s2, t1, t2;
1363 
1364 		/* Add two hi words. */
1365 		s1 = head_a + head_b;
1366 		bv = s1 - head_a;
1367 		s2 = ((head_b - bv) + (head_a - (s1 - bv)));
1368 
1369 		/* Add two lo words. */
1370 		t1 = tail_a + tail_b;
1371 		bv = t1 - tail_a;
1372 		t2 = ((tail_b - bv) + (tail_a - (t1 - bv)));
1373 
1374 		s2 += t1;
1375 
1376 		/* Renormalize (s1, s2)  to  (t1, s2) */
1377 		t1 = s1 + s2;
1378 		s2 = s2 - (t1 - s1);
1379 
1380 		t2 += s2;
1381 
1382 		/* Renormalize (t1, t2)  */
1383 		head_t = t1 + t2;
1384 		tail_t = t2 - (head_t - t1);
1385 	      }
1386 	      head_tmp1[1] = head_t;
1387 	      tail_tmp1[1] = tail_t;
1388 	    }
1389 	    c_i[cij] = head_tmp1[0];
1390 	    c_i[cij + 1] = head_tmp1[1];
1391 	  }
1392 	}
1393 
1394       }
1395 
1396       FPU_FIX_STOP;
1397 
1398       break;
1399     }
1400   }
1401 }
1402