1 #include "blas_extended.h"
2 #include "blas_extended_private.h"
BLAS_ssymm_x(enum blas_order_type order,enum blas_side_type side,enum blas_uplo_type uplo,int m,int n,float alpha,const float * a,int lda,const float * b,int ldb,float beta,float * c,int ldc,enum blas_prec_type prec)3 void BLAS_ssymm_x(enum blas_order_type order, enum blas_side_type side,
4 		  enum blas_uplo_type uplo, int m, int n,
5 		  float alpha, const float *a, int lda,
6 		  const float *b, int ldb, float beta,
7 		  float *c, int ldc, enum blas_prec_type prec)
8 
9 /*
10  * Purpose
11  * =======
12  *
13  * This routines computes one of the matrix product:
14  *
15  *     C  <-  alpha * A * B  +  beta * C
16  *     C  <-  alpha * B * A  +  beta * C
17  *
18  * where A is a symmetric matrix.
19  *
20  * Arguments
21  * =========
22  *
23  * order   (input) enum blas_order_type
24  *         Storage format of input matrices A, B, and C.
25  *
26  * side    (input) enum blas_side_type
27  *         Determines which side of matrix B is matrix A is multiplied.
28  *
29  * uplo    (input) enum blas_uplo_type
30  *         Determines which half of matrix A (upper or lower triangle)
31  *         is accessed.
32  *
33  * m n     (input) int
34  *         Size of matrices A, B, and C.
35  *         Matrix A is m-by-m if it is multiplied on the left,
36  *                     n-by-n otherwise.
37  *         Matrices B and C are m-by-n.
38  *
39  * alpha   (input) float
40  *
41  * a       (input) const float*
42  *         Matrix A.
43  *
44  * lda     (input) int
45  *         Leading dimension of matrix A.
46  *
47  * b       (input) const float*
48  *         Matrix B.
49  *
50  * ldb     (input) int
51  *         Leading dimension of matrix B.
52  *
53  * beta    (input) float
54  *
55  * c       (input/output) float*
56  *         Matrix C.
57  *
58  * ldc     (input) int
59  *         Leading dimension of matrix C.
60  *
61  * prec   (input) enum blas_prec_type
62  *        Specifies the internal precision to be used.
63  *        = blas_prec_single: single precision.
64  *        = blas_prec_double: double precision.
65  *        = blas_prec_extra : anything at least 1.5 times as accurate
66  *                            than double, and wider than 80-bits.
67  *                            We use double-double in our implementation.
68  *
69  */
70 {
71   switch (prec) {
72 
73   case blas_prec_single:{
74 
75       /* Integer Index Variables */
76       int i, j, k;
77 
78       int ai, bj, ci;
79       int aik, bkj, cij;
80 
81       int incai, incbj, incci;
82       int incaik1, incaik2, incbkj, inccij;
83 
84       int m_i, n_i;
85 
86       /* Input Matrices */
87       const float *a_i = a;
88       const float *b_i = b;
89 
90       /* Output Matrix */
91       float *c_i = c;
92 
93       /* Input Scalars */
94       float alpha_i = alpha;
95       float beta_i = beta;
96 
97       /* Temporary Floating-Point Variables */
98       float a_elem;
99       float b_elem;
100       float c_elem;
101       float prod;
102       float sum;
103       float tmp1;
104       float tmp2;
105 
106 
107 
108       /* Check for error conditions. */
109       if (m <= 0 || n <= 0) {
110 	return;
111       }
112 
113       if (order == blas_colmajor && (ldb < m || ldc < m)) {
114 	return;
115       }
116       if (order == blas_rowmajor && (ldb < n || ldc < n)) {
117 	return;
118       }
119       if (side == blas_left_side && lda < m) {
120 	return;
121       }
122       if (side == blas_right_side && lda < n) {
123 	return;
124       }
125 
126       /* Test for no-op */
127       if (alpha_i == 0.0 && beta_i == 1.0) {
128 	return;
129       }
130 
131       /* Set Index Parameters */
132       if (side == blas_left_side) {
133 	m_i = m;
134 	n_i = n;
135       } else {
136 	m_i = n;
137 	n_i = m;
138       }
139 
140       if ((order == blas_colmajor && side == blas_left_side) ||
141 	  (order == blas_rowmajor && side == blas_right_side)) {
142 	incbj = ldb;
143 	incbkj = 1;
144 	incci = 1;
145 	inccij = ldc;
146       } else {
147 	incbj = 1;
148 	incbkj = ldb;
149 	incci = ldc;
150 	inccij = 1;
151       }
152 
153       if ((order == blas_colmajor && uplo == blas_upper) ||
154 	  (order == blas_rowmajor && uplo == blas_lower)) {
155 	incai = lda;
156 	incaik1 = 1;
157 	incaik2 = lda;
158       } else {
159 	incai = 1;
160 	incaik1 = lda;
161 	incaik2 = 1;
162       }
163 
164 
165 
166       /* Adjustment to increments (if any) */
167 
168 
169 
170 
171 
172 
173 
174 
175       /* alpha = 0.  In this case, just return beta * C */
176       if (alpha_i == 0.0) {
177 	for (i = 0, ci = 0; i < m_i; i++, ci += incci) {
178 	  for (j = 0, cij = ci; j < n_i; j++, cij += inccij) {
179 	    c_elem = c_i[cij];
180 	    tmp1 = c_elem * beta_i;
181 	    c_i[cij] = tmp1;
182 	  }
183 	}
184       } else if (alpha_i == 1.0) {
185 
186 	/* Case alpha == 1. */
187 
188 	if (beta_i == 0.0) {
189 	  /* Case alpha = 1, beta = 0.  We compute  C <--- A * B   or  B * A */
190 	  for (i = 0, ci = 0, ai = 0; i < m_i; i++, ci += incci, ai += incai) {
191 	    for (j = 0, cij = ci, bj = 0; j < n_i;
192 		 j++, cij += inccij, bj += incbj) {
193 
194 	      sum = 0.0;
195 
196 	      for (k = 0, aik = ai, bkj = bj; k < i;
197 		   k++, aik += incaik1, bkj += incbkj) {
198 		a_elem = a_i[aik];
199 		b_elem = b_i[bkj];
200 		prod = a_elem * b_elem;
201 		sum = sum + prod;
202 	      }
203 	      for (; k < m_i; k++, aik += incaik2, bkj += incbkj) {
204 		a_elem = a_i[aik];
205 		b_elem = b_i[bkj];
206 		prod = a_elem * b_elem;
207 		sum = sum + prod;
208 	      }
209 	      c_i[cij] = sum;
210 	    }
211 	  }
212 	} else {
213 	  /* Case alpha = 1, but beta != 0.
214 	     We compute  C  <--- A * B + beta * C
215 	     or  C  <--- B * A + beta * C  */
216 
217 	  for (i = 0, ci = 0, ai = 0; i < m_i; i++, ci += incci, ai += incai) {
218 	    for (j = 0, cij = ci, bj = 0; j < n_i;
219 		 j++, cij += inccij, bj += incbj) {
220 
221 	      sum = 0.0;
222 
223 	      for (k = 0, aik = ai, bkj = bj; k < i;
224 		   k++, aik += incaik1, bkj += incbkj) {
225 		a_elem = a_i[aik];
226 		b_elem = b_i[bkj];
227 		prod = a_elem * b_elem;
228 		sum = sum + prod;
229 	      }
230 	      for (; k < m_i; k++, aik += incaik2, bkj += incbkj) {
231 		a_elem = a_i[aik];
232 		b_elem = b_i[bkj];
233 		prod = a_elem * b_elem;
234 		sum = sum + prod;
235 	      }
236 	      c_elem = c_i[cij];
237 	      tmp2 = c_elem * beta_i;
238 	      tmp1 = sum;
239 	      tmp1 = tmp2 + tmp1;
240 	      c_i[cij] = tmp1;
241 	    }
242 	  }
243 	}
244 
245       } else {
246 	/* The most general form,   C <--- alpha * A * B + beta * C
247 	   or   C <--- alpha * B * A + beta * C  */
248 
249 	for (i = 0, ci = 0, ai = 0; i < m_i; i++, ci += incci, ai += incai) {
250 	  for (j = 0, cij = ci, bj = 0; j < n_i;
251 	       j++, cij += inccij, bj += incbj) {
252 
253 	    sum = 0.0;
254 
255 	    for (k = 0, aik = ai, bkj = bj; k < i;
256 		 k++, aik += incaik1, bkj += incbkj) {
257 	      a_elem = a_i[aik];
258 	      b_elem = b_i[bkj];
259 	      prod = a_elem * b_elem;
260 	      sum = sum + prod;
261 	    }
262 	    for (; k < m_i; k++, aik += incaik2, bkj += incbkj) {
263 	      a_elem = a_i[aik];
264 	      b_elem = b_i[bkj];
265 	      prod = a_elem * b_elem;
266 	      sum = sum + prod;
267 	    }
268 	    tmp1 = sum * alpha_i;
269 	    c_elem = c_i[cij];
270 	    tmp2 = c_elem * beta_i;
271 	    tmp1 = tmp1 + tmp2;
272 	    c_i[cij] = tmp1;
273 	  }
274 	}
275       }
276 
277 
278 
279       break;
280     }
281   case blas_prec_double:
282   case blas_prec_indigenous:{
283 
284       /* Integer Index Variables */
285       int i, j, k;
286 
287       int ai, bj, ci;
288       int aik, bkj, cij;
289 
290       int incai, incbj, incci;
291       int incaik1, incaik2, incbkj, inccij;
292 
293       int m_i, n_i;
294 
295       /* Input Matrices */
296       const float *a_i = a;
297       const float *b_i = b;
298 
299       /* Output Matrix */
300       float *c_i = c;
301 
302       /* Input Scalars */
303       float alpha_i = alpha;
304       float beta_i = beta;
305 
306       /* Temporary Floating-Point Variables */
307       float a_elem;
308       float b_elem;
309       float c_elem;
310       double prod;
311       double sum;
312       double tmp1;
313       double tmp2;
314 
315 
316 
317       /* Check for error conditions. */
318       if (m <= 0 || n <= 0) {
319 	return;
320       }
321 
322       if (order == blas_colmajor && (ldb < m || ldc < m)) {
323 	return;
324       }
325       if (order == blas_rowmajor && (ldb < n || ldc < n)) {
326 	return;
327       }
328       if (side == blas_left_side && lda < m) {
329 	return;
330       }
331       if (side == blas_right_side && lda < n) {
332 	return;
333       }
334 
335       /* Test for no-op */
336       if (alpha_i == 0.0 && beta_i == 1.0) {
337 	return;
338       }
339 
340       /* Set Index Parameters */
341       if (side == blas_left_side) {
342 	m_i = m;
343 	n_i = n;
344       } else {
345 	m_i = n;
346 	n_i = m;
347       }
348 
349       if ((order == blas_colmajor && side == blas_left_side) ||
350 	  (order == blas_rowmajor && side == blas_right_side)) {
351 	incbj = ldb;
352 	incbkj = 1;
353 	incci = 1;
354 	inccij = ldc;
355       } else {
356 	incbj = 1;
357 	incbkj = ldb;
358 	incci = ldc;
359 	inccij = 1;
360       }
361 
362       if ((order == blas_colmajor && uplo == blas_upper) ||
363 	  (order == blas_rowmajor && uplo == blas_lower)) {
364 	incai = lda;
365 	incaik1 = 1;
366 	incaik2 = lda;
367       } else {
368 	incai = 1;
369 	incaik1 = lda;
370 	incaik2 = 1;
371       }
372 
373 
374 
375       /* Adjustment to increments (if any) */
376 
377 
378 
379 
380 
381 
382 
383 
384       /* alpha = 0.  In this case, just return beta * C */
385       if (alpha_i == 0.0) {
386 	for (i = 0, ci = 0; i < m_i; i++, ci += incci) {
387 	  for (j = 0, cij = ci; j < n_i; j++, cij += inccij) {
388 	    c_elem = c_i[cij];
389 	    tmp1 = (double) c_elem *beta_i;
390 	    c_i[cij] = tmp1;
391 	  }
392 	}
393       } else if (alpha_i == 1.0) {
394 
395 	/* Case alpha == 1. */
396 
397 	if (beta_i == 0.0) {
398 	  /* Case alpha = 1, beta = 0.  We compute  C <--- A * B   or  B * A */
399 	  for (i = 0, ci = 0, ai = 0; i < m_i; i++, ci += incci, ai += incai) {
400 	    for (j = 0, cij = ci, bj = 0; j < n_i;
401 		 j++, cij += inccij, bj += incbj) {
402 
403 	      sum = 0.0;
404 
405 	      for (k = 0, aik = ai, bkj = bj; k < i;
406 		   k++, aik += incaik1, bkj += incbkj) {
407 		a_elem = a_i[aik];
408 		b_elem = b_i[bkj];
409 		prod = (double) a_elem *b_elem;
410 		sum = sum + prod;
411 	      }
412 	      for (; k < m_i; k++, aik += incaik2, bkj += incbkj) {
413 		a_elem = a_i[aik];
414 		b_elem = b_i[bkj];
415 		prod = (double) a_elem *b_elem;
416 		sum = sum + prod;
417 	      }
418 	      c_i[cij] = sum;
419 	    }
420 	  }
421 	} else {
422 	  /* Case alpha = 1, but beta != 0.
423 	     We compute  C  <--- A * B + beta * C
424 	     or  C  <--- B * A + beta * C  */
425 
426 	  for (i = 0, ci = 0, ai = 0; i < m_i; i++, ci += incci, ai += incai) {
427 	    for (j = 0, cij = ci, bj = 0; j < n_i;
428 		 j++, cij += inccij, bj += incbj) {
429 
430 	      sum = 0.0;
431 
432 	      for (k = 0, aik = ai, bkj = bj; k < i;
433 		   k++, aik += incaik1, bkj += incbkj) {
434 		a_elem = a_i[aik];
435 		b_elem = b_i[bkj];
436 		prod = (double) a_elem *b_elem;
437 		sum = sum + prod;
438 	      }
439 	      for (; k < m_i; k++, aik += incaik2, bkj += incbkj) {
440 		a_elem = a_i[aik];
441 		b_elem = b_i[bkj];
442 		prod = (double) a_elem *b_elem;
443 		sum = sum + prod;
444 	      }
445 	      c_elem = c_i[cij];
446 	      tmp2 = (double) c_elem *beta_i;
447 	      tmp1 = sum;
448 	      tmp1 = tmp2 + tmp1;
449 	      c_i[cij] = tmp1;
450 	    }
451 	  }
452 	}
453 
454       } else {
455 	/* The most general form,   C <--- alpha * A * B + beta * C
456 	   or   C <--- alpha * B * A + beta * C  */
457 
458 	for (i = 0, ci = 0, ai = 0; i < m_i; i++, ci += incci, ai += incai) {
459 	  for (j = 0, cij = ci, bj = 0; j < n_i;
460 	       j++, cij += inccij, bj += incbj) {
461 
462 	    sum = 0.0;
463 
464 	    for (k = 0, aik = ai, bkj = bj; k < i;
465 		 k++, aik += incaik1, bkj += incbkj) {
466 	      a_elem = a_i[aik];
467 	      b_elem = b_i[bkj];
468 	      prod = (double) a_elem *b_elem;
469 	      sum = sum + prod;
470 	    }
471 	    for (; k < m_i; k++, aik += incaik2, bkj += incbkj) {
472 	      a_elem = a_i[aik];
473 	      b_elem = b_i[bkj];
474 	      prod = (double) a_elem *b_elem;
475 	      sum = sum + prod;
476 	    }
477 	    tmp1 = sum * alpha_i;
478 	    c_elem = c_i[cij];
479 	    tmp2 = (double) c_elem *beta_i;
480 	    tmp1 = tmp1 + tmp2;
481 	    c_i[cij] = tmp1;
482 	  }
483 	}
484       }
485 
486 
487 
488       break;
489     }
490 
491   case blas_prec_extra:{
492 
493       /* Integer Index Variables */
494       int i, j, k;
495 
496       int ai, bj, ci;
497       int aik, bkj, cij;
498 
499       int incai, incbj, incci;
500       int incaik1, incaik2, incbkj, inccij;
501 
502       int m_i, n_i;
503 
504       /* Input Matrices */
505       const float *a_i = a;
506       const float *b_i = b;
507 
508       /* Output Matrix */
509       float *c_i = c;
510 
511       /* Input Scalars */
512       float alpha_i = alpha;
513       float beta_i = beta;
514 
515       /* Temporary Floating-Point Variables */
516       float a_elem;
517       float b_elem;
518       float c_elem;
519       double head_prod, tail_prod;
520       double head_sum, tail_sum;
521       double head_tmp1, tail_tmp1;
522       double head_tmp2, tail_tmp2;
523 
524       FPU_FIX_DECL;
525 
526       /* Check for error conditions. */
527       if (m <= 0 || n <= 0) {
528 	return;
529       }
530 
531       if (order == blas_colmajor && (ldb < m || ldc < m)) {
532 	return;
533       }
534       if (order == blas_rowmajor && (ldb < n || ldc < n)) {
535 	return;
536       }
537       if (side == blas_left_side && lda < m) {
538 	return;
539       }
540       if (side == blas_right_side && lda < n) {
541 	return;
542       }
543 
544       /* Test for no-op */
545       if (alpha_i == 0.0 && beta_i == 1.0) {
546 	return;
547       }
548 
549       /* Set Index Parameters */
550       if (side == blas_left_side) {
551 	m_i = m;
552 	n_i = n;
553       } else {
554 	m_i = n;
555 	n_i = m;
556       }
557 
558       if ((order == blas_colmajor && side == blas_left_side) ||
559 	  (order == blas_rowmajor && side == blas_right_side)) {
560 	incbj = ldb;
561 	incbkj = 1;
562 	incci = 1;
563 	inccij = ldc;
564       } else {
565 	incbj = 1;
566 	incbkj = ldb;
567 	incci = ldc;
568 	inccij = 1;
569       }
570 
571       if ((order == blas_colmajor && uplo == blas_upper) ||
572 	  (order == blas_rowmajor && uplo == blas_lower)) {
573 	incai = lda;
574 	incaik1 = 1;
575 	incaik2 = lda;
576       } else {
577 	incai = 1;
578 	incaik1 = lda;
579 	incaik2 = 1;
580       }
581 
582       FPU_FIX_START;
583 
584       /* Adjustment to increments (if any) */
585 
586 
587 
588 
589 
590 
591 
592 
593       /* alpha = 0.  In this case, just return beta * C */
594       if (alpha_i == 0.0) {
595 	for (i = 0, ci = 0; i < m_i; i++, ci += incci) {
596 	  for (j = 0, cij = ci; j < n_i; j++, cij += inccij) {
597 	    c_elem = c_i[cij];
598 	    head_tmp1 = (double) c_elem *beta_i;
599 	    tail_tmp1 = 0.0;
600 	    c_i[cij] = head_tmp1;
601 	  }
602 	}
603       } else if (alpha_i == 1.0) {
604 
605 	/* Case alpha == 1. */
606 
607 	if (beta_i == 0.0) {
608 	  /* Case alpha = 1, beta = 0.  We compute  C <--- A * B   or  B * A */
609 	  for (i = 0, ci = 0, ai = 0; i < m_i; i++, ci += incci, ai += incai) {
610 	    for (j = 0, cij = ci, bj = 0; j < n_i;
611 		 j++, cij += inccij, bj += incbj) {
612 
613 	      head_sum = tail_sum = 0.0;
614 
615 	      for (k = 0, aik = ai, bkj = bj; k < i;
616 		   k++, aik += incaik1, bkj += incbkj) {
617 		a_elem = a_i[aik];
618 		b_elem = b_i[bkj];
619 		head_prod = (double) a_elem *b_elem;
620 		tail_prod = 0.0;
621 		{
622 		  /* Compute double-double = double-double + double-double. */
623 		  double bv;
624 		  double s1, s2, t1, t2;
625 
626 		  /* Add two hi words. */
627 		  s1 = head_sum + head_prod;
628 		  bv = s1 - head_sum;
629 		  s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
630 
631 		  /* Add two lo words. */
632 		  t1 = tail_sum + tail_prod;
633 		  bv = t1 - tail_sum;
634 		  t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
635 
636 		  s2 += t1;
637 
638 		  /* Renormalize (s1, s2)  to  (t1, s2) */
639 		  t1 = s1 + s2;
640 		  s2 = s2 - (t1 - s1);
641 
642 		  t2 += s2;
643 
644 		  /* Renormalize (t1, t2)  */
645 		  head_sum = t1 + t2;
646 		  tail_sum = t2 - (head_sum - t1);
647 		}
648 	      }
649 	      for (; k < m_i; k++, aik += incaik2, bkj += incbkj) {
650 		a_elem = a_i[aik];
651 		b_elem = b_i[bkj];
652 		head_prod = (double) a_elem *b_elem;
653 		tail_prod = 0.0;
654 		{
655 		  /* Compute double-double = double-double + double-double. */
656 		  double bv;
657 		  double s1, s2, t1, t2;
658 
659 		  /* Add two hi words. */
660 		  s1 = head_sum + head_prod;
661 		  bv = s1 - head_sum;
662 		  s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
663 
664 		  /* Add two lo words. */
665 		  t1 = tail_sum + tail_prod;
666 		  bv = t1 - tail_sum;
667 		  t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
668 
669 		  s2 += t1;
670 
671 		  /* Renormalize (s1, s2)  to  (t1, s2) */
672 		  t1 = s1 + s2;
673 		  s2 = s2 - (t1 - s1);
674 
675 		  t2 += s2;
676 
677 		  /* Renormalize (t1, t2)  */
678 		  head_sum = t1 + t2;
679 		  tail_sum = t2 - (head_sum - t1);
680 		}
681 	      }
682 	      c_i[cij] = head_sum;
683 	    }
684 	  }
685 	} else {
686 	  /* Case alpha = 1, but beta != 0.
687 	     We compute  C  <--- A * B + beta * C
688 	     or  C  <--- B * A + beta * C  */
689 
690 	  for (i = 0, ci = 0, ai = 0; i < m_i; i++, ci += incci, ai += incai) {
691 	    for (j = 0, cij = ci, bj = 0; j < n_i;
692 		 j++, cij += inccij, bj += incbj) {
693 
694 	      head_sum = tail_sum = 0.0;
695 
696 	      for (k = 0, aik = ai, bkj = bj; k < i;
697 		   k++, aik += incaik1, bkj += incbkj) {
698 		a_elem = a_i[aik];
699 		b_elem = b_i[bkj];
700 		head_prod = (double) a_elem *b_elem;
701 		tail_prod = 0.0;
702 		{
703 		  /* Compute double-double = double-double + double-double. */
704 		  double bv;
705 		  double s1, s2, t1, t2;
706 
707 		  /* Add two hi words. */
708 		  s1 = head_sum + head_prod;
709 		  bv = s1 - head_sum;
710 		  s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
711 
712 		  /* Add two lo words. */
713 		  t1 = tail_sum + tail_prod;
714 		  bv = t1 - tail_sum;
715 		  t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
716 
717 		  s2 += t1;
718 
719 		  /* Renormalize (s1, s2)  to  (t1, s2) */
720 		  t1 = s1 + s2;
721 		  s2 = s2 - (t1 - s1);
722 
723 		  t2 += s2;
724 
725 		  /* Renormalize (t1, t2)  */
726 		  head_sum = t1 + t2;
727 		  tail_sum = t2 - (head_sum - t1);
728 		}
729 	      }
730 	      for (; k < m_i; k++, aik += incaik2, bkj += incbkj) {
731 		a_elem = a_i[aik];
732 		b_elem = b_i[bkj];
733 		head_prod = (double) a_elem *b_elem;
734 		tail_prod = 0.0;
735 		{
736 		  /* Compute double-double = double-double + double-double. */
737 		  double bv;
738 		  double s1, s2, t1, t2;
739 
740 		  /* Add two hi words. */
741 		  s1 = head_sum + head_prod;
742 		  bv = s1 - head_sum;
743 		  s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
744 
745 		  /* Add two lo words. */
746 		  t1 = tail_sum + tail_prod;
747 		  bv = t1 - tail_sum;
748 		  t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
749 
750 		  s2 += t1;
751 
752 		  /* Renormalize (s1, s2)  to  (t1, s2) */
753 		  t1 = s1 + s2;
754 		  s2 = s2 - (t1 - s1);
755 
756 		  t2 += s2;
757 
758 		  /* Renormalize (t1, t2)  */
759 		  head_sum = t1 + t2;
760 		  tail_sum = t2 - (head_sum - t1);
761 		}
762 	      }
763 	      c_elem = c_i[cij];
764 	      head_tmp2 = (double) c_elem *beta_i;
765 	      tail_tmp2 = 0.0;
766 	      head_tmp1 = head_sum;
767 	      tail_tmp1 = tail_sum;
768 	      {
769 		/* Compute double-double = double-double + double-double. */
770 		double bv;
771 		double s1, s2, t1, t2;
772 
773 		/* Add two hi words. */
774 		s1 = head_tmp2 + head_tmp1;
775 		bv = s1 - head_tmp2;
776 		s2 = ((head_tmp1 - bv) + (head_tmp2 - (s1 - bv)));
777 
778 		/* Add two lo words. */
779 		t1 = tail_tmp2 + tail_tmp1;
780 		bv = t1 - tail_tmp2;
781 		t2 = ((tail_tmp1 - bv) + (tail_tmp2 - (t1 - bv)));
782 
783 		s2 += t1;
784 
785 		/* Renormalize (s1, s2)  to  (t1, s2) */
786 		t1 = s1 + s2;
787 		s2 = s2 - (t1 - s1);
788 
789 		t2 += s2;
790 
791 		/* Renormalize (t1, t2)  */
792 		head_tmp1 = t1 + t2;
793 		tail_tmp1 = t2 - (head_tmp1 - t1);
794 	      }
795 	      c_i[cij] = head_tmp1;
796 	    }
797 	  }
798 	}
799 
800       } else {
801 	/* The most general form,   C <--- alpha * A * B + beta * C
802 	   or   C <--- alpha * B * A + beta * C  */
803 
804 	for (i = 0, ci = 0, ai = 0; i < m_i; i++, ci += incci, ai += incai) {
805 	  for (j = 0, cij = ci, bj = 0; j < n_i;
806 	       j++, cij += inccij, bj += incbj) {
807 
808 	    head_sum = tail_sum = 0.0;
809 
810 	    for (k = 0, aik = ai, bkj = bj; k < i;
811 		 k++, aik += incaik1, bkj += incbkj) {
812 	      a_elem = a_i[aik];
813 	      b_elem = b_i[bkj];
814 	      head_prod = (double) a_elem *b_elem;
815 	      tail_prod = 0.0;
816 	      {
817 		/* Compute double-double = double-double + double-double. */
818 		double bv;
819 		double s1, s2, t1, t2;
820 
821 		/* Add two hi words. */
822 		s1 = head_sum + head_prod;
823 		bv = s1 - head_sum;
824 		s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
825 
826 		/* Add two lo words. */
827 		t1 = tail_sum + tail_prod;
828 		bv = t1 - tail_sum;
829 		t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
830 
831 		s2 += t1;
832 
833 		/* Renormalize (s1, s2)  to  (t1, s2) */
834 		t1 = s1 + s2;
835 		s2 = s2 - (t1 - s1);
836 
837 		t2 += s2;
838 
839 		/* Renormalize (t1, t2)  */
840 		head_sum = t1 + t2;
841 		tail_sum = t2 - (head_sum - t1);
842 	      }
843 	    }
844 	    for (; k < m_i; k++, aik += incaik2, bkj += incbkj) {
845 	      a_elem = a_i[aik];
846 	      b_elem = b_i[bkj];
847 	      head_prod = (double) a_elem *b_elem;
848 	      tail_prod = 0.0;
849 	      {
850 		/* Compute double-double = double-double + double-double. */
851 		double bv;
852 		double s1, s2, t1, t2;
853 
854 		/* Add two hi words. */
855 		s1 = head_sum + head_prod;
856 		bv = s1 - head_sum;
857 		s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));
858 
859 		/* Add two lo words. */
860 		t1 = tail_sum + tail_prod;
861 		bv = t1 - tail_sum;
862 		t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));
863 
864 		s2 += t1;
865 
866 		/* Renormalize (s1, s2)  to  (t1, s2) */
867 		t1 = s1 + s2;
868 		s2 = s2 - (t1 - s1);
869 
870 		t2 += s2;
871 
872 		/* Renormalize (t1, t2)  */
873 		head_sum = t1 + t2;
874 		tail_sum = t2 - (head_sum - t1);
875 	      }
876 	    }
877 	    {
878 	      double dt = (double) alpha_i;
879 	      {
880 		/* Compute double-double = double-double * double. */
881 		double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
882 
883 		con = head_sum * split;
884 		a11 = con - head_sum;
885 		a11 = con - a11;
886 		a21 = head_sum - a11;
887 		con = dt * split;
888 		b1 = con - dt;
889 		b1 = con - b1;
890 		b2 = dt - b1;
891 
892 		c11 = head_sum * dt;
893 		c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
894 
895 		c2 = tail_sum * dt;
896 		t1 = c11 + c2;
897 		t2 = (c2 - (t1 - c11)) + c21;
898 
899 		head_tmp1 = t1 + t2;
900 		tail_tmp1 = t2 - (head_tmp1 - t1);
901 	      }
902 	    }
903 	    c_elem = c_i[cij];
904 	    head_tmp2 = (double) c_elem *beta_i;
905 	    tail_tmp2 = 0.0;
906 	    {
907 	      /* Compute double-double = double-double + double-double. */
908 	      double bv;
909 	      double s1, s2, t1, t2;
910 
911 	      /* Add two hi words. */
912 	      s1 = head_tmp1 + head_tmp2;
913 	      bv = s1 - head_tmp1;
914 	      s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
915 
916 	      /* Add two lo words. */
917 	      t1 = tail_tmp1 + tail_tmp2;
918 	      bv = t1 - tail_tmp1;
919 	      t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
920 
921 	      s2 += t1;
922 
923 	      /* Renormalize (s1, s2)  to  (t1, s2) */
924 	      t1 = s1 + s2;
925 	      s2 = s2 - (t1 - s1);
926 
927 	      t2 += s2;
928 
929 	      /* Renormalize (t1, t2)  */
930 	      head_tmp1 = t1 + t2;
931 	      tail_tmp1 = t2 - (head_tmp1 - t1);
932 	    }
933 	    c_i[cij] = head_tmp1;
934 	  }
935 	}
936       }
937 
938       FPU_FIX_STOP;
939 
940       break;
941     }
942   }
943 }
944