1 /*********************************************************************/
2 /* */
3 /* Optimized BLAS libraries */
4 /* By Kazushige Goto <kgoto@tacc.utexas.edu> */
5 /* */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION. */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of */
16 /* profits, interruption of business, or related expenses which may */
17 /* arise from use of Software or Documentation, including but not */
18 /* limited to those resulting from defects in Software and/or */
19 /* Documentation, or loss or inaccuracy of data of any kind. */
20 /*********************************************************************/
21
22 #include <stdio.h>
23 #include "common.h"
24
25 #ifdef UNIT
26 #define TRTI2 TRTI2_LU
27 #define TRMM TRMM_LNLU
28 #define TRSM TRSM_RNLU
29 #else
30 #define TRTI2 TRTI2_LN
31 #define TRMM TRMM_LNLN
32 #define TRSM TRSM_RNLN
33 #endif
34
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG mypos)35 blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) {
36
37 BLASLONG n, info;
38 BLASLONG bk, i, blocking, start_i;
39 int mode;
40 BLASLONG lda, range_N[2];
41 blas_arg_t newarg;
42 FLOAT *a;
43 FLOAT alpha[2] = { ONE, ZERO};
44 FLOAT beta [2] = {-ONE, ZERO};
45
46 #ifndef COMPLEX
47 #ifdef XDOUBLE
48 mode = BLAS_XDOUBLE | BLAS_REAL;
49 #elif defined(DOUBLE)
50 mode = BLAS_DOUBLE | BLAS_REAL;
51 #else
52 mode = BLAS_SINGLE | BLAS_REAL;
53 #endif
54 #else
55 #ifdef XDOUBLE
56 mode = BLAS_XDOUBLE | BLAS_COMPLEX;
57 #elif defined(DOUBLE)
58 mode = BLAS_DOUBLE | BLAS_COMPLEX;
59 #else
60 mode = BLAS_SINGLE | BLAS_COMPLEX;
61 #endif
62 #endif
63
64 n = args -> n;
65 a = (FLOAT *)args -> a;
66 lda = args -> lda;
67
68 if (range_n) n = range_n[1] - range_n[0];
69
70 if (n <= DTB_ENTRIES) {
71 info = TRTI2(args, NULL, range_n, sa, sb, 0);
72 return info;
73 }
74
75 blocking = GEMM_Q;
76 if (n < 4 * GEMM_Q) blocking = (n + 3) / 4;
77
78 start_i = 0;
79 while (start_i < n) start_i += blocking;
80 start_i -= blocking;
81
82 for (i = start_i; i >= 0; i -= blocking) {
83 bk = n - i;
84 if (bk > blocking) bk = blocking;
85
86 range_N[0] = i;
87 range_N[1] = i + bk;
88
89 newarg.lda = lda;
90 newarg.ldb = lda;
91 newarg.ldc = lda;
92 newarg.alpha = alpha;
93
94 newarg.m = n - bk - i;
95 newarg.n = bk;
96 newarg.a = a + ( i + i * lda) * COMPSIZE;
97 newarg.b = a + ((i + bk) + i * lda) * COMPSIZE;
98
99 newarg.beta = beta;
100 newarg.nthreads = args -> nthreads;
101
102 gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads);
103
104 newarg.m = bk;
105 newarg.n = bk;
106
107 newarg.a = a + (i + i * lda) * COMPSIZE;
108
109 CNAME (&newarg, NULL, NULL, sa, sb, 0);
110
111 newarg.m = n - bk - i;
112 newarg.n = i;
113 newarg.k = bk;
114
115 newarg.a = a + (i + bk + i * lda) * COMPSIZE;
116 newarg.b = a + (i ) * COMPSIZE;
117 newarg.c = a + (i + bk ) * COMPSIZE;
118
119 newarg.beta = NULL;
120
121 gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads);
122
123 newarg.a = a + (i + i * lda) * COMPSIZE;
124 newarg.b = a + (i ) * COMPSIZE;
125
126 newarg.m = bk;
127 newarg.n = i;
128
129 gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads);
130 }
131
132
133 return 0;
134 }
135