1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
42 #ifndef CONJA
43 #ifndef CONJB
44 #define GEMM_KERNEL GEMM_KERNEL_N
45 #else
46 #define GEMM_KERNEL GEMM_KERNEL_R
47 #endif
48 #else
49 #ifndef CONJB
50 #define GEMM_KERNEL GEMM_KERNEL_L
51 #else
52 #define GEMM_KERNEL GEMM_KERNEL_B
53 #endif
54 #endif
55
CNAME(BLASLONG m,BLASLONG n,BLASLONG k,FLOAT alpha_r,FLOAT alpha_i,FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG offset)56 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
57 #ifdef COMPLEX
58 FLOAT alpha_i,
59 #endif
60 FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
61
62 BLASLONG i, j;
63 BLASLONG loop;
64 FLOAT *cc, *ss;
65 FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE];
66
67 if (m + offset < 0) {
68 #ifndef LOWER
69 GEMM_KERNEL(m, n, k,
70 alpha_r,
71 #ifdef COMPLEX
72 alpha_i,
73 #endif
74 a, b, c, ldc);
75 #endif
76 return 0;
77 }
78
79 if (n < offset) {
80 #ifdef LOWER
81 GEMM_KERNEL(m, n, k,
82 alpha_r,
83 #ifdef COMPLEX
84 alpha_i,
85 #endif
86 a, b, c, ldc);
87 #endif
88 return 0;
89 }
90
91 if (offset > 0) {
92 #ifdef LOWER
93 GEMM_KERNEL(m, offset, k,
94 alpha_r,
95 #ifdef COMPLEX
96 alpha_i,
97 #endif
98 a, b, c, ldc);
99 #endif
100 b += offset * k * COMPSIZE;
101 c += offset * ldc * COMPSIZE;
102 n -= offset;
103 offset = 0;
104
105 if (n <= 0) return 0;
106 }
107
108 if (n > m + offset) {
109 #ifndef LOWER
110 GEMM_KERNEL(m, n - m - offset, k,
111 alpha_r,
112 #ifdef COMPLEX
113 alpha_i,
114 #endif
115 a,
116 b + (m + offset) * k * COMPSIZE,
117 c + (m + offset) * ldc * COMPSIZE, ldc);
118 #endif
119
120 n = m + offset;
121 if (n <= 0) return 0;
122 }
123
124 if (offset < 0) {
125 #ifndef LOWER
126 GEMM_KERNEL(-offset, n, k,
127 alpha_r,
128 #ifdef COMPLEX
129 alpha_i,
130 #endif
131 a, b, c, ldc);
132 #endif
133 a -= offset * k * COMPSIZE;
134 c -= offset * COMPSIZE;
135 m += offset;
136 offset = 0;
137
138 if (m <= 0) return 0;
139 }
140
141 if (m > n - offset) {
142 #ifdef LOWER
143 GEMM_KERNEL(m - n + offset, n, k,
144 alpha_r,
145 #ifdef COMPLEX
146 alpha_i,
147 #endif
148 a + (n - offset) * k * COMPSIZE,
149 b,
150 c + (n - offset) * COMPSIZE, ldc);
151 #endif
152 m = n + offset;
153
154 if (m <= 0) return 0;
155 }
156
157 for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) {
158
159 int mm, nn;
160
161 mm = (loop & ~(GEMM_UNROLL_MN - 1));
162 nn = MIN(GEMM_UNROLL_MN, n - loop);
163
164 #ifndef LOWER
165 GEMM_KERNEL(mm, nn, k,
166 alpha_r,
167 #ifdef COMPLEX
168 alpha_i,
169 #endif
170 a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
171 #endif
172
173 GEMM_BETA(nn, nn, 0, ZERO,
174 #ifdef COMPLEX
175 ZERO,
176 #endif
177 NULL, 0, NULL, 0, subbuffer, nn);
178
179 GEMM_KERNEL(nn, nn, k,
180 alpha_r,
181 #ifdef COMPLEX
182 alpha_i,
183 #endif
184 a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
185
186 cc = c + (loop + loop * ldc) * COMPSIZE;
187 ss = subbuffer;
188
189 #ifndef LOWER
190 for (j = 0; j < nn; j ++) {
191 for (i = 0; i <= j; i ++) {
192 #ifndef COMPLEX
193 cc[i] += ss[i];
194 #else
195 cc[i * 2 + 0] += ss[i * 2 + 0];
196 cc[i * 2 + 1] += ss[i * 2 + 1];
197 #endif
198 }
199 ss += nn * COMPSIZE;
200 cc += ldc * COMPSIZE;
201 }
202 #else
203 for (j = 0; j < nn; j ++) {
204 for (i = j; i < nn; i ++) {
205 #ifndef COMPLEX
206 cc[i] += ss[i];
207 #else
208 cc[i * 2 + 0] += ss[i * 2 + 0];
209 cc[i * 2 + 1] += ss[i * 2 + 1];
210 #endif
211 }
212 ss += nn * COMPSIZE;
213 cc += ldc * COMPSIZE;
214 }
215 #endif
216
217 #ifdef LOWER
218 GEMM_KERNEL(m - mm - nn, nn, k,
219 alpha_r,
220 #ifdef COMPLEX
221 alpha_i,
222 #endif
223 a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
224 c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
225 #endif
226
227 }
228
229 return 0;
230 }
231