1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
42 #ifndef CONJ
43 #define GEMM_KERNEL GEMM_KERNEL_R
44 #define GEMM_KERNEL_B0 GEMM_KERNEL_R_B0
45 #else
46 #define GEMM_KERNEL GEMM_KERNEL_L
47 #define GEMM_KERNEL_B0 GEMM_KERNEL_L_B0
48 #endif
49
CNAME(BLASLONG m,BLASLONG n,BLASLONG k,FLOAT alpha_r,FLOAT alpha_i,FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG offset,int flag)50 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
51 FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset, int flag){
52
53 BLASLONG i, j;
54 BLASLONG loop;
55 FLOAT subbuffer[GEMM_UNROLL_MN * GEMM_UNROLL_MN * COMPSIZE];
56
57 if (m + offset < 0) {
58 #ifndef LOWER
59 GEMM_KERNEL(m, n, k,
60 alpha_r,
61 #ifdef COMPLEX
62 alpha_i,
63 #endif
64 a, b, c, ldc);
65 #endif
66 return 0;
67 }
68
69 if (n < offset) {
70 #ifdef LOWER
71 GEMM_KERNEL(m, n, k,
72 alpha_r,
73 #ifdef COMPLEX
74 alpha_i,
75 #endif
76 a, b, c, ldc);
77 #endif
78 return 0;
79 }
80
81
82 if (offset > 0) {
83 #ifdef LOWER
84 GEMM_KERNEL(m, offset, k,
85 alpha_r,
86 #ifdef COMPLEX
87 alpha_i,
88 #endif
89 a, b, c, ldc);
90 #endif
91 b += offset * k * COMPSIZE;
92 c += offset * ldc * COMPSIZE;
93 n -= offset;
94 offset = 0;
95
96 if (n <= 0) return 0;
97 }
98
99 if (n > m + offset) {
100 #ifndef LOWER
101 GEMM_KERNEL(m, n - m - offset, k,
102 alpha_r,
103 #ifdef COMPLEX
104 alpha_i,
105 #endif
106 a,
107 b + (m + offset) * k * COMPSIZE,
108 c + (m + offset) * ldc * COMPSIZE, ldc);
109 #endif
110
111 n = m + offset;
112 if (n <= 0) return 0;
113 }
114
115
116 if (offset < 0) {
117 #ifndef LOWER
118 GEMM_KERNEL(-offset, n, k,
119 alpha_r,
120 #ifdef COMPLEX
121 alpha_i,
122 #endif
123 a, b, c, ldc);
124 #endif
125 a -= offset * k * COMPSIZE;
126 c -= offset * COMPSIZE;
127 m += offset;
128 offset = 0;
129
130 if (m <= 0) return 0;
131 }
132
133 if (m > n - offset) {
134 #ifdef LOWER
135 GEMM_KERNEL(m - n + offset, n, k,
136 alpha_r,
137 #ifdef COMPLEX
138 alpha_i,
139 #endif
140 a + (n - offset) * k * COMPSIZE,
141 b,
142 c + (n - offset) * COMPSIZE, ldc);
143 #endif
144 m = n + offset;
145 if (m <= 0) return 0;
146 }
147
148 for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) {
149
150 int mm, nn;
151
152 mm = (loop & ~(GEMM_UNROLL_MN - 1));
153 nn = MIN(GEMM_UNROLL_MN, n - loop);
154
155 #ifndef LOWER
156 GEMM_KERNEL(mm, nn, k,
157 alpha_r,
158 #ifdef COMPLEX
159 alpha_i,
160 #endif
161 a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
162 #endif
163
164 if (flag) {
165 GEMM_BETA(nn, nn, 0, ZERO,
166 #ifdef COMPLEX
167 ZERO,
168 #endif
169 NULL, 0, NULL, 0, subbuffer, nn);
170
171 GEMM_KERNEL(nn, nn, k,
172 alpha_r,
173 #ifdef COMPLEX
174 alpha_i,
175 #endif
176 a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);
177
178
179 #ifndef LOWER
180
181 for (j = 0; j < nn; j ++) {
182 for (i = 0; i <= j; i ++) {
183 c[(i + loop + (j + loop) * ldc) * 2 + 0] +=
184 subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0];
185 if (i != j) {
186 c[(i + loop + (j + loop) * ldc) * 2 + 1] +=
187 subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1];
188 } else {
189 c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO;
190 }
191 }
192 }
193 #else
194 for (j = 0; j < nn; j ++) {
195 for (i = j; i < nn; i ++) {
196 c[(i + loop + (j + loop) * ldc) * 2 + 0] +=
197 subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0];
198 if (i != j) {
199 c[(i + loop + (j + loop) * ldc) * 2 + 1] +=
200 subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1];
201 } else {
202 c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO;
203 }
204 }
205 }
206 #endif
207 }
208
209 #ifdef LOWER
210 GEMM_KERNEL(m - mm - nn, nn, k,
211 alpha_r,
212 #ifdef COMPLEX
213 alpha_i,
214 #endif
215 a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
216 c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
217 #endif
218 }
219
220 return 0;
221 }
222