1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include "common.h"
42
syr_kernel(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * dummy1,FLOAT * buffer,BLASLONG pos)43 static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
44
45 FLOAT *a, *x;
46 BLASLONG lda, incx;
47 BLASLONG i, m_from, m_to;
48 FLOAT alpha_r;
49 #if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
50 FLOAT alpha_i;
51 #endif
52
53
54 x = (FLOAT *)args -> a;
55 a = (FLOAT *)args -> b;
56
57 incx = args -> lda;
58 lda = args -> ldb;
59
60 alpha_r = *((FLOAT *)args -> alpha + 0);
61 #if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
62 alpha_i = *((FLOAT *)args -> alpha + 1);
63 #endif
64
65 m_from = 0;
66 m_to = args -> m;
67
68 if (range_m) {
69 m_from = *(range_m + 0);
70 m_to = *(range_m + 1);
71 }
72
73 if (incx != 1) {
74 #ifndef LOWER
75 COPY_K(m_to, x, incx, buffer, 1);
76 #else
77 COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
78 #endif
79
80 x = buffer;
81 }
82
83 a += m_from * lda * COMPSIZE;
84
85 for (i = m_from; i < m_to; i++){
86 #if !defined(HER) && !defined(HERREV)
87 #ifndef COMPLEX
88 if (x[i * COMPSIZE] != ZERO) {
89 #ifndef LOWER
90 AXPYU_K(i + 1, 0, 0, alpha_r * x[i], x, 1, a, 1, NULL, 0);
91 #else
92 AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], x + i, 1, a + i, 1, NULL, 0);
93 #endif
94 }
95 #else
96 if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
97 #ifndef LOWER
98 AXPYU_K(i + 1, 0, 0,
99 alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
100 alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
101 x, 1, a, 1, NULL, 0);
102 #else
103 AXPYU_K(args -> m - i, 0, 0,
104 alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
105 alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
106 x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0);
107 #endif
108 }
109 #endif
110 #else
111 if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
112 #ifndef HERREV
113 #ifndef LOWER
114 AXPYU_K(i + 1, 0, 0,
115 alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1],
116 x, 1, a, 1, NULL, 0);
117 #else
118 AXPYU_K(args -> m - i, 0, 0,
119 alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1],
120 x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0);
121 #endif
122 #else
123 #ifndef LOWER
124 AXPYC_K(i + 1, 0, 0,
125 alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1],
126 x, 1, a, 1, NULL, 0);
127 #else
128 AXPYC_K(args -> m - i, 0, 0,
129 alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1],
130 x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0);
131 #endif
132 #endif
133
134 }
135 a[i * COMPSIZE + 1] = ZERO;
136 #endif
137 a += lda * COMPSIZE;
138
139 }
140
141 return 0;
142 }
143
144 #if !defined(COMPLEX) || defined(HER) || defined(HERREV)
CNAME(BLASLONG m,FLOAT alpha,FLOAT * x,BLASLONG incx,FLOAT * a,BLASLONG lda,FLOAT * buffer,int nthreads)145 int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){
146 #else
147 int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){
148 #endif
149
150 blas_arg_t args;
151 blas_queue_t queue[MAX_CPU_NUMBER];
152 BLASLONG range_m[MAX_CPU_NUMBER + 1];
153
154 BLASLONG width, i, num_cpu;
155
156 double dnum;
157 int mask = 7;
158
159 #ifdef SMP
160 #ifndef COMPLEX
161 #ifdef XDOUBLE
162 int mode = BLAS_XDOUBLE | BLAS_REAL;
163 #elif defined(DOUBLE)
164 int mode = BLAS_DOUBLE | BLAS_REAL;
165 #else
166 int mode = BLAS_SINGLE | BLAS_REAL;
167 #endif
168 #else
169 #ifdef XDOUBLE
170 int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
171 #elif defined(DOUBLE)
172 int mode = BLAS_DOUBLE | BLAS_COMPLEX;
173 #else
174 int mode = BLAS_SINGLE | BLAS_COMPLEX;
175 #endif
176 #endif
177 #endif
178
179 args.m = m;
180
181 args.a = (void *)x;
182 args.b = (void *)a;
183
184 args.lda = incx;
185 args.ldb = lda;
186 #if !defined(COMPLEX) || defined(HER) || defined(HERREV)
187 args.alpha = (void *)α
188 #else
189 args.alpha = (void *)alpha;
190 #endif
191
192 dnum = (double)m * (double)m / (double)nthreads;
193 num_cpu = 0;
194
195 #ifndef LOWER
196
197 range_m[MAX_CPU_NUMBER] = m;
198 i = 0;
199
200 while (i < m){
201
202 if (nthreads - num_cpu > 1) {
203
204 double di = (double)(m - i);
205 if (di * di - dnum > 0) {
206 width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
207 } else {
208 width = m - i;
209 }
210
211 if (width < 16) width = 16;
212 if (width > m - i) width = m - i;
213
214 } else {
215 width = m - i;
216 }
217
218 range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
219
220 queue[num_cpu].mode = mode;
221 queue[num_cpu].routine = syr_kernel;
222 queue[num_cpu].args = &args;
223 queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1];
224 queue[num_cpu].range_n = NULL;
225 queue[num_cpu].sa = NULL;
226 queue[num_cpu].sb = NULL;
227 queue[num_cpu].next = &queue[num_cpu + 1];
228
229 num_cpu ++;
230 i += width;
231 }
232
233 #else
234
235 range_m[0] = 0;
236 i = 0;
237
238 while (i < m){
239
240 if (nthreads - num_cpu > 1) {
241
242 double di = (double)(m - i);
243 if (di * di - dnum > 0) {
244 width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
245 } else {
246 width = m - i;
247 }
248
249 if (width < 16) width = 16;
250 if (width > m - i) width = m - i;
251
252 } else {
253 width = m - i;
254 }
255
256 range_m[num_cpu + 1] = range_m[num_cpu] + width;
257
258 queue[num_cpu].mode = mode;
259 queue[num_cpu].routine = syr_kernel;
260 queue[num_cpu].args = &args;
261 queue[num_cpu].range_m = &range_m[num_cpu];
262 queue[num_cpu].range_n = NULL;
263 queue[num_cpu].sa = NULL;
264 queue[num_cpu].sb = NULL;
265 queue[num_cpu].next = &queue[num_cpu + 1];
266
267 num_cpu ++;
268 i += width;
269 }
270
271 #endif
272
273 if (num_cpu) {
274 queue[0].sa = NULL;
275 queue[0].sb = buffer;
276
277 queue[num_cpu - 1].next = NULL;
278
279 exec_blas(num_cpu, queue);
280 }
281
282 return 0;
283 }
284