1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include "common.h"
42 
syr_kernel(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * dummy1,FLOAT * buffer,BLASLONG pos)43 static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
44 
45   FLOAT *a, *x;
46   BLASLONG lda, incx;
47   BLASLONG i, m_from, m_to;
48   FLOAT alpha_r;
49 #if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
50   FLOAT alpha_i;
51 #endif
52 
53 
54   x = (FLOAT *)args -> a;
55   a = (FLOAT *)args -> b;
56 
57   incx = args -> lda;
58   lda  = args -> ldb;
59 
60   alpha_r = *((FLOAT *)args -> alpha + 0);
61 #if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
62   alpha_i = *((FLOAT *)args -> alpha + 1);
63 #endif
64 
65   m_from = 0;
66   m_to   = args -> m;
67 
68   if (range_m) {
69     m_from = *(range_m + 0);
70     m_to   = *(range_m + 1);
71   }
72 
73   if (incx != 1) {
74 #ifndef LOWER
75     COPY_K(m_to, x, incx, buffer, 1);
76 #else
77     COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
78 #endif
79 
80     x = buffer;
81   }
82 
83   a += m_from * lda * COMPSIZE;
84 
85   for (i = m_from; i < m_to; i++){
86 #if !defined(HER) && !defined(HERREV)
87 #ifndef COMPLEX
88     if (x[i * COMPSIZE] != ZERO) {
89 #ifndef LOWER
90       AXPYU_K(i + 1,         0, 0, alpha_r * x[i], x,     1, a,     1, NULL, 0);
91 #else
92       AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], x + i, 1, a + i, 1, NULL, 0);
93 #endif
94     }
95 #else
96     if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
97 #ifndef LOWER
98       AXPYU_K(i + 1,         0, 0,
99 	      alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
100 	      alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
101 	      x,                1, a,                1, NULL, 0);
102 #else
103       AXPYU_K(args -> m - i, 0, 0,
104 	      alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
105 	      alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
106 	      x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0);
107 #endif
108     }
109 #endif
110 #else
111     if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
112 #ifndef HERREV
113 #ifndef LOWER
114       AXPYU_K(i + 1,         0, 0,
115 	      alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1],
116 	      x,                1, a,                1, NULL, 0);
117 #else
118       AXPYU_K(args -> m - i, 0, 0,
119 	      alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1],
120 	      x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0);
121 #endif
122 #else
123 #ifndef LOWER
124       AXPYC_K(i + 1,         0, 0,
125 	      alpha_r * x[i * COMPSIZE + 0],  alpha_r * x[i * COMPSIZE + 1],
126 	      x,                1, a,                1, NULL, 0);
127 #else
128       AXPYC_K(args -> m - i, 0, 0,
129 	      alpha_r * x[i * COMPSIZE + 0],  alpha_r * x[i * COMPSIZE + 1],
130 	      x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0);
131 #endif
132 #endif
133 
134     }
135       a[i * COMPSIZE + 1] = ZERO;
136 #endif
137       a += lda * COMPSIZE;
138 
139   }
140 
141   return 0;
142 }
143 
144 #if !defined(COMPLEX) || defined(HER) || defined(HERREV)
CNAME(BLASLONG m,FLOAT alpha,FLOAT * x,BLASLONG incx,FLOAT * a,BLASLONG lda,FLOAT * buffer,int nthreads)145 int CNAME(BLASLONG m, FLOAT  alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){
146 #else
147 int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){
148 #endif
149 
150   blas_arg_t args;
151   blas_queue_t queue[MAX_CPU_NUMBER];
152   BLASLONG range_m[MAX_CPU_NUMBER + 1];
153 
154   BLASLONG width, i, num_cpu;
155 
156   double dnum;
157   int mask = 7;
158 
159 #ifdef SMP
160 #ifndef COMPLEX
161 #ifdef XDOUBLE
162   int mode  =  BLAS_XDOUBLE | BLAS_REAL;
163 #elif defined(DOUBLE)
164   int mode  =  BLAS_DOUBLE  | BLAS_REAL;
165 #else
166   int mode  =  BLAS_SINGLE  | BLAS_REAL;
167 #endif
168 #else
169 #ifdef XDOUBLE
170   int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
171 #elif defined(DOUBLE)
172   int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
173 #else
174   int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
175 #endif
176 #endif
177 #endif
178 
179   args.m = m;
180 
181   args.a = (void *)x;
182   args.b = (void *)a;
183 
184   args.lda = incx;
185   args.ldb = lda;
186 #if !defined(COMPLEX) || defined(HER) || defined(HERREV)
187   args.alpha = (void *)&alpha;
188 #else
189   args.alpha = (void *)alpha;
190 #endif
191 
192   dnum = (double)m * (double)m / (double)nthreads;
193   num_cpu  = 0;
194 
195 #ifndef LOWER
196 
197   range_m[MAX_CPU_NUMBER] = m;
198   i          = 0;
199 
200   while (i < m){
201 
202     if (nthreads - num_cpu > 1) {
203 
204       double di = (double)(m - i);
205       if (di * di - dnum > 0) {
206 	width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
207       } else {
208 	width = m - i;
209       }
210 
211       if (width < 16) width = 16;
212       if (width > m - i) width = m - i;
213 
214     } else {
215       width = m - i;
216     }
217 
218     range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
219 
220     queue[num_cpu].mode    = mode;
221     queue[num_cpu].routine = syr_kernel;
222     queue[num_cpu].args    = &args;
223     queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1];
224     queue[num_cpu].range_n = NULL;
225     queue[num_cpu].sa      = NULL;
226     queue[num_cpu].sb      = NULL;
227     queue[num_cpu].next    = &queue[num_cpu + 1];
228 
229     num_cpu ++;
230     i += width;
231   }
232 
233 #else
234 
235   range_m[0] = 0;
236   i          = 0;
237 
238   while (i < m){
239 
240     if (nthreads - num_cpu > 1) {
241 
242       double di = (double)(m - i);
243       if (di * di - dnum > 0) {
244 	width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
245       } else {
246 	width = m - i;
247       }
248 
249       if (width < 16) width = 16;
250       if (width > m - i) width = m - i;
251 
252     } else {
253       width = m - i;
254     }
255 
256     range_m[num_cpu + 1] = range_m[num_cpu] + width;
257 
258     queue[num_cpu].mode    = mode;
259     queue[num_cpu].routine = syr_kernel;
260     queue[num_cpu].args    = &args;
261     queue[num_cpu].range_m = &range_m[num_cpu];
262     queue[num_cpu].range_n = NULL;
263     queue[num_cpu].sa      = NULL;
264     queue[num_cpu].sb      = NULL;
265     queue[num_cpu].next    = &queue[num_cpu + 1];
266 
267     num_cpu ++;
268     i += width;
269   }
270 
271 #endif
272 
273   if (num_cpu) {
274     queue[0].sa = NULL;
275     queue[0].sb = buffer;
276 
277     queue[num_cpu - 1].next = NULL;
278 
279     exec_blas(num_cpu, queue);
280   }
281 
282   return 0;
283 }
284