1 /***************************************************************************
2 Copyright (c) 2019, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28 #include "common.h"
29
30 #define NBMAX 1024
31
zgemv_kernel_4x4(BLASLONG n,FLOAT ** ap,FLOAT * x,FLOAT * y)32 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
33 register FLOAT *ap0 = ap[0];
34 register FLOAT *ap1 = ap[1];
35 register FLOAT *ap2 = ap[2];
36 register FLOAT *ap3 = ap[3];
37
38 __asm__("vl %%v16,0(%[x])\n\t"
39 "vl %%v17,16(%[x])\n\t"
40 "vl %%v18,32(%[x])\n\t"
41 "vl %%v19,48(%[x])\n\t"
42 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
43 "vleg %%v20,8(%[x]),0\n\t"
44 "wflcdb %%v20,%%v20\n\t"
45 "vleg %%v20,0(%[x]),1\n\t"
46 "vleg %%v21,24(%[x]),0\n\t"
47 "wflcdb %%v21,%%v21\n\t"
48 "vleg %%v21,16(%[x]),1\n\t"
49 "vleg %%v22,40(%[x]),0\n\t"
50 "wflcdb %%v22,%%v22\n\t"
51 "vleg %%v22,32(%[x]),1\n\t"
52 "vleg %%v23,56(%[x]),0\n\t"
53 "wflcdb %%v23,%%v23\n\t"
54 "vleg %%v23,48(%[x]),1\n\t"
55 #else
56 "vleg %%v20,0(%[x]),1\n\t"
57 "vflcdb %%v20,%%v20\n\t"
58 "vleg %%v20,8(%[x]),0\n\t"
59 "vleg %%v21,16(%[x]),1\n\t"
60 "vflcdb %%v21,%%v21\n\t"
61 "vleg %%v21,24(%[x]),0\n\t"
62 "vleg %%v22,32(%[x]),1\n\t"
63 "vflcdb %%v22,%%v22\n\t"
64 "vleg %%v22,40(%[x]),0\n\t"
65 "vleg %%v23,48(%[x]),1\n\t"
66 "vflcdb %%v23,%%v23\n\t"
67 "vleg %%v23,56(%[x]),0\n\t"
68 #endif
69 "xgr %%r1,%%r1\n\t"
70 "srlg %[n],%[n],1\n\t"
71 "0:\n\t"
72 "pfd 1,1024(%%r1,%[ap0])\n\t"
73 "pfd 1,1024(%%r1,%[ap1])\n\t"
74 "pfd 1,1024(%%r1,%[ap2])\n\t"
75 "pfd 1,1024(%%r1,%[ap3])\n\t"
76 "pfd 2,1024(%%r1,%[y])\n\t"
77 "vl %%v0,0(%%r1,%[y])\n\t"
78 "vl %%v1,16(%%r1,%[y])\n\t"
79 "vlrepg %%v24,0(%%r1,%[ap0])\n\t"
80 "vlrepg %%v25,8(%%r1,%[ap0])\n\t"
81 "vlrepg %%v26,0(%%r1,%[ap1])\n\t"
82 "vlrepg %%v27,8(%%r1,%[ap1])\n\t"
83 "vlrepg %%v28,16(%%r1,%[ap0])\n\t"
84 "vlrepg %%v29,24(%%r1,%[ap0])\n\t"
85 "vlrepg %%v30,16(%%r1,%[ap1])\n\t"
86 "vlrepg %%v31,24(%%r1,%[ap1])\n\t"
87 "vfmadb %%v0,%%v24,%%v16,%%v0\n\t"
88 "vfmadb %%v1,%%v28,%%v16,%%v1\n\t"
89 "vfmadb %%v0,%%v25,%%v20,%%v0\n\t"
90 "vfmadb %%v1,%%v29,%%v20,%%v1\n\t"
91 "vfmadb %%v0,%%v26,%%v17,%%v0\n\t"
92 "vfmadb %%v1,%%v30,%%v17,%%v1\n\t"
93 "vfmadb %%v0,%%v27,%%v21,%%v0\n\t"
94 "vfmadb %%v1,%%v31,%%v21,%%v1\n\t"
95 "vlrepg %%v24,0(%%r1,%[ap2])\n\t"
96 "vlrepg %%v25,8(%%r1,%[ap2])\n\t"
97 "vlrepg %%v26,0(%%r1,%[ap3])\n\t"
98 "vlrepg %%v27,8(%%r1,%[ap3])\n\t"
99 "vlrepg %%v28,16(%%r1,%[ap2])\n\t"
100 "vlrepg %%v29,24(%%r1,%[ap2])\n\t"
101 "vlrepg %%v30,16(%%r1,%[ap3])\n\t"
102 "vlrepg %%v31,24(%%r1,%[ap3])\n\t"
103 "vfmadb %%v0,%%v24,%%v18,%%v0\n\t"
104 "vfmadb %%v1,%%v28,%%v18,%%v1\n\t"
105 "vfmadb %%v0,%%v25,%%v22,%%v0\n\t"
106 "vfmadb %%v1,%%v29,%%v22,%%v1\n\t"
107 "vfmadb %%v0,%%v26,%%v19,%%v0\n\t"
108 "vfmadb %%v1,%%v30,%%v19,%%v1\n\t"
109 "vfmadb %%v0,%%v27,%%v23,%%v0\n\t"
110 "vfmadb %%v1,%%v31,%%v23,%%v1\n\t"
111 "vst %%v0,0(%%r1,%[y])\n\t"
112 "vst %%v1,16(%%r1,%[y])\n\t"
113 "agfi %%r1,32\n\t"
114 "brctg %[n],0b"
115 : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
116 : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0),
117 "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1),
118 "m"(*(const FLOAT (*)[n * 2]) ap2),[ap2] "a"(ap2),
119 "m"(*(const FLOAT (*)[n * 2]) ap3),[ap3] "a"(ap3),
120 "m"(*(const FLOAT (*)[8]) x),[x] "a"(x)
121 : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
122 "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
123 "v31");
124 }
125
zgemv_kernel_4x2(BLASLONG n,FLOAT ** ap,FLOAT * x,FLOAT * y)126 static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
127 register FLOAT *ap0 = ap[0];
128 register FLOAT *ap1 = ap[1];
129
130 __asm__("vl %%v16,0(%[x])\n\t"
131 "vl %%v17,16(%[x])\n\t"
132 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
133 "vleg %%v18,8(%[x]),0\n\t"
134 "wflcdb %%v18,%%v18\n\t"
135 "vleg %%v18,0(%[x]),1\n\t"
136 "vleg %%v19,24(%[x]),0\n\t"
137 "wflcdb %%v19,%%v19\n\t"
138 "vleg %%v19,16(%[x]),1\n\t"
139 #else
140 "vleg %%v18,0(%[x]),1\n\t"
141 "vflcdb %%v18,%%v18\n\t"
142 "vleg %%v18,8(%[x]),0\n\t"
143 "vleg %%v19,16(%[x]),1\n\t"
144 "vflcdb %%v19,%%v19\n\t"
145 "vleg %%v19,24(%[x]),0\n\t"
146 #endif
147 "xgr %%r1,%%r1\n\t"
148 "srlg %[n],%[n],1\n\t"
149 "0:\n\t"
150 "pfd 1,1024(%%r1,%[ap0])\n\t"
151 "pfd 1,1024(%%r1,%[ap1])\n\t"
152 "pfd 2,1024(%%r1,%[y])\n\t"
153 "vl %%v0,0(%%r1,%[y])\n\t"
154 "vl %%v1,16(%%r1,%[y])\n\t"
155 "vlrepg %%v20,0(%%r1,%[ap0])\n\t"
156 "vlrepg %%v21,8(%%r1,%[ap0])\n\t"
157 "vlrepg %%v22,0(%%r1,%[ap1])\n\t"
158 "vlrepg %%v23,8(%%r1,%[ap1])\n\t"
159 "vlrepg %%v24,16(%%r1,%[ap0])\n\t"
160 "vlrepg %%v25,24(%%r1,%[ap0])\n\t"
161 "vlrepg %%v26,16(%%r1,%[ap1])\n\t"
162 "vlrepg %%v27,24(%%r1,%[ap1])\n\t"
163 "vfmadb %%v0,%%v20,%%v16,%%v0\n\t"
164 "vfmadb %%v1,%%v24,%%v16,%%v1\n\t"
165 "vfmadb %%v0,%%v21,%%v18,%%v0\n\t"
166 "vfmadb %%v1,%%v25,%%v18,%%v1\n\t"
167 "vfmadb %%v0,%%v22,%%v17,%%v0\n\t"
168 "vfmadb %%v1,%%v26,%%v17,%%v1\n\t"
169 "vfmadb %%v0,%%v23,%%v19,%%v0\n\t"
170 "vfmadb %%v1,%%v27,%%v19,%%v1\n\t"
171 "vst %%v0,0(%%r1,%[y])\n\t"
172 "vst %%v1,16(%%r1,%[y])\n\t"
173 "agfi %%r1,32\n\t"
174 "brctg %[n],0b"
175 : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
176 : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0),
177 "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1),
178 "m"(*(const FLOAT (*)[4]) x),[x] "a"(x)
179 : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
180 "v22", "v23", "v24", "v25", "v26", "v27");
181 }
182
zgemv_kernel_4x1(BLASLONG n,FLOAT * ap,FLOAT * x,FLOAT * y)183 static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
184 __asm__("vl %%v16,0(%[x])\n\t"
185 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
186 "vleg %%v17,8(%[x]),0\n\t"
187 "wflcdb %%v17,%%v17\n\t"
188 "vleg %%v17,0(%[x]),1\n\t"
189 #else
190 "vleg %%v17,0(%[x]),1\n\t"
191 "vflcdb %%v17,%%v17\n\t"
192 "vleg %%v17,8(%[x]),0\n\t"
193 #endif
194 "xgr %%r1,%%r1\n\t"
195 "srlg %[n],%[n],1\n\t"
196 "0:\n\t"
197 "pfd 1,1024(%%r1,%[ap])\n\t"
198 "pfd 2,1024(%%r1,%[y])\n\t"
199 "vl %%v0,0(%%r1,%[y])\n\t"
200 "vl %%v1,16(%%r1,%[y])\n\t"
201 "vlrepg %%v18,0(%%r1,%[ap])\n\t"
202 "vlrepg %%v19,8(%%r1,%[ap])\n\t"
203 "vlrepg %%v20,16(%%r1,%[ap])\n\t"
204 "vlrepg %%v21,24(%%r1,%[ap])\n\t"
205 "vfmadb %%v0,%%v18,%%v16,%%v0\n\t"
206 "vfmadb %%v1,%%v20,%%v16,%%v1\n\t"
207 "vfmadb %%v0,%%v19,%%v17,%%v0\n\t"
208 "vfmadb %%v1,%%v21,%%v17,%%v1\n\t"
209 "vst %%v0,0(%%r1,%[y])\n\t"
210 "vst %%v1,16(%%r1,%[y])\n\t"
211 "agfi %%r1,32\n\t"
212 "brctg %[n],0b"
213 : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n)
214 : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap),
215 "m"(*(const FLOAT (*)[2]) x),[x] "a"(x)
216 : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21");
217 }
218
add_y_4(BLASLONG n,FLOAT * src,FLOAT * dest,FLOAT alpha_r,FLOAT alpha_i)219 static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
220 FLOAT alpha_i) {
221 __asm__(
222 #if !defined(XCONJ)
223 "vlrepg %%v0,%[alpha_r]\n\t"
224 "vleg %%v1,%[alpha_i],0\n\t"
225 "wflcdb %%v1,%%v1\n\t"
226 "vleg %%v1,%[alpha_i],1\n\t"
227 #else
228 "vleg %%v0,%[alpha_r],1\n\t"
229 "vflcdb %%v0,%%v0\n\t"
230 "vleg %%v0,%[alpha_r],0\n\t"
231 "vlrepg %%v1,%[alpha_i]\n\t"
232 #endif
233 "xgr %%r1,%%r1\n\t"
234 "srlg %[n],%[n],2\n\t"
235 "0:\n\t"
236 "pfd 1,1024(%%r1,%[src])\n\t"
237 "pfd 2,1024(%%r1,%[dest])\n\t"
238 "vl %%v16,0(%%r1,%[src])\n\t"
239 "vl %%v17,16(%%r1,%[src])\n\t"
240 "vl %%v18,32(%%r1,%[src])\n\t"
241 "vl %%v19,48(%%r1,%[src])\n\t"
242 "vl %%v20,0(%%r1,%[dest])\n\t"
243 "vl %%v21,16(%%r1,%[dest])\n\t"
244 "vl %%v22,32(%%r1,%[dest])\n\t"
245 "vl %%v23,48(%%r1,%[dest])\n\t"
246 "vpdi %%v24,%%v16,%%v16,4\n\t"
247 "vpdi %%v25,%%v17,%%v17,4\n\t"
248 "vpdi %%v26,%%v18,%%v18,4\n\t"
249 "vpdi %%v27,%%v19,%%v19,4\n\t"
250 "vfmadb %%v28,%%v16,%%v0,%%v20\n\t"
251 "vfmadb %%v29,%%v17,%%v0,%%v21\n\t"
252 "vfmadb %%v30,%%v18,%%v0,%%v22\n\t"
253 "vfmadb %%v31,%%v19,%%v0,%%v23\n\t"
254 "vfmadb %%v28,%%v24,%%v1,%%v28\n\t"
255 "vfmadb %%v29,%%v25,%%v1,%%v29\n\t"
256 "vfmadb %%v30,%%v26,%%v1,%%v30\n\t"
257 "vfmadb %%v31,%%v27,%%v1,%%v31\n\t"
258 "vst %%v28,0(%%r1,%[dest])\n\t"
259 "vst %%v29,16(%%r1,%[dest])\n\t"
260 "vst %%v30,32(%%r1,%[dest])\n\t"
261 "vst %%v31,48(%%r1,%[dest])\n\t"
262 "agfi %%r1,64\n\t"
263 "brctg %[n],0b"
264 : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n)
265 : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),
266 [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
267 : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
268 "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
269 "v31");
270 }
271
add_y(BLASLONG n,FLOAT * src,FLOAT * dest,BLASLONG inc_dest,FLOAT alpha_r,FLOAT alpha_i)272 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
273 FLOAT alpha_r, FLOAT alpha_i) {
274 BLASLONG i;
275
276 if (inc_dest != 2) {
277
278 FLOAT temp_r;
279 FLOAT temp_i;
280 for (i = 0; i < n; i++) {
281 #if !defined(XCONJ)
282 temp_r = alpha_r * src[0] - alpha_i * src[1];
283 temp_i = alpha_r * src[1] + alpha_i * src[0];
284 #else
285 temp_r = alpha_r * src[0] + alpha_i * src[1];
286 temp_i = -alpha_r * src[1] + alpha_i * src[0];
287 #endif
288
289 *dest += temp_r;
290 *(dest + 1) += temp_i;
291
292 src += 2;
293 dest += inc_dest;
294 }
295 return;
296 }
297
298 add_y_4(n, src, dest, alpha_r, alpha_i);
299 }
300
CNAME(BLASLONG m,BLASLONG n,BLASLONG dummy1,FLOAT alpha_r,FLOAT alpha_i,FLOAT * a,BLASLONG lda,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT * buffer)301 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
302 FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
303 BLASLONG inc_y, FLOAT *buffer) {
304 BLASLONG i;
305 FLOAT *a_ptr;
306 FLOAT *x_ptr;
307 FLOAT *y_ptr;
308 FLOAT *ap[4];
309 BLASLONG n1;
310 BLASLONG m1;
311 BLASLONG m2;
312 BLASLONG m3;
313 BLASLONG n2;
314 BLASLONG lda4;
315 FLOAT xbuffer[8], *ybuffer;
316
317 if (m < 1)
318 return (0);
319 if (n < 1)
320 return (0);
321
322 ybuffer = buffer;
323
324 inc_x *= 2;
325 inc_y *= 2;
326 lda *= 2;
327 lda4 = 4 * lda;
328
329 n1 = n / 4;
330 n2 = n % 4;
331
332 m3 = m % 4;
333 m1 = m - (m % 4);
334 m2 = (m % NBMAX) - (m % 4);
335
336 y_ptr = y;
337
338 BLASLONG NB = NBMAX;
339
340 while (NB == NBMAX) {
341
342 m1 -= NB;
343 if (m1 < 0) {
344 if (m2 == 0)
345 break;
346 NB = m2;
347 }
348
349 a_ptr = a;
350 ap[0] = a_ptr;
351 ap[1] = a_ptr + lda;
352 ap[2] = ap[1] + lda;
353 ap[3] = ap[2] + lda;
354 x_ptr = x;
355 //zero_y(NB,ybuffer);
356 memset(ybuffer, 0, NB * 16);
357
358 if (inc_x == 2) {
359
360 for (i = 0; i < n1; i++) {
361 zgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
362 ap[0] += lda4;
363 ap[1] += lda4;
364 ap[2] += lda4;
365 ap[3] += lda4;
366 a_ptr += lda4;
367 x_ptr += 8;
368 }
369
370 if (n2 & 2) {
371 zgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
372 x_ptr += 4;
373 a_ptr += 2 * lda;
374
375 }
376
377 if (n2 & 1) {
378 zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
379 /* x_ptr += 2;
380 a_ptr += lda; */
381
382 }
383 } else {
384
385 for (i = 0; i < n1; i++) {
386
387 xbuffer[0] = x_ptr[0];
388 xbuffer[1] = x_ptr[1];
389 x_ptr += inc_x;
390 xbuffer[2] = x_ptr[0];
391 xbuffer[3] = x_ptr[1];
392 x_ptr += inc_x;
393 xbuffer[4] = x_ptr[0];
394 xbuffer[5] = x_ptr[1];
395 x_ptr += inc_x;
396 xbuffer[6] = x_ptr[0];
397 xbuffer[7] = x_ptr[1];
398 x_ptr += inc_x;
399
400 zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer);
401 ap[0] += lda4;
402 ap[1] += lda4;
403 ap[2] += lda4;
404 ap[3] += lda4;
405 a_ptr += lda4;
406 }
407
408 for (i = 0; i < n2; i++) {
409 xbuffer[0] = x_ptr[0];
410 xbuffer[1] = x_ptr[1];
411 x_ptr += inc_x;
412 zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
413 a_ptr += 1 * lda;
414
415 }
416
417 }
418
419 add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
420 a += 2 * NB;
421 y_ptr += NB * inc_y;
422 }
423
424 if (m3 == 0)
425 return (0);
426
427 if (m3 == 1) {
428 a_ptr = a;
429 x_ptr = x;
430 FLOAT temp_r = 0.0;
431 FLOAT temp_i = 0.0;
432
433 if (lda == 2 && inc_x == 2) {
434
435 for (i = 0; i < (n & -2); i += 2) {
436 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
437 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
438 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
439 temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
440 temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
441 #else
442 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
443 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
444 temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
445 temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
446 #endif
447
448 a_ptr += 4;
449 x_ptr += 4;
450 }
451
452 for (; i < n; i++) {
453 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
454 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
455 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
456 #else
457 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
458 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
459 #endif
460
461 a_ptr += 2;
462 x_ptr += 2;
463 }
464
465 } else {
466
467 for (i = 0; i < n; i++) {
468 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
469 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
470 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
471 #else
472 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
473 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
474 #endif
475
476 a_ptr += lda;
477 x_ptr += inc_x;
478 }
479
480 }
481 #if !defined(XCONJ)
482 y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
483 y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
484 #else
485 y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
486 y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
487 #endif
488 return (0);
489 }
490
491 if (m3 == 2) {
492 a_ptr = a;
493 x_ptr = x;
494 FLOAT temp_r0 = 0.0;
495 FLOAT temp_i0 = 0.0;
496 FLOAT temp_r1 = 0.0;
497 FLOAT temp_i1 = 0.0;
498
499 if (lda == 4 && inc_x == 2) {
500
501 for (i = 0; i < (n & -2); i += 2) {
502 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
503
504 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
505 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
506 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
507 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
508
509 temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
510 temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
511 temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
512 temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
513
514 #else
515 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
516 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
517 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
518 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
519
520 temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
521 temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
522 temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
523 temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
524
525 #endif
526
527 a_ptr += 8;
528 x_ptr += 4;
529 }
530
531 for (; i < n; i++) {
532 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
533 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
534 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
535 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
536 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
537 #else
538 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
539 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
540 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
541 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
542 #endif
543
544 a_ptr += 4;
545 x_ptr += 2;
546 }
547
548 } else {
549
550 for (i = 0; i < n; i++) {
551 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
552 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
553 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
554 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
555 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
556 #else
557 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
558 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
559 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
560 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
561 #endif
562
563 a_ptr += lda;
564 x_ptr += inc_x;
565 }
566
567 }
568 #if !defined(XCONJ)
569 y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
570 y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
571 y_ptr += inc_y;
572 y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
573 y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
574 #else
575 y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
576 y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
577 y_ptr += inc_y;
578 y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
579 y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
580 #endif
581 return (0);
582 }
583
584 if (m3 == 3) {
585 a_ptr = a;
586 x_ptr = x;
587 FLOAT temp_r0 = 0.0;
588 FLOAT temp_i0 = 0.0;
589 FLOAT temp_r1 = 0.0;
590 FLOAT temp_i1 = 0.0;
591 FLOAT temp_r2 = 0.0;
592 FLOAT temp_i2 = 0.0;
593
594 if (lda == 6 && inc_x == 2) {
595
596 for (i = 0; i < n; i++) {
597 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
598 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
599 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
600 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
601 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
602 temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
603 temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
604 #else
605 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
606 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
607 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
608 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
609 temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
610 temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
611 #endif
612
613 a_ptr += 6;
614 x_ptr += 2;
615 }
616
617 } else {
618
619 for (i = 0; i < n; i++) {
620 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
621 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
622 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
623 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
624 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
625 temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
626 temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
627 #else
628 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
629 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
630 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
631 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
632 temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
633 temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
634 #endif
635
636 a_ptr += lda;
637 x_ptr += inc_x;
638 }
639
640 }
641 #if !defined(XCONJ)
642 y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
643 y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
644 y_ptr += inc_y;
645 y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
646 y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
647 y_ptr += inc_y;
648 y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
649 y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
650 #else
651 y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
652 y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
653 y_ptr += inc_y;
654 y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
655 y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
656 y_ptr += inc_y;
657 y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
658 y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
659 #endif
660 return (0);
661 }
662
663 return (0);
664 }
665