1 /***************************************************************************
2 Copyright (c) 2018, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include "common.h"
31
32 #if defined(__VEC__) || defined(__ALTIVEC__)
33
34 #define HAVE_KERNEL_4x4_VEC 1
35 #define HAVE_KERNEL_4x2_VEC 1
36 #define HAVE_KERNEL_4x1_VEC 1
37 #define HAVE_KERNEL_ADDY 1
38
39 #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
40 #include <altivec.h>
41 #endif
42 #endif
43
44 //
45 #define NBMAX 4096
46
47 #ifdef HAVE_KERNEL_4x4_VEC_ASM
48
49 #elif HAVE_KERNEL_4x4_VEC
50
zgemv_kernel_4x4(BLASLONG n,BLASLONG lda,FLOAT * ap,FLOAT * x,FLOAT * y)51 static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
52
53 FLOAT *a0, *a1, *a2, *a3;
54 a0 = ap;
55 a1 = ap + lda;
56 a2 = a1 + lda;
57 a3 = a2 + lda;
58
59 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
60
61 register __vector double vx0_r = {x[0], x[0]};
62 register __vector double vx0_i = {-x[1], x[1]};
63 register __vector double vx1_r = {x[2], x[2]};
64 register __vector double vx1_i = {-x[3], x[3]};
65 register __vector double vx2_r = {x[4], x[4]};
66 register __vector double vx2_i = {-x[5], x[5]};
67 register __vector double vx3_r = {x[6], x[6]};
68 register __vector double vx3_i = {-x[7], x[7]};
69
70 #else
71 register __vector double vx0_r = {x[0], -x[0]};
72 register __vector double vx0_i = {x[1], x[1]};
73 register __vector double vx1_r = {x[2], -x[2]};
74 register __vector double vx1_i = {x[3], x[3]};
75 register __vector double vx2_r = {x[4], -x[4]};
76 register __vector double vx2_i = {x[5], x[5]};
77 register __vector double vx3_r = {x[6], -x[6]};
78 register __vector double vx3_i = {x[7], x[7]};
79 #endif
80
81 register __vector double *vy = (__vector double *) y;
82 register __vector double *vptr_a0 = (__vector double *) a0;
83 register __vector double *vptr_a1 = (__vector double *) a1;
84 register __vector double *vptr_a2 = (__vector double *) a2;
85 register __vector double *vptr_a3 = (__vector double *) a3;
86
87
88 register __vector double vy_0;
89 register __vector double va0;
90 register __vector double va1;
91 register __vector double va2;
92 register __vector double va3;
93 register __vector double vy_1;
94 register __vector double va0_1;
95 register __vector double va1_1;
96 register __vector double va2_1;
97 register __vector double va3_1;
98 register __vector double vy_2;
99 register __vector double va0_2;
100 register __vector double va1_2;
101 register __vector double va2_2;
102 register __vector double va3_2;
103 register __vector double vy_3;
104 register __vector double va0_3;
105 register __vector double va1_3;
106 register __vector double va2_3;
107 register __vector double va3_3;
108
109 BLASLONG i = 0;
110 while (i < n) {
111
112 vy_0 = vy[i];
113 va0 = vptr_a0[i];
114 va1 = vptr_a1[i];
115 va2 = vptr_a2[i];
116 va3 = vptr_a3[i];
117
118 vy_1 = vy[i + 1];
119 va0_1 = vptr_a0[i + 1];
120 va1_1 = vptr_a1[i + 1];
121 va2_1 = vptr_a2[i + 1];
122 va3_1 = vptr_a3[i + 1];
123
124 vy_2 = vy[i + 2];
125 va0_2 = vptr_a0[i + 2];
126 va1_2 = vptr_a1[i + 2];
127 va2_2 = vptr_a2[i + 2];
128 va3_2 = vptr_a3[i + 2];
129
130 vy_3 = vy[i + 3];
131 va0_3 = vptr_a0[i + 3];
132 va1_3 = vptr_a1[i + 3];
133 va2_3 = vptr_a2[i + 3];
134 va3_3 = vptr_a3[i + 3];
135
136 vy_0 += va0*vx0_r;
137 vy_1 += va0_1*vx0_r;
138 vy_2 += va0_2*vx0_r;
139 vy_3 += va0_3*vx0_r;
140
141
142 vy_0 += va1*vx1_r;
143 vy_1 += va1_1*vx1_r;
144 vy_2 += va1_2*vx1_r;
145 vy_3 += va1_3*vx1_r;
146
147 va0 = vec_xxpermdi(va0, va0, 2);
148 va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
149
150
151 vy_0 += va2*vx2_r;
152 vy_1 += va2_1*vx2_r;
153 va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
154 va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
155 vy_2 += va2_2*vx2_r;
156 vy_3 += va2_3*vx2_r;
157
158 va1 = vec_xxpermdi(va1, va1, 2);
159 va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
160
161
162 vy_0 += va3*vx3_r;
163 vy_1 += va3_1*vx3_r;
164
165 va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
166 va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
167
168 vy_2 += va3_2*vx3_r;
169 vy_3 += va3_3*vx3_r;
170
171 va2 = vec_xxpermdi(va2, va2, 2);
172 va2_1 = vec_xxpermdi(va2_1, va2_1, 2);
173
174
175 vy_0 += va0*vx0_i;
176 vy_1 += va0_1*vx0_i;
177
178 va2_2 = vec_xxpermdi(va2_2, va2_2, 2);
179 va2_3 = vec_xxpermdi(va2_3, va2_3, 2);
180
181 vy_2 += va0_2*vx0_i;
182 vy_3 += va0_3*vx0_i;
183
184 va3 = vec_xxpermdi(va3, va3, 2);
185 va3_1 = vec_xxpermdi(va3_1, va3_1, 2);
186
187
188 vy_0 += va1*vx1_i;
189 vy_1 += va1_1*vx1_i;
190
191 va3_2 = vec_xxpermdi(va3_2, va3_2, 2);
192 va3_3 = vec_xxpermdi(va3_3, va3_3, 2);
193
194 vy_2 += va1_2*vx1_i;
195 vy_3 += va1_3*vx1_i;
196
197 vy_0 += va2*vx2_i;
198 vy_1 += va2_1*vx2_i;
199 vy_2 += va2_2*vx2_i;
200 vy_3 += va2_3*vx2_i;
201
202 vy_0 += va3*vx3_i;
203 vy_1 += va3_1*vx3_i;
204 vy_2 += va3_2*vx3_i;
205 vy_3 += va3_3*vx3_i;
206
207 vy[i] = vy_0;
208 vy[i + 1] = vy_1;
209 vy[i + 2] = vy_2;
210 vy[i + 3] = vy_3;
211
212
213 i += 4;
214
215
216 }
217
218 }
219 #else
220
zgemv_kernel_4x4(BLASLONG n,BLASLONG lda,FLOAT * ap,FLOAT * x,FLOAT * y)221 static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
222 BLASLONG i;
223 FLOAT *a0, *a1, *a2, *a3;
224 a0 = ap;
225 a1 = ap + lda;
226 a2 = a1 + lda;
227 a3 = a2 + lda;
228
229 for (i = 0; i < 2 * n; i += 2) {
230 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
231 y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
232 y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
233 y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
234 y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
235 y[i] += a2[i] * x[4] - a2[i + 1] * x[5];
236 y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4];
237 y[i] += a3[i] * x[6] - a3[i + 1] * x[7];
238 y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6];
239 #else
240 y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
241 y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
242 y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
243 y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
244 y[i] += a2[i] * x[4] + a2[i + 1] * x[5];
245 y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4];
246 y[i] += a3[i] * x[6] + a3[i + 1] * x[7];
247 y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6];
248 #endif
249 }
250 }
251
252 #endif
253
254 #ifdef HAVE_KERNEL_4x2_VEC
255
zgemv_kernel_4x2(BLASLONG n,BLASLONG lda,FLOAT * ap,FLOAT * x,FLOAT * y)256 static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
257 BLASLONG i;
258 FLOAT *a0, *a1;
259 a0 = ap;
260 a1 = ap + lda;
261
262
263 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
264
265 register __vector double vx0_r = {x[0], x[0]};
266 register __vector double vx0_i = {-x[1], x[1]};
267 register __vector double vx1_r = {x[2], x[2]};
268 register __vector double vx1_i = {-x[3], x[3]};
269
270 #else
271 register __vector double vx0_r = {x[0], -x[0]};
272 register __vector double vx0_i = {x[1], x[1]};
273 register __vector double vx1_r = {x[2], -x[2]};
274 register __vector double vx1_i = {x[3], x[3]};
275 #endif
276
277
278 register __vector double *vy = (__vector double *) y;
279 register __vector double *vptr_a0 = (__vector double *) a0;
280 register __vector double *vptr_a1 = (__vector double *) a1;
281
282 for (i = 0; i < n; i += 4) {
283
284 register __vector double vy_0 = vy[i];
285 register __vector double vy_1 = vy[i + 1];
286 register __vector double vy_2 = vy[i + 2];
287 register __vector double vy_3 = vy[i + 3];
288
289 register __vector double va0 = vptr_a0[i];
290 register __vector double va0_1 = vptr_a0[i + 1];
291 register __vector double va0_2 = vptr_a0[i + 2];
292 register __vector double va0_3 = vptr_a0[i + 3];
293
294 register __vector double va1 = vptr_a1[i];
295 register __vector double va1_1 = vptr_a1[i + 1];
296 register __vector double va1_2 = vptr_a1[i + 2];
297 register __vector double va1_3 = vptr_a1[i + 3];
298
299 vy_0 += va0*vx0_r;
300 vy_1 += va0_1*vx0_r;
301 vy_2 += va0_2*vx0_r;
302 vy_3 += va0_3*vx0_r;
303
304 va0 = vec_xxpermdi(va0, va0, 2);
305 va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
306 va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
307 va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
308
309 vy_0 += va1*vx1_r;
310 vy_1 += va1_1*vx1_r;
311 vy_2 += va1_2*vx1_r;
312 vy_3 += va1_3*vx1_r;
313
314 va1 = vec_xxpermdi(va1, va1, 2);
315 va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
316 va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
317 va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
318
319 vy_0 += va0*vx0_i;
320 vy_1 += va0_1*vx0_i;
321 vy_2 += va0_2*vx0_i;
322 vy_3 += va0_3*vx0_i;
323
324 vy_0 += va1*vx1_i;
325 vy_1 += va1_1*vx1_i;
326 vy_2 += va1_2*vx1_i;
327 vy_3 += va1_3*vx1_i;
328
329 vy[i] = vy_0;
330 vy[i + 1] = vy_1;
331 vy[i + 2] = vy_2;
332 vy[i + 3] = vy_3;
333
334 }
335 }
336 #else
337
zgemv_kernel_4x2(BLASLONG n,BLASLONG lda,FLOAT * ap,FLOAT * x,FLOAT * y)338 static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
339 BLASLONG i;
340 FLOAT *a0, *a1;
341 a0 = ap;
342 a1 = ap + lda;
343
344 for (i = 0; i < 2 * n; i += 2) {
345 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
346 y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
347 y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
348 y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
349 y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
350 #else
351 y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
352 y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
353 y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
354 y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
355 #endif
356 }
357 }
358
359 #endif
360
361 #ifdef HAVE_KERNEL_4x1_VEC
362
zgemv_kernel_4x1(BLASLONG n,FLOAT * ap,FLOAT * x,FLOAT * y)363 static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
364 BLASLONG i;
365 FLOAT *a0;
366 a0 = ap;
367
368
369 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
370
371 register __vector double vx0_r = {x[0], x[0]};
372 register __vector double vx0_i = {-x[1], x[1]};
373
374 #else
375 register __vector double vx0_r = {x[0], -x[0]};
376 register __vector double vx0_i = {x[1], x[1]};
377 #endif
378
379
380 register __vector double *vy = (__vector double *) y;
381 register __vector double *vptr_a0 = (__vector double *) a0;
382
383 for (i = 0; i < n; i += 4) {
384
385 register __vector double vy_0 = vy[i];
386 register __vector double vy_1 = vy[i + 1];
387 register __vector double vy_2 = vy[i + 2];
388 register __vector double vy_3 = vy[i + 3];
389
390 register __vector double va0 = vptr_a0[i];
391 register __vector double va0_1 = vptr_a0[i + 1];
392 register __vector double va0_2 = vptr_a0[i + 2];
393 register __vector double va0_3 = vptr_a0[i + 3];
394
395 register __vector double va0x = vec_xxpermdi(va0, va0, 2);
396 register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2);
397 register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2);
398 register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2);
399 vy_0 += va0*vx0_r + va0x*vx0_i;
400 vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
401 vy_2 += va0_2*vx0_r + va0x_2*vx0_i;
402 vy_3 += va0_3*vx0_r + va0x_3*vx0_i;
403
404 vy[i] = vy_0;
405 vy[i + 1] = vy_1;
406 vy[i + 2] = vy_2;
407 vy[i + 3] = vy_3;
408
409 }
410 }
411
412 #else
413
zgemv_kernel_4x1(BLASLONG n,FLOAT * ap,FLOAT * x,FLOAT * y)414 static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
415 BLASLONG i;
416 FLOAT *a0;
417 a0 = ap;
418
419 for (i = 0; i < 2 * n; i += 2) {
420 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
421 y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
422 y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
423 #else
424 y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
425 y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
426 #endif
427
428 }
429 }
430
431 #endif
432
433 #ifdef HAVE_KERNEL_ADDY
434
add_y(BLASLONG n,FLOAT * src,FLOAT * dest,BLASLONG inc_dest,FLOAT alpha_r,FLOAT alpha_i)435 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
436 BLASLONG i;
437
438
439 #if !defined(XCONJ)
440
441 register __vector double valpha_r = {alpha_r, alpha_r};
442 register __vector double valpha_i = {-alpha_i, alpha_i};
443
444 #else
445 register __vector double valpha_r = {alpha_r, -alpha_r};
446 register __vector double valpha_i = {alpha_i, alpha_i};
447 #endif
448
449 register __vector double *vptr_src = (__vector double *) src;
450 if (inc_dest != 2) {
451 register __vector double *vptr_y = (__vector double *) dest;
452 //note that inc_dest is already 2x. so we should add it to double*
453 register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest);
454 register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest);
455 register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest);
456 BLASLONG dest_t = 0;
457 BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times
458 for (i = 0; i < n; i += 4) {
459
460 register __vector double vy_0 = vptr_y[dest_t];
461 register __vector double vy_1 = vptr_y1[dest_t];
462 register __vector double vy_2 = vptr_y2[dest_t];
463 register __vector double vy_3 = vptr_y3[dest_t];
464
465 register __vector double vsrc = vptr_src[i];
466 register __vector double vsrc_1 = vptr_src[i + 1];
467 register __vector double vsrc_2 = vptr_src[i + 2];
468 register __vector double vsrc_3 = vptr_src[i + 3];
469
470 vy_0 += vsrc*valpha_r;
471 vy_1 += vsrc_1*valpha_r;
472 vy_2 += vsrc_2*valpha_r;
473 vy_3 += vsrc_3*valpha_r;
474
475 vsrc = vec_xxpermdi(vsrc, vsrc, 2);
476 vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
477 vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
478 vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
479
480 vy_0 += vsrc*valpha_i;
481 vy_1 += vsrc_1*valpha_i;
482 vy_2 += vsrc_2*valpha_i;
483 vy_3 += vsrc_3*valpha_i;
484
485 vptr_y[dest_t] = vy_0;
486 vptr_y1[dest_t ] = vy_1;
487 vptr_y2[dest_t] = vy_2;
488 vptr_y3[dest_t] = vy_3;
489
490 dest_t += add_dest;
491
492 }
493
494 return;
495 } else {
496 register __vector double *vptr_y = (__vector double *) dest;
497 for (i = 0; i < n; i += 4) {
498
499 register __vector double vy_0 = vptr_y[i];
500 register __vector double vy_1 = vptr_y[i + 1];
501 register __vector double vy_2 = vptr_y[i + 2];
502 register __vector double vy_3 = vptr_y[i + 3];
503
504 register __vector double vsrc = vptr_src[i];
505 register __vector double vsrc_1 = vptr_src[i + 1];
506 register __vector double vsrc_2 = vptr_src[i + 2];
507 register __vector double vsrc_3 = vptr_src[i + 3];
508
509 vy_0 += vsrc*valpha_r;
510 vy_1 += vsrc_1*valpha_r;
511 vy_2 += vsrc_2*valpha_r;
512 vy_3 += vsrc_3*valpha_r;
513
514 vsrc = vec_xxpermdi(vsrc, vsrc, 2);
515 vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
516 vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
517 vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
518
519 vy_0 += vsrc*valpha_i;
520 vy_1 += vsrc_1*valpha_i;
521 vy_2 += vsrc_2*valpha_i;
522 vy_3 += vsrc_3*valpha_i;
523
524 vptr_y[i] = vy_0;
525 vptr_y[i + 1 ] = vy_1;
526 vptr_y[i + 2] = vy_2;
527 vptr_y[i + 3] = vy_3;
528
529 }
530
531 return;
532 }
533 return;
534 }
535
536 #else
537
add_y(BLASLONG n,FLOAT * src,FLOAT * dest,BLASLONG inc_dest,FLOAT alpha_r,FLOAT alpha_i)538 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
539 BLASLONG i;
540
541 if (inc_dest != 2) {
542
543 FLOAT temp_r;
544 FLOAT temp_i;
545 for (i = 0; i < n; i++) {
546 #if !defined(XCONJ)
547 temp_r = alpha_r * src[0] - alpha_i * src[1];
548 temp_i = alpha_r * src[1] + alpha_i * src[0];
549 #else
550 temp_r = alpha_r * src[0] + alpha_i * src[1];
551 temp_i = -alpha_r * src[1] + alpha_i * src[0];
552 #endif
553
554 *dest += temp_r;
555 *(dest + 1) += temp_i;
556
557 src += 2;
558 dest += inc_dest;
559 }
560 return;
561 }
562
563 FLOAT temp_r0;
564 FLOAT temp_i0;
565 FLOAT temp_r1;
566 FLOAT temp_i1;
567 FLOAT temp_r2;
568 FLOAT temp_i2;
569 FLOAT temp_r3;
570 FLOAT temp_i3;
571 for (i = 0; i < n; i += 4) {
572 #if !defined(XCONJ)
573 temp_r0 = alpha_r * src[0] - alpha_i * src[1];
574 temp_i0 = alpha_r * src[1] + alpha_i * src[0];
575 temp_r1 = alpha_r * src[2] - alpha_i * src[3];
576 temp_i1 = alpha_r * src[3] + alpha_i * src[2];
577 temp_r2 = alpha_r * src[4] - alpha_i * src[5];
578 temp_i2 = alpha_r * src[5] + alpha_i * src[4];
579 temp_r3 = alpha_r * src[6] - alpha_i * src[7];
580 temp_i3 = alpha_r * src[7] + alpha_i * src[6];
581 #else
582 temp_r0 = alpha_r * src[0] + alpha_i * src[1];
583 temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
584 temp_r1 = alpha_r * src[2] + alpha_i * src[3];
585 temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
586 temp_r2 = alpha_r * src[4] + alpha_i * src[5];
587 temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
588 temp_r3 = alpha_r * src[6] + alpha_i * src[7];
589 temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
590 #endif
591
592 dest[0] += temp_r0;
593 dest[1] += temp_i0;
594 dest[2] += temp_r1;
595 dest[3] += temp_i1;
596 dest[4] += temp_r2;
597 dest[5] += temp_i2;
598 dest[6] += temp_r3;
599 dest[7] += temp_i3;
600
601 src += 8;
602 dest += 8;
603 }
604 return;
605 }
606 #endif
607
CNAME(BLASLONG m,BLASLONG n,BLASLONG dummy1,FLOAT alpha_r,FLOAT alpha_i,FLOAT * a,BLASLONG lda,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT * buffer)608 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
609 BLASLONG i;
610 FLOAT *a_ptr;
611 FLOAT *x_ptr;
612 FLOAT *y_ptr;
613
614 BLASLONG n1;
615 BLASLONG m1;
616 BLASLONG m2;
617 BLASLONG m3;
618 BLASLONG n2;
619 FLOAT xbuffer[8] __attribute__((aligned(16)));
620 FLOAT *ybuffer;
621
622 if (m < 1) return (0);
623 if (n < 1) return (0);
624
625 ybuffer = buffer;
626
627 inc_x *= 2;
628 inc_y *= 2;
629 lda *= 2;
630
631 n1 = n / 4;
632 n2 = n % 4;
633
634 m3 = m % 4;
635 m1 = m - (m % 4);
636 m2 = (m % NBMAX) - (m % 4);
637
638 y_ptr = y;
639
640 BLASLONG NB = NBMAX;
641
642 while (NB == NBMAX) {
643
644 m1 -= NB;
645 if (m1 < 0) {
646 if (m2 == 0) break;
647 NB = m2;
648 }
649
650 a_ptr = a;
651
652 x_ptr = x;
653 //zero_y(NB,ybuffer);
654 memset(ybuffer, 0, NB * 16);
655
656 if (inc_x == 2) {
657
658 for (i = 0; i < n1; i++) {
659 zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
660
661 a_ptr += lda << 2;
662 x_ptr += 8;
663 }
664
665 if (n2 & 2) {
666 zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
667 x_ptr += 4;
668 a_ptr += 2 * lda;
669
670 }
671
672 if (n2 & 1) {
673 zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
674 x_ptr += 2;
675 a_ptr += lda;
676
677 }
678 } else {
679
680 for (i = 0; i < n1; i++) {
681
682 xbuffer[0] = x_ptr[0];
683 xbuffer[1] = x_ptr[1];
684 x_ptr += inc_x;
685 xbuffer[2] = x_ptr[0];
686 xbuffer[3] = x_ptr[1];
687 x_ptr += inc_x;
688 xbuffer[4] = x_ptr[0];
689 xbuffer[5] = x_ptr[1];
690 x_ptr += inc_x;
691 xbuffer[6] = x_ptr[0];
692 xbuffer[7] = x_ptr[1];
693 x_ptr += inc_x;
694
695 zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
696
697 a_ptr += lda << 2;
698 }
699
700 for (i = 0; i < n2; i++) {
701 xbuffer[0] = x_ptr[0];
702 xbuffer[1] = x_ptr[1];
703 x_ptr += inc_x;
704 zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
705 a_ptr += lda;
706
707 }
708
709 }
710
711 add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
712 a += 2 * NB;
713 y_ptr += NB * inc_y;
714 }
715
716 if (m3 == 0) return (0);
717
718 if (m3 == 1) {
719 a_ptr = a;
720 x_ptr = x;
721 FLOAT temp_r = 0.0;
722 FLOAT temp_i = 0.0;
723
724 if (lda == 2 && inc_x == 2) {
725
726 for (i = 0; i < (n & -2); i += 2) {
727 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
728 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
729 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
730 temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
731 temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
732 #else
733 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
734 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
735 temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
736 temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
737 #endif
738
739 a_ptr += 4;
740 x_ptr += 4;
741 }
742
743 for (; i < n; i++) {
744 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
745 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
746 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
747 #else
748 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
749 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
750 #endif
751
752 a_ptr += 2;
753 x_ptr += 2;
754 }
755
756 } else {
757
758 for (i = 0; i < n; i++) {
759 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
760 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
761 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
762 #else
763 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
764 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
765 #endif
766
767 a_ptr += lda;
768 x_ptr += inc_x;
769 }
770
771 }
772 #if !defined(XCONJ)
773 y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
774 y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
775 #else
776 y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
777 y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
778 #endif
779 return (0);
780 }
781
782 if (m3 == 2) {
783 a_ptr = a;
784 x_ptr = x;
785 FLOAT temp_r0 = 0.0;
786 FLOAT temp_i0 = 0.0;
787 FLOAT temp_r1 = 0.0;
788 FLOAT temp_i1 = 0.0;
789
790 if (lda == 4 && inc_x == 2) {
791
792 for (i = 0; i < (n & -2); i += 2) {
793 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
794
795 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
796 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
797 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
798 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
799
800 temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
801 temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
802 temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
803 temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
804 #else
805 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
806 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
807 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
808 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
809
810 temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
811 temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
812 temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
813 temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
814 #endif
815
816 a_ptr += 8;
817 x_ptr += 4;
818 }
819
820 for (; i < n; i++) {
821 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
822 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
823 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
824 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
825 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
826 #else
827 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
828 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
829 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
830 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
831 #endif
832
833 a_ptr += 4;
834 x_ptr += 2;
835 }
836
837 } else {
838
839 for (i = 0; i < n; i++) {
840 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
841 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
842 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
843 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
844 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
845 #else
846 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
847 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
848 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
849 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
850 #endif
851
852 a_ptr += lda;
853 x_ptr += inc_x;
854 }
855
856 }
857 #if !defined(XCONJ)
858 y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
859 y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
860 y_ptr += inc_y;
861 y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
862 y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
863 #else
864 y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
865 y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
866 y_ptr += inc_y;
867 y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
868 y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
869 #endif
870 return (0);
871 }
872
873 if (m3 == 3) {
874 a_ptr = a;
875 x_ptr = x;
876 FLOAT temp_r0 = 0.0;
877 FLOAT temp_i0 = 0.0;
878 FLOAT temp_r1 = 0.0;
879 FLOAT temp_i1 = 0.0;
880 FLOAT temp_r2 = 0.0;
881 FLOAT temp_i2 = 0.0;
882
883 if (lda == 6 && inc_x == 2) {
884
885 for (i = 0; i < n; i++) {
886 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
887 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
888 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
889 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
890 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
891 temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
892 temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
893 #else
894 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
895 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
896 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
897 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
898 temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
899 temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
900 #endif
901
902 a_ptr += 6;
903 x_ptr += 2;
904 }
905
906 } else {
907
908 for (i = 0; i < n; i++) {
909 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
910 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
911 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
912 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
913 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
914 temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
915 temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
916 #else
917 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
918 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
919 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
920 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
921 temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
922 temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
923 #endif
924
925 a_ptr += lda;
926 x_ptr += inc_x;
927 }
928
929 }
930 #if !defined(XCONJ)
931 y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
932 y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
933 y_ptr += inc_y;
934 y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
935 y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
936 y_ptr += inc_y;
937 y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
938 y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
939 #else
940 y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
941 y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
942 y_ptr += inc_y;
943 y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
944 y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
945 y_ptr += inc_y;
946 y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
947 y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
948 #endif
949 return (0);
950 }
951
952 return (0);
953 }
954
955