1 /***************************************************************************
2 Copyright (c) 2018, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *****************************************************************************/
27 
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include "common.h"
31 
32 #if defined(__VEC__) || defined(__ALTIVEC__)
33 
34 #define HAVE_KERNEL_4x4_VEC 1
35 #define HAVE_KERNEL_4x2_VEC 1
36 #define HAVE_KERNEL_4x1_VEC 1
37 #define HAVE_KERNEL_ADDY 1
38 
39 #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
40 #include <altivec.h>
41 #endif
42 #endif
43 
44 //
45 #define NBMAX 4096
46 
47 #ifdef HAVE_KERNEL_4x4_VEC_ASM
48 
49 #elif HAVE_KERNEL_4x4_VEC
50 
zgemv_kernel_4x4(BLASLONG n,BLASLONG lda,FLOAT * ap,FLOAT * x,FLOAT * y)51 static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
52 
53     FLOAT *a0, *a1, *a2, *a3;
54     a0 = ap;
55     a1 = ap + lda;
56     a2 = a1 + lda;
57     a3 = a2 + lda;
58 
59 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
60 
61     register __vector double vx0_r = {x[0], x[0]};
62     register __vector double vx0_i = {-x[1], x[1]};
63     register __vector double vx1_r = {x[2], x[2]};
64     register __vector double vx1_i = {-x[3], x[3]};
65     register __vector double vx2_r = {x[4], x[4]};
66     register __vector double vx2_i = {-x[5], x[5]};
67     register __vector double vx3_r = {x[6], x[6]};
68     register __vector double vx3_i = {-x[7], x[7]};
69 
70 #else
71     register __vector double vx0_r = {x[0], -x[0]};
72     register __vector double vx0_i = {x[1], x[1]};
73     register __vector double vx1_r = {x[2], -x[2]};
74     register __vector double vx1_i = {x[3], x[3]};
75     register __vector double vx2_r = {x[4], -x[4]};
76     register __vector double vx2_i = {x[5], x[5]};
77     register __vector double vx3_r = {x[6], -x[6]};
78     register __vector double vx3_i = {x[7], x[7]};
79 #endif
80 
81     register __vector double *vy = (__vector double *) y;
82     register __vector double *vptr_a0 = (__vector double *) a0;
83     register __vector double *vptr_a1 = (__vector double *) a1;
84     register __vector double *vptr_a2 = (__vector double *) a2;
85     register __vector double *vptr_a3 = (__vector double *) a3;
86 
87 
88     register __vector double vy_0;
89     register __vector double va0;
90     register __vector double va1;
91     register __vector double va2;
92     register __vector double va3;
93     register __vector double vy_1;
94     register __vector double va0_1;
95     register __vector double va1_1;
96     register __vector double va2_1;
97     register __vector double va3_1;
98     register __vector double vy_2;
99     register __vector double va0_2;
100     register __vector double va1_2;
101     register __vector double va2_2;
102     register __vector double va3_2;
103     register __vector double vy_3;
104     register __vector double va0_3;
105     register __vector double va1_3;
106     register __vector double va2_3;
107     register __vector double va3_3;
108 
109      BLASLONG  i = 0;
110     while (i < n) {
111 
112         vy_0 = vy[i];
113         va0 = vptr_a0[i];
114         va1 = vptr_a1[i];
115         va2 = vptr_a2[i];
116         va3 = vptr_a3[i];
117 
118         vy_1 = vy[i + 1];
119         va0_1 = vptr_a0[i + 1];
120         va1_1 = vptr_a1[i + 1];
121         va2_1 = vptr_a2[i + 1];
122         va3_1 = vptr_a3[i + 1];
123 
124         vy_2 = vy[i + 2];
125         va0_2 = vptr_a0[i + 2];
126         va1_2 = vptr_a1[i + 2];
127         va2_2 = vptr_a2[i + 2];
128         va3_2 = vptr_a3[i + 2];
129 
130         vy_3 = vy[i + 3];
131         va0_3 = vptr_a0[i + 3];
132         va1_3 = vptr_a1[i + 3];
133         va2_3 = vptr_a2[i + 3];
134         va3_3 = vptr_a3[i + 3];
135 
136         vy_0 += va0*vx0_r;
137         vy_1 += va0_1*vx0_r;
138         vy_2 += va0_2*vx0_r;
139         vy_3 += va0_3*vx0_r;
140 
141 
142         vy_0 += va1*vx1_r;
143         vy_1 += va1_1*vx1_r;
144         vy_2 += va1_2*vx1_r;
145         vy_3 += va1_3*vx1_r;
146 
147         va0 = vec_xxpermdi(va0, va0, 2);
148         va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
149 
150 
151         vy_0 += va2*vx2_r;
152         vy_1 += va2_1*vx2_r;
153         va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
154         va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
155         vy_2 += va2_2*vx2_r;
156         vy_3 += va2_3*vx2_r;
157 
158         va1 = vec_xxpermdi(va1, va1, 2);
159         va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
160 
161 
162         vy_0 += va3*vx3_r;
163         vy_1 += va3_1*vx3_r;
164 
165         va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
166         va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
167 
168         vy_2 += va3_2*vx3_r;
169         vy_3 += va3_3*vx3_r;
170 
171         va2 = vec_xxpermdi(va2, va2, 2);
172         va2_1 = vec_xxpermdi(va2_1, va2_1, 2);
173 
174 
175         vy_0 += va0*vx0_i;
176         vy_1 += va0_1*vx0_i;
177 
178         va2_2 = vec_xxpermdi(va2_2, va2_2, 2);
179         va2_3 = vec_xxpermdi(va2_3, va2_3, 2);
180 
181         vy_2 += va0_2*vx0_i;
182         vy_3 += va0_3*vx0_i;
183 
184         va3 = vec_xxpermdi(va3, va3, 2);
185         va3_1 = vec_xxpermdi(va3_1, va3_1, 2);
186 
187 
188         vy_0 += va1*vx1_i;
189         vy_1 += va1_1*vx1_i;
190 
191         va3_2 = vec_xxpermdi(va3_2, va3_2, 2);
192         va3_3 = vec_xxpermdi(va3_3, va3_3, 2);
193 
194         vy_2 += va1_2*vx1_i;
195         vy_3 += va1_3*vx1_i;
196 
197         vy_0 += va2*vx2_i;
198         vy_1 += va2_1*vx2_i;
199         vy_2 += va2_2*vx2_i;
200         vy_3 += va2_3*vx2_i;
201 
202         vy_0 += va3*vx3_i;
203         vy_1 += va3_1*vx3_i;
204         vy_2 += va3_2*vx3_i;
205         vy_3 += va3_3*vx3_i;
206 
207         vy[i] = vy_0;
208         vy[i + 1] = vy_1;
209         vy[i + 2] = vy_2;
210         vy[i + 3] = vy_3;
211 
212 
213         i += 4;
214 
215 
216     }
217 
218 }
219 #else
220 
zgemv_kernel_4x4(BLASLONG n,BLASLONG lda,FLOAT * ap,FLOAT * x,FLOAT * y)221 static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
222     BLASLONG i;
223     FLOAT *a0, *a1, *a2, *a3;
224     a0 = ap;
225     a1 = ap + lda;
226     a2 = a1 + lda;
227     a3 = a2 + lda;
228 
229     for (i = 0; i < 2 * n; i += 2) {
230 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
231         y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
232         y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
233         y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
234         y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
235         y[i] += a2[i] * x[4] - a2[i + 1] * x[5];
236         y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4];
237         y[i] += a3[i] * x[6] - a3[i + 1] * x[7];
238         y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6];
239 #else
240         y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
241         y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
242         y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
243         y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
244         y[i] += a2[i] * x[4] + a2[i + 1] * x[5];
245         y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4];
246         y[i] += a3[i] * x[6] + a3[i + 1] * x[7];
247         y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6];
248 #endif
249     }
250 }
251 
252 #endif
253 
254 #ifdef  HAVE_KERNEL_4x2_VEC
255 
zgemv_kernel_4x2(BLASLONG n,BLASLONG lda,FLOAT * ap,FLOAT * x,FLOAT * y)256 static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
257     BLASLONG i;
258     FLOAT *a0, *a1;
259     a0 = ap;
260     a1 = ap + lda;
261 
262 
263 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
264 
265     register __vector double vx0_r = {x[0], x[0]};
266     register __vector double vx0_i = {-x[1], x[1]};
267     register __vector double vx1_r = {x[2], x[2]};
268     register __vector double vx1_i = {-x[3], x[3]};
269 
270 #else
271     register __vector double vx0_r = {x[0], -x[0]};
272     register __vector double vx0_i = {x[1], x[1]};
273     register __vector double vx1_r = {x[2], -x[2]};
274     register __vector double vx1_i = {x[3], x[3]};
275 #endif
276 
277 
278     register __vector double *vy = (__vector double *) y;
279     register __vector double *vptr_a0 = (__vector double *) a0;
280     register __vector double *vptr_a1 = (__vector double *) a1;
281 
282     for (i = 0; i < n; i += 4) {
283 
284         register __vector double vy_0 = vy[i];
285         register __vector double vy_1 = vy[i + 1];
286         register __vector double vy_2 = vy[i + 2];
287         register __vector double vy_3 = vy[i + 3];
288 
289         register __vector double va0 = vptr_a0[i];
290         register __vector double va0_1 = vptr_a0[i + 1];
291         register __vector double va0_2 = vptr_a0[i + 2];
292         register __vector double va0_3 = vptr_a0[i + 3];
293 
294         register __vector double va1 = vptr_a1[i];
295         register __vector double va1_1 = vptr_a1[i + 1];
296         register __vector double va1_2 = vptr_a1[i + 2];
297         register __vector double va1_3 = vptr_a1[i + 3];
298 
299         vy_0 += va0*vx0_r;
300         vy_1 += va0_1*vx0_r;
301         vy_2 += va0_2*vx0_r;
302         vy_3 += va0_3*vx0_r;
303 
304         va0 = vec_xxpermdi(va0, va0, 2);
305         va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
306         va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
307         va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
308 
309         vy_0 += va1*vx1_r;
310         vy_1 += va1_1*vx1_r;
311         vy_2 += va1_2*vx1_r;
312         vy_3 += va1_3*vx1_r;
313 
314         va1 = vec_xxpermdi(va1, va1, 2);
315         va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
316         va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
317         va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
318 
319         vy_0 += va0*vx0_i;
320         vy_1 += va0_1*vx0_i;
321         vy_2 += va0_2*vx0_i;
322         vy_3 += va0_3*vx0_i;
323 
324         vy_0 += va1*vx1_i;
325         vy_1 += va1_1*vx1_i;
326         vy_2 += va1_2*vx1_i;
327         vy_3 += va1_3*vx1_i;
328 
329         vy[i] = vy_0;
330         vy[i + 1] = vy_1;
331         vy[i + 2] = vy_2;
332         vy[i + 3] = vy_3;
333 
334     }
335 }
336 #else
337 
zgemv_kernel_4x2(BLASLONG n,BLASLONG lda,FLOAT * ap,FLOAT * x,FLOAT * y)338 static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
339     BLASLONG i;
340     FLOAT *a0, *a1;
341     a0 = ap;
342     a1 = ap + lda;
343 
344     for (i = 0; i < 2 * n; i += 2) {
345 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
346         y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
347         y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
348         y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
349         y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
350 #else
351         y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
352         y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
353         y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
354         y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
355 #endif
356     }
357 }
358 
359 #endif
360 
361 #ifdef  HAVE_KERNEL_4x1_VEC
362 
zgemv_kernel_4x1(BLASLONG n,FLOAT * ap,FLOAT * x,FLOAT * y)363 static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
364     BLASLONG i;
365     FLOAT *a0;
366     a0 = ap;
367 
368 
369 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
370 
371     register __vector double vx0_r = {x[0], x[0]};
372     register __vector double vx0_i = {-x[1], x[1]};
373 
374 #else
375     register __vector double vx0_r = {x[0], -x[0]};
376     register __vector double vx0_i = {x[1], x[1]};
377 #endif
378 
379 
380     register __vector double *vy = (__vector double *) y;
381     register __vector double *vptr_a0 = (__vector double *) a0;
382 
383     for (i = 0; i < n; i += 4) {
384 
385         register __vector double vy_0 = vy[i];
386         register __vector double vy_1 = vy[i + 1];
387         register __vector double vy_2 = vy[i + 2];
388         register __vector double vy_3 = vy[i + 3];
389 
390         register __vector double va0 = vptr_a0[i];
391         register __vector double va0_1 = vptr_a0[i + 1];
392         register __vector double va0_2 = vptr_a0[i + 2];
393         register __vector double va0_3 = vptr_a0[i + 3];
394 
395         register __vector double va0x = vec_xxpermdi(va0, va0, 2);
396         register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2);
397         register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2);
398         register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2);
399         vy_0 += va0*vx0_r + va0x*vx0_i;
400         vy_1 += va0_1*vx0_r +  va0x_1*vx0_i;
401         vy_2 += va0_2*vx0_r + va0x_2*vx0_i;
402         vy_3 += va0_3*vx0_r + va0x_3*vx0_i;
403 
404         vy[i] = vy_0;
405         vy[i + 1] = vy_1;
406         vy[i + 2] = vy_2;
407         vy[i + 3] = vy_3;
408 
409     }
410 }
411 
412 #else
413 
zgemv_kernel_4x1(BLASLONG n,FLOAT * ap,FLOAT * x,FLOAT * y)414 static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
415     BLASLONG i;
416     FLOAT *a0;
417     a0 = ap;
418 
419     for (i = 0; i < 2 * n; i += 2) {
420 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
421         y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
422         y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
423 #else
424         y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
425         y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
426 #endif
427 
428     }
429 }
430 
431 #endif
432 
433 #ifdef HAVE_KERNEL_ADDY
434 
add_y(BLASLONG n,FLOAT * src,FLOAT * dest,BLASLONG inc_dest,FLOAT alpha_r,FLOAT alpha_i)435 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
436     BLASLONG i;
437 
438 
439 #if   !defined(XCONJ)
440 
441     register __vector double valpha_r = {alpha_r, alpha_r};
442     register __vector double valpha_i = {-alpha_i, alpha_i};
443 
444 #else
445     register __vector double valpha_r = {alpha_r, -alpha_r};
446     register __vector double valpha_i = {alpha_i, alpha_i};
447 #endif
448 
449     register __vector double *vptr_src = (__vector double *) src;
450     if (inc_dest != 2) {
451         register __vector double *vptr_y = (__vector double *) dest;
452         //note that inc_dest is already 2x. so we should add it to double*
453         register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest);
454         register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest);
455         register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest);
456         BLASLONG dest_t = 0;
457         BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4  we just multiply 2 times
458         for (i = 0; i < n; i += 4) {
459 
460             register __vector double vy_0 = vptr_y[dest_t];
461             register __vector double vy_1 = vptr_y1[dest_t];
462             register __vector double vy_2 = vptr_y2[dest_t];
463             register __vector double vy_3 = vptr_y3[dest_t];
464 
465             register __vector double vsrc = vptr_src[i];
466             register __vector double vsrc_1 = vptr_src[i + 1];
467             register __vector double vsrc_2 = vptr_src[i + 2];
468             register __vector double vsrc_3 = vptr_src[i + 3];
469 
470             vy_0 += vsrc*valpha_r;
471             vy_1 += vsrc_1*valpha_r;
472             vy_2 += vsrc_2*valpha_r;
473             vy_3 += vsrc_3*valpha_r;
474 
475             vsrc = vec_xxpermdi(vsrc, vsrc, 2);
476             vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
477             vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
478             vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
479 
480             vy_0 += vsrc*valpha_i;
481             vy_1 += vsrc_1*valpha_i;
482             vy_2 += vsrc_2*valpha_i;
483             vy_3 += vsrc_3*valpha_i;
484 
485             vptr_y[dest_t] = vy_0;
486             vptr_y1[dest_t ] = vy_1;
487             vptr_y2[dest_t] = vy_2;
488             vptr_y3[dest_t] = vy_3;
489 
490             dest_t += add_dest;
491 
492         }
493 
494         return;
495     } else {
496         register __vector double *vptr_y = (__vector double *) dest;
497         for (i = 0; i < n; i += 4) {
498 
499             register __vector double vy_0 = vptr_y[i];
500             register __vector double vy_1 = vptr_y[i + 1];
501             register __vector double vy_2 = vptr_y[i + 2];
502             register __vector double vy_3 = vptr_y[i + 3];
503 
504             register __vector double vsrc = vptr_src[i];
505             register __vector double vsrc_1 = vptr_src[i + 1];
506             register __vector double vsrc_2 = vptr_src[i + 2];
507             register __vector double vsrc_3 = vptr_src[i + 3];
508 
509             vy_0 += vsrc*valpha_r;
510             vy_1 += vsrc_1*valpha_r;
511             vy_2 += vsrc_2*valpha_r;
512             vy_3 += vsrc_3*valpha_r;
513 
514             vsrc = vec_xxpermdi(vsrc, vsrc, 2);
515             vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
516             vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
517             vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
518 
519             vy_0 += vsrc*valpha_i;
520             vy_1 += vsrc_1*valpha_i;
521             vy_2 += vsrc_2*valpha_i;
522             vy_3 += vsrc_3*valpha_i;
523 
524             vptr_y[i] = vy_0;
525             vptr_y[i + 1 ] = vy_1;
526             vptr_y[i + 2] = vy_2;
527             vptr_y[i + 3] = vy_3;
528 
529         }
530 
531         return;
532     }
533     return;
534 }
535 
536 #else
537 
add_y(BLASLONG n,FLOAT * src,FLOAT * dest,BLASLONG inc_dest,FLOAT alpha_r,FLOAT alpha_i)538 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
539     BLASLONG i;
540 
541     if (inc_dest != 2) {
542 
543         FLOAT temp_r;
544         FLOAT temp_i;
545         for (i = 0; i < n; i++) {
546 #if !defined(XCONJ)
547             temp_r = alpha_r * src[0] - alpha_i * src[1];
548             temp_i = alpha_r * src[1] + alpha_i * src[0];
549 #else
550             temp_r = alpha_r * src[0] + alpha_i * src[1];
551             temp_i = -alpha_r * src[1] + alpha_i * src[0];
552 #endif
553 
554             *dest += temp_r;
555             *(dest + 1) += temp_i;
556 
557             src += 2;
558             dest += inc_dest;
559         }
560         return;
561     }
562 
563     FLOAT temp_r0;
564     FLOAT temp_i0;
565     FLOAT temp_r1;
566     FLOAT temp_i1;
567     FLOAT temp_r2;
568     FLOAT temp_i2;
569     FLOAT temp_r3;
570     FLOAT temp_i3;
571     for (i = 0; i < n; i += 4) {
572 #if !defined(XCONJ)
573         temp_r0 = alpha_r * src[0] - alpha_i * src[1];
574         temp_i0 = alpha_r * src[1] + alpha_i * src[0];
575         temp_r1 = alpha_r * src[2] - alpha_i * src[3];
576         temp_i1 = alpha_r * src[3] + alpha_i * src[2];
577         temp_r2 = alpha_r * src[4] - alpha_i * src[5];
578         temp_i2 = alpha_r * src[5] + alpha_i * src[4];
579         temp_r3 = alpha_r * src[6] - alpha_i * src[7];
580         temp_i3 = alpha_r * src[7] + alpha_i * src[6];
581 #else
582         temp_r0 = alpha_r * src[0] + alpha_i * src[1];
583         temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
584         temp_r1 = alpha_r * src[2] + alpha_i * src[3];
585         temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
586         temp_r2 = alpha_r * src[4] + alpha_i * src[5];
587         temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
588         temp_r3 = alpha_r * src[6] + alpha_i * src[7];
589         temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
590 #endif
591 
592         dest[0] += temp_r0;
593         dest[1] += temp_i0;
594         dest[2] += temp_r1;
595         dest[3] += temp_i1;
596         dest[4] += temp_r2;
597         dest[5] += temp_i2;
598         dest[6] += temp_r3;
599         dest[7] += temp_i3;
600 
601         src += 8;
602         dest += 8;
603     }
604     return;
605 }
606 #endif
607 
CNAME(BLASLONG m,BLASLONG n,BLASLONG dummy1,FLOAT alpha_r,FLOAT alpha_i,FLOAT * a,BLASLONG lda,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT * buffer)608 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
609     BLASLONG i;
610     FLOAT *a_ptr;
611     FLOAT *x_ptr;
612     FLOAT *y_ptr;
613 
614     BLASLONG n1;
615     BLASLONG m1;
616     BLASLONG m2;
617     BLASLONG m3;
618     BLASLONG n2;
619     FLOAT xbuffer[8] __attribute__((aligned(16)));
620     FLOAT *ybuffer;
621 
622     if (m < 1) return (0);
623     if (n < 1) return (0);
624 
625     ybuffer = buffer;
626 
627     inc_x *= 2;
628     inc_y *= 2;
629     lda *= 2;
630 
631     n1 = n / 4;
632     n2 = n % 4;
633 
634     m3 = m % 4;
635     m1 = m - (m % 4);
636     m2 = (m % NBMAX) - (m % 4);
637 
638     y_ptr = y;
639 
640     BLASLONG NB = NBMAX;
641 
642     while (NB == NBMAX) {
643 
644         m1 -= NB;
645         if (m1 < 0) {
646             if (m2 == 0) break;
647             NB = m2;
648         }
649 
650         a_ptr = a;
651 
652         x_ptr = x;
653         //zero_y(NB,ybuffer);
654         memset(ybuffer, 0, NB * 16);
655 
656         if (inc_x == 2) {
657 
658             for (i = 0; i < n1; i++) {
659                 zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
660 
661                 a_ptr += lda << 2;
662                 x_ptr += 8;
663             }
664 
665             if (n2 & 2) {
666                 zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
667                 x_ptr += 4;
668                 a_ptr += 2 * lda;
669 
670             }
671 
672             if (n2 & 1) {
673                 zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
674                 x_ptr += 2;
675                 a_ptr += lda;
676 
677             }
678         } else {
679 
680             for (i = 0; i < n1; i++) {
681 
682                 xbuffer[0] = x_ptr[0];
683                 xbuffer[1] = x_ptr[1];
684                 x_ptr += inc_x;
685                 xbuffer[2] = x_ptr[0];
686                 xbuffer[3] = x_ptr[1];
687                 x_ptr += inc_x;
688                 xbuffer[4] = x_ptr[0];
689                 xbuffer[5] = x_ptr[1];
690                 x_ptr += inc_x;
691                 xbuffer[6] = x_ptr[0];
692                 xbuffer[7] = x_ptr[1];
693                 x_ptr += inc_x;
694 
695                 zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
696 
697                 a_ptr += lda << 2;
698             }
699 
700             for (i = 0; i < n2; i++) {
701                 xbuffer[0] = x_ptr[0];
702                 xbuffer[1] = x_ptr[1];
703                 x_ptr += inc_x;
704                 zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
705                 a_ptr += lda;
706 
707             }
708 
709         }
710 
711         add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
712         a += 2 * NB;
713         y_ptr += NB * inc_y;
714     }
715 
716     if (m3 == 0) return (0);
717 
718     if (m3 == 1) {
719         a_ptr = a;
720         x_ptr = x;
721         FLOAT temp_r = 0.0;
722         FLOAT temp_i = 0.0;
723 
724         if (lda == 2 && inc_x == 2) {
725 
726             for (i = 0; i < (n & -2); i += 2) {
727 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
728                 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
729                 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
730                 temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
731                 temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
732 #else
733                 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
734                 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
735                 temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
736                 temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
737 #endif
738 
739                 a_ptr += 4;
740                 x_ptr += 4;
741             }
742 
743             for (; i < n; i++) {
744 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
745                 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
746                 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
747 #else
748                 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
749                 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
750 #endif
751 
752                 a_ptr += 2;
753                 x_ptr += 2;
754             }
755 
756         } else {
757 
758             for (i = 0; i < n; i++) {
759 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
760                 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
761                 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
762 #else
763                 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
764                 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
765 #endif
766 
767                 a_ptr += lda;
768                 x_ptr += inc_x;
769             }
770 
771         }
772 #if !defined(XCONJ)
773         y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
774         y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
775 #else
776         y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
777         y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
778 #endif
779         return (0);
780     }
781 
782     if (m3 == 2) {
783         a_ptr = a;
784         x_ptr = x;
785         FLOAT temp_r0 = 0.0;
786         FLOAT temp_i0 = 0.0;
787         FLOAT temp_r1 = 0.0;
788         FLOAT temp_i1 = 0.0;
789 
790         if (lda == 4 && inc_x == 2) {
791 
792             for (i = 0; i < (n & -2); i += 2) {
793 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
794 
795                 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
796                 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
797                 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
798                 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
799 
800                 temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
801                 temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
802                 temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
803                 temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
804 #else
805                 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
806                 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
807                 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
808                 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
809 
810                 temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
811                 temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
812                 temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
813                 temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
814 #endif
815 
816                 a_ptr += 8;
817                 x_ptr += 4;
818             }
819 
820             for (; i < n; i++) {
821 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
822                 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
823                 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
824                 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
825                 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
826 #else
827                 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
828                 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
829                 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
830                 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
831 #endif
832 
833                 a_ptr += 4;
834                 x_ptr += 2;
835             }
836 
837         } else {
838 
839             for (i = 0; i < n; i++) {
840 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
841                 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
842                 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
843                 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
844                 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
845 #else
846                 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
847                 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
848                 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
849                 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
850 #endif
851 
852                 a_ptr += lda;
853                 x_ptr += inc_x;
854             }
855 
856         }
857 #if !defined(XCONJ)
858         y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
859         y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
860         y_ptr += inc_y;
861         y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
862         y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
863 #else
864         y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
865         y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
866         y_ptr += inc_y;
867         y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
868         y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
869 #endif
870         return (0);
871     }
872 
873     if (m3 == 3) {
874         a_ptr = a;
875         x_ptr = x;
876         FLOAT temp_r0 = 0.0;
877         FLOAT temp_i0 = 0.0;
878         FLOAT temp_r1 = 0.0;
879         FLOAT temp_i1 = 0.0;
880         FLOAT temp_r2 = 0.0;
881         FLOAT temp_i2 = 0.0;
882 
883         if (lda == 6 && inc_x == 2) {
884 
885             for (i = 0; i < n; i++) {
886 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
887                 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
888                 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
889                 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
890                 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
891                 temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
892                 temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
893 #else
894                 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
895                 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
896                 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
897                 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
898                 temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
899                 temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
900 #endif
901 
902                 a_ptr += 6;
903                 x_ptr += 2;
904             }
905 
906         } else {
907 
908             for (i = 0; i < n; i++) {
909 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
910                 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
911                 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
912                 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
913                 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
914                 temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
915                 temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
916 #else
917                 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
918                 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
919                 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
920                 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
921                 temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
922                 temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
923 #endif
924 
925                 a_ptr += lda;
926                 x_ptr += inc_x;
927             }
928 
929         }
930 #if !defined(XCONJ)
931         y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
932         y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
933         y_ptr += inc_y;
934         y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
935         y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
936         y_ptr += inc_y;
937         y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
938         y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
939 #else
940         y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
941         y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
942         y_ptr += inc_y;
943         y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
944         y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
945         y_ptr += inc_y;
946         y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
947         y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
948 #endif
949         return (0);
950     }
951 
952     return (0);
953 }
954 
955