1 /***************************************************************************
2 Copyright (c) 2014, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include "common.h"
31
32
33 #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
34 #include "zgemv_n_microk_haswell-4.c"
35 #elif defined(SANDYBRIDGE)
36 #include "zgemv_n_microk_sandy-4.c"
37 #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
38 #include "zgemv_n_microk_bulldozer-4.c"
39 #endif
40
41 #define NBMAX 1024
42
43 #ifndef HAVE_KERNEL_4x4
44
zgemv_kernel_4x4(BLASLONG n,FLOAT ** ap,FLOAT * x,FLOAT * y)45 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
46 {
47 BLASLONG i;
48 FLOAT *a0,*a1,*a2,*a3;
49 a0 = ap[0];
50 a1 = ap[1];
51 a2 = ap[2];
52 a3 = ap[3];
53
54 for ( i=0; i< 2*n; i+=2 )
55 {
56 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
57 y[i] += a0[i]*x[0] - a0[i+1] * x[1];
58 y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
59 y[i] += a1[i]*x[2] - a1[i+1] * x[3];
60 y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
61 y[i] += a2[i]*x[4] - a2[i+1] * x[5];
62 y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
63 y[i] += a3[i]*x[6] - a3[i+1] * x[7];
64 y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
65 #else
66 y[i] += a0[i]*x[0] + a0[i+1] * x[1];
67 y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
68 y[i] += a1[i]*x[2] + a1[i+1] * x[3];
69 y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
70 y[i] += a2[i]*x[4] + a2[i+1] * x[5];
71 y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
72 y[i] += a3[i]*x[6] + a3[i+1] * x[7];
73 y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
74 #endif
75 }
76 }
77
78 #endif
79
80
81
82 #ifndef HAVE_KERNEL_4x2
83
zgemv_kernel_4x2(BLASLONG n,FLOAT ** ap,FLOAT * x,FLOAT * y)84 static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
85 {
86 BLASLONG i;
87 FLOAT *a0,*a1;
88 a0 = ap[0];
89 a1 = ap[1];
90
91 for ( i=0; i< 2*n; i+=2 )
92 {
93 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
94 y[i] += a0[i]*x[0] - a0[i+1] * x[1];
95 y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
96 y[i] += a1[i]*x[2] - a1[i+1] * x[3];
97 y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
98 #else
99 y[i] += a0[i]*x[0] + a0[i+1] * x[1];
100 y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
101 y[i] += a1[i]*x[2] + a1[i+1] * x[3];
102 y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
103 #endif
104 }
105 }
106
107 #endif
108
109
110
111
112 #ifndef HAVE_KERNEL_4x1
113
114
zgemv_kernel_4x1(BLASLONG n,FLOAT * ap,FLOAT * x,FLOAT * y)115 static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
116 {
117 BLASLONG i;
118 FLOAT *a0;
119 a0 = ap;
120
121 for ( i=0; i< 2*n; i+=2 )
122 {
123 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
124 y[i] += a0[i]*x[0] - a0[i+1] * x[1];
125 y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
126 #else
127 y[i] += a0[i]*x[0] + a0[i+1] * x[1];
128 y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
129 #endif
130
131 }
132 }
133
134
135 #endif
136
137
138 #ifndef HAVE_KERNEL_ADDY
139
140 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
141
add_y(BLASLONG n,FLOAT * src,FLOAT * dest,BLASLONG inc_dest,FLOAT alpha_r,FLOAT alpha_i)142 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
143 {
144 BLASLONG i;
145
146 if ( inc_dest != 2 )
147 {
148
149 FLOAT temp_r;
150 FLOAT temp_i;
151 for ( i=0; i<n; i++ )
152 {
153 #if !defined(XCONJ)
154 temp_r = alpha_r * src[0] - alpha_i * src[1];
155 temp_i = alpha_r * src[1] + alpha_i * src[0];
156 #else
157 temp_r = alpha_r * src[0] + alpha_i * src[1];
158 temp_i = -alpha_r * src[1] + alpha_i * src[0];
159 #endif
160
161 *dest += temp_r;
162 *(dest+1) += temp_i;
163
164 src+=2;
165 dest += inc_dest;
166 }
167 return;
168 }
169
170 FLOAT temp_r0;
171 FLOAT temp_i0;
172 FLOAT temp_r1;
173 FLOAT temp_i1;
174 FLOAT temp_r2;
175 FLOAT temp_i2;
176 FLOAT temp_r3;
177 FLOAT temp_i3;
178 for ( i=0; i<n; i+=4 )
179 {
180 #if !defined(XCONJ)
181 temp_r0 = alpha_r * src[0] - alpha_i * src[1];
182 temp_i0 = alpha_r * src[1] + alpha_i * src[0];
183 temp_r1 = alpha_r * src[2] - alpha_i * src[3];
184 temp_i1 = alpha_r * src[3] + alpha_i * src[2];
185 temp_r2 = alpha_r * src[4] - alpha_i * src[5];
186 temp_i2 = alpha_r * src[5] + alpha_i * src[4];
187 temp_r3 = alpha_r * src[6] - alpha_i * src[7];
188 temp_i3 = alpha_r * src[7] + alpha_i * src[6];
189 #else
190 temp_r0 = alpha_r * src[0] + alpha_i * src[1];
191 temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
192 temp_r1 = alpha_r * src[2] + alpha_i * src[3];
193 temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
194 temp_r2 = alpha_r * src[4] + alpha_i * src[5];
195 temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
196 temp_r3 = alpha_r * src[6] + alpha_i * src[7];
197 temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
198 #endif
199
200 dest[0] += temp_r0;
201 dest[1] += temp_i0;
202 dest[2] += temp_r1;
203 dest[3] += temp_i1;
204 dest[4] += temp_r2;
205 dest[5] += temp_i2;
206 dest[6] += temp_r3;
207 dest[7] += temp_i3;
208
209 src += 8;
210 dest += 8;
211 }
212 return;
213
214 }
215
216 #endif
217
CNAME(BLASLONG m,BLASLONG n,BLASLONG dummy1,FLOAT alpha_r,FLOAT alpha_i,FLOAT * a,BLASLONG lda,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT * buffer)218 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
219 {
220 BLASLONG i;
221 FLOAT *a_ptr;
222 FLOAT *x_ptr;
223 FLOAT *y_ptr;
224 FLOAT *ap[4];
225 BLASLONG n1;
226 BLASLONG m1;
227 BLASLONG m2;
228 BLASLONG m3;
229 BLASLONG n2;
230 BLASLONG lda4;
231 FLOAT xbuffer[8],*ybuffer;
232
233
234 #if 0
235 printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
236 #endif
237
238 if ( m < 1 ) return(0);
239 if ( n < 1 ) return(0);
240
241 ybuffer = buffer;
242
243 inc_x *= 2;
244 inc_y *= 2;
245 lda *= 2;
246 lda4 = 4 * lda;
247
248 n1 = n / 4 ;
249 n2 = n % 4 ;
250
251 m3 = m % 4;
252 m1 = m - ( m % 4 );
253 m2 = (m % NBMAX) - (m % 4) ;
254
255 y_ptr = y;
256
257 BLASLONG NB = NBMAX;
258
259 while ( NB == NBMAX )
260 {
261
262 m1 -= NB;
263 if ( m1 < 0)
264 {
265 if ( m2 == 0 ) break;
266 NB = m2;
267 }
268
269 a_ptr = a;
270 ap[0] = a_ptr;
271 ap[1] = a_ptr + lda;
272 ap[2] = ap[1] + lda;
273 ap[3] = ap[2] + lda;
274 x_ptr = x;
275 //zero_y(NB,ybuffer);
276 memset(ybuffer,0,NB*16);
277
278 if ( inc_x == 2 )
279 {
280
281 for( i = 0; i < n1 ; i++)
282 {
283 zgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
284 ap[0] += lda4;
285 ap[1] += lda4;
286 ap[2] += lda4;
287 ap[3] += lda4;
288 a_ptr += lda4;
289 x_ptr += 8;
290 }
291
292 if ( n2 & 2 )
293 {
294 zgemv_kernel_4x2(NB,ap,x_ptr,ybuffer);
295 x_ptr += 4;
296 a_ptr += 2 * lda;
297
298 }
299
300 if ( n2 & 1 )
301 {
302 zgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer);
303 /* x_ptr += 2;
304 a_ptr += lda; */
305
306 }
307 }
308 else
309 {
310
311 for( i = 0; i < n1 ; i++)
312 {
313
314 xbuffer[0] = x_ptr[0];
315 xbuffer[1] = x_ptr[1];
316 x_ptr += inc_x;
317 xbuffer[2] = x_ptr[0];
318 xbuffer[3] = x_ptr[1];
319 x_ptr += inc_x;
320 xbuffer[4] = x_ptr[0];
321 xbuffer[5] = x_ptr[1];
322 x_ptr += inc_x;
323 xbuffer[6] = x_ptr[0];
324 xbuffer[7] = x_ptr[1];
325 x_ptr += inc_x;
326
327 zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
328 ap[0] += lda4;
329 ap[1] += lda4;
330 ap[2] += lda4;
331 ap[3] += lda4;
332 a_ptr += lda4;
333 }
334
335 for( i = 0; i < n2 ; i++)
336 {
337 xbuffer[0] = x_ptr[0];
338 xbuffer[1] = x_ptr[1];
339 x_ptr += inc_x;
340 zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
341 a_ptr += 1 * lda;
342
343 }
344
345 }
346
347 add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
348 a += 2 * NB;
349 y_ptr += NB * inc_y;
350 }
351
352 if ( m3 == 0 ) return(0);
353
354 if ( m3 == 1 )
355 {
356 a_ptr = a;
357 x_ptr = x;
358 FLOAT temp_r = 0.0;
359 FLOAT temp_i = 0.0;
360
361 if ( lda == 2 && inc_x == 2 )
362 {
363
364
365 for( i=0 ; i < (n & -2); i+=2 )
366 {
367 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
368 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
369 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
370 temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
371 temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
372 #else
373 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
374 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
375 temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
376 temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
377 #endif
378
379 a_ptr += 4;
380 x_ptr += 4;
381 }
382
383
384
385 for( ; i < n; i++ )
386 {
387 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
388 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
389 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
390 #else
391 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
392 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
393 #endif
394
395 a_ptr += 2;
396 x_ptr += 2;
397 }
398
399
400 }
401 else
402 {
403
404 for( i = 0; i < n; i++ )
405 {
406 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
407 temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
408 temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
409 #else
410 temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
411 temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
412 #endif
413
414 a_ptr += lda;
415 x_ptr += inc_x;
416 }
417
418 }
419 #if !defined(XCONJ)
420 y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
421 y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
422 #else
423 y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
424 y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
425 #endif
426 return(0);
427 }
428
429 if ( m3 == 2 )
430 {
431 a_ptr = a;
432 x_ptr = x;
433 FLOAT temp_r0 = 0.0;
434 FLOAT temp_i0 = 0.0;
435 FLOAT temp_r1 = 0.0;
436 FLOAT temp_i1 = 0.0;
437
438 if ( lda == 4 && inc_x == 2 )
439 {
440
441 for( i = 0; i < (n & -2); i+=2 )
442 {
443 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
444
445 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
446 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
447 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
448 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
449
450 temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
451 temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
452 temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
453 temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
454
455 #else
456 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
457 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
458 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
459 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
460
461 temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
462 temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
463 temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
464 temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
465
466 #endif
467
468 a_ptr += 8;
469 x_ptr += 4;
470 }
471
472
473 for( ; i < n; i++ )
474 {
475 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
476 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
477 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
478 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
479 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
480 #else
481 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
482 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
483 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
484 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
485 #endif
486
487 a_ptr += 4;
488 x_ptr += 2;
489 }
490
491
492 }
493 else
494 {
495
496 for( i=0 ; i < n; i++ )
497 {
498 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
499 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
500 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
501 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
502 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
503 #else
504 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
505 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
506 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
507 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
508 #endif
509
510 a_ptr += lda;
511 x_ptr += inc_x;
512 }
513
514
515 }
516 #if !defined(XCONJ)
517 y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
518 y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
519 y_ptr += inc_y;
520 y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
521 y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
522 #else
523 y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
524 y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
525 y_ptr += inc_y;
526 y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
527 y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
528 #endif
529 return(0);
530 }
531
532
533 if ( m3 == 3 )
534 {
535 a_ptr = a;
536 x_ptr = x;
537 FLOAT temp_r0 = 0.0;
538 FLOAT temp_i0 = 0.0;
539 FLOAT temp_r1 = 0.0;
540 FLOAT temp_i1 = 0.0;
541 FLOAT temp_r2 = 0.0;
542 FLOAT temp_i2 = 0.0;
543
544 if ( lda == 6 && inc_x == 2 )
545 {
546
547 for( i=0 ; i < n; i++ )
548 {
549 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
550 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
551 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
552 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
553 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
554 temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
555 temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
556 #else
557 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
558 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
559 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
560 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
561 temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
562 temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
563 #endif
564
565 a_ptr += 6;
566 x_ptr += 2;
567 }
568
569
570 }
571 else
572 {
573
574 for( i = 0; i < n; i++ )
575 {
576 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
577 temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
578 temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
579 temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
580 temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
581 temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
582 temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
583 #else
584 temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
585 temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
586 temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
587 temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
588 temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
589 temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
590 #endif
591
592 a_ptr += lda;
593 x_ptr += inc_x;
594 }
595
596 }
597 #if !defined(XCONJ)
598 y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
599 y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
600 y_ptr += inc_y;
601 y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
602 y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
603 y_ptr += inc_y;
604 y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
605 y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
606 #else
607 y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
608 y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
609 y_ptr += inc_y;
610 y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
611 y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
612 y_ptr += inc_y;
613 y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
614 y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
615 #endif
616 return(0);
617 }
618
619
620
621
622
623 return(0);
624 }
625
626
627