1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "blis1.h"
12 
13 /*
14    Effective computation:
15 
16      a   = a + beta * u + gamma * z;
17      rho = conj(a) * x;
18      w   = w + kappa * a;
19 */
20 
bl1_saxpyv2bdotaxpy(int n,float * beta,float * u,int inc_u,float * gamma,float * z,int inc_z,float * a,int inc_a,float * x,int inc_x,float * kappa,float * rho,float * w,int inc_w)21 void bl1_saxpyv2bdotaxpy( int       n,
22                           float*    beta,
23                           float*    u, int inc_u,
24                           float*    gamma,
25                           float*    z, int inc_z,
26                           float*    a, int inc_a,
27                           float*    x, int inc_x,
28                           float*    kappa,
29                           float*    rho,
30                           float*    w, int inc_w )
31 {
32 	bl1_abort();
33 }
34 
35 
bl1_daxpyv2bdotaxpy(int n,double * beta,double * u,int inc_u,double * gamma,double * z,int inc_z,double * a,int inc_a,double * x,int inc_x,double * kappa,double * rho,double * w,int inc_w)36 void bl1_daxpyv2bdotaxpy( int       n,
37                           double*   beta,
38                           double*   u, int inc_u,
39                           double*   gamma,
40                           double*   z, int inc_z,
41                           double*   a, int inc_a,
42                           double*   x, int inc_x,
43                           double*   kappa,
44                           double*   rho,
45                           double*   w, int inc_w )
46 #if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS
47 {
48 	double*   restrict upsilon1;
49 	double*   restrict zeta1;
50 	double*   restrict alpha1;
51 	double*   restrict chi1;
52 	double*   restrict omega1;
53 	double             rho_c;
54 	int                i;
55 	v2df_t             b1v, g1v, k1v;
56 	v2df_t             rhov;
57 	v2df_t             u1v, z1v, a1v;
58 	v2df_t             u2v, z2v, a2v;
59 	v2df_t             x1v, w1v;
60 	v2df_t             x2v, w2v;
61 
62 	int       n_pre;
63 	int       n_run;
64 	int       n_left;
65 
66 	n_pre = 0;
67 	if ( ( unsigned long ) a % 16 != 0 )
68 	{
69 		if ( ( unsigned long ) u % 16 == 0 ||
70 		     ( unsigned long ) z % 16 == 0 ||
71 		     ( unsigned long ) x % 16 == 0 ||
72 		     ( unsigned long ) w % 16 == 0 ) bl1_abort();
73 
74 		n_pre = 1;
75 	}
76 
77 	n_run       = ( n - n_pre ) / 4;
78 	n_left      = ( n - n_pre ) % 4;
79 
80 	upsilon1 = u;
81 	zeta1    = z;
82 	alpha1   = a;
83 	chi1     = x;
84 	omega1   = w;
85 
86 
87 	rho_c   = 0.0;
88 
89 	if ( n_pre == 1 )
90 	{
91 		double   beta_c     = *beta;
92 		double   gamma_c    = *gamma;
93 		double   kappa_c    = *kappa;
94 
95 		double   upsilon1_c = *upsilon1;
96 		double   zeta1_c    = *zeta1;
97 		double   alpha1_c   = *alpha1;
98 		double   chi1_c     = *chi1;
99 		double   omega1_c   = *omega1;
100 
101 		alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
102 		rho_c += alpha1_c * chi1_c;
103 		omega1_c += kappa_c * alpha1_c;
104 
105 		*alpha1 = alpha1_c;
106 		*omega1 = omega1_c;
107 
108 		upsilon1 += inc_u;
109 		zeta1    += inc_z;
110 		alpha1   += inc_a;
111 		chi1     += inc_x;
112 		omega1   += inc_w;
113 	}
114 
115 	b1v.v = _mm_loaddup_pd( ( double* )beta );
116 	g1v.v = _mm_loaddup_pd( ( double* )gamma );
117 	k1v.v = _mm_loaddup_pd( ( double* )kappa );
118 
119 	rhov.v = _mm_setzero_pd();
120 
121 	for ( i = 0; i < n_run; ++i )
122 	{
123 		u1v.v = _mm_load_pd( ( double* )upsilon1 );
124 		z1v.v = _mm_load_pd( ( double* )zeta1 );
125 		a1v.v = _mm_load_pd( ( double* )alpha1 );
126 
127 		a1v.v += b1v.v * u1v.v + g1v.v * z1v.v;
128 
129 		u2v.v = _mm_load_pd( ( double* )(upsilon1 + 2) );
130 		z2v.v = _mm_load_pd( ( double* )(zeta1 + 2) );
131 		a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
132 
133 		a2v.v += b1v.v * u2v.v + g1v.v * z2v.v;
134 
135 		x1v.v = _mm_load_pd( ( double* )chi1 );
136 		x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
137 
138 		w1v.v = _mm_load_pd( ( double* )omega1 );
139 		w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
140 
141 		rhov.v += a1v.v * x1v.v;
142 		rhov.v += a2v.v * x2v.v;
143 
144 		w1v.v += k1v.v * a1v.v;
145 		w2v.v += k1v.v * a2v.v;
146 
147 		_mm_store_pd( ( double* )alpha1, a1v.v );
148 		_mm_store_pd( ( double* )(alpha1 + 2), a2v.v );
149 
150 		_mm_store_pd( ( double* )omega1, w1v.v );
151 		_mm_store_pd( ( double* )(omega1 + 2), w2v.v );
152 
153 
154 		upsilon1 += 4;
155 		zeta1    += 4;
156 		alpha1   += 4;
157 		chi1     += 4;
158 		omega1   += 4;
159 	}
160 
161 	rho_c += rhov.d[0] + rhov.d[1];
162 
163 	if ( n_left > 0 )
164 	{
165 		double beta_c  = *beta;
166 		double gamma_c = *gamma;
167 		double kappa_c = *kappa;
168 
169 		for ( i = 0; i < n_left; ++i )
170 		{
171 			double   upsilon1_c = *upsilon1;
172 			double   zeta1_c    = *zeta1;
173 			double   alpha1_c   = *alpha1;
174 			double   chi1_c     = *chi1;
175 			double   omega1_c   = *omega1;
176 
177 			alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
178 			rho_c += alpha1_c * chi1_c;
179 			omega1_c += kappa_c * alpha1_c;
180 
181 			*alpha1 = alpha1_c;
182 			*omega1 = omega1_c;
183 
184 			upsilon1 += inc_u;
185 			zeta1    += inc_z;
186 			alpha1   += inc_a;
187 			chi1     += inc_x;
188 			omega1   += inc_w;
189 		}
190 	}
191 
192 	*rho = rho_c;
193 }
194 #elif BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_NO_INTRINSICS
195 {
196 	double*   restrict upsilon1;
197 	double*   restrict zeta1;
198 	double*   restrict alpha1;
199 	double*   restrict chi1;
200 	double*   restrict omega1;
201 	double             beta_c;
202 	double             gamma_c;
203 	double             kappa_c;
204 	double             rho_c;
205 	int                i;
206 
207 	int       n_pre;
208 	int       n_run;
209 	int       n_left;
210 
211 	n_pre = 0;
212 	//if ( ( unsigned long ) a % 16 != 0 )
213 	//{
214 	//	if ( ( unsigned long ) u % 16 == 0 ||
215 	//	     ( unsigned long ) z % 16 == 0 ||
216 	//	     ( unsigned long ) x % 16 == 0 ||
217 	//	     ( unsigned long ) w % 16 == 0 ) bl1_abort();
218 	//
219 	//	n_pre = 1;
220 	//}
221 
222 	n_run       = ( n - n_pre ) / 2;
223 	n_left      = ( n - n_pre ) % 2;
224 
225 	upsilon1 = u;
226 	zeta1    = z;
227 	alpha1   = a;
228 	chi1     = x;
229 	omega1   = w;
230 
231 	beta_c  = *beta;
232 	gamma_c = *gamma;
233 	kappa_c = *kappa;
234 
235 	rho_c = 0.0;
236 
237 	if ( n_pre == 1 )
238 	{
239 		double   upsilon1_c = *upsilon1;
240 		double   zeta1_c    = *zeta1;
241 		double   alpha1_c   = *alpha1;
242 		double   chi1_c     = *chi1;
243 		double   omega1_c   = *omega1;
244 
245 		alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
246 		rho_c += alpha1_c * chi1_c;
247 		omega1_c += kappa_c * alpha1_c;
248 
249 		*alpha1 = alpha1_c;
250 		*omega1 = omega1_c;
251 
252 		upsilon1 += inc_u;
253 		zeta1    += inc_z;
254 		alpha1   += inc_a;
255 		chi1     += inc_x;
256 		omega1   += inc_w;
257 	}
258 
259 	for ( i = 0; i < n_run; ++i )
260 	{
261 		double   upsilon1_c = *upsilon1;
262 		double   upsilon2_c = *(upsilon1 + 1);
263 		double   zeta1_c    = *zeta1;
264 		double   zeta2_c    = *(zeta1 + 1);
265 		double   alpha1_c   = *alpha1;
266 		double   alpha2_c   = *(alpha1 + 1);
267 		double   chi1_c     = *chi1;
268 		double   chi2_c     = *(chi1 + 1);
269 		double   omega1_c   = *omega1;
270 		double   omega2_c   = *(omega1 + 1);
271 
272 		// alpha1 += beta * upsilon1 + gamma * zeta1;
273 		alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
274 		alpha2_c += beta_c * upsilon2_c + gamma_c * zeta2_c;
275 
276 		// rho += conj(alpha1) * chi1 +
277 		//        conj(alpha2) * chi2;
278 		rho_c += alpha1_c * chi1_c + alpha2_c * chi2_c;
279 
280 		// omega1 += kappa * alpha1;
281 		omega1_c += kappa_c * alpha1_c;
282 		omega2_c += kappa_c * alpha2_c;
283 
284 		*alpha1       = alpha1_c;
285 		*(alpha1 + 1) = alpha2_c;
286 		*omega1       = omega1_c;
287 		*(omega1 + 1) = omega2_c;
288 
289 		upsilon1 += 2*inc_u;
290 		zeta1    += 2*inc_z;
291 		alpha1   += 2*inc_a;
292 		chi1     += 2*inc_x;
293 		omega1   += 2*inc_w;
294 	}
295 
296 	if ( n_left > 0 )
297 	{
298 
299 		for ( i = 0; i < n_left; ++i )
300 		{
301 			double   upsilon1_c = *upsilon1;
302 			double   zeta1_c    = *zeta1;
303 			double   alpha1_c   = *alpha1;
304 			double   chi1_c     = *chi1;
305 			double   omega1_c   = *omega1;
306 
307 			alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
308 			rho_c += alpha1_c * chi1_c;
309 			omega1_c += kappa_c * alpha1_c;
310 
311 			*alpha1 = alpha1_c;
312 			*omega1 = omega1_c;
313 
314 			upsilon1 += inc_u;
315 			zeta1    += inc_z;
316 			alpha1   += inc_a;
317 			chi1     += inc_x;
318 			omega1   += inc_w;
319 		}
320 	}
321 
322 	*rho = rho_c;
323 }
324 #endif
325 
326 
bl1_caxpyv2bdotaxpy(int n,scomplex * beta,scomplex * u,int inc_u,scomplex * gamma,scomplex * z,int inc_z,scomplex * a,int inc_a,scomplex * x,int inc_x,scomplex * kappa,scomplex * rho,scomplex * w,int inc_w)327 void bl1_caxpyv2bdotaxpy( int       n,
328                           scomplex* beta,
329                           scomplex* u, int inc_u,
330                           scomplex* gamma,
331                           scomplex* z, int inc_z,
332                           scomplex* a, int inc_a,
333                           scomplex* x, int inc_x,
334                           scomplex* kappa,
335                           scomplex* rho,
336                           scomplex* w, int inc_w )
337 {
338 	bl1_abort();
339 }
340 
341 
bl1_zaxpyv2bdotaxpy(int n,dcomplex * beta,dcomplex * u,int inc_u,dcomplex * gamma,dcomplex * z,int inc_z,dcomplex * a,int inc_a,dcomplex * x,int inc_x,dcomplex * kappa,dcomplex * rho,dcomplex * w,int inc_w)342 void bl1_zaxpyv2bdotaxpy( int       n,
343                           dcomplex* beta,
344                           dcomplex* u, int inc_u,
345                           dcomplex* gamma,
346                           dcomplex* z, int inc_z,
347                           dcomplex* a, int inc_a,
348                           dcomplex* x, int inc_x,
349                           dcomplex* kappa,
350                           dcomplex* rho,
351                           dcomplex* w, int inc_w )
352 #if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS
353 {
354 	dcomplex* restrict upsilon1;
355 	dcomplex* restrict zeta1;
356 	dcomplex* restrict alpha1;
357 	dcomplex* restrict chi1;
358 	dcomplex* restrict omega1;
359 	int                i;
360 
361 	//v2df_t    beta1v, beta1rv;
362 	//v2df_t    gamma1v, gamma1rv;
363 	//v2df_t    kappa1v, kappa1rv;
364 	v2df_t    rho1v;
365 	//v2df_t    u11v, u12v;
366 	//v2df_t    z11v, z12v;
367 	v2df_t    a11v, a12v;
368 	v2df_t    x1v, x1rv;
369 	v2df_t    w1v;
370 	v2df_t    acbc, bdad;
371 	v2df_t    adac, bcbd;
372 
373 	v2df_t    a1v, a1rv;
374 	v2df_t    u1v, u1rv;
375 	v2df_t    z1v, z1rv;
376 	v2df_t    beta11v, gamma11v, kappa11v;
377 	v2df_t    beta12v, gamma12v, kappa12v;
378 
379 	upsilon1 = u;
380 	zeta1    = z;
381 	alpha1   = a;
382 	chi1     = x;
383 	omega1   = w;
384 
385 	if ( inc_u != 1 ||
386 	     inc_z != 1 ||
387 	     inc_a != 1 ||
388 	     inc_x != 1 ||
389 	     inc_w != 1 ) bl1_abort();
390 
391 
392 	beta11v.v  = _mm_loaddup_pd( ( double* )&(beta->real) );
393 	beta12v.v  = _mm_loaddup_pd( ( double* )&(beta->imag) );
394 	gamma11v.v = _mm_loaddup_pd( ( double* )&(gamma->real) );
395 	gamma12v.v = _mm_loaddup_pd( ( double* )&(gamma->imag) );
396 	kappa11v.v = _mm_loaddup_pd( ( double* )&(kappa->real) );
397 	kappa12v.v = _mm_loaddup_pd( ( double* )&(kappa->imag) );
398 
399 	rho1v.v = _mm_setzero_pd();
400 
401 	for ( i = 0; i < n; ++i )
402 	{
403 		//alpha_c = *alpha1;
404 		a1v.v  = _mm_load_pd( ( double* )alpha1 );
405 
406 		//alpha1_c.real += beta_c.real * upsilon1_c.real - beta_c.imag * upsilon1_c.imag;
407 		//alpha1_c.imag += beta_c.real * upsilon1_c.imag + beta_c.imag * upsilon1_c.real;
408 		u1v.v  = _mm_load_pd( ( double* )upsilon1 );
409 		u1rv.v = _mm_shuffle_pd( u1v.v, u1v.v, _MM_SHUFFLE2 (0,1) );
410 		acbc.v = beta11v.v * u1v.v;
411 		bdad.v = beta12v.v * u1rv.v;
412 		a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
413 
414 		//alpha1_c.real += gamma_c.real * zeta1_c.real - gamma_c.imag * zeta1_c.imag;
415 		//alpha1_c.imag += gamma_c.real * zeta1_c.imag + gamma_c.imag * zeta1_c.real;
416 		z1v.v  = _mm_load_pd( ( double* )zeta1 );
417 		z1rv.v = _mm_shuffle_pd( z1v.v, z1v.v, _MM_SHUFFLE2 (0,1) );
418 		acbc.v = gamma11v.v * z1v.v;
419 		bdad.v = gamma12v.v * z1rv.v;
420 		a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
421 
422 		//*alpha1 = alpha1_c;
423 		_mm_store_pd( ( double* )alpha1, a1v.v );
424 
425 		//rho_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
426 		//rho_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
427 		x1v.v  = _mm_load_pd( ( double* )chi1 );
428 		x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
429 		a11v.v = a1v.v;
430 		a12v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (1,1) );
431 		a11v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (0,0) );
432 		adac.v = a11v.v * x1rv.v;
433 		bcbd.v = a12v.v * x1v.v;
434 		rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
435 
436 		//omega_c = *omega1;
437 		w1v.v  = _mm_load_pd( ( double* )omega1 );
438 
439 		//omega1_c.real += kappa_c.real * alpha1_c.real - kappa_c.imag * alpha1_c.imag;
440 		//omega1_c.imag += kappa_c.real * alpha1_c.imag + kappa_c.imag * alpha1_c.real;
441 		a1rv.v = _mm_shuffle_pd( a1v.v, a1v.v, _MM_SHUFFLE2 (0,1) );
442 		acbc.v = kappa11v.v * a1v.v;
443 		bdad.v = kappa12v.v * a1rv.v;
444 		w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
445 
446 		// *omega1 = omega1_c;
447 		_mm_store_pd( ( double* )omega1, w1v.v );
448 
449 
450 		upsilon1 += 1;
451 		zeta1    += 1;
452 		alpha1   += 1;
453 		chi1     += 1;
454 		omega1   += 1;
455 	}
456 
457 	rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
458 
459 	//rho->real = rho_c.real;
460 	//rho->imag = rho_c.imag;
461 	_mm_store_pd( ( double* )rho, rho1v.v );
462 }
463 #elif BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_NO_INTRINSICS
464 {
465 	dcomplex* restrict upsilon1;
466 	dcomplex* restrict zeta1;
467 	dcomplex* restrict alpha1;
468 	dcomplex* restrict chi1;
469 	dcomplex* restrict omega1;
470 	dcomplex           beta_c;
471 	dcomplex           gamma_c;
472 	dcomplex           kappa_c;
473 	dcomplex           rho_c;
474 	int                i;
475 
476 	upsilon1 = u;
477 	zeta1    = z;
478 	alpha1   = a;
479 	chi1     = x;
480 	omega1   = w;
481 
482 	rho_c.real = 0.0;
483 	rho_c.imag = 0.0;
484 
485 	beta_c  = *beta;
486 	gamma_c = *gamma;
487 	kappa_c = *kappa;
488 
489 	for ( i = 0; i < n; ++i )
490 	{
491 		dcomplex upsilon1_c = *upsilon1;
492 		dcomplex zeta1_c    = *zeta1;
493 		dcomplex alpha1_c   = *alpha1;
494 		dcomplex chi1_c     = *chi1;
495 		dcomplex omega1_c   = *omega1;
496 
497 		// alpha1 += beta * upsilon1;
498 		alpha1_c.real += beta_c.real * upsilon1_c.real - beta_c.imag * upsilon1_c.imag;
499 		alpha1_c.imag += beta_c.real * upsilon1_c.imag + beta_c.imag * upsilon1_c.real;
500 
501 		// alpha1 += gamma * zeta1;
502 		alpha1_c.real += gamma_c.real * zeta1_c.real - gamma_c.imag * zeta1_c.imag;
503 		alpha1_c.imag += gamma_c.real * zeta1_c.imag + gamma_c.imag * zeta1_c.real;
504 
505 		// rho += conj(alpha1) * chi1;
506 		rho_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
507 		rho_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
508 
509 		// omega1 += kappa * alpha1;
510 		omega1_c.real += kappa_c.real * alpha1_c.real - kappa_c.imag * alpha1_c.imag;
511 		omega1_c.imag += kappa_c.real * alpha1_c.imag + kappa_c.imag * alpha1_c.real;
512 
513 		*alpha1 = alpha1_c;
514 		*omega1 = omega1_c;
515 
516 		upsilon1 += inc_u;
517 		zeta1    += inc_z;
518 		alpha1   += inc_a;
519 		chi1     += inc_x;
520 		omega1   += inc_w;
521 	}
522 
523 	rho->real = rho_c.real;
524 	rho->imag = rho_c.imag;
525 }
526 #endif
527 
528