1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "FLAME.h"
12 
13 /*
14    Effective computation:
15 
16      rho = conj(x) * u;
17      y   = y - alpha * x;
18      z   = z - beta  * x;
19 */
20 
bl1_sdotaxmyv2(int n,float * alpha,float * beta,float * x,int inc_x,float * u,int inc_u,float * rho,float * y,int inc_y,float * z,int inc_z)21 void bl1_sdotaxmyv2( int       n,
22                      float*    alpha,
23                      float*    beta,
24                      float*    x, int inc_x,
25                      float*    u, int inc_u,
26                      float*    rho,
27                      float*    y, int inc_y,
28                      float*    z, int inc_z )
29 {
30 	bl1_abort();
31 }
32 
33 
bl1_ddotaxmyv2(int n,double * alpha,double * beta,double * x,int inc_x,double * u,int inc_u,double * rho,double * y,int inc_y,double * z,int inc_z)34 void bl1_ddotaxmyv2( int       n,
35                      double*   alpha,
36                      double*   beta,
37                      double*   x, int inc_x,
38                      double*   u, int inc_u,
39                      double*   rho,
40                      double*   y, int inc_y,
41                      double*   z, int inc_z )
42 #if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS
43 {
44 	double*   restrict chi1;
45 	double*   restrict upsilon1;
46 	double*   restrict psi1;
47 	double*   restrict zeta1;
48 	double    rho_c;
49 	int       i;
50 
51 	int       n_pre;
52 	int       n_run;
53 	int       n_left;
54 
55 	v2df_t    a1v, b1v;
56 	v2df_t    rho1v;
57 	v2df_t    x1v, u1v, y1v, z1v;
58 
59 	if ( inc_x != 1 ||
60 	     inc_u != 1 ||
61 	     inc_y != 1 ||
62 	     inc_z != 1 ) bl1_abort();
63 
64 	n_pre = 0;
65 	if ( ( unsigned long ) z % 16 != 0 )
66 	{
67 		if ( ( unsigned long ) x % 16 == 0 ||
68 		     ( unsigned long ) u % 16 == 0 ||
69 		     ( unsigned long ) y % 16 == 0 ) bl1_abort();
70 
71 		n_pre = 1;
72 	}
73 
74 	n_run       = ( n - n_pre ) / 2;
75 	n_left      = ( n - n_pre ) % 2;
76 
77 	chi1     = x;
78 	upsilon1 = u;
79 	psi1     = y;
80 	zeta1    = z;
81 
82 	rho_c   = 0.0;
83 
84 	if ( n_pre == 1 )
85 	{
86 		double   alpha_c = *alpha;
87 		double   beta_c  = *beta;
88 		double   chi1_c    = *chi1;
89 		double   upsilon_c = *upsilon1;
90 
91 		rho_c  += chi1_c * upsilon_c;
92 		*psi1  -= alpha_c * chi1_c;
93 		*zeta1 -= beta_c  * chi1_c;
94 
95 		chi1     += inc_x;
96 		upsilon1 += inc_u;
97 		psi1     += inc_y;
98 		zeta1    += inc_z;
99 	}
100 
101 	a1v.v = _mm_loaddup_pd( ( double* )alpha );
102 	b1v.v = _mm_loaddup_pd( ( double* )beta );
103 
104 	rho1v.v = _mm_setzero_pd();
105 
106 	for ( i = 0; i < n_run; ++i )
107 	{
108 		x1v.v = _mm_load_pd( ( double* )chi1 );
109 		u1v.v = _mm_load_pd( ( double* )upsilon1 );
110 		y1v.v = _mm_load_pd( ( double* )psi1 );
111 		z1v.v = _mm_load_pd( ( double* )zeta1 );
112 
113 		rho1v.v += x1v.v * u1v.v;
114 		y1v.v   -= a1v.v * x1v.v;
115 		z1v.v   -= b1v.v * x1v.v;
116 
117 		_mm_store_pd( ( double* )psi1,  y1v.v );
118 		_mm_store_pd( ( double* )zeta1, z1v.v );
119 
120 		chi1     += 2;
121 		upsilon1 += 2;
122 		psi1     += 2;
123 		zeta1    += 2;
124 	}
125 
126 	rho_c += rho1v.d[0] + rho1v.d[1];
127 
128 	if ( n_left > 0 )
129 	{
130 		double   alpha_c = *alpha;
131 		double   beta_c  = *beta;
132 
133 		for( i = 0; i < n_left; ++i )
134 		{
135 			double   chi1_c    = *chi1;
136 			double   upsilon_c = *upsilon1;
137 
138 			rho_c  += chi1_c * upsilon_c;
139 			*psi1  -= alpha_c * chi1_c;
140 			*zeta1 -= beta_c  * chi1_c;
141 
142 			chi1     += inc_x;
143 			upsilon1 += inc_u;
144 			psi1     += inc_y;
145 			zeta1    += inc_z;
146 		}
147 	}
148 
149 	*rho = rho_c;
150 }
151 #elif BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_NO_INTRINSICS
152 {
153 	double*   restrict chi1;
154 	double*   restrict upsilon1;
155 	double*   restrict psi1;
156 	double*   restrict zeta1;
157 	double    alpha_c;
158 	double    beta_c;
159 	double    rho_c;
160 	int       i;
161 
162 	int       n_pre;
163 	int       n_run;
164 	int       n_left;
165 
166 	if ( inc_x != 1 ||
167 	     inc_u != 1 ||
168 	     inc_y != 1 ||
169 	     inc_z != 1 ) bl1_abort();
170 
171 	n_pre = 0;
172 	//if ( ( unsigned long ) z % 16 != 0 )
173 	//{
174 	//	if ( ( unsigned long ) x % 16 == 0 ||
175 	//	     ( unsigned long ) u % 16 == 0 ||
176 	//	     ( unsigned long ) y % 16 == 0 ) bl1_abort();
177 	//
178 	//	n_pre = 1;
179 	//}
180 
181 	n_run       = ( n - n_pre ) / 2;
182 	n_left      = ( n - n_pre ) % 2;
183 
184 	chi1     = x;
185 	upsilon1 = u;
186 	psi1     = y;
187 	zeta1    = z;
188 
189 	alpha_c = *alpha;
190 	beta_c  = *beta;
191 
192 	rho_c   = 0.0;
193 
194 	if ( n_pre == 1 )
195 	{
196 		double   chi1_c    = *chi1;
197 		double   upsilon_c = *upsilon1;
198 
199 		rho_c  += chi1_c * upsilon_c;
200 		*psi1  -= alpha_c * chi1_c;
201 		*zeta1 -= beta_c  * chi1_c;
202 
203 		chi1     += inc_x;
204 		upsilon1 += inc_u;
205 		psi1     += inc_y;
206 		zeta1    += inc_z;
207 	}
208 
209 	for ( i = 0; i < n_run; ++i )
210 	{
211 		double   chi1_c     = *chi1;
212 		double   chi2_c     = *(chi1 + 1);
213 		double   upsilon1_c = *upsilon1;
214 		double   upsilon2_c = *(upsilon1 + 1);
215 		double   psi1_c     = *psi1;
216 		double   psi2_c     = *(psi1 + 1);
217 		double   zeta1_c    = *zeta1;
218 		double   zeta2_c    = *(zeta1 + 1);
219 
220 		rho_c   += chi1_c * upsilon1_c;
221 		rho_c   += chi2_c * upsilon2_c;
222 
223 		psi1_c  -= alpha_c * chi1_c;
224 		psi2_c  -= alpha_c * chi2_c;
225 
226 		zeta1_c -= beta_c  * chi1_c;
227 		zeta2_c -= beta_c  * chi2_c;
228 
229 		*psi1        = psi1_c;
230 		*(psi1 + 1)  = psi2_c;
231 		*zeta1       = zeta1_c;
232 		*(zeta1 + 1) = zeta2_c;
233 
234 		chi1     += 2*inc_x;
235 		upsilon1 += 2*inc_u;
236 		psi1     += 2*inc_y;
237 		zeta1    += 2*inc_z;
238 	}
239 
240 	if ( n_left > 0 )
241 	{
242 		for( i = 0; i < n_left; ++i )
243 		{
244 			double   chi1_c    = *chi1;
245 			double   upsilon_c = *upsilon1;
246 
247 			rho_c  += chi1_c * upsilon_c;
248 			*psi1  -= alpha_c * chi1_c;
249 			*zeta1 -= beta_c  * chi1_c;
250 
251 			chi1     += inc_x;
252 			upsilon1 += inc_u;
253 			psi1     += inc_y;
254 			zeta1    += inc_z;
255 		}
256 	}
257 
258 	*rho = rho_c;
259 }
260 #endif
261 
262 
bl1_cdotaxmyv2(int n,scomplex * alpha,scomplex * beta,scomplex * x,int inc_x,scomplex * u,int inc_u,scomplex * rho,scomplex * y,int inc_y,scomplex * z,int inc_z)263 void bl1_cdotaxmyv2( int       n,
264                      scomplex* alpha,
265                      scomplex* beta,
266                      scomplex* x, int inc_x,
267                      scomplex* u, int inc_u,
268                      scomplex* rho,
269                      scomplex* y, int inc_y,
270                      scomplex* z, int inc_z )
271 {
272 	bl1_abort();
273 }
274 
275 
bl1_zdotaxmyv2(int n,dcomplex * alpha,dcomplex * beta,dcomplex * x,int inc_x,dcomplex * u,int inc_u,dcomplex * rho,dcomplex * y,int inc_y,dcomplex * z,int inc_z)276 void bl1_zdotaxmyv2( int       n,
277                      dcomplex* alpha,
278                      dcomplex* beta,
279                      dcomplex* x, int inc_x,
280                      dcomplex* u, int inc_u,
281                      dcomplex* rho,
282                      dcomplex* y, int inc_y,
283                      dcomplex* z, int inc_z )
284 #if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS
285 {
286 	dcomplex* restrict chi1;
287 	dcomplex* restrict upsilon1;
288 	dcomplex* restrict psi1;
289 	dcomplex* restrict zeta1;
290 	int       i;
291 
292 	v2df_t alpha11v, alpha12v;
293 	v2df_t beta11v,  beta12v;
294 	v2df_t rho1v;
295 	v2df_t x1v, x1rv;
296 	v2df_t y1v;
297 	v2df_t z1v;
298 	v2df_t u11v, u12v;
299 	v2df_t acad, bdbc;
300 	v2df_t bcac, adbd;
301 
302 	if ( inc_x != 1 ||
303 	     inc_u != 1 ||
304 	     inc_y != 1 ||
305 	     inc_z != 1 ) bl1_abort();
306 
307 	chi1     = x;
308 	upsilon1 = u;
309 	psi1     = y;
310 	zeta1    = z;
311 
312 	//rho_c.real = 0.0;
313 	//rho_c.imag = 0.0;
314 	rho1v.v = _mm_setzero_pd();
315 
316 	//alpha_c = *alpha;
317 	//beta_c  = *beta;
318 	alpha11v.v  = _mm_loaddup_pd( ( double* )&(alpha->real) );
319 	alpha12v.v  = _mm_loaddup_pd( ( double* )&(alpha->imag) );
320 	beta11v.v   = _mm_loaddup_pd( ( double* )&(beta->real) );
321 	beta12v.v   = _mm_loaddup_pd( ( double* )&(beta->imag) );
322 
323 	for ( i = 0; i < n; ++i )
324 	{
325 		//dcomplex chi1_c     = *chi1;
326 		x1v.v = _mm_load_pd( ( double* )chi1 );
327 
328 		//psi1->real  -= alpha_c.real * chi1_c.real - alpha_c.imag * chi1_c.imag;
329 		//psi1->imag  -= alpha_c.real * chi1_c.imag + alpha_c.imag * chi1_c.real;
330 		x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
331 		acad.v = alpha11v.v * x1v.v;
332 		bdbc.v = alpha12v.v * x1rv.v;
333 		y1v.v = _mm_load_pd( ( double* )psi1 );
334 		y1v.v = y1v.v - _mm_addsub_pd( acad.v, bdbc.v );
335 		_mm_store_pd( ( double* )psi1, y1v.v );
336 
337 		//zeta1->real -= beta_c.real  * chi1_c.real - beta_c.imag  * chi1_c.imag;
338 		//zeta1->imag -= beta_c.real  * chi1_c.imag + beta_c.imag  * chi1_c.real;
339 		x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
340 		acad.v = beta11v.v * x1v.v;
341 		bdbc.v = beta12v.v * x1rv.v;
342 		z1v.v = _mm_load_pd( ( double* )zeta1 );
343 		z1v.v = z1v.v - _mm_addsub_pd( acad.v, bdbc.v );
344 		_mm_store_pd( ( double* )zeta1, z1v.v );
345 
346 		//rho_c.real = chi1_c.real * upsilon1_c.real - -chi1_c.imag * upsilon1_c.imag;
347 		//rho_c.imag = chi1_c.real * upsilon1_c.imag + -chi1_c.imag * upsilon1_c.real;
348 		x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
349 		u11v.v = _mm_loaddup_pd( ( double* )&(upsilon1->real) );
350 		u12v.v = _mm_loaddup_pd( ( double* )&(upsilon1->imag) );
351 		bcac.v = x1rv.v * u11v.v;
352 		adbd.v = x1v.v  * u12v.v;
353 		rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
354 
355 		chi1     += 1;
356 		upsilon1 += 1;
357 		psi1     += 1;
358 		zeta1    += 1;
359 	}
360 
361 	rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
362 
363 	rho1v.d[1] = -rho1v.d[1];
364 
365 	_mm_store_pd( ( double* )rho, rho1v.v );
366 }
367 #elif BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_NO_INTRINSICS
368 {
369 	dcomplex* restrict chi1;
370 	dcomplex* restrict upsilon1;
371 	dcomplex* restrict psi1;
372 	dcomplex* restrict zeta1;
373 	dcomplex  alpha_c;
374 	dcomplex  beta_c;
375 	dcomplex  rho_c;
376 	int       i;
377 
378 	if ( inc_x != 1 ||
379 	     inc_u != 1 ||
380 	     inc_y != 1 ||
381 	     inc_z != 1 ) bl1_abort();
382 
383 	chi1     = x;
384 	upsilon1 = u;
385 	psi1     = y;
386 	zeta1    = z;
387 
388 	rho_c.real = 0.0;
389 	rho_c.imag = 0.0;
390 
391 	alpha_c = *alpha;
392 	beta_c  = *beta;
393 
394 	for ( i = 0; i < n; ++i )
395 	{
396 		dcomplex chi1_c     = *chi1;
397 		dcomplex upsilon1_c = *upsilon1;
398 
399 		// rho += conj(chi1) * upsilon1;
400 		rho_c.real += chi1_c.real * upsilon1_c.real - -chi1_c.imag * upsilon1_c.imag;
401 		rho_c.imag += chi1_c.real * upsilon1_c.imag + -chi1_c.imag * upsilon1_c.real;
402 
403 		// psi1  = psi1  - alpha * chi1;
404 		psi1->real  -= alpha_c.real * chi1_c.real - alpha_c.imag * chi1_c.imag;
405 		psi1->imag  -= alpha_c.real * chi1_c.imag + alpha_c.imag * chi1_c.real;
406 
407 		// zeta1 = zeta1 - beta  * chi1;
408 		zeta1->real -= beta_c.real  * chi1_c.real - beta_c.imag  * chi1_c.imag;
409 		zeta1->imag -= beta_c.real  * chi1_c.imag + beta_c.imag  * chi1_c.real;
410 
411 		chi1     += inc_x;
412 		upsilon1 += inc_u;
413 		psi1     += inc_y;
414 		zeta1    += inc_z;
415 	}
416 
417 	*rho = rho_c;
418 }
419 #endif
420 
421