1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "blis1.h"
12
13 /*
14 Effective computation:
15
16 a = a + beta * u + gamma * z;
17 rho = conj(a) * x;
18 w = w + kappa * a;
19 */
20
bl1_saxpyv2bdotaxpy(int n,float * beta,float * u,int inc_u,float * gamma,float * z,int inc_z,float * a,int inc_a,float * x,int inc_x,float * kappa,float * rho,float * w,int inc_w)21 void bl1_saxpyv2bdotaxpy( int n,
22 float* beta,
23 float* u, int inc_u,
24 float* gamma,
25 float* z, int inc_z,
26 float* a, int inc_a,
27 float* x, int inc_x,
28 float* kappa,
29 float* rho,
30 float* w, int inc_w )
31 {
32 bl1_abort();
33 }
34
35
bl1_daxpyv2bdotaxpy(int n,double * beta,double * u,int inc_u,double * gamma,double * z,int inc_z,double * a,int inc_a,double * x,int inc_x,double * kappa,double * rho,double * w,int inc_w)36 void bl1_daxpyv2bdotaxpy( int n,
37 double* beta,
38 double* u, int inc_u,
39 double* gamma,
40 double* z, int inc_z,
41 double* a, int inc_a,
42 double* x, int inc_x,
43 double* kappa,
44 double* rho,
45 double* w, int inc_w )
46 #if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS
47 {
48 double* restrict upsilon1;
49 double* restrict zeta1;
50 double* restrict alpha1;
51 double* restrict chi1;
52 double* restrict omega1;
53 double rho_c;
54 int i;
55 v2df_t b1v, g1v, k1v;
56 v2df_t rhov;
57 v2df_t u1v, z1v, a1v;
58 v2df_t u2v, z2v, a2v;
59 v2df_t x1v, w1v;
60 v2df_t x2v, w2v;
61
62 int n_pre;
63 int n_run;
64 int n_left;
65
66 n_pre = 0;
67 if ( ( unsigned long ) a % 16 != 0 )
68 {
69 if ( ( unsigned long ) u % 16 == 0 ||
70 ( unsigned long ) z % 16 == 0 ||
71 ( unsigned long ) x % 16 == 0 ||
72 ( unsigned long ) w % 16 == 0 ) bl1_abort();
73
74 n_pre = 1;
75 }
76
77 n_run = ( n - n_pre ) / 4;
78 n_left = ( n - n_pre ) % 4;
79
80 upsilon1 = u;
81 zeta1 = z;
82 alpha1 = a;
83 chi1 = x;
84 omega1 = w;
85
86
87 rho_c = 0.0;
88
89 if ( n_pre == 1 )
90 {
91 double beta_c = *beta;
92 double gamma_c = *gamma;
93 double kappa_c = *kappa;
94
95 double upsilon1_c = *upsilon1;
96 double zeta1_c = *zeta1;
97 double alpha1_c = *alpha1;
98 double chi1_c = *chi1;
99 double omega1_c = *omega1;
100
101 alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
102 rho_c += alpha1_c * chi1_c;
103 omega1_c += kappa_c * alpha1_c;
104
105 *alpha1 = alpha1_c;
106 *omega1 = omega1_c;
107
108 upsilon1 += inc_u;
109 zeta1 += inc_z;
110 alpha1 += inc_a;
111 chi1 += inc_x;
112 omega1 += inc_w;
113 }
114
115 b1v.v = _mm_loaddup_pd( ( double* )beta );
116 g1v.v = _mm_loaddup_pd( ( double* )gamma );
117 k1v.v = _mm_loaddup_pd( ( double* )kappa );
118
119 rhov.v = _mm_setzero_pd();
120
121 for ( i = 0; i < n_run; ++i )
122 {
123 u1v.v = _mm_load_pd( ( double* )upsilon1 );
124 z1v.v = _mm_load_pd( ( double* )zeta1 );
125 a1v.v = _mm_load_pd( ( double* )alpha1 );
126
127 a1v.v += b1v.v * u1v.v + g1v.v * z1v.v;
128
129 u2v.v = _mm_load_pd( ( double* )(upsilon1 + 2) );
130 z2v.v = _mm_load_pd( ( double* )(zeta1 + 2) );
131 a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
132
133 a2v.v += b1v.v * u2v.v + g1v.v * z2v.v;
134
135 x1v.v = _mm_load_pd( ( double* )chi1 );
136 x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
137
138 w1v.v = _mm_load_pd( ( double* )omega1 );
139 w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
140
141 rhov.v += a1v.v * x1v.v;
142 rhov.v += a2v.v * x2v.v;
143
144 w1v.v += k1v.v * a1v.v;
145 w2v.v += k1v.v * a2v.v;
146
147 _mm_store_pd( ( double* )alpha1, a1v.v );
148 _mm_store_pd( ( double* )(alpha1 + 2), a2v.v );
149
150 _mm_store_pd( ( double* )omega1, w1v.v );
151 _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
152
153
154 upsilon1 += 4;
155 zeta1 += 4;
156 alpha1 += 4;
157 chi1 += 4;
158 omega1 += 4;
159 }
160
161 rho_c += rhov.d[0] + rhov.d[1];
162
163 if ( n_left > 0 )
164 {
165 double beta_c = *beta;
166 double gamma_c = *gamma;
167 double kappa_c = *kappa;
168
169 for ( i = 0; i < n_left; ++i )
170 {
171 double upsilon1_c = *upsilon1;
172 double zeta1_c = *zeta1;
173 double alpha1_c = *alpha1;
174 double chi1_c = *chi1;
175 double omega1_c = *omega1;
176
177 alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
178 rho_c += alpha1_c * chi1_c;
179 omega1_c += kappa_c * alpha1_c;
180
181 *alpha1 = alpha1_c;
182 *omega1 = omega1_c;
183
184 upsilon1 += inc_u;
185 zeta1 += inc_z;
186 alpha1 += inc_a;
187 chi1 += inc_x;
188 omega1 += inc_w;
189 }
190 }
191
192 *rho = rho_c;
193 }
194 #elif BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_NO_INTRINSICS
195 {
196 double* restrict upsilon1;
197 double* restrict zeta1;
198 double* restrict alpha1;
199 double* restrict chi1;
200 double* restrict omega1;
201 double beta_c;
202 double gamma_c;
203 double kappa_c;
204 double rho_c;
205 int i;
206
207 int n_pre;
208 int n_run;
209 int n_left;
210
211 n_pre = 0;
212 //if ( ( unsigned long ) a % 16 != 0 )
213 //{
214 // if ( ( unsigned long ) u % 16 == 0 ||
215 // ( unsigned long ) z % 16 == 0 ||
216 // ( unsigned long ) x % 16 == 0 ||
217 // ( unsigned long ) w % 16 == 0 ) bl1_abort();
218 //
219 // n_pre = 1;
220 //}
221
222 n_run = ( n - n_pre ) / 2;
223 n_left = ( n - n_pre ) % 2;
224
225 upsilon1 = u;
226 zeta1 = z;
227 alpha1 = a;
228 chi1 = x;
229 omega1 = w;
230
231 beta_c = *beta;
232 gamma_c = *gamma;
233 kappa_c = *kappa;
234
235 rho_c = 0.0;
236
237 if ( n_pre == 1 )
238 {
239 double upsilon1_c = *upsilon1;
240 double zeta1_c = *zeta1;
241 double alpha1_c = *alpha1;
242 double chi1_c = *chi1;
243 double omega1_c = *omega1;
244
245 alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
246 rho_c += alpha1_c * chi1_c;
247 omega1_c += kappa_c * alpha1_c;
248
249 *alpha1 = alpha1_c;
250 *omega1 = omega1_c;
251
252 upsilon1 += inc_u;
253 zeta1 += inc_z;
254 alpha1 += inc_a;
255 chi1 += inc_x;
256 omega1 += inc_w;
257 }
258
259 for ( i = 0; i < n_run; ++i )
260 {
261 double upsilon1_c = *upsilon1;
262 double upsilon2_c = *(upsilon1 + 1);
263 double zeta1_c = *zeta1;
264 double zeta2_c = *(zeta1 + 1);
265 double alpha1_c = *alpha1;
266 double alpha2_c = *(alpha1 + 1);
267 double chi1_c = *chi1;
268 double chi2_c = *(chi1 + 1);
269 double omega1_c = *omega1;
270 double omega2_c = *(omega1 + 1);
271
272 // alpha1 += beta * upsilon1 + gamma * zeta1;
273 alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
274 alpha2_c += beta_c * upsilon2_c + gamma_c * zeta2_c;
275
276 // rho += conj(alpha1) * chi1 +
277 // conj(alpha2) * chi2;
278 rho_c += alpha1_c * chi1_c + alpha2_c * chi2_c;
279
280 // omega1 += kappa * alpha1;
281 omega1_c += kappa_c * alpha1_c;
282 omega2_c += kappa_c * alpha2_c;
283
284 *alpha1 = alpha1_c;
285 *(alpha1 + 1) = alpha2_c;
286 *omega1 = omega1_c;
287 *(omega1 + 1) = omega2_c;
288
289 upsilon1 += 2*inc_u;
290 zeta1 += 2*inc_z;
291 alpha1 += 2*inc_a;
292 chi1 += 2*inc_x;
293 omega1 += 2*inc_w;
294 }
295
296 if ( n_left > 0 )
297 {
298
299 for ( i = 0; i < n_left; ++i )
300 {
301 double upsilon1_c = *upsilon1;
302 double zeta1_c = *zeta1;
303 double alpha1_c = *alpha1;
304 double chi1_c = *chi1;
305 double omega1_c = *omega1;
306
307 alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
308 rho_c += alpha1_c * chi1_c;
309 omega1_c += kappa_c * alpha1_c;
310
311 *alpha1 = alpha1_c;
312 *omega1 = omega1_c;
313
314 upsilon1 += inc_u;
315 zeta1 += inc_z;
316 alpha1 += inc_a;
317 chi1 += inc_x;
318 omega1 += inc_w;
319 }
320 }
321
322 *rho = rho_c;
323 }
324 #endif
325
326
bl1_caxpyv2bdotaxpy(int n,scomplex * beta,scomplex * u,int inc_u,scomplex * gamma,scomplex * z,int inc_z,scomplex * a,int inc_a,scomplex * x,int inc_x,scomplex * kappa,scomplex * rho,scomplex * w,int inc_w)327 void bl1_caxpyv2bdotaxpy( int n,
328 scomplex* beta,
329 scomplex* u, int inc_u,
330 scomplex* gamma,
331 scomplex* z, int inc_z,
332 scomplex* a, int inc_a,
333 scomplex* x, int inc_x,
334 scomplex* kappa,
335 scomplex* rho,
336 scomplex* w, int inc_w )
337 {
338 bl1_abort();
339 }
340
341
bl1_zaxpyv2bdotaxpy(int n,dcomplex * beta,dcomplex * u,int inc_u,dcomplex * gamma,dcomplex * z,int inc_z,dcomplex * a,int inc_a,dcomplex * x,int inc_x,dcomplex * kappa,dcomplex * rho,dcomplex * w,int inc_w)342 void bl1_zaxpyv2bdotaxpy( int n,
343 dcomplex* beta,
344 dcomplex* u, int inc_u,
345 dcomplex* gamma,
346 dcomplex* z, int inc_z,
347 dcomplex* a, int inc_a,
348 dcomplex* x, int inc_x,
349 dcomplex* kappa,
350 dcomplex* rho,
351 dcomplex* w, int inc_w )
352 #if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS
353 {
354 dcomplex* restrict upsilon1;
355 dcomplex* restrict zeta1;
356 dcomplex* restrict alpha1;
357 dcomplex* restrict chi1;
358 dcomplex* restrict omega1;
359 int i;
360
361 //v2df_t beta1v, beta1rv;
362 //v2df_t gamma1v, gamma1rv;
363 //v2df_t kappa1v, kappa1rv;
364 v2df_t rho1v;
365 //v2df_t u11v, u12v;
366 //v2df_t z11v, z12v;
367 v2df_t a11v, a12v;
368 v2df_t x1v, x1rv;
369 v2df_t w1v;
370 v2df_t acbc, bdad;
371 v2df_t adac, bcbd;
372
373 v2df_t a1v, a1rv;
374 v2df_t u1v, u1rv;
375 v2df_t z1v, z1rv;
376 v2df_t beta11v, gamma11v, kappa11v;
377 v2df_t beta12v, gamma12v, kappa12v;
378
379 upsilon1 = u;
380 zeta1 = z;
381 alpha1 = a;
382 chi1 = x;
383 omega1 = w;
384
385 if ( inc_u != 1 ||
386 inc_z != 1 ||
387 inc_a != 1 ||
388 inc_x != 1 ||
389 inc_w != 1 ) bl1_abort();
390
391
392 beta11v.v = _mm_loaddup_pd( ( double* )&(beta->real) );
393 beta12v.v = _mm_loaddup_pd( ( double* )&(beta->imag) );
394 gamma11v.v = _mm_loaddup_pd( ( double* )&(gamma->real) );
395 gamma12v.v = _mm_loaddup_pd( ( double* )&(gamma->imag) );
396 kappa11v.v = _mm_loaddup_pd( ( double* )&(kappa->real) );
397 kappa12v.v = _mm_loaddup_pd( ( double* )&(kappa->imag) );
398
399 rho1v.v = _mm_setzero_pd();
400
401 for ( i = 0; i < n; ++i )
402 {
403 //alpha_c = *alpha1;
404 a1v.v = _mm_load_pd( ( double* )alpha1 );
405
406 //alpha1_c.real += beta_c.real * upsilon1_c.real - beta_c.imag * upsilon1_c.imag;
407 //alpha1_c.imag += beta_c.real * upsilon1_c.imag + beta_c.imag * upsilon1_c.real;
408 u1v.v = _mm_load_pd( ( double* )upsilon1 );
409 u1rv.v = _mm_shuffle_pd( u1v.v, u1v.v, _MM_SHUFFLE2 (0,1) );
410 acbc.v = beta11v.v * u1v.v;
411 bdad.v = beta12v.v * u1rv.v;
412 a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
413
414 //alpha1_c.real += gamma_c.real * zeta1_c.real - gamma_c.imag * zeta1_c.imag;
415 //alpha1_c.imag += gamma_c.real * zeta1_c.imag + gamma_c.imag * zeta1_c.real;
416 z1v.v = _mm_load_pd( ( double* )zeta1 );
417 z1rv.v = _mm_shuffle_pd( z1v.v, z1v.v, _MM_SHUFFLE2 (0,1) );
418 acbc.v = gamma11v.v * z1v.v;
419 bdad.v = gamma12v.v * z1rv.v;
420 a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
421
422 //*alpha1 = alpha1_c;
423 _mm_store_pd( ( double* )alpha1, a1v.v );
424
425 //rho_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
426 //rho_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
427 x1v.v = _mm_load_pd( ( double* )chi1 );
428 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
429 a11v.v = a1v.v;
430 a12v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (1,1) );
431 a11v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (0,0) );
432 adac.v = a11v.v * x1rv.v;
433 bcbd.v = a12v.v * x1v.v;
434 rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
435
436 //omega_c = *omega1;
437 w1v.v = _mm_load_pd( ( double* )omega1 );
438
439 //omega1_c.real += kappa_c.real * alpha1_c.real - kappa_c.imag * alpha1_c.imag;
440 //omega1_c.imag += kappa_c.real * alpha1_c.imag + kappa_c.imag * alpha1_c.real;
441 a1rv.v = _mm_shuffle_pd( a1v.v, a1v.v, _MM_SHUFFLE2 (0,1) );
442 acbc.v = kappa11v.v * a1v.v;
443 bdad.v = kappa12v.v * a1rv.v;
444 w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
445
446 // *omega1 = omega1_c;
447 _mm_store_pd( ( double* )omega1, w1v.v );
448
449
450 upsilon1 += 1;
451 zeta1 += 1;
452 alpha1 += 1;
453 chi1 += 1;
454 omega1 += 1;
455 }
456
457 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
458
459 //rho->real = rho_c.real;
460 //rho->imag = rho_c.imag;
461 _mm_store_pd( ( double* )rho, rho1v.v );
462 }
463 #elif BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_NO_INTRINSICS
464 {
465 dcomplex* restrict upsilon1;
466 dcomplex* restrict zeta1;
467 dcomplex* restrict alpha1;
468 dcomplex* restrict chi1;
469 dcomplex* restrict omega1;
470 dcomplex beta_c;
471 dcomplex gamma_c;
472 dcomplex kappa_c;
473 dcomplex rho_c;
474 int i;
475
476 upsilon1 = u;
477 zeta1 = z;
478 alpha1 = a;
479 chi1 = x;
480 omega1 = w;
481
482 rho_c.real = 0.0;
483 rho_c.imag = 0.0;
484
485 beta_c = *beta;
486 gamma_c = *gamma;
487 kappa_c = *kappa;
488
489 for ( i = 0; i < n; ++i )
490 {
491 dcomplex upsilon1_c = *upsilon1;
492 dcomplex zeta1_c = *zeta1;
493 dcomplex alpha1_c = *alpha1;
494 dcomplex chi1_c = *chi1;
495 dcomplex omega1_c = *omega1;
496
497 // alpha1 += beta * upsilon1;
498 alpha1_c.real += beta_c.real * upsilon1_c.real - beta_c.imag * upsilon1_c.imag;
499 alpha1_c.imag += beta_c.real * upsilon1_c.imag + beta_c.imag * upsilon1_c.real;
500
501 // alpha1 += gamma * zeta1;
502 alpha1_c.real += gamma_c.real * zeta1_c.real - gamma_c.imag * zeta1_c.imag;
503 alpha1_c.imag += gamma_c.real * zeta1_c.imag + gamma_c.imag * zeta1_c.real;
504
505 // rho += conj(alpha1) * chi1;
506 rho_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
507 rho_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
508
509 // omega1 += kappa * alpha1;
510 omega1_c.real += kappa_c.real * alpha1_c.real - kappa_c.imag * alpha1_c.imag;
511 omega1_c.imag += kappa_c.real * alpha1_c.imag + kappa_c.imag * alpha1_c.real;
512
513 *alpha1 = alpha1_c;
514 *omega1 = omega1_c;
515
516 upsilon1 += inc_u;
517 zeta1 += inc_z;
518 alpha1 += inc_a;
519 chi1 += inc_x;
520 omega1 += inc_w;
521 }
522
523 rho->real = rho_c.real;
524 rho->imag = rho_c.imag;
525 }
526 #endif
527
528