1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "FLAME.h"
12
13 /*
14 Effective computation:
15
16 rho = conj(x) * u;
17 y = y - alpha * x;
18 z = z - beta * x;
19 */
20
bl1_sdotaxmyv2(int n,float * alpha,float * beta,float * x,int inc_x,float * u,int inc_u,float * rho,float * y,int inc_y,float * z,int inc_z)21 void bl1_sdotaxmyv2( int n,
22 float* alpha,
23 float* beta,
24 float* x, int inc_x,
25 float* u, int inc_u,
26 float* rho,
27 float* y, int inc_y,
28 float* z, int inc_z )
29 {
30 bl1_abort();
31 }
32
33
bl1_ddotaxmyv2(int n,double * alpha,double * beta,double * x,int inc_x,double * u,int inc_u,double * rho,double * y,int inc_y,double * z,int inc_z)34 void bl1_ddotaxmyv2( int n,
35 double* alpha,
36 double* beta,
37 double* x, int inc_x,
38 double* u, int inc_u,
39 double* rho,
40 double* y, int inc_y,
41 double* z, int inc_z )
42 #if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS
43 {
44 double* restrict chi1;
45 double* restrict upsilon1;
46 double* restrict psi1;
47 double* restrict zeta1;
48 double rho_c;
49 int i;
50
51 int n_pre;
52 int n_run;
53 int n_left;
54
55 v2df_t a1v, b1v;
56 v2df_t rho1v;
57 v2df_t x1v, u1v, y1v, z1v;
58
59 if ( inc_x != 1 ||
60 inc_u != 1 ||
61 inc_y != 1 ||
62 inc_z != 1 ) bl1_abort();
63
64 n_pre = 0;
65 if ( ( unsigned long ) z % 16 != 0 )
66 {
67 if ( ( unsigned long ) x % 16 == 0 ||
68 ( unsigned long ) u % 16 == 0 ||
69 ( unsigned long ) y % 16 == 0 ) bl1_abort();
70
71 n_pre = 1;
72 }
73
74 n_run = ( n - n_pre ) / 2;
75 n_left = ( n - n_pre ) % 2;
76
77 chi1 = x;
78 upsilon1 = u;
79 psi1 = y;
80 zeta1 = z;
81
82 rho_c = 0.0;
83
84 if ( n_pre == 1 )
85 {
86 double alpha_c = *alpha;
87 double beta_c = *beta;
88 double chi1_c = *chi1;
89 double upsilon_c = *upsilon1;
90
91 rho_c += chi1_c * upsilon_c;
92 *psi1 -= alpha_c * chi1_c;
93 *zeta1 -= beta_c * chi1_c;
94
95 chi1 += inc_x;
96 upsilon1 += inc_u;
97 psi1 += inc_y;
98 zeta1 += inc_z;
99 }
100
101 a1v.v = _mm_loaddup_pd( ( double* )alpha );
102 b1v.v = _mm_loaddup_pd( ( double* )beta );
103
104 rho1v.v = _mm_setzero_pd();
105
106 for ( i = 0; i < n_run; ++i )
107 {
108 x1v.v = _mm_load_pd( ( double* )chi1 );
109 u1v.v = _mm_load_pd( ( double* )upsilon1 );
110 y1v.v = _mm_load_pd( ( double* )psi1 );
111 z1v.v = _mm_load_pd( ( double* )zeta1 );
112
113 rho1v.v += x1v.v * u1v.v;
114 y1v.v -= a1v.v * x1v.v;
115 z1v.v -= b1v.v * x1v.v;
116
117 _mm_store_pd( ( double* )psi1, y1v.v );
118 _mm_store_pd( ( double* )zeta1, z1v.v );
119
120 chi1 += 2;
121 upsilon1 += 2;
122 psi1 += 2;
123 zeta1 += 2;
124 }
125
126 rho_c += rho1v.d[0] + rho1v.d[1];
127
128 if ( n_left > 0 )
129 {
130 double alpha_c = *alpha;
131 double beta_c = *beta;
132
133 for( i = 0; i < n_left; ++i )
134 {
135 double chi1_c = *chi1;
136 double upsilon_c = *upsilon1;
137
138 rho_c += chi1_c * upsilon_c;
139 *psi1 -= alpha_c * chi1_c;
140 *zeta1 -= beta_c * chi1_c;
141
142 chi1 += inc_x;
143 upsilon1 += inc_u;
144 psi1 += inc_y;
145 zeta1 += inc_z;
146 }
147 }
148
149 *rho = rho_c;
150 }
151 #elif BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_NO_INTRINSICS
152 {
153 double* restrict chi1;
154 double* restrict upsilon1;
155 double* restrict psi1;
156 double* restrict zeta1;
157 double alpha_c;
158 double beta_c;
159 double rho_c;
160 int i;
161
162 int n_pre;
163 int n_run;
164 int n_left;
165
166 if ( inc_x != 1 ||
167 inc_u != 1 ||
168 inc_y != 1 ||
169 inc_z != 1 ) bl1_abort();
170
171 n_pre = 0;
172 //if ( ( unsigned long ) z % 16 != 0 )
173 //{
174 // if ( ( unsigned long ) x % 16 == 0 ||
175 // ( unsigned long ) u % 16 == 0 ||
176 // ( unsigned long ) y % 16 == 0 ) bl1_abort();
177 //
178 // n_pre = 1;
179 //}
180
181 n_run = ( n - n_pre ) / 2;
182 n_left = ( n - n_pre ) % 2;
183
184 chi1 = x;
185 upsilon1 = u;
186 psi1 = y;
187 zeta1 = z;
188
189 alpha_c = *alpha;
190 beta_c = *beta;
191
192 rho_c = 0.0;
193
194 if ( n_pre == 1 )
195 {
196 double chi1_c = *chi1;
197 double upsilon_c = *upsilon1;
198
199 rho_c += chi1_c * upsilon_c;
200 *psi1 -= alpha_c * chi1_c;
201 *zeta1 -= beta_c * chi1_c;
202
203 chi1 += inc_x;
204 upsilon1 += inc_u;
205 psi1 += inc_y;
206 zeta1 += inc_z;
207 }
208
209 for ( i = 0; i < n_run; ++i )
210 {
211 double chi1_c = *chi1;
212 double chi2_c = *(chi1 + 1);
213 double upsilon1_c = *upsilon1;
214 double upsilon2_c = *(upsilon1 + 1);
215 double psi1_c = *psi1;
216 double psi2_c = *(psi1 + 1);
217 double zeta1_c = *zeta1;
218 double zeta2_c = *(zeta1 + 1);
219
220 rho_c += chi1_c * upsilon1_c;
221 rho_c += chi2_c * upsilon2_c;
222
223 psi1_c -= alpha_c * chi1_c;
224 psi2_c -= alpha_c * chi2_c;
225
226 zeta1_c -= beta_c * chi1_c;
227 zeta2_c -= beta_c * chi2_c;
228
229 *psi1 = psi1_c;
230 *(psi1 + 1) = psi2_c;
231 *zeta1 = zeta1_c;
232 *(zeta1 + 1) = zeta2_c;
233
234 chi1 += 2*inc_x;
235 upsilon1 += 2*inc_u;
236 psi1 += 2*inc_y;
237 zeta1 += 2*inc_z;
238 }
239
240 if ( n_left > 0 )
241 {
242 for( i = 0; i < n_left; ++i )
243 {
244 double chi1_c = *chi1;
245 double upsilon_c = *upsilon1;
246
247 rho_c += chi1_c * upsilon_c;
248 *psi1 -= alpha_c * chi1_c;
249 *zeta1 -= beta_c * chi1_c;
250
251 chi1 += inc_x;
252 upsilon1 += inc_u;
253 psi1 += inc_y;
254 zeta1 += inc_z;
255 }
256 }
257
258 *rho = rho_c;
259 }
260 #endif
261
262
bl1_cdotaxmyv2(int n,scomplex * alpha,scomplex * beta,scomplex * x,int inc_x,scomplex * u,int inc_u,scomplex * rho,scomplex * y,int inc_y,scomplex * z,int inc_z)263 void bl1_cdotaxmyv2( int n,
264 scomplex* alpha,
265 scomplex* beta,
266 scomplex* x, int inc_x,
267 scomplex* u, int inc_u,
268 scomplex* rho,
269 scomplex* y, int inc_y,
270 scomplex* z, int inc_z )
271 {
272 bl1_abort();
273 }
274
275
bl1_zdotaxmyv2(int n,dcomplex * alpha,dcomplex * beta,dcomplex * x,int inc_x,dcomplex * u,int inc_u,dcomplex * rho,dcomplex * y,int inc_y,dcomplex * z,int inc_z)276 void bl1_zdotaxmyv2( int n,
277 dcomplex* alpha,
278 dcomplex* beta,
279 dcomplex* x, int inc_x,
280 dcomplex* u, int inc_u,
281 dcomplex* rho,
282 dcomplex* y, int inc_y,
283 dcomplex* z, int inc_z )
284 #if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS
285 {
286 dcomplex* restrict chi1;
287 dcomplex* restrict upsilon1;
288 dcomplex* restrict psi1;
289 dcomplex* restrict zeta1;
290 int i;
291
292 v2df_t alpha11v, alpha12v;
293 v2df_t beta11v, beta12v;
294 v2df_t rho1v;
295 v2df_t x1v, x1rv;
296 v2df_t y1v;
297 v2df_t z1v;
298 v2df_t u11v, u12v;
299 v2df_t acad, bdbc;
300 v2df_t bcac, adbd;
301
302 if ( inc_x != 1 ||
303 inc_u != 1 ||
304 inc_y != 1 ||
305 inc_z != 1 ) bl1_abort();
306
307 chi1 = x;
308 upsilon1 = u;
309 psi1 = y;
310 zeta1 = z;
311
312 //rho_c.real = 0.0;
313 //rho_c.imag = 0.0;
314 rho1v.v = _mm_setzero_pd();
315
316 //alpha_c = *alpha;
317 //beta_c = *beta;
318 alpha11v.v = _mm_loaddup_pd( ( double* )&(alpha->real) );
319 alpha12v.v = _mm_loaddup_pd( ( double* )&(alpha->imag) );
320 beta11v.v = _mm_loaddup_pd( ( double* )&(beta->real) );
321 beta12v.v = _mm_loaddup_pd( ( double* )&(beta->imag) );
322
323 for ( i = 0; i < n; ++i )
324 {
325 //dcomplex chi1_c = *chi1;
326 x1v.v = _mm_load_pd( ( double* )chi1 );
327
328 //psi1->real -= alpha_c.real * chi1_c.real - alpha_c.imag * chi1_c.imag;
329 //psi1->imag -= alpha_c.real * chi1_c.imag + alpha_c.imag * chi1_c.real;
330 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
331 acad.v = alpha11v.v * x1v.v;
332 bdbc.v = alpha12v.v * x1rv.v;
333 y1v.v = _mm_load_pd( ( double* )psi1 );
334 y1v.v = y1v.v - _mm_addsub_pd( acad.v, bdbc.v );
335 _mm_store_pd( ( double* )psi1, y1v.v );
336
337 //zeta1->real -= beta_c.real * chi1_c.real - beta_c.imag * chi1_c.imag;
338 //zeta1->imag -= beta_c.real * chi1_c.imag + beta_c.imag * chi1_c.real;
339 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
340 acad.v = beta11v.v * x1v.v;
341 bdbc.v = beta12v.v * x1rv.v;
342 z1v.v = _mm_load_pd( ( double* )zeta1 );
343 z1v.v = z1v.v - _mm_addsub_pd( acad.v, bdbc.v );
344 _mm_store_pd( ( double* )zeta1, z1v.v );
345
346 //rho_c.real = chi1_c.real * upsilon1_c.real - -chi1_c.imag * upsilon1_c.imag;
347 //rho_c.imag = chi1_c.real * upsilon1_c.imag + -chi1_c.imag * upsilon1_c.real;
348 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
349 u11v.v = _mm_loaddup_pd( ( double* )&(upsilon1->real) );
350 u12v.v = _mm_loaddup_pd( ( double* )&(upsilon1->imag) );
351 bcac.v = x1rv.v * u11v.v;
352 adbd.v = x1v.v * u12v.v;
353 rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
354
355 chi1 += 1;
356 upsilon1 += 1;
357 psi1 += 1;
358 zeta1 += 1;
359 }
360
361 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
362
363 rho1v.d[1] = -rho1v.d[1];
364
365 _mm_store_pd( ( double* )rho, rho1v.v );
366 }
367 #elif BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_NO_INTRINSICS
368 {
369 dcomplex* restrict chi1;
370 dcomplex* restrict upsilon1;
371 dcomplex* restrict psi1;
372 dcomplex* restrict zeta1;
373 dcomplex alpha_c;
374 dcomplex beta_c;
375 dcomplex rho_c;
376 int i;
377
378 if ( inc_x != 1 ||
379 inc_u != 1 ||
380 inc_y != 1 ||
381 inc_z != 1 ) bl1_abort();
382
383 chi1 = x;
384 upsilon1 = u;
385 psi1 = y;
386 zeta1 = z;
387
388 rho_c.real = 0.0;
389 rho_c.imag = 0.0;
390
391 alpha_c = *alpha;
392 beta_c = *beta;
393
394 for ( i = 0; i < n; ++i )
395 {
396 dcomplex chi1_c = *chi1;
397 dcomplex upsilon1_c = *upsilon1;
398
399 // rho += conj(chi1) * upsilon1;
400 rho_c.real += chi1_c.real * upsilon1_c.real - -chi1_c.imag * upsilon1_c.imag;
401 rho_c.imag += chi1_c.real * upsilon1_c.imag + -chi1_c.imag * upsilon1_c.real;
402
403 // psi1 = psi1 - alpha * chi1;
404 psi1->real -= alpha_c.real * chi1_c.real - alpha_c.imag * chi1_c.imag;
405 psi1->imag -= alpha_c.real * chi1_c.imag + alpha_c.imag * chi1_c.real;
406
407 // zeta1 = zeta1 - beta * chi1;
408 zeta1->real -= beta_c.real * chi1_c.real - beta_c.imag * chi1_c.imag;
409 zeta1->imag -= beta_c.real * chi1_c.imag + beta_c.imag * chi1_c.real;
410
411 chi1 += inc_x;
412 upsilon1 += inc_u;
413 psi1 += inc_y;
414 zeta1 += inc_z;
415 }
416
417 *rho = rho_c;
418 }
419 #endif
420
421