1 /*
2
3 PhyML: a program that computes maximum likelihood phylogenies from
4 DNA or AA homologous sequences.
5
6 Copyright (C) Stephane Guindon. Oct 2003 onward.
7
8 All parts of the source except where indicated are distributed under
9 the GNU public licence. See http://www.opensource.org for details.
10
11 */
12
13 #include "assert.h"
14 #include "sse.h"
15
16 //////////////////////////////////////////////////////////////
17 //////////////////////////////////////////////////////////////
18
19 #if defined(__SSE3__)
20
SSE_Update_Eigen_Lr(t_edge * b,t_tree * tree)21 void SSE_Update_Eigen_Lr(t_edge *b, t_tree *tree)
22 {
23 unsigned int site,catg;
24 unsigned int i,j;
25
26 unsigned const int npattern = tree->n_pattern;
27 unsigned const int ncatg = tree->mod->ras->n_catg;
28 unsigned const int ns = tree->mod->ns;
29 unsigned const int sz = (int)BYTE_ALIGN / 8;
30 unsigned const int nblocks = ns / sz;
31 unsigned const int ncatgns = ncatg*ns;
32
33 const phydbl *p_lk_left,*p_lk_rght,*pi;
34 phydbl *dot_prod,*p_lk_left_pi;
35 phydbl *l_ev,*r_ev;
36
37 __m128d *_l_ev,*_r_ev,*_prod_left,*_prod_rght;
38
39 #ifndef WIN32
40 if(posix_memalign((void **)&p_lk_left_pi,BYTE_ALIGN,(size_t) ns * sizeof(phydbl))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
41 if(posix_memalign((void **)&l_ev,BYTE_ALIGN,(size_t) ns * ns * sizeof(phydbl))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
42 if(posix_memalign((void **)&_l_ev,BYTE_ALIGN,(size_t) ns * ns / sz * sizeof(__m128d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
43 if(posix_memalign((void **)&_r_ev,BYTE_ALIGN,(size_t) ns * ns / sz * sizeof(__m128d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
44 if(posix_memalign((void **)&_prod_left,BYTE_ALIGN,(size_t) ns / sz * sizeof(__m128d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
45 if(posix_memalign((void **)&_prod_rght,BYTE_ALIGN,(size_t) ns / sz * sizeof(__m128d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
46 #else
47 p_lk_left_pi = _aligned_malloc(ns * sizeof(phydbl),BYTE_ALIGN);
48 l_ev = _aligned_malloc(ns * ns * sizeof(phydbl),BYTE_ALIGN);
49 _l_ev = _aligned_malloc(ns * ns / sz * sizeof(__m128d),BYTE_ALIGN);
50 _r_ev = _aligned_malloc(ns * ns / sz * sizeof(__m128d),BYTE_ALIGN);
51 _prod_left = _aligned_malloc(ns / sz * sizeof(__m128d),BYTE_ALIGN);
52 _prod_rght = _aligned_malloc(ns / sz * sizeof(__m128d),BYTE_ALIGN);
53 #endif
54
55
56 assert(sz == 2);
57 assert(tree->update_eigen_lr == YES);
58
59 r_ev = tree->mod->eigen->r_e_vect;
60
61 /* Copy transpose of matrix of left eigen vectors */
62 for(i=0;i<ns;++i)
63 for(j=0;j<ns;++j)
64 l_ev[i*ns+j] = tree->mod->eigen->l_e_vect[j*ns+i];
65
66 /* Load into AVX registers */
67 for(i=0;i<ns;++i)
68 {
69 for(j=0;j<nblocks;++j)
70 {
71 _r_ev[i*nblocks+j] = _mm_load_pd(r_ev + j*sz);
72 _l_ev[i*nblocks+j] = _mm_load_pd(l_ev + j*sz);
73 }
74 r_ev += ns;
75 l_ev += ns;
76 }
77
78 l_ev -= ns*ns;
79 r_ev -= ns*ns;
80
81 p_lk_left = b->left->tax ? b->p_lk_tip_l : b->p_lk_left;
82 p_lk_rght = b->rght->tax ? b->p_lk_tip_r : b->p_lk_rght;
83 pi = tree->mod->e_frq->pi->v;
84 dot_prod = tree->dot_prod;
85
86 for(site=0;site<npattern;++site)
87 {
88 if(tree->data->wght[site] > SMALL)
89 {
90 for(catg=0;catg<ncatg;++catg)
91 {
92 for(i=0;i<ns;++i) p_lk_left_pi[i] = p_lk_left[i] * pi[i];
93
94 SSE_Matrix_Vect_Prod(_r_ev,p_lk_left_pi,ns,_prod_left);
95 SSE_Matrix_Vect_Prod(_l_ev,p_lk_rght,ns,_prod_rght);
96
97 for(i=0;i<nblocks;++i) _mm_store_pd(dot_prod + i*sz,_mm_mul_pd(_prod_left[i],_prod_rght[i]));
98
99 dot_prod += ns;
100 if(b->left->tax == NO) p_lk_left += ns;
101 if(b->rght->tax == NO) p_lk_rght += ns;
102 }
103
104 if(b->left->tax == YES) p_lk_left += ns;
105 if(b->rght->tax == YES) p_lk_rght += ns;
106 }
107 else
108 {
109 if(b->left->tax == YES) p_lk_left += ns;
110 else p_lk_left += ncatgns;
111
112 if(b->rght->tax == YES) p_lk_rght += ns;
113 else p_lk_rght += ncatgns;
114
115 dot_prod += ncatgns;
116 }
117 }
118
119 Free(l_ev);
120 Free(_l_ev);
121 Free(_r_ev);
122 Free(_prod_left);
123 Free(_prod_rght);
124 Free(p_lk_left_pi);
125 }
126
127 //////////////////////////////////////////////////////////////
128 //////////////////////////////////////////////////////////////
129
SSE_Lk_Core_One_Class_Eigen_Lr(phydbl * dot_prod,phydbl * expl,int ns)130 phydbl SSE_Lk_Core_One_Class_Eigen_Lr(phydbl *dot_prod, phydbl *expl, int ns)
131 {
132 phydbl lk;
133 unsigned int l;
134 const unsigned sz = (int)BYTE_ALIGN / 8;
135 const unsigned int nblocks = ns/sz;
136 __m128d _prod[nblocks],_x;
137
138 for(l=0;l<nblocks;++l) _prod[l] = _mm_load_pd(dot_prod + l*sz);
139 if(expl != NULL) for(l=0;l<nblocks;++l) _prod[l] = _mm_mul_pd(_prod[l],_mm_load_pd(expl + l*sz));
140 _x = _mm_setzero_pd();
141 for(l=0;l<nblocks;++l) _x = _mm_add_pd(_x,_prod[l]);
142 _x = _mm_hadd_pd(_x,_x);
143 _mm_store_sd(&lk,_x);
144
145 return lk;
146 }
147
148 //////////////////////////////////////////////////////////////
149 //////////////////////////////////////////////////////////////
150
SSE_Lk_dLk_Core_One_Class_Eigen_Lr(phydbl * dot_prod,phydbl * expl,unsigned int ns,phydbl * lk,phydbl * dlk)151 void SSE_Lk_dLk_Core_One_Class_Eigen_Lr(phydbl *dot_prod, phydbl *expl, unsigned int ns, phydbl *lk, phydbl *dlk)
152 {
153 unsigned int i;
154 __m128d _x,_y,_z;
155
156 _z = _mm_setzero_pd();
157
158 for(i=0;i<ns;++i)
159 {
160 _x = _mm_set1_pd(dot_prod[i]);
161 _y = _mm_load_pd(expl + 2*i);
162
163 _z = _mm_add_pd(_z,_mm_mul_pd(_x,_y));
164 }
165
166 *lk = ((double *)&_z)[0];
167 *dlk = ((double *)&_z)[1];
168 }
169
170 //////////////////////////////////////////////////////////////
171 //////////////////////////////////////////////////////////////
172
SSE_Lk_Core_One_Class_No_Eigen_Lr(phydbl * p_lk_left,phydbl * p_lk_rght,phydbl * Pij,phydbl * tPij,phydbl * pi,int ns,int ambiguity_check,int state)173 phydbl SSE_Lk_Core_One_Class_No_Eigen_Lr(phydbl *p_lk_left, phydbl *p_lk_rght, phydbl *Pij, phydbl *tPij, phydbl *pi, int ns, int ambiguity_check, int state)
174 {
175 phydbl lk,dum;
176 const unsigned int sz = (int)BYTE_ALIGN / 8;
177 const unsigned nblocks = ns/sz;
178 unsigned int i,j;
179 __m128d _plk;
180 __m128d _plk_l[nblocks],_plk_r[nblocks];
181
182 /* [ Pi . Lkr ]' x Pij x Lkl */
183
184 if(ambiguity_check == NO) // tip case.
185 {
186 for(i=0;i<nblocks;++i)
187 {
188 _plk_l[i] = _mm_load_pd(p_lk_left);
189 _plk_r[i] = _mm_load_pd(Pij + state*ns);
190 p_lk_left += sz;
191 Pij += sz;
192 }
193
194 for(i=0;i<nblocks;++i)
195 {
196 _plk_r[i] = _mm_mul_pd(_plk_r[i],_mm_set1_pd(pi[state]));
197 _plk_r[i] = _mm_mul_pd(_plk_r[i],_plk_l[i]);
198 }
199
200 _plk = _mm_setzero_pd();
201 lk = 0.0;
202 for(i=0;i<nblocks;++i)
203 {
204 _plk = _mm_hadd_pd(_plk_r[i],_plk_r[i]);
205 _mm_store_sd(&dum,_plk);
206 lk += dum;
207 }
208 return lk;
209 }
210 else
211 {
212 __m128d _pij[nblocks],_pijplk[nblocks];
213
214 for(i=0;i<nblocks;++i)
215 {
216 _plk_r[i] = _mm_mul_pd(_mm_load_pd(p_lk_rght),_mm_load_pd(pi));
217 p_lk_rght += sz;
218 pi += sz;
219 }
220
221 for(i=0;i<nblocks;++i) _pijplk[i] = _mm_setzero_pd();
222
223 for(i=0;i<ns;++i)
224 {
225 for(j=0;j<nblocks;++j)
226 {
227 _pij[j] = _mm_load_pd(tPij);
228 tPij += sz;
229
230 _pijplk[j] = _mm_add_pd(_pijplk[j],_mm_mul_pd(_pij[j],_mm_set1_pd(p_lk_left[i])));
231 }
232 }
233
234 lk = 0.0;
235 for(i=0;i<nblocks;++i)
236 {
237 _plk = _mm_mul_pd(_pijplk[i],_plk_r[i]);
238 _plk = _mm_hadd_pd(_plk,_plk);
239 _mm_store_sd(&dum,_plk);
240 lk += dum;
241 }
242 return lk;
243 }
244 return UNLIKELY;
245 }
246
247 //////////////////////////////////////////////////////////////
248 //////////////////////////////////////////////////////////////
249
SSE_Update_Partial_Lk(t_tree * tree,t_edge * b,t_node * d)250 void SSE_Update_Partial_Lk(t_tree *tree, t_edge *b, t_node *d)
251 {
252 /*
253 |
254 |<- b
255 |
256 d
257 / \
258 / \
259 / \
260 n_v1 n_v2
261 */
262 t_node *n_v1, *n_v2;
263 phydbl *plk0,*plk1,*plk2;
264 phydbl *Pij1,*Pij2;
265 phydbl *tPij1,*tPij2;
266 int *sum_scale, *sum_scale_v1, *sum_scale_v2;
267 int sum_scale_v1_val, sum_scale_v2_val;
268 unsigned int i,j,k;
269 unsigned int catg,site;
270 short int state_v1,state_v2;
271 short int ambiguity_check_v1,ambiguity_check_v2;
272 phydbl largest_p_lk;
273 int *p_lk_loc;
274
275 const unsigned int npattern = tree->n_pattern;
276 const unsigned int ns = tree->mod->ns;
277 const unsigned int ncatg = tree->mod->ras->n_catg;
278
279 const unsigned int ncatgns = ncatg * ns;
280 const unsigned int nsns = ns * ns;
281
282 const unsigned int sz = (int)BYTE_ALIGN / 8;
283 const unsigned nblocks = ns/sz;
284
285 __m128d *_tPij1,*_tPij2,*_pmat1plk1,*_pmat2plk2,*_plk0,*_init_tPij1,*_init_tPij2;
286
287 #ifndef WIN32
288 if(posix_memalign((void **)&_tPij1,BYTE_ALIGN,(size_t)(ncatg * nsns / sz) * sizeof(__m128d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
289 if(posix_memalign((void **)&_tPij2,BYTE_ALIGN,(size_t)(ncatg * nsns / sz) * sizeof(__m128d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
290 if(posix_memalign((void **)&_pmat1plk1,BYTE_ALIGN,(size_t)ns / sz * sizeof(__m128d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
291 if(posix_memalign((void **)&_pmat2plk2,BYTE_ALIGN,(size_t)ns / sz * sizeof(__m128d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
292 if(posix_memalign((void **)&_plk0,BYTE_ALIGN,(size_t)(ns / sz) * sizeof(__m128d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
293 #else
294 _tPij1 = _aligned_malloc(ncatg * nsns / sz * sizeof(__m128d),BYTE_ALIGN);
295 _tPij2 = _aligned_malloc(ncatg * nsns / sz * sizeof(__m128d),BYTE_ALIGN);
296 tPij1 = _aligned_malloc(ncatg * nsns / sz * sizeof(__m128d),BYTE_ALIGN);
297 tPij2 = _aligned_malloc(ncatg * nsns / sz * sizeof(__m128d),BYTE_ALIGN);
298 _pmat1plk1 = _aligned_malloc(ns / sz * sizeof(__m128d),BYTE_ALIGN);
299 _pmat2plk2 = _aligned_malloc(ns / sz * sizeof(__m128d),BYTE_ALIGN);
300 _plk0 = _aligned_malloc(ns / sz * sizeof(__m128d),BYTE_ALIGN);
301 #endif
302
303 _init_tPij1 = _tPij1;
304 _init_tPij2 = _tPij2;
305
306 sum_scale_v1_val = 0;
307 sum_scale_v2_val = 0;
308 n_v1 = n_v2 = NULL;
309 plk0 = plk1 = plk2 = NULL;
310 Pij1 = Pij2 = NULL;
311 tPij1 = tPij2 = NULL;
312 sum_scale_v1 = sum_scale_v2 = NULL;
313 p_lk_loc = NULL;
314 state_v1 = state_v2 = -1;
315
316
317 Set_All_Partial_Lk(&n_v1,&n_v2,
318 &plk0,&sum_scale,&p_lk_loc,
319 &Pij1,&tPij1,&plk1,&sum_scale_v1,
320 &Pij2,&tPij2,&plk2,&sum_scale_v2,
321 d,b,tree);
322
323 // Copy transpose of transition matrices into AVX registers
324 for(i=0;i<ncatg;++i)
325 {
326 for(j=0;j<ns;++j)
327 {
328 for(k=0;k<nblocks;++k)
329 {
330 _tPij1[k] = _mm_load_pd(tPij1);
331 _tPij2[k] = _mm_load_pd(tPij2);
332 tPij1 += sz;
333 tPij2 += sz;
334 }
335 _tPij1 += nblocks;
336 _tPij2 += nblocks;
337 }
338 }
339 _tPij1 = _init_tPij1;
340 _tPij2 = _init_tPij2;
341
342 if(tree->mod->augmented == YES)
343 {
344 PhyML_Printf("\n== AVX version of the Update_Partial_Lk function does not");
345 PhyML_Printf("\n== allow augmented data.");
346 assert(FALSE);
347 }
348
349 /* For every site in the alignment */
350 for(site=0;site<npattern;++site)
351 {
352 if(tree->data->wght[site] > SMALL)
353 {
354 state_v1 = state_v2 = -1;
355 ambiguity_check_v1 = ambiguity_check_v2 = YES;
356
357 if(n_v1 && n_v1->tax)
358 {
359 ambiguity_check_v1 = n_v1->c_seq->is_ambigu[site];
360 if(ambiguity_check_v1 == NO) state_v1 = n_v1->c_seq->d_state[site];
361 }
362
363 if(n_v2 && n_v2->tax)
364 {
365 ambiguity_check_v2 = n_v2->c_seq->is_ambigu[site];
366 if(ambiguity_check_v2 == NO) state_v2 = n_v2->c_seq->d_state[site];
367 }
368
369 _tPij1 = _init_tPij1;
370 _tPij2 = _init_tPij2;
371
372 for(catg=0;catg<ncatg;++catg)
373 {
374 if(ambiguity_check_v1 == NO && ambiguity_check_v2 == NO)
375 {
376 SSE_Partial_Lk_Exex(_tPij1,state_v1,
377 _tPij2,state_v2,
378 ns,_plk0);
379 }
380 else if(ambiguity_check_v1 == YES && ambiguity_check_v2 == NO)
381 {
382 SSE_Partial_Lk_Exin(_tPij2,state_v2,
383 _tPij1,plk1,_pmat1plk1,
384 ns,_plk0);
385 }
386 else if(ambiguity_check_v1 == NO && ambiguity_check_v2 == YES)
387 {
388 SSE_Partial_Lk_Exin(_tPij1,state_v1,
389 _tPij2,plk2,_pmat2plk2,
390 ns,_plk0);
391 }
392 else
393 {
394 SSE_Partial_Lk_Inin(_tPij1,plk1,_pmat1plk1,
395 _tPij2,plk2,_pmat2plk2,
396 ns,_plk0);
397 }
398
399 for(k=0;k<nblocks;++k) _mm_store_pd(plk0+sz*k,_plk0[k]);
400
401 _tPij1 += nsns / sz;
402 _tPij2 += nsns / sz;
403 plk0 += ns;
404 plk1 += (n_v1->tax) ? 0 : ns;
405 plk2 += (n_v2->tax) ? 0 : ns;
406 }
407
408 plk1 += (n_v1->tax) ? ns : 0;
409 plk2 += (n_v2->tax) ? ns : 0;
410
411 if(tree->scaling_method == SCALE_FAST)
412 {
413 sum_scale_v1_val = (sum_scale_v1)?(sum_scale_v1[site]):(0);
414 sum_scale_v2_val = (sum_scale_v2)?(sum_scale_v2[site]):(0);
415 sum_scale[site] = sum_scale_v1_val + sum_scale_v2_val;
416
417 if(sum_scale[site] >= 1024)
418 {
419 /* plk0 -= ncatgns; */
420 /* plk1 -= (n_v1->tax) ? ns : ncatgns; */
421 /* plk2 -= (n_v2->tax) ? ns : ncatgns; */
422 /* PhyML_Fprintf(stderr,"\n. PARTIAL site: %d plk0: %p [%g %g %g %g] plk1: %p [%g %g %g %g] plk2: %p [%g %g %g %g]", */
423 /* site, */
424 /* plk0, */
425 /* plk0[0], */
426 /* plk0[1], */
427 /* plk0[2], */
428 /* plk0[3], */
429 /* plk1, */
430 /* plk1[0], */
431 /* plk1[1], */
432 /* plk1[2], */
433 /* plk1[3], */
434 /* plk2, */
435 /* plk2[0], */
436 /* plk2[1], */
437 /* plk2[2], */
438 /* plk2[3] */
439 /* ); */
440 /* PhyML_Fprintf(stderr,"\n. PARTIAL site: %d d: %d n_v1: %d n_v2: %d",site,d->num,n_v1->num,n_v2->num); */
441 /* PhyML_Fprintf(stderr,"\n. PARTIAL site: %d sum n: %d sum n_v1: %d sum n_v2: %d",site,sum_scale[site],sum_scale_v1_val,sum_scale_v2_val); */
442
443 /* plk0 += ncatgns; */
444 /* plk1 += (n_v1->tax) ? ns : ncatgns; */
445 /* plk2 += (n_v2->tax) ? ns : ncatgns; */
446 /* Exit("\n"); */
447 }
448
449 plk0 -= ncatgns;
450
451 largest_p_lk = -BIG;
452 for(i=0;i<ncatgns;++i)
453 if(plk0[i] > largest_p_lk)
454 largest_p_lk = plk0[i];
455
456 if(largest_p_lk < INV_TWO_TO_THE_LARGE &&
457 tree->mod->augmented == NO &&
458 tree->apply_lk_scaling == YES)
459 {
460 for(i=0;i<ncatgns;++i) plk0[i] *= TWO_TO_THE_LARGE;
461 sum_scale[site] += LARGE;
462 }
463
464 plk0 += ncatgns;
465 }
466 }
467 else
468 {
469 plk0 += ncatgns;
470 plk1 += (n_v1->tax) ? ns : ncatgns;
471 plk2 += (n_v2->tax) ? ns : ncatgns;
472 }
473 }
474
475 Free(_init_tPij1);
476 Free(_init_tPij2);
477 Free(_pmat1plk1);
478 Free(_pmat2plk2);
479 Free(_plk0);
480
481 }
482
483 //////////////////////////////////////////////////////////////
484 //////////////////////////////////////////////////////////////
485
SSE_Partial_Lk_Exex(const __m128d * _tPij1,const int state1,const __m128d * _tPij2,const int state2,const int ns,__m128d * plk0)486 void SSE_Partial_Lk_Exex(const __m128d *_tPij1, const int state1, const __m128d *_tPij2, const int state2, const int ns, __m128d *plk0)
487 {
488 unsigned const int sz = (int)BYTE_ALIGN / 8;
489 unsigned const int nblocks = ns / sz;
490 unsigned int i;
491
492 _tPij1 = _tPij1 + state1 * nblocks;
493 _tPij2 = _tPij2 + state2 * nblocks;
494 for(i=0;i<nblocks;++i) plk0[i] = _mm_mul_pd(_tPij1[i],_tPij2[i]);
495 }
496
497 //////////////////////////////////////////////////////////////
498 //////////////////////////////////////////////////////////////
499
SSE_Partial_Lk_Exin(const __m128d * _tPij1,const int state1,const __m128d * _tPij2,const phydbl * _plk2,__m128d * _pmat2plk2,const int ns,__m128d * _plk0)500 void SSE_Partial_Lk_Exin(const __m128d *_tPij1, const int state1, const __m128d *_tPij2, const phydbl *_plk2, __m128d *_pmat2plk2, const int ns, __m128d *_plk0)
501 {
502 unsigned const int sz = (int)BYTE_ALIGN / 8;
503 unsigned const int nblocks = ns / sz;
504 unsigned int i;
505
506 _tPij1 = _tPij1 + state1 * nblocks;
507 SSE_Matrix_Vect_Prod(_tPij2,_plk2,ns,_pmat2plk2);
508
509 for(i=0;i<nblocks;++i) _plk0[i] = _mm_mul_pd(_tPij1[i],_pmat2plk2[i]);
510 }
511
512 //////////////////////////////////////////////////////////////
513 //////////////////////////////////////////////////////////////
514
SSE_Partial_Lk_Inin(const __m128d * _tPij1,const phydbl * plk1,__m128d * _pmat1plk1,const __m128d * _tPij2,const phydbl * plk2,__m128d * _pmat2plk2,const int ns,__m128d * _plk0)515 void SSE_Partial_Lk_Inin(const __m128d *_tPij1, const phydbl *plk1, __m128d *_pmat1plk1, const __m128d *_tPij2, const phydbl *plk2, __m128d *_pmat2plk2, const int ns, __m128d *_plk0)
516 {
517 unsigned int i;
518 unsigned const int sz = (int)BYTE_ALIGN / 8;
519 unsigned const int nblocks = ns / sz;
520
521 for(i=0;i<ns;++i) if(plk1[i] > 1.0 || plk1[i] < 1.0 || plk2[i] > 1.0 || plk2[i] < 1.0) break;
522
523 if(i != ns)
524 {
525 SSE_Matrix_Vect_Prod(_tPij1,plk1,ns,_pmat1plk1);
526 SSE_Matrix_Vect_Prod(_tPij2,plk2,ns,_pmat2plk2);
527
528 for(i=0;i<nblocks;++i) _plk0[i] = _mm_mul_pd(_pmat1plk1[i],_pmat2plk2[i]);
529 }
530 else
531 {
532 for(i=0;i<nblocks;++i) _plk0[i] = _mm_set1_pd(1.0);
533 }
534 }
535
536 //////////////////////////////////////////////////////////////
537 //////////////////////////////////////////////////////////////
538
SSE_Matrix_Vect_Prod(const __m128d * _m_transpose,const phydbl * _v,const int ns,__m128d * _u)539 void SSE_Matrix_Vect_Prod(const __m128d *_m_transpose, const phydbl *_v, const int ns, __m128d *_u)
540 {
541 unsigned const int sz = (int)BYTE_ALIGN / 8;
542 unsigned const int nblocks = ns / sz;
543 unsigned int i,j;
544 __m128d _x;
545
546 for(i=0;i<nblocks;++i) _u[i] = _mm_setzero_pd();
547
548 for(i=0;i<ns;++i)
549 {
550 _x = _mm_set1_pd(_v[i]);
551 for(j=0;j<nblocks;++j) _u[j] = _mm_add_pd(_u[j],_mm_mul_pd(_m_transpose[j],_x));
552 _m_transpose = _m_transpose + nblocks;
553 }
554 }
555
556 //////////////////////////////////////////////////////////////
557 //////////////////////////////////////////////////////////////
558 //////////////////////////////////////////////////////////////
559 //////////////////////////////////////////////////////////////
560 //////////////////////////////////////////////////////////////
561 //////////////////////////////////////////////////////////////
562 //////////////////////////////////////////////////////////////
563 //////////////////////////////////////////////////////////////
564 //////////////////////////////////////////////////////////////
565 //////////////////////////////////////////////////////////////
566 //////////////////////////////////////////////////////////////
567 //////////////////////////////////////////////////////////////
568 //////////////////////////////////////////////////////////////
569 //////////////////////////////////////////////////////////////
570 //////////////////////////////////////////////////////////////
571 //////////////////////////////////////////////////////////////
572 //////////////////////////////////////////////////////////////
573 //////////////////////////////////////////////////////////////
574 //////////////////////////////////////////////////////////////
575 //////////////////////////////////////////////////////////////
576 //////////////////////////////////////////////////////////////
577 //////////////////////////////////////////////////////////////
578 //////////////////////////////////////////////////////////////
579 //////////////////////////////////////////////////////////////
580 //////////////////////////////////////////////////////////////
581 //////////////////////////////////////////////////////////////
582
583 #endif
584