1 /*
2
3 PhyML: a program that computes maximum likelihood phylogenies from
4 DNA or AA homologous sequences.
5
6 Copyright (C) Stephane Guindon. Oct 2003 onward.
7
8 All parts of the source except where indicated are distributed under
9 the GNU public licence. See http://www.opensource.org for details.
10
11 */
12
13 #include "assert.h"
14 #include "avx.h"
15
16 #if defined(__AVX__)
17
18 //////////////////////////////////////////////////////////////
19 //////////////////////////////////////////////////////////////
20
AVX_Update_Eigen_Lr(t_edge * b,t_tree * tree)21 void AVX_Update_Eigen_Lr(t_edge *b, t_tree *tree)
22 {
23 unsigned int site,catg;
24 unsigned int i,j;
25
26 unsigned const int npattern = tree->n_pattern;
27 unsigned const int ncatg = tree->mod->ras->n_catg;
28 unsigned const int ns = tree->mod->ns;
29 unsigned const int sz = (int)BYTE_ALIGN / 8;
30 unsigned const int nblocks = ns / sz;
31 unsigned const int ncatgns = ncatg*ns;
32
33 const phydbl *p_lk_left,*p_lk_rght,*pi;
34 phydbl *dot_prod,*p_lk_left_pi;
35 phydbl *l_ev,*r_ev;
36
37 __m256d *_l_ev,*_r_ev,*_prod_left,*_prod_rght;
38
39 #ifndef WIN32
40 if(posix_memalign((void **)&p_lk_left_pi,BYTE_ALIGN,(size_t) ns * sizeof(phydbl))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
41 if(posix_memalign((void **)&l_ev,BYTE_ALIGN,(size_t) ns * ns * sizeof(phydbl))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
42 if(posix_memalign((void **)&_l_ev,BYTE_ALIGN,(size_t) ns * ns / sz * sizeof(__m256d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
43 if(posix_memalign((void **)&_r_ev,BYTE_ALIGN,(size_t) ns * ns / sz * sizeof(__m256d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
44 if(posix_memalign((void **)&_prod_left,BYTE_ALIGN,(size_t) ns / sz * sizeof(__m256d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
45 if(posix_memalign((void **)&_prod_rght,BYTE_ALIGN,(size_t) ns / sz * sizeof(__m256d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
46 #else
47 p_lk_left_pi = _aligned_malloc(ns * sizeof(phydbl),BYTE_ALIGN);
48 l_ev = _aligned_malloc(ns * ns * sizeof(phydbl),BYTE_ALIGN);
49 _l_ev = _aligned_malloc(ns * ns / sz * sizeof(__m256d),BYTE_ALIGN);
50 _r_ev = _aligned_malloc(ns * ns / sz * sizeof(__m256d),BYTE_ALIGN);
51 _prod_left = _aligned_malloc(ns / sz * sizeof(__m256d),BYTE_ALIGN);
52 _prod_rght = _aligned_malloc(ns / sz * sizeof(__m256d),BYTE_ALIGN);
53 #endif
54
55
56 assert(sz == 4);
57 assert(tree->update_eigen_lr == YES);
58
59 r_ev = tree->mod->eigen->r_e_vect;
60
61 /* Copy transpose of matrix of left eigen vectors */
62 for(i=0;i<ns;++i)
63 for(j=0;j<ns;++j)
64 l_ev[i*ns+j] = tree->mod->eigen->l_e_vect[j*ns+i];
65
66 /* Load into AVX registers */
67 for(i=0;i<ns;++i)
68 {
69 for(j=0;j<nblocks;++j)
70 {
71 _r_ev[i*nblocks+j] = _mm256_load_pd(r_ev + j*sz);
72 _l_ev[i*nblocks+j] = _mm256_load_pd(l_ev + j*sz);
73 }
74 r_ev += ns;
75 l_ev += ns;
76 }
77
78 l_ev -= ns*ns;
79 r_ev -= ns*ns;
80
81 p_lk_left = b->left->tax ? b->p_lk_tip_l : b->p_lk_left;
82 p_lk_rght = b->rght->tax ? b->p_lk_tip_r : b->p_lk_rght;
83 pi = tree->mod->e_frq->pi->v;
84 dot_prod = tree->dot_prod;
85
86 for(site=0;site<npattern;++site)
87 {
88 if(tree->data->wght[site] > SMALL)
89 {
90 for(catg=0;catg<ncatg;++catg)
91 {
92 for(i=0;i<ns;++i) p_lk_left_pi[i] = p_lk_left[i] * pi[i];
93
94 AVX_Matrix_Vect_Prod(_r_ev,p_lk_left_pi,ns,_prod_left);
95 AVX_Matrix_Vect_Prod(_l_ev,p_lk_rght,ns,_prod_rght);
96
97 for(i=0;i<nblocks;++i) _mm256_store_pd(dot_prod + i*sz,_mm256_mul_pd(_prod_left[i],_prod_rght[i]));
98
99 dot_prod += ns;
100 if(b->left->tax == NO) p_lk_left += ns;
101 if(b->rght->tax == NO) p_lk_rght += ns;
102 }
103
104 if(b->left->tax == YES) p_lk_left += ns;
105 if(b->rght->tax == YES) p_lk_rght += ns;
106 }
107 else
108 {
109 if(b->left->tax == YES) p_lk_left += ns;
110 else p_lk_left += ncatgns;
111
112 if(b->rght->tax == YES) p_lk_rght += ns;
113 else p_lk_rght += ncatgns;
114
115 dot_prod += ncatgns;
116 }
117 }
118
119 Free(l_ev);
120 Free(_l_ev);
121 Free(_r_ev);
122 Free(_prod_left);
123 Free(_prod_rght);
124 Free(p_lk_left_pi);
125 }
126
127 //////////////////////////////////////////////////////////////
128 //////////////////////////////////////////////////////////////
129
AVX_Lk_Core_One_Class_No_Eigen_Lr(const phydbl * p_lk_left,const phydbl * p_lk_rght,const phydbl * Pij,const phydbl * tPij,const phydbl * pi,const int ns,const int ambiguity_check,const int observed_state)130 phydbl AVX_Lk_Core_One_Class_No_Eigen_Lr(const phydbl *p_lk_left, const phydbl *p_lk_rght, const phydbl *Pij, const phydbl *tPij, const phydbl *pi, const int ns, const int ambiguity_check, const int observed_state)
131 {
132 const unsigned int sz = (int)BYTE_ALIGN / 8;
133 const unsigned nblocks = ns/sz;
134
135 if(nblocks == 1)
136 {
137 __m256d _plk_r,_plk_l;
138
139 if(ambiguity_check == NO) // tip case.
140 {
141 Pij += observed_state*ns;
142 _plk_r = _mm256_mul_pd(_mm256_load_pd(Pij),_mm256_load_pd(p_lk_left));
143 return pi[observed_state] * AVX_Vect_Norm(_plk_r);
144 }
145 else
146 {
147 unsigned int i;
148 __m256d _pijplk,_pij;
149
150 _plk_r = _mm256_mul_pd(_mm256_load_pd(p_lk_rght),_mm256_load_pd(pi));
151 _pijplk = _mm256_setzero_pd();
152
153 for(i=0;i<ns;++i)
154 {
155 _pij = _mm256_load_pd(tPij);
156
157
158 #if (defined(__FMA__))
159 _pijplk = _mm256_fmadd_pd(_pij,_mm256_set1_pd(p_lk_left[i]),_pijplk);
160 #else
161 _pijplk = _mm256_add_pd(_pijplk,_mm256_mul_pd(_pij,_mm256_set1_pd(p_lk_left[i])));
162 #endif
163 tPij += ns;
164 }
165
166 _plk_l = _mm256_mul_pd(_pijplk,_plk_r);
167
168 return(AVX_Vect_Norm(_plk_l));
169 }
170 return UNLIKELY;
171 }
172 else
173 {
174 __m256d _plk_l[nblocks],_plk_r[nblocks];
175 __m256d _plk;
176
177 /* [ Pi . Lkr ]' x Pij x Lkl */
178
179 if(ambiguity_check == NO) // tip case.
180 {
181 unsigned int i;
182
183 Pij += observed_state*ns;
184
185 for(i=0;i<nblocks;++i)
186 {
187 _plk_l[i] = _mm256_load_pd(p_lk_left);
188 _plk_r[i] = _mm256_load_pd(Pij);
189 _plk_r[i] = _mm256_mul_pd(_plk_r[i],_plk_l[i]);
190 p_lk_left += sz;
191 Pij += sz;
192 }
193
194 _plk = _mm256_setzero_pd();
195 for(i=0;i<nblocks;++i) _plk = _mm256_add_pd(_plk,_plk_r[i]);
196 return pi[observed_state] * AVX_Vect_Norm(_plk);
197 }
198 else
199 {
200 unsigned int i,j;
201 __m256d _pij[nblocks],_pijplk[nblocks];
202 phydbl lk;
203
204 for(i=0;i<nblocks;++i)
205 {
206 _plk_r[i] = _mm256_mul_pd(_mm256_load_pd(p_lk_rght),_mm256_load_pd(pi));
207 p_lk_rght += sz;
208 pi += sz;
209 }
210
211 for(i=0;i<nblocks;++i) _pijplk[i] = _mm256_setzero_pd();
212
213 for(i=0;i<ns;++i)
214 {
215 for(j=0;j<nblocks;++j)
216 {
217 _pij[j] = _mm256_load_pd(tPij);
218 tPij += sz;
219
220 #if (defined(__FMA__))
221 _pijplk[j] = _mm256_fmadd_pd(_pij[j],_mm256_set1_pd(p_lk_left[i]),_pijplk[j]);
222 #else
223 _pijplk[j] = _mm256_add_pd(_pijplk[j],_mm256_mul_pd(_pij[j],_mm256_set1_pd(p_lk_left[i])));
224 #endif
225 }
226 }
227
228 lk = 0.0;
229 for(i=0;i<nblocks;++i) lk += AVX_Vect_Norm(_mm256_mul_pd(_pijplk[i],_plk_r[i]));
230 return lk;
231 }
232
233 return UNLIKELY;
234 }
235 }
236
237 //////////////////////////////////////////////////////////////
238 //////////////////////////////////////////////////////////////
239
AVX_Lk_Core_One_Class_Eigen_Lr(const phydbl * dot_prod,const phydbl * expl,const unsigned int ns)240 phydbl AVX_Lk_Core_One_Class_Eigen_Lr(const phydbl *dot_prod, const phydbl *expl, const unsigned int ns)
241 {
242 unsigned int l;
243 const unsigned int sz = (int)BYTE_ALIGN / 8;
244 const unsigned nblocks = ns/sz;
245 __m256d _prod[nblocks],_x;
246
247 for(l=0;l<nblocks;++l)
248 {
249 _prod[l] = _mm256_load_pd(dot_prod); dot_prod += sz;
250 }
251
252 if(expl != NULL)
253 {
254 for(l=0;l<nblocks;++l)
255 {
256 _prod[l] = _mm256_mul_pd(_prod[l],_mm256_load_pd(expl));
257 expl += sz;
258 }
259 }
260
261 _x = _mm256_setzero_pd();
262 for(l=0;l<nblocks;++l) _x = _mm256_add_pd(_x,_prod[l]);
263
264 return AVX_Vect_Norm(_x);
265 }
266
267 //////////////////////////////////////////////////////////////
268 //////////////////////////////////////////////////////////////
269
AVX_Lk_dLk_Core_One_Class_Eigen_Lr(const phydbl * dot_prod,const phydbl * expl,const unsigned int ns,phydbl * lk,phydbl * dlk)270 void AVX_Lk_dLk_Core_One_Class_Eigen_Lr(const phydbl *dot_prod, const phydbl *expl, const unsigned int ns, phydbl *lk, phydbl *dlk)
271 {
272 unsigned int i;
273 const unsigned int sz = (int)BYTE_ALIGN / 8;
274 const unsigned nblocks = ns/sz*2;
275 __m256d _x,_y,_z;
276
277 _z = _mm256_setzero_pd();
278
279 for(i=0;i<nblocks;++i)
280 {
281 _x = _mm256_blend_pd(_mm256_set1_pd(dot_prod[2*i]),
282 _mm256_set1_pd(dot_prod[2*i + 1]),12);
283
284 _y = _mm256_load_pd(expl + 4*i);
285
286 #if (defined(__FMA__))
287 _z = _mm256_fmadd_pd(_x,_y,_z);
288 #else
289 _z = _mm256_add_pd(_z,_mm256_mul_pd(_x,_y));
290 #endif
291 }
292
293 *lk = ((double *)&_z)[0] + ((double *)&_z)[2];
294 *dlk = ((double *)&_z)[1] + ((double *)&_z)[3];
295
296 }
297
298 //////////////////////////////////////////////////////////////
299 //////////////////////////////////////////////////////////////
300
AVX_Vect_Norm(__m256d _z)301 phydbl AVX_Vect_Norm(__m256d _z)
302 {
303 __m128d vlow = _mm256_castpd256_pd128(_z);
304 __m128d vhigh = _mm256_extractf128_pd(_z, 1); // high 128
305 vlow = _mm_add_pd(vlow, vhigh); // reduce down to 128
306
307 __m128d high64 = _mm_unpackhi_pd(vlow, vlow);
308 return _mm_cvtsd_f64(_mm_add_sd(vlow, high64));
309
310
311 /* phydbl r; */
312 /* __m256d _x = _mm256_hadd_pd(_z,_z); */
313 /* __m256d _y = _mm256_permute2f128_pd(_x,_x,0x21); */
314 /* _mm_store_sd(&r,_mm256_castpd256_pd128(_mm256_add_pd(_x,_y))); */
315 /* return r; */
316 }
317
318 //////////////////////////////////////////////////////////////
319 //////////////////////////////////////////////////////////////
320
AVX_Update_Partial_Lk(t_tree * tree,t_edge * b,t_node * d)321 void AVX_Update_Partial_Lk(t_tree *tree, t_edge *b, t_node *d)
322 {
323 /*
324 |
325 |<- b
326 |
327 d
328 / \
329 / \
330 / \
331 n_v1 n_v2
332 */
333 t_node *n_v1, *n_v2;
334 phydbl *plk0,*plk1,*plk2;
335 phydbl *Pij1,*Pij2;
336 phydbl *tPij1,*tPij2;
337 int *sum_scale, *sum_scale_v1, *sum_scale_v2;
338 int sum_scale_v1_val, sum_scale_v2_val;
339 unsigned int i,j,k;
340 unsigned int catg,site;
341 short int state_v1,state_v2;
342 short int ambiguity_check_v1,ambiguity_check_v2;
343 phydbl largest_p_lk;
344 int *p_lk_loc;
345
346 const unsigned int npattern = tree->n_pattern;
347 const unsigned int ns = tree->mod->ns;
348 const unsigned int ncatg = tree->mod->ras->n_catg;
349
350 const unsigned int ncatgns = ncatg * ns;
351 const unsigned int nsns = ns * ns;
352
353 const unsigned int sz = (int)BYTE_ALIGN / 8;
354 const unsigned nblocks = ns/sz;
355
356 __m256d *_tPij1,*_tPij2,*_pmat1plk1,*_pmat2plk2,*_plk0,*_init_tPij1,*_init_tPij2;
357
358
359 #ifndef WIN32
360 if(posix_memalign((void **)&_tPij1,BYTE_ALIGN,(size_t)(ncatg * nsns / sz) * sizeof(__m256d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
361 if(posix_memalign((void **)&_tPij2,BYTE_ALIGN,(size_t)(ncatg * nsns / sz) * sizeof(__m256d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
362 if(posix_memalign((void **)&_pmat1plk1,BYTE_ALIGN,(size_t)ns / sz * sizeof(__m256d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
363 if(posix_memalign((void **)&_pmat2plk2,BYTE_ALIGN,(size_t)ns / sz * sizeof(__m256d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
364 if(posix_memalign((void **)&_plk0,BYTE_ALIGN,(size_t)(ns / sz) * sizeof(__m256d))) Generic_Exit(__FILE__,__LINE__,__FUNCTION__);
365 #else
366 _tPij1 = _aligned_malloc(ncatg * nsns / sz * sizeof(__m256d),BYTE_ALIGN);
367 _tPij2 = _aligned_malloc(ncatg * nsns / sz * sizeof(__m256d),BYTE_ALIGN);
368 tPij1 = _aligned_malloc(ncatg * nsns / sz * sizeof(__m256d),BYTE_ALIGN);
369 tPij2 = _aligned_malloc(ncatg * nsns / sz * sizeof(__m256d),BYTE_ALIGN);
370 _pmat1plk1 = _aligned_malloc(ns / sz * sizeof(__m256d),BYTE_ALIGN);
371 _pmat2plk2 = _aligned_malloc(ns / sz * sizeof(__m256d),BYTE_ALIGN);
372 _plk0 = _aligned_malloc(ns / sz * sizeof(__m256d),BYTE_ALIGN);
373 #endif
374
375 _init_tPij1 = _tPij1;
376 _init_tPij2 = _tPij2;
377
378 sum_scale_v1_val = 0;
379 sum_scale_v2_val = 0;
380 n_v1 = n_v2 = NULL;
381 plk0 = plk1 = plk2 = NULL;
382 Pij1 = Pij2 = NULL;
383 tPij1 = tPij2 = NULL;
384 sum_scale_v1 = sum_scale_v2 = NULL;
385 p_lk_loc = NULL;
386 state_v1 = state_v2 = -1;
387
388
389 Set_All_Partial_Lk(&n_v1,&n_v2,
390 &plk0,&sum_scale,&p_lk_loc,
391 &Pij1,&tPij1,&plk1,&sum_scale_v1,
392 &Pij2,&tPij2,&plk2,&sum_scale_v2,
393 d,b,tree);
394
395 // Copy transpose of transition matrices into AVX registers
396 for(i=0;i<ncatg;++i)
397 {
398 for(j=0;j<ns;++j)
399 {
400 for(k=0;k<nblocks;++k)
401 {
402 _tPij1[k] = _mm256_load_pd(tPij1);
403 _tPij2[k] = _mm256_load_pd(tPij2);
404 tPij1 += sz;
405 tPij2 += sz;
406 }
407 _tPij1 += nblocks;
408 _tPij2 += nblocks;
409 }
410 }
411 _tPij1 = _init_tPij1;
412 _tPij2 = _init_tPij2;
413
414 if(tree->mod->augmented == YES)
415 {
416 PhyML_Printf("\n== AVX version of the Update_Partial_Lk function does not");
417 PhyML_Printf("\n== allow augmented data.");
418 assert(FALSE);
419 }
420
421 /* For every site in the alignment */
422 for(site=0;site<npattern;++site)
423 {
424 if(tree->data->wght[site] > SMALL)
425 {
426 state_v1 = state_v2 = -1;
427 ambiguity_check_v1 = ambiguity_check_v2 = YES;
428
429 if(n_v1 && n_v1->tax)
430 {
431 ambiguity_check_v1 = n_v1->c_seq->is_ambigu[site];
432 if(ambiguity_check_v1 == NO) state_v1 = n_v1->c_seq->d_state[site];
433 }
434
435 if(n_v2 && n_v2->tax)
436 {
437 ambiguity_check_v2 = n_v2->c_seq->is_ambigu[site];
438 if(ambiguity_check_v2 == NO) state_v2 = n_v2->c_seq->d_state[site];
439 }
440
441 _tPij1 = _init_tPij1;
442 _tPij2 = _init_tPij2;
443
444 for(catg=0;catg<ncatg;++catg)
445 {
446 if(ambiguity_check_v1 == NO && ambiguity_check_v2 == NO)
447 {
448 AVX_Partial_Lk_Exex(_tPij1,state_v1,
449 _tPij2,state_v2,
450 ns,_plk0);
451 }
452 else if(ambiguity_check_v1 == YES && ambiguity_check_v2 == NO)
453 {
454 AVX_Partial_Lk_Exin(_tPij2,state_v2,
455 _tPij1,plk1,_pmat1plk1,
456 ns,_plk0);
457 }
458 else if(ambiguity_check_v1 == NO && ambiguity_check_v2 == YES)
459 {
460 AVX_Partial_Lk_Exin(_tPij1,state_v1,
461 _tPij2,plk2,_pmat2plk2,
462 ns,_plk0);
463 }
464 else
465 {
466 AVX_Partial_Lk_Inin(_tPij1,plk1,_pmat1plk1,
467 _tPij2,plk2,_pmat2plk2,
468 ns,_plk0);
469
470 }
471
472 for(k=0;k<nblocks;++k) _mm256_store_pd(plk0+sz*k,_plk0[k]);
473
474 _tPij1 += nsns / sz;
475 _tPij2 += nsns / sz;
476
477 plk0 += ns;
478 plk1 += (n_v1->tax) ? 0 : ns;
479 plk2 += (n_v2->tax) ? 0 : ns;
480 }
481
482 plk1 += (n_v1->tax) ? ns : 0;
483 plk2 += (n_v2->tax) ? ns : 0;
484
485 if(tree->scaling_method == SCALE_FAST)
486 {
487 sum_scale_v1_val = (sum_scale_v1)?(sum_scale_v1[site]):(0);
488 sum_scale_v2_val = (sum_scale_v2)?(sum_scale_v2[site]):(0);
489 sum_scale[site] = sum_scale_v1_val + sum_scale_v2_val;
490
491 if(sum_scale[site] >= 1024)
492 {
493 /* plk0 -= ncatgns; */
494 /* plk1 -= (n_v1->tax) ? ns : ncatgns; */
495 /* plk2 -= (n_v2->tax) ? ns : ncatgns; */
496 /* PhyML_Fprintf(stderr,"\n. PARTIAL site: %d plk0: %p [%g %g %g %g] plk1: %p [%g %g %g %g] plk2: %p [%g %g %g %g]", */
497 /* site, */
498 /* plk0, */
499 /* plk0[0], */
500 /* plk0[1], */
501 /* plk0[2], */
502 /* plk0[3], */
503 /* plk1, */
504 /* plk1[0], */
505 /* plk1[1], */
506 /* plk1[2], */
507 /* plk1[3], */
508 /* plk2, */
509 /* plk2[0], */
510 /* plk2[1], */
511 /* plk2[2], */
512 /* plk2[3] */
513 /* ); */
514 /* PhyML_Fprintf(stderr,"\n. PARTIAL site: %d d: %d n_v1: %d n_v2: %d",site,d->num,n_v1->num,n_v2->num); */
515 /* PhyML_Fprintf(stderr,"\n. PARTIAL site: %d sum n: %d sum n_v1: %d sum n_v2: %d",site,sum_scale[site],sum_scale_v1_val,sum_scale_v2_val); */
516
517 /* plk0 += ncatgns; */
518 /* plk1 += (n_v1->tax) ? ns : ncatgns; */
519 /* plk2 += (n_v2->tax) ? ns : ncatgns; */
520 /* Exit("\n"); */
521 }
522
523 plk0 -= ncatgns;
524 largest_p_lk = -BIG;
525 for(i=0;i<ncatgns;++i)
526 if(plk0[i] > largest_p_lk)
527 largest_p_lk = plk0[i];
528
529 if(largest_p_lk < INV_TWO_TO_THE_LARGE &&
530 tree->mod->augmented == NO &&
531 tree->apply_lk_scaling == YES)
532 {
533 for(i=0;i<ncatgns;++i) plk0[i] *= TWO_TO_THE_LARGE;
534 sum_scale[site] += LARGE;
535 }
536
537 plk0 += ncatgns;
538 }
539 }
540 else
541 {
542 plk0 += ncatgns;
543 plk1 += (n_v1->tax) ? ns : ncatgns;
544 plk2 += (n_v2->tax) ? ns : ncatgns;
545 }
546 }
547 Free(_init_tPij1);
548 Free(_init_tPij2);
549 Free(_pmat1plk1);
550 Free(_pmat2plk2);
551 Free(_plk0);
552
553 }
554
555 //////////////////////////////////////////////////////////////
556 //////////////////////////////////////////////////////////////
557
AVX_Partial_Lk_Exex(const __m256d * _tPij1,const int state1,const __m256d * _tPij2,const int state2,const int ns,__m256d * plk0)558 void AVX_Partial_Lk_Exex(const __m256d *_tPij1, const int state1, const __m256d *_tPij2, const int state2, const int ns, __m256d *plk0)
559 {
560 unsigned const int sz = (int)BYTE_ALIGN / 8;
561 unsigned const int nblocks = ns / sz;
562 unsigned int i;
563
564 _tPij1 = _tPij1 + state1 * nblocks;
565 _tPij2 = _tPij2 + state2 * nblocks;
566 for(i=0;i<nblocks;++i) plk0[i] = _mm256_mul_pd(_tPij1[i],_tPij2[i]);
567
568 /* double *x; */
569 /* posix_memalign((void *)&x,BYTE_ALIGN,(size_t)4*sizeof(phydbl)); */
570
571 /* _mm256_store_pd(x,plk0[0]); */
572 /* for(int i=0;i<4;++i) PhyML_Printf("\n> plk0: %f",x[i]); */
573
574 /* _mm256_store_pd(x,_tPij1[0]); */
575 /* for(int i=0;i<4;++i) PhyML_Printf("\n> Pij1: %f",x[i]); */
576
577 /* _mm256_store_pd(x,_tPij2[0]); */
578 /* for(int i=0;i<4;++i) PhyML_Printf("\n> Pij2: %f",x[i]); */
579
580 }
581
582 //////////////////////////////////////////////////////////////
583 //////////////////////////////////////////////////////////////
584
AVX_Partial_Lk_Exin(const __m256d * _tPij1,const int state1,const __m256d * _tPij2,const phydbl * _plk2,__m256d * _pmat2plk2,const int ns,__m256d * _plk0)585 void AVX_Partial_Lk_Exin(const __m256d *_tPij1, const int state1, const __m256d *_tPij2, const phydbl *_plk2, __m256d *_pmat2plk2, const int ns, __m256d *_plk0)
586 {
587 unsigned const int sz = (int)BYTE_ALIGN / 8;
588 unsigned const int nblocks = ns / sz;
589 unsigned int i;
590
591 _tPij1 = _tPij1 + state1 * nblocks;
592 AVX_Matrix_Vect_Prod(_tPij2,_plk2,ns,_pmat2plk2);
593
594 for(i=0;i<nblocks;++i) _plk0[i] = _mm256_mul_pd(_tPij1[i],_pmat2plk2[i]);
595 }
596
597 //////////////////////////////////////////////////////////////
598 //////////////////////////////////////////////////////////////
599
AVX_Partial_Lk_Inin(const __m256d * _tPij1,const phydbl * plk1,__m256d * _pmat1plk1,const __m256d * _tPij2,const phydbl * plk2,__m256d * _pmat2plk2,const int ns,__m256d * _plk0)600 void AVX_Partial_Lk_Inin(const __m256d *_tPij1, const phydbl *plk1, __m256d *_pmat1plk1, const __m256d *_tPij2, const phydbl *plk2, __m256d *_pmat2plk2, const int ns, __m256d *_plk0)
601 {
602 unsigned int i;
603 unsigned const int sz = (int)BYTE_ALIGN / 8;
604 unsigned const int nblocks = ns / sz;
605
606 for(i=0;i<ns;++i) if(plk1[i] > 1.0 || plk1[i] < 1.0 || plk2[i] > 1.0 || plk2[i] < 1.0) break;
607
608 if(i != ns)
609 {
610 AVX_Matrix_Vect_Prod(_tPij1,plk1,ns,_pmat1plk1);
611 AVX_Matrix_Vect_Prod(_tPij2,plk2,ns,_pmat2plk2);
612
613 for(i=0;i<nblocks;++i) _plk0[i] = _mm256_mul_pd(_pmat1plk1[i],_pmat2plk2[i]);
614 }
615 else
616 {
617 for(i=0;i<nblocks;++i) _plk0[i] = _mm256_set1_pd(1.0);
618 }
619 }
620
621 //////////////////////////////////////////////////////////////
622 //////////////////////////////////////////////////////////////
623
AVX_Matrix_Vect_Prod(const __m256d * _m_transpose,const phydbl * _v,const int ns,__m256d * _u)624 void AVX_Matrix_Vect_Prod(const __m256d *_m_transpose, const phydbl *_v, const int ns, __m256d *_u)
625 {
626 unsigned const int sz = (int)BYTE_ALIGN / 8;
627 unsigned const int nblocks = ns / sz;
628 unsigned int i,j;
629 __m256d _x;
630
631 for(i=0;i<nblocks;++i) _u[i] = _mm256_setzero_pd();
632
633 for(i=0;i<ns;++i)
634 {
635 _x = _mm256_set1_pd(_v[i]);
636 for(j=0;j<nblocks;++j)
637 {
638 #if (defined(__FMA__))
639 _u[j] = _mm256_fmadd_pd(_m_transpose[j],_x,_u[j]);
640 #else
641 _u[j] = _mm256_add_pd(_u[j],_mm256_mul_pd(_m_transpose[j],_x));
642 #endif
643 }
644 _m_transpose = _m_transpose + nblocks;
645 }
646 }
647
648 //////////////////////////////////////////////////////////////
649 //////////////////////////////////////////////////////////////
650
AVX_Horizontal_Add(const __m256d x[4])651 __m256d AVX_Horizontal_Add(const __m256d x[4])
652 {
653 __m256d y[2],z[2];
654
655 // y[0] = [x00+x01;x10+x11;x02+x03;x12+x13]
656 y[0] = _mm256_hadd_pd(x[0], x[1]);
657 // y[1] = [x20+x21;x30+x31;x22+x23;x32+x33]
658 y[1] = _mm256_hadd_pd(x[2], x[3]);
659
660 // z[0] = [x00+x01;x10+x11;x22+x23;x32+x33]
661 /* z[0] = _mm256_blend_pd(y[0],y[1],0b1100); */
662 z[0] = _mm256_blend_pd(y[0],y[1],12);
663 // z[1] = [x02+x03;x12+x13;x20+x21;x30+x31]
664 z[1] = _mm256_permute2f128_pd(y[0],y[1],0x21);
665
666 return(_mm256_add_pd(z[0],z[1]));
667 }
668
669 #endif
670