1 /*
2 * phylokernelsitemodel.h
3 * optimize SIMD likelihood function for site-specific model
4 * Created on: Jan 12, 2016
5 * Author: minh
6 */
7
8 #ifndef PHYLOKERNELSITEMODEL_H_
9 #define PHYLOKERNELSITEMODEL_H_
10
11 #include "phylotree.h"
12 #include "model/modelset.h"
13
horizontal_add(double x)14 inline double horizontal_add(double x) {
15 return x;
16 }
17
18 template <class VectorClass, const int VCSIZE, const int nstates>
computeSitemodelPartialLikelihoodEigenSIMD(PhyloNeighbor * dad_branch,PhyloNode * dad)19 void PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
20
21 if (dad_branch->node->degree() > 3) {
22 // TODO: SIMD version for multifurcating node
23 computeSitemodelPartialLikelihoodEigen(dad_branch, dad);
24 return;
25 }
26
27 // don't recompute the likelihood
28 ASSERT(dad);
29 if (dad_branch->partial_lh_computed & 1)
30 return;
31 dad_branch->partial_lh_computed |= 1;
32 PhyloNode *node = (PhyloNode*)(dad_branch->node);
33
34 size_t nptn = aln->size(), tip_block_size = get_safe_upper_limit(nptn)*nstates;
35 size_t ptn, c;
36 size_t ncat = site_rate->getNRate();
37 size_t i, x, j;
38 size_t block = nstates * ncat;
39 ModelSet *models = (ModelSet*) model;
40 ASSERT(models->size() == nptn);
41
42
43 if (node->isLeaf()) {
44 dad_branch->lh_scale_factor = 0.0;
45 // scale number must be ZERO
46 // memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
47
48 if (!tip_partial_lh_computed)
49 computeTipPartialLikelihood();
50
51 return;
52 }
53
54 dad_branch->lh_scale_factor = 0.0;
55
56 // internal node
57 PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
58 FOR_NEIGHBOR_IT(node, dad, it) {
59 PhyloNeighbor *nei = (PhyloNeighbor*)*it;
60 if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
61 if ((nei->partial_lh_computed & 1) == 0)
62 computeSitemodelPartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(nei, node);
63 dad_branch->lh_scale_factor += nei->lh_scale_factor;
64 }
65
66 if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
67 // re-orient partial_lh
68 bool done = false;
69 FOR_NEIGHBOR_IT(node, dad, it2) {
70 PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
71 if (backnei->partial_lh) {
72 dad_branch->partial_lh = backnei->partial_lh;
73 dad_branch->scale_num = backnei->scale_num;
74 backnei->partial_lh = NULL;
75 backnei->scale_num = NULL;
76 backnei->partial_lh_computed &= ~1; // clear bit
77 done = true;
78 break;
79 }
80 }
81 ASSERT(done && "partial_lh is not re-oriented");
82 }
83
84
85 double sum_scale = 0.0;
86
87 if (!left->node->isLeaf() && right->node->isLeaf()) {
88 PhyloNeighbor *tmp = left;
89 left = right;
90 right = tmp;
91 }
92
93 ASSERT(node->degree() == 3); // does not work with multifurcating tree yet
94
95 if (left->node->isLeaf() && right->node->isLeaf()) {
96
97 /*--------------------- TIP-TIP (cherry) case ------------------*/
98
99 double *tip_partial_lh_left = tip_partial_lh + (left->node->id * tip_block_size);
100 double *tip_partial_lh_right = tip_partial_lh + (right->node->id * tip_block_size);
101
102 // scale number must be ZERO
103 memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
104
105 #ifdef _OPENMP
106 #pragma omp parallel for private(ptn, c, x, i, j) schedule(static)
107 #endif
108 for (ptn = 0; ptn < nptn; ptn++) {
109 VectorClass partial_lh_tmp[nstates/VCSIZE];
110 VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
111 VectorClass *partial_lh_left = (VectorClass*)(tip_partial_lh_left + ptn*nstates);
112 VectorClass *partial_lh_right = (VectorClass*)(tip_partial_lh_right + ptn*nstates);
113
114 VectorClass expleft[nstates/VCSIZE];
115 VectorClass expright[nstates/VCSIZE];
116 VectorClass *eval = (VectorClass*)(models->at(ptn)->getEigenvalues());
117 VectorClass *evec = (VectorClass*)(models->at(ptn)->getEigenvectors());
118 VectorClass *inv_evec = (VectorClass*)(models->at(ptn)->getInverseEigenvectors());
119 VectorClass vleft[VCSIZE];
120 VectorClass vright[VCSIZE];
121 VectorClass res[VCSIZE];
122 VectorClass len_left, len_right;
123 VectorClass *this_evec;
124
125 for (c = 0; c < ncat; c++) {
126 len_left = site_rate->getRate(c) * left->length;
127 len_right = site_rate->getRate(c) * right->length;
128 for (i = 0; i < nstates/VCSIZE; i++) {
129 expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
130 expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
131 }
132 // compute real partial likelihood vector
133 this_evec = evec;
134 for (x = 0; x < nstates/VCSIZE; x++) {
135 for (j = 0; j < VCSIZE; j++) {
136 vleft[j] = 0.0;
137 vright[j] = 0.0;
138 for (i = 0; i < nstates/VCSIZE; i++) {
139 vleft[j] = mul_add(this_evec[i], expleft[i], vleft[j]);
140 vright[j] = mul_add(this_evec[i], expright[i], vright[j]);
141 }
142 this_evec += nstates/VCSIZE;
143 }
144 partial_lh_tmp[x] = horizontal_add(vleft)*horizontal_add(vright);
145 }
146
147 // compute dot-product with inv_eigenvector
148 this_evec = inv_evec;
149 for (i = 0; i < nstates/VCSIZE; i++) {
150 for (j = 0; j < VCSIZE; j++) {
151 res[j] = 0.0;
152 for (x = 0; x < nstates/VCSIZE; x++) {
153 res[j] = mul_add(partial_lh_tmp[x], this_evec[x], res[j]);
154 }
155 this_evec += nstates/VCSIZE;
156 }
157 partial_lh[i] = horizontal_add(res);
158 }
159
160 // partial_lh_left += nstates/VCSIZE;
161 // partial_lh_right += nstates/VCSIZE;
162 partial_lh += nstates/VCSIZE;
163 }
164 }
165
166 } else if (left->node->isLeaf() && !right->node->isLeaf()) {
167
168 /*--------------------- TIP-INTERNAL NODE case ------------------*/
169
170 double *tip_partial_lh_left = tip_partial_lh + (left->node->id * tip_block_size);
171
172 // only take scale_num from the right subtree
173 memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
174
175 #ifdef _OPENMP
176 #pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, j) schedule(static)
177 #endif
178 for (ptn = 0; ptn < nptn; ptn++) {
179 VectorClass partial_lh_tmp[nstates/VCSIZE];
180 VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
181 VectorClass *partial_lh_left = (VectorClass*)(tip_partial_lh_left + ptn*nstates);
182 VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block);
183 VectorClass lh_max = 0.0;
184
185 VectorClass expleft[nstates/VCSIZE];
186 VectorClass expright[nstates/VCSIZE];
187 VectorClass *eval = (VectorClass*)(models->at(ptn)->getEigenvalues());
188 VectorClass *evec = (VectorClass*)(models->at(ptn)->getEigenvectors());
189 VectorClass *inv_evec = (VectorClass*)(models->at(ptn)->getInverseEigenvectors());
190 VectorClass vleft[VCSIZE];
191 VectorClass vright[VCSIZE];
192 VectorClass res[VCSIZE];
193 VectorClass len_left, len_right;
194 VectorClass *this_evec;
195
196 for (c = 0; c < ncat; c++) {
197 len_left = site_rate->getRate(c) * left->length;
198 len_right = site_rate->getRate(c) * right->length;
199 for (i = 0; i < nstates/VCSIZE; i++) {
200 expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
201 expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
202 }
203 // compute real partial likelihood vector
204 this_evec = evec;
205 for (x = 0; x < nstates/VCSIZE; x++) {
206 for (j = 0; j < VCSIZE; j++) {
207 vleft[j] = 0.0;
208 vright[j] = 0.0;
209 for (i = 0; i < nstates/VCSIZE; i++) {
210 vleft[j] = mul_add(this_evec[i], expleft[i], vleft[j]);
211 vright[j] = mul_add(this_evec[i], expright[i], vright[j]);
212 }
213 this_evec += nstates/VCSIZE;
214 }
215 partial_lh_tmp[x] = horizontal_add(vleft)*horizontal_add(vright);
216 }
217
218 // compute dot-product with inv_eigenvector
219 this_evec = inv_evec;
220 for (i = 0; i < nstates/VCSIZE; i++) {
221 for (j = 0; j < VCSIZE; j++) {
222 res[j] = 0.0;
223 for (x = 0; x < nstates/VCSIZE; x++) {
224 res[j] = mul_add(partial_lh_tmp[x], this_evec[x], res[j]);
225 }
226 this_evec += nstates/VCSIZE;
227 }
228 lh_max = max(lh_max, abs(partial_lh[i] = horizontal_add(res)));
229 }
230
231 // partial_lh_left += nstates/VCSIZE;
232 partial_lh_right += nstates/VCSIZE;
233 partial_lh += nstates/VCSIZE;
234 }
235
236 // check if one should scale partial likelihoods
237 double dmax = horizontal_max(lh_max);
238 if (dmax < SCALING_THRESHOLD) {
239 partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
240 if (dmax == 0.0) {
241 // for very shitty data
242 for (c = 0; c < ncat; c++)
243 memcpy(&partial_lh[c*nstates/VCSIZE], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
244 sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
245 //sum_scale += log(lh_max) * ptn_freq[ptn];
246 dad_branch->scale_num[ptn] += 4;
247 int nsite = aln->getNSite();
248 for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
249 if (aln->getPatternID(i) == ptn) {
250 outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
251 x++;
252 }
253 } else {
254 // now do the likelihood scaling
255 for (i = 0; i < block/VCSIZE; i++) {
256 partial_lh[i] *= SCALING_THRESHOLD_INVER;
257 //partial_lh[i] /= lh_max;
258 }
259 // unobserved const pattern will never have underflow
260 sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
261 //sum_scale += log(lh_max) * ptn_freq[ptn];
262 dad_branch->scale_num[ptn] += 1;
263 }
264 }
265
266 }
267 dad_branch->lh_scale_factor += sum_scale;
268
269
270 } else {
271 /*--------------------- INTERNAL-INTERNAL NODE case ------------------*/
272
273 #ifdef _OPENMP
274 #pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, j) schedule(static)
275 #endif
276 for (ptn = 0; ptn < nptn; ptn++) {
277 VectorClass partial_lh_tmp[nstates/VCSIZE];
278 VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
279 VectorClass *partial_lh_left = (VectorClass*)(left->partial_lh + ptn*block);
280 VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block);
281 VectorClass lh_max = 0.0;
282
283 VectorClass expleft[nstates/VCSIZE];
284 VectorClass expright[nstates/VCSIZE];
285 VectorClass *eval = (VectorClass*)(models->at(ptn)->getEigenvalues());
286 VectorClass *evec = (VectorClass*)(models->at(ptn)->getEigenvectors());
287 VectorClass *inv_evec = (VectorClass*)(models->at(ptn)->getInverseEigenvectors());
288 VectorClass vleft[VCSIZE];
289 VectorClass vright[VCSIZE];
290 VectorClass res[VCSIZE];
291 VectorClass len_left, len_right;
292 VectorClass *this_evec;
293
294 dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
295
296 for (c = 0; c < ncat; c++) {
297 len_left = site_rate->getRate(c) * left->length;
298 len_right = site_rate->getRate(c) * right->length;
299 for (i = 0; i < nstates/VCSIZE; i++) {
300 expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
301 expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
302 }
303 // compute real partial likelihood vector
304 this_evec = evec;
305 for (x = 0; x < nstates/VCSIZE; x++) {
306 for (j = 0; j < VCSIZE; j++) {
307 vleft[j] = 0.0;
308 vright[j] = 0.0;
309 // this_evec = evec + (x*VCSIZE+j)*nstates/VCSIZE;
310 for (i = 0; i < nstates/VCSIZE; i++) {
311 vleft[j] = mul_add(this_evec[i], expleft[i], vleft[j]);
312 vright[j] = mul_add(this_evec[i], expright[i], vright[j]);
313 }
314 this_evec += nstates/VCSIZE;
315 }
316 partial_lh_tmp[x] = horizontal_add(vleft)*horizontal_add(vright);
317 }
318
319 // compute dot-product with inv_eigenvector
320 this_evec = inv_evec;
321 for (i = 0; i < nstates/VCSIZE; i++) {
322 for (j = 0; j < VCSIZE; j++) {
323 res[j] = 0.0;
324 for (x = 0; x < nstates/VCSIZE; x++) {
325 res[j] = mul_add(partial_lh_tmp[x], this_evec[x], res[j]);
326 }
327 this_evec += nstates/VCSIZE;
328 }
329 lh_max = max(lh_max, abs(partial_lh[i] = horizontal_add(res)));
330 }
331
332 partial_lh_left += nstates/VCSIZE;
333 partial_lh_right += nstates/VCSIZE;
334 partial_lh += nstates/VCSIZE;
335 }
336
337 // check if one should scale partial likelihoods
338 double dmax = horizontal_max(lh_max);
339 if (dmax < SCALING_THRESHOLD) {
340 partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
341 if (dmax == 0.0) {
342 // for very shitty data
343 for (c = 0; c < ncat; c++)
344 memcpy(&partial_lh[c*nstates/VCSIZE], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
345 sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
346 //sum_scale += log(lh_max) * ptn_freq[ptn];
347 dad_branch->scale_num[ptn] += 4;
348 int nsite = aln->getNSite();
349 for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
350 if (aln->getPatternID(i) == ptn) {
351 outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
352 x++;
353 }
354 } else {
355 // now do the likelihood scaling
356 for (i = 0; i < block/VCSIZE; i++) {
357 partial_lh[i] *= SCALING_THRESHOLD_INVER;
358 //partial_lh[i] /= lh_max;
359 }
360 // unobserved const pattern will never have underflow
361 sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
362 //sum_scale += log(lh_max) * ptn_freq[ptn];
363 dad_branch->scale_num[ptn] += 1;
364 }
365 }
366
367 }
368 dad_branch->lh_scale_factor += sum_scale;
369
370 }
371
372 }
373
374 template <class VectorClass, const int VCSIZE, const int nstates>
computeSitemodelLikelihoodDervEigenSIMD(PhyloNeighbor * dad_branch,PhyloNode * dad,double & df,double & ddf)375 void PhyloTree::computeSitemodelLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
376 PhyloNode *node = (PhyloNode*) dad_branch->node;
377 PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
378 if (!central_partial_lh)
379 initializeAllPartialLh();
380 if (node->isLeaf()) {
381 PhyloNode *tmp_node = dad;
382 dad = node;
383 node = tmp_node;
384 PhyloNeighbor *tmp_nei = dad_branch;
385 dad_branch = node_branch;
386 node_branch = tmp_nei;
387 }
388 if ((dad_branch->partial_lh_computed & 1) == 0)
389 computeSitemodelPartialLikelihoodEigenSIMD<VectorClass,VCSIZE,nstates>(dad_branch, dad);
390 if ((node_branch->partial_lh_computed & 1) == 0)
391 computeSitemodelPartialLikelihoodEigenSIMD<VectorClass,VCSIZE,nstates>(node_branch, node);
392
393 // size_t nstates = aln->num_states;
394 size_t ncat = site_rate->getNRate();
395
396 size_t block = ncat * nstates;
397 size_t ptn; // for big data size > 4GB memory required
398 size_t c, i, j;
399 size_t nptn = aln->size();
400 size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
401
402 ASSERT(theta_all);
403 if (!theta_computed) {
404 // precompute theta for fast branch length optimization
405
406 if (dad->isLeaf()) {
407 // special treatment for TIP-INTERNAL NODE case
408
409 double *tip_partial_lh_node = tip_partial_lh + (dad->id * get_safe_upper_limit(nptn)*nstates);
410
411 #ifdef _OPENMP
412 #pragma omp parallel for private(ptn, i, c) schedule(static)
413 #endif
414 for (ptn = 0; ptn < nptn; ptn++) {
415 VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
416 VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
417 VectorClass *lh_tip = (VectorClass*)(tip_partial_lh_node + ptn*nstates);
418 for (c = 0; c < ncat; c++) {
419 for (i = 0; i < nstates/VCSIZE; i++) {
420 theta[i] = lh_tip[i] * partial_lh_dad[i];
421 }
422 partial_lh_dad += nstates/VCSIZE;
423 theta += nstates/VCSIZE;
424 }
425
426 }
427 } else
428 {
429 // both dad and node are internal nodes
430
431 size_t block_VCSIZE = block/VCSIZE;
432
433 // size_t all_entries = nptn*block;
434 #ifdef _OPENMP
435 #pragma omp parallel for private(ptn, i) schedule(static)
436 #endif
437 for (ptn = 0; ptn < nptn; ptn++) {
438 VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
439 VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
440 VectorClass *partial_lh_node = (VectorClass*)(node_branch->partial_lh + ptn*block);
441 for (i = 0; i < block_VCSIZE; i++) {
442 theta[i] = partial_lh_node[i] * partial_lh_dad[i];
443 }
444 }
445 }
446 if (nptn < maxptn) {
447 // copy dummy values
448 for (ptn = nptn; ptn < maxptn; ptn++)
449 memcpy(&theta_all[ptn*block], theta_all, block*sizeof(double));
450 }
451 theta_computed = true;
452 }
453
454 ModelSet *models = (ModelSet*)model;
455 VectorClass my_df = 0.0, my_ddf = 0.0;
456 VectorClass dad_length = dad_branch->length;
457 VectorClass unit = 1.0;
458
459 #ifdef _OPENMP
460 #pragma omp parallel private(ptn, i, c, j)
461 {
462 VectorClass my_df_thread = 0.0;
463 VectorClass my_ddf_thread = 0.0;
464 #pragma omp for nowait schedule(static)
465 #endif
466 for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
467 VectorClass lh_ptn[VCSIZE];
468 VectorClass df_ptn[VCSIZE];
469 VectorClass ddf_ptn[VCSIZE];
470 VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
471 VectorClass* eval;
472
473 for (j = 0; j < VCSIZE; j++) {
474 lh_ptn[j] = 0.0;
475 df_ptn[j] = 0.0;
476 ddf_ptn[j] = 0.0;
477 if (ptn+j < nptn) {
478 eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
479 } else {
480 eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
481 }
482 for (c = 0; c < ncat; c++) {
483 VectorClass cat_rate = site_rate->getRate(c);
484 VectorClass lh_cat = 0.0, df_cat = 0.0, ddf_cat = 0.0;
485 for (i = 0; i < nstates/VCSIZE; i++) {
486 VectorClass cof = eval[i]*cat_rate;
487 VectorClass val = exp(cof*dad_length) * theta[i];
488 VectorClass val1 = cof*val;
489 lh_cat += val;
490 df_cat += val1;
491 ddf_cat = mul_add(cof, val1, ddf_cat);
492 }
493 VectorClass prop = site_rate->getProp(c);
494 lh_ptn[j] = mul_add(prop, lh_cat, lh_ptn[j]);
495 df_ptn[j] = mul_add(prop, df_cat, df_ptn[j]);
496 ddf_ptn[j] = mul_add(prop, ddf_cat, ddf_ptn[j]);
497 theta += nstates/VCSIZE;
498 }
499 }
500
501 VectorClass inv_lh_ptn = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
502 inv_lh_ptn = unit / abs(inv_lh_ptn);
503 VectorClass freq;
504 freq.load_a(&ptn_freq[ptn]);
505
506 VectorClass df_ptn_sum = horizontal_add(df_ptn) * inv_lh_ptn;
507 VectorClass ddf_ptn_sum = horizontal_add(ddf_ptn) * inv_lh_ptn;
508 ddf_ptn_sum = nmul_add(df_ptn_sum, df_ptn_sum, ddf_ptn_sum);
509
510 #ifdef _OPENMP
511 my_df_thread = mul_add(df_ptn_sum, freq, my_df_thread);
512 my_ddf_thread = mul_add(ddf_ptn_sum, freq, my_ddf_thread);
513 #else
514 my_df = mul_add(df_ptn_sum, freq, my_df);
515 my_ddf = mul_add(ddf_ptn_sum, freq, my_ddf);
516 #endif
517 } // for loop
518
519 #ifdef _OPENMP
520 #pragma omp critical
521 {
522 my_df += my_df_thread;
523 my_ddf += my_ddf_thread;
524 }
525 }
526 #endif
527
528 df = horizontal_add(my_df);
529 ddf = horizontal_add(my_ddf);
530 if (isnan(df) || isinf(df)) {
531 df = 0.0;
532 ddf = 0.0;
533 outWarning("Numerical instability (some site-likelihood = 0)");
534 }
535
536 }
537
538 template <class VectorClass, const int VCSIZE, const int nstates>
computeSitemodelLikelihoodBranchEigenSIMD(PhyloNeighbor * dad_branch,PhyloNode * dad)539 double PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
540 PhyloNode *node = (PhyloNode*) dad_branch->node;
541 PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
542 if (!central_partial_lh)
543 initializeAllPartialLh();
544 if (node->isLeaf()) {
545 PhyloNode *tmp_node = dad;
546 dad = node;
547 node = tmp_node;
548 PhyloNeighbor *tmp_nei = dad_branch;
549 dad_branch = node_branch;
550 node_branch = tmp_nei;
551 }
552 if ((dad_branch->partial_lh_computed & 1) == 0)
553 computeSitemodelPartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(dad_branch, dad);
554 if ((node_branch->partial_lh_computed & 1) == 0)
555 computeSitemodelPartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
556 size_t ncat = site_rate->getNRate();
557
558 size_t block = ncat * nstates;
559 size_t ptn; // for big data size > 4GB memory required
560 size_t c, i, j;
561 size_t nptn = aln->size();
562 size_t maxptn = get_safe_upper_limit(nptn);
563
564 ModelSet *models = (ModelSet*)model;
565 VectorClass tree_lh = 0.0;
566 VectorClass *cat_length = aligned_alloc<VectorClass>(ncat);
567 VectorClass *cat_prop = aligned_alloc<VectorClass>(ncat);
568 for (c = 0; c < ncat; c++) {
569 cat_length[c] = site_rate->getRate(c) * dad_branch->length;
570 cat_prop[c] = site_rate->getProp(c);
571 }
572
573 if (dad->isLeaf()) {
574 // copy dummy values because VectorClass will access beyond nptn
575 for (ptn = nptn; ptn < maxptn; ptn++)
576 memcpy(&dad_branch->partial_lh[ptn*block], &dad_branch->partial_lh[(ptn-1)*block], block*sizeof(double));
577
578 // special treatment for TIP-INTERNAL NODE case
579 double *tip_partial_lh_node = tip_partial_lh + (dad->id * get_safe_upper_limit(nptn)*nstates);
580 #ifdef _OPENMP
581 #pragma omp parallel private(ptn, i, c, j)
582 {
583 VectorClass tree_lh_thread = 0.0;
584 #pragma omp for nowait schedule(static)
585 #endif
586 for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
587 VectorClass lh_ptn[VCSIZE];
588 VectorClass* eval;
589 VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
590 VectorClass *partial_lh_node = (VectorClass*)(tip_partial_lh_node + ptn*nstates);
591
592 for (j = 0; j < VCSIZE; j++) {
593 lh_ptn[j] = 0.0;
594 if (ptn+j < nptn) {
595 eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
596 } else {
597 eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
598 }
599 for (c = 0; c < ncat; c++) {
600 VectorClass lh_cat = 0.0;
601 for (i = 0; i < nstates/VCSIZE; i++) {
602 lh_cat=mul_add(exp(eval[i]*cat_length[c]), partial_lh_dad[i] * partial_lh_node[i], lh_cat);
603 }
604 lh_ptn[j] = mul_add(cat_prop[c], lh_cat, lh_ptn[j]);
605 partial_lh_dad += nstates/VCSIZE;
606 // partial_lh_node += nstates/VCSIZE;
607 }
608 partial_lh_node += nstates/VCSIZE;
609 }
610
611 VectorClass freq;
612 freq.load_a(&ptn_freq[ptn]);
613 VectorClass lh_ptn_sum = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
614 lh_ptn_sum = log(abs(lh_ptn_sum));
615 lh_ptn_sum.store_a(&_pattern_lh[ptn]);
616 #ifdef _OPENMP
617 tree_lh_thread = mul_add(lh_ptn_sum, freq, tree_lh_thread);
618 #else
619 tree_lh = mul_add(lh_ptn_sum, freq, tree_lh);
620 #endif
621 } // for loop
622
623 #ifdef _OPENMP
624 #pragma omp critical
625 {
626 tree_lh += tree_lh_thread;
627 }
628 }
629 #endif
630
631 } else
632 {
633 // both dad and node are internal nodes
634 // copy dummy values because VectorClass will access beyond nptn
635 for (ptn = nptn; ptn < maxptn; ptn++) {
636 memcpy(&dad_branch->partial_lh[ptn*block], &dad_branch->partial_lh[(ptn-1)*block], block*sizeof(double));
637 memcpy(&node_branch->partial_lh[ptn*block], &node_branch->partial_lh[(ptn-1)*block], block*sizeof(double));
638 }
639 #ifdef _OPENMP
640 #pragma omp parallel private(ptn, i, c, j)
641 {
642 VectorClass tree_lh_thread = 0.0;
643 #pragma omp for nowait schedule(static)
644 #endif
645 for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
646 VectorClass lh_ptn[VCSIZE];
647 VectorClass* eval;
648 VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
649 VectorClass *partial_lh_node = (VectorClass*)(node_branch->partial_lh + ptn*block);
650
651 for (j = 0; j < VCSIZE; j++) {
652 lh_ptn[j] = 0.0;
653 if (ptn+j < nptn) {
654 eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
655 } else {
656 eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
657 }
658 for (c = 0; c < ncat; c++) {
659 VectorClass lh_cat = 0.0;
660 for (i = 0; i < nstates/VCSIZE; i++) {
661 lh_cat = mul_add(exp(eval[i]*cat_length[c]), partial_lh_dad[i] * partial_lh_node[i], lh_cat);
662 }
663 lh_ptn[j] = mul_add(cat_prop[c], lh_cat, lh_ptn[j]);
664 partial_lh_dad += nstates/VCSIZE;
665 partial_lh_node += nstates/VCSIZE;
666 }
667 }
668
669 VectorClass freq;
670 freq.load_a(&ptn_freq[ptn]);
671 VectorClass lh_ptn_sum = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
672 lh_ptn_sum = log(abs(lh_ptn_sum));
673 lh_ptn_sum.store_a(&_pattern_lh[ptn]);
674 #ifdef _OPENMP
675 tree_lh_thread = mul_add(lh_ptn_sum, freq, tree_lh_thread);
676 #else
677 tree_lh = mul_add(lh_ptn_sum, freq, tree_lh);
678 #endif
679 } // for loop
680
681 #ifdef _OPENMP
682 #pragma omp critical
683 {
684 tree_lh += tree_lh_thread;
685 }
686 }
687 #endif
688
689
690 }
691
692 double tree_lh_final = horizontal_add(tree_lh) + node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
693
694 if (isnan(tree_lh_final) || isinf(tree_lh_final)) {
695 cout << "WARNING: Numerical underflow caused by alignment sites";
696 i = aln->getNSite();
697 int j;
698 for (j = 0, c = 0; j < i; j++) {
699 ptn = aln->getPatternID(j);
700 if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
701 cout << " " << j+1;
702 c++;
703 if (c >= 10) {
704 cout << " ...";
705 break;
706 }
707 }
708 }
709 cout << endl;
710 tree_lh_final = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
711 for (ptn = 0; ptn < nptn; ptn++) {
712 if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
713 _pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
714 }
715 tree_lh_final += _pattern_lh[ptn] * ptn_freq[ptn];
716 }
717 }
718
719 ASSERT(!isnan(tree_lh_final) && !isinf(tree_lh_final));
720
721 aligned_free(cat_prop);
722 aligned_free(cat_length);
723
724 return tree_lh_final;
725 }
726
727
728 template <class VectorClass, const int VCSIZE, const int nstates>
computeSitemodelLikelihoodFromBufferEigenSIMD()729 double PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD() {
730 ASSERT(theta_all && theta_computed);
731
732 // size_t nstates = aln->num_states;
733 size_t ncat = site_rate->getNRate();
734
735 size_t block = ncat * nstates;
736 size_t ptn; // for big data size > 4GB memory required
737 size_t c, i, j;
738 size_t nptn = aln->size();
739
740 ModelSet *models = (ModelSet*)model;
741
742 VectorClass tree_lh = 0.0;
743 VectorClass *cat_length = aligned_alloc<VectorClass>(ncat);
744 VectorClass *cat_prop = aligned_alloc<VectorClass>(ncat);
745 for (c = 0; c < ncat; c++) {
746 cat_length[c] = site_rate->getRate(c) * current_it->length;
747 cat_prop[c] = site_rate->getProp(c);
748 }
749
750 #ifdef _OPENMP
751 #pragma omp parallel private(ptn, i, c, j)
752 {
753 VectorClass tree_lh_thread = 0.0;
754 #pragma omp for nowait schedule(static)
755 #endif
756 for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
757 VectorClass lh_ptn[VCSIZE];
758 VectorClass* eval;
759 VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
760
761 for (j = 0; j < VCSIZE; j++) {
762 lh_ptn[j] = 0.0;
763 if (ptn+j < nptn) {
764 eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
765 } else {
766 eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
767 }
768 for (c = 0; c < ncat; c++) {
769 VectorClass lh_cat = 0.0;
770 for (i = 0; i < nstates/VCSIZE; i++) {
771 lh_cat = mul_add(exp(eval[i]*cat_length[c]), theta[i], lh_cat);
772 }
773 lh_ptn[j] = mul_add(cat_prop[c], lh_cat, lh_ptn[j]);
774 theta += nstates/VCSIZE;
775 }
776 }
777
778 VectorClass freq;
779 freq.load_a(&ptn_freq[ptn]);
780 VectorClass lh_ptn_sum = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
781 lh_ptn_sum = log(abs(lh_ptn_sum));
782 lh_ptn_sum.store_a(&_pattern_lh[ptn]);
783 #ifdef _OPENMP
784 tree_lh_thread = mul_add(lh_ptn_sum, freq, tree_lh_thread);
785 #else
786 tree_lh = mul_add(lh_ptn_sum, freq, tree_lh);
787 #endif
788 } // for loop
789
790 #ifdef _OPENMP
791 #pragma omp critical
792 {
793 tree_lh += tree_lh_thread;
794 }
795 }
796 #endif
797
798 double tree_lh_final = horizontal_add(tree_lh) + current_it->lh_scale_factor + current_it_back->lh_scale_factor;
799
800 aligned_free(cat_prop);
801 aligned_free(cat_length);
802
803 return tree_lh_final;
804 }
805
806
807 #endif /* PHYLOKERNELSITEMODEL_H_ */
808