1 /*
2  * phylokernelsitemodel.h
3  * optimize SIMD likelihood function for site-specific model
4  *  Created on: Jan 12, 2016
5  *      Author: minh
6  */
7 
8 #ifndef PHYLOKERNELSITEMODEL_H_
9 #define PHYLOKERNELSITEMODEL_H_
10 
11 #include "phylotree.h"
12 #include "model/modelset.h"
13 
horizontal_add(double x)14 inline double horizontal_add(double x) {
15     return x;
16 }
17 
18 template <class VectorClass, const int VCSIZE, const int nstates>
computeSitemodelPartialLikelihoodEigenSIMD(PhyloNeighbor * dad_branch,PhyloNode * dad)19 void PhyloTree::computeSitemodelPartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
20 
21     if (dad_branch->node->degree() > 3) {
22         // TODO: SIMD version for multifurcating node
23         computeSitemodelPartialLikelihoodEigen(dad_branch, dad);
24         return;
25     }
26 
27     // don't recompute the likelihood
28 	ASSERT(dad);
29     if (dad_branch->partial_lh_computed & 1)
30         return;
31     dad_branch->partial_lh_computed |= 1;
32     PhyloNode *node = (PhyloNode*)(dad_branch->node);
33 
34     size_t nptn = aln->size(), tip_block_size = get_safe_upper_limit(nptn)*nstates;
35     size_t ptn, c;
36     size_t ncat = site_rate->getNRate();
37     size_t i, x, j;
38     size_t block = nstates * ncat;
39     ModelSet *models = (ModelSet*) model;
40     ASSERT(models->size() == nptn);
41 
42 
43 	if (node->isLeaf()) {
44 	    dad_branch->lh_scale_factor = 0.0;
45 		// scale number must be ZERO
46 //	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
47 
48 		if (!tip_partial_lh_computed)
49 			computeTipPartialLikelihood();
50 
51 		return;
52 	}
53 
54     dad_branch->lh_scale_factor = 0.0;
55 
56 	// internal node
57 	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
58 	FOR_NEIGHBOR_IT(node, dad, it) {
59         PhyloNeighbor *nei = (PhyloNeighbor*)*it;
60 		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
61         if ((nei->partial_lh_computed & 1) == 0)
62             computeSitemodelPartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(nei, node);
63         dad_branch->lh_scale_factor += nei->lh_scale_factor;
64 	}
65 
66     if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
67         // re-orient partial_lh
68         bool done = false;
69         FOR_NEIGHBOR_IT(node, dad, it2) {
70             PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
71             if (backnei->partial_lh) {
72                 dad_branch->partial_lh = backnei->partial_lh;
73                 dad_branch->scale_num = backnei->scale_num;
74                 backnei->partial_lh = NULL;
75                 backnei->scale_num = NULL;
76                 backnei->partial_lh_computed &= ~1; // clear bit
77                 done = true;
78                 break;
79             }
80         }
81         ASSERT(done && "partial_lh is not re-oriented");
82     }
83 
84 
85     double sum_scale = 0.0;
86 
87 	if (!left->node->isLeaf() && right->node->isLeaf()) {
88 		PhyloNeighbor *tmp = left;
89 		left = right;
90 		right = tmp;
91 	}
92 
93     ASSERT(node->degree() == 3); // does not work with multifurcating tree yet
94 
95     if (left->node->isLeaf() && right->node->isLeaf()) {
96 
97         /*--------------------- TIP-TIP (cherry) case ------------------*/
98 
99         double *tip_partial_lh_left = tip_partial_lh + (left->node->id * tip_block_size);
100         double *tip_partial_lh_right = tip_partial_lh + (right->node->id * tip_block_size);
101 
102 		// scale number must be ZERO
103 	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
104 
105 #ifdef _OPENMP
106 #pragma omp parallel for private(ptn, c, x, i, j) schedule(static)
107 #endif
108 		for (ptn = 0; ptn < nptn; ptn++) {
109 			VectorClass partial_lh_tmp[nstates/VCSIZE];
110 			VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
111 			VectorClass *partial_lh_left = (VectorClass*)(tip_partial_lh_left + ptn*nstates);
112 			VectorClass *partial_lh_right = (VectorClass*)(tip_partial_lh_right + ptn*nstates);
113 
114             VectorClass expleft[nstates/VCSIZE];
115             VectorClass expright[nstates/VCSIZE];
116             VectorClass *eval = (VectorClass*)(models->at(ptn)->getEigenvalues());
117             VectorClass *evec = (VectorClass*)(models->at(ptn)->getEigenvectors());
118             VectorClass *inv_evec = (VectorClass*)(models->at(ptn)->getInverseEigenvectors());
119             VectorClass vleft[VCSIZE];
120             VectorClass vright[VCSIZE];
121             VectorClass res[VCSIZE];
122             VectorClass len_left, len_right;
123             VectorClass *this_evec;
124 
125 			for (c = 0; c < ncat; c++) {
126                 len_left = site_rate->getRate(c) * left->length;
127                 len_right = site_rate->getRate(c) * right->length;
128                 for (i = 0; i < nstates/VCSIZE; i++) {
129                     expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
130                     expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
131                 }
132 				// compute real partial likelihood vector
133                 this_evec = evec;
134 				for (x = 0; x < nstates/VCSIZE; x++) {
135                     for (j = 0; j < VCSIZE; j++) {
136                         vleft[j] = 0.0;
137                         vright[j] = 0.0;
138                         for (i = 0; i < nstates/VCSIZE; i++) {
139                             vleft[j] = mul_add(this_evec[i], expleft[i], vleft[j]);
140                             vright[j] = mul_add(this_evec[i], expright[i], vright[j]);
141                         }
142                         this_evec += nstates/VCSIZE;
143                     }
144 					partial_lh_tmp[x] = horizontal_add(vleft)*horizontal_add(vright);
145 				}
146 
147 				// compute dot-product with inv_eigenvector
148                 this_evec = inv_evec;
149 				for (i = 0; i < nstates/VCSIZE; i++) {
150                     for (j = 0; j < VCSIZE; j++) {
151                         res[j] = 0.0;
152                         for (x = 0; x < nstates/VCSIZE; x++) {
153                             res[j] = mul_add(partial_lh_tmp[x], this_evec[x], res[j]);
154                         }
155                         this_evec += nstates/VCSIZE;
156                     }
157                     partial_lh[i] = horizontal_add(res);
158 				}
159 
160 //                partial_lh_left += nstates/VCSIZE;
161 //                partial_lh_right += nstates/VCSIZE;
162                 partial_lh += nstates/VCSIZE;
163 			}
164 		}
165 
166 	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
167 
168         /*--------------------- TIP-INTERNAL NODE case ------------------*/
169 
170         double *tip_partial_lh_left = tip_partial_lh + (left->node->id * tip_block_size);
171 
172 		// only take scale_num from the right subtree
173 		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
174 
175 #ifdef _OPENMP
176 #pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, j) schedule(static)
177 #endif
178 		for (ptn = 0; ptn < nptn; ptn++) {
179 			VectorClass partial_lh_tmp[nstates/VCSIZE];
180 			VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
181 			VectorClass *partial_lh_left = (VectorClass*)(tip_partial_lh_left + ptn*nstates);
182 			VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block);
183             VectorClass lh_max = 0.0;
184 
185             VectorClass expleft[nstates/VCSIZE];
186             VectorClass expright[nstates/VCSIZE];
187             VectorClass *eval = (VectorClass*)(models->at(ptn)->getEigenvalues());
188             VectorClass *evec = (VectorClass*)(models->at(ptn)->getEigenvectors());
189             VectorClass *inv_evec = (VectorClass*)(models->at(ptn)->getInverseEigenvectors());
190             VectorClass vleft[VCSIZE];
191             VectorClass vright[VCSIZE];
192             VectorClass res[VCSIZE];
193             VectorClass len_left, len_right;
194             VectorClass *this_evec;
195 
196 			for (c = 0; c < ncat; c++) {
197                 len_left = site_rate->getRate(c) * left->length;
198                 len_right = site_rate->getRate(c) * right->length;
199                 for (i = 0; i < nstates/VCSIZE; i++) {
200                     expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
201                     expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
202                 }
203 				// compute real partial likelihood vector
204                 this_evec = evec;
205 				for (x = 0; x < nstates/VCSIZE; x++) {
206                     for (j = 0; j < VCSIZE; j++) {
207                         vleft[j] = 0.0;
208                         vright[j] = 0.0;
209                         for (i = 0; i < nstates/VCSIZE; i++) {
210                             vleft[j] = mul_add(this_evec[i], expleft[i], vleft[j]);
211                             vright[j] = mul_add(this_evec[i], expright[i], vright[j]);
212                         }
213                         this_evec += nstates/VCSIZE;
214                     }
215 					partial_lh_tmp[x] = horizontal_add(vleft)*horizontal_add(vright);
216 				}
217 
218 				// compute dot-product with inv_eigenvector
219                 this_evec = inv_evec;
220 				for (i = 0; i < nstates/VCSIZE; i++) {
221                     for (j = 0; j < VCSIZE; j++) {
222                         res[j] = 0.0;
223                         for (x = 0; x < nstates/VCSIZE; x++) {
224                             res[j] = mul_add(partial_lh_tmp[x], this_evec[x], res[j]);
225                         }
226                         this_evec += nstates/VCSIZE;
227                     }
228                     lh_max = max(lh_max, abs(partial_lh[i] = horizontal_add(res)));
229 				}
230 
231 //                partial_lh_left += nstates/VCSIZE;
232                 partial_lh_right += nstates/VCSIZE;
233                 partial_lh += nstates/VCSIZE;
234 			}
235 
236             // check if one should scale partial likelihoods
237             double dmax = horizontal_max(lh_max);
238             if (dmax < SCALING_THRESHOLD) {
239                 partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
240             	if (dmax == 0.0) {
241             		// for very shitty data
242             		for (c = 0; c < ncat; c++)
243             			memcpy(&partial_lh[c*nstates/VCSIZE], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
244 					sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
245 					//sum_scale += log(lh_max) * ptn_freq[ptn];
246 					dad_branch->scale_num[ptn] += 4;
247 					int nsite = aln->getNSite();
248 					for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
249 						if (aln->getPatternID(i) == ptn) {
250 							outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
251 							x++;
252 						}
253             	} else {
254 					// now do the likelihood scaling
255 					for (i = 0; i < block/VCSIZE; i++) {
256 						partial_lh[i] *= SCALING_THRESHOLD_INVER;
257 	                    //partial_lh[i] /= lh_max;
258 					}
259 					// unobserved const pattern will never have underflow
260 					sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
261 					//sum_scale += log(lh_max) * ptn_freq[ptn];
262 					dad_branch->scale_num[ptn] += 1;
263             	}
264             }
265 
266 		}
267 		dad_branch->lh_scale_factor += sum_scale;
268 
269 
270 	} else {
271         /*--------------------- INTERNAL-INTERNAL NODE case ------------------*/
272 
273 #ifdef _OPENMP
274 #pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, j) schedule(static)
275 #endif
276 		for (ptn = 0; ptn < nptn; ptn++) {
277 			VectorClass partial_lh_tmp[nstates/VCSIZE];
278 			VectorClass *partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
279 			VectorClass *partial_lh_left = (VectorClass*)(left->partial_lh + ptn*block);
280 			VectorClass *partial_lh_right = (VectorClass*)(right->partial_lh + ptn*block);
281             VectorClass lh_max = 0.0;
282 
283             VectorClass expleft[nstates/VCSIZE];
284             VectorClass expright[nstates/VCSIZE];
285             VectorClass *eval = (VectorClass*)(models->at(ptn)->getEigenvalues());
286             VectorClass *evec = (VectorClass*)(models->at(ptn)->getEigenvectors());
287             VectorClass *inv_evec = (VectorClass*)(models->at(ptn)->getInverseEigenvectors());
288             VectorClass vleft[VCSIZE];
289             VectorClass vright[VCSIZE];
290             VectorClass res[VCSIZE];
291             VectorClass len_left, len_right;
292             VectorClass *this_evec;
293 
294 			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
295 
296 			for (c = 0; c < ncat; c++) {
297                 len_left = site_rate->getRate(c) * left->length;
298                 len_right = site_rate->getRate(c) * right->length;
299                 for (i = 0; i < nstates/VCSIZE; i++) {
300                     expleft[i] = exp(eval[i]*len_left) * partial_lh_left[i];
301                     expright[i] = exp(eval[i]*len_right) * partial_lh_right[i];
302                 }
303 				// compute real partial likelihood vector
304                 this_evec = evec;
305 				for (x = 0; x < nstates/VCSIZE; x++) {
306                     for (j = 0; j < VCSIZE; j++) {
307                         vleft[j] = 0.0;
308                         vright[j] = 0.0;
309 //                        this_evec = evec + (x*VCSIZE+j)*nstates/VCSIZE;
310                         for (i = 0; i < nstates/VCSIZE; i++) {
311                             vleft[j] = mul_add(this_evec[i], expleft[i], vleft[j]);
312                             vright[j] = mul_add(this_evec[i], expright[i], vright[j]);
313                         }
314                         this_evec += nstates/VCSIZE;
315                     }
316 					partial_lh_tmp[x] = horizontal_add(vleft)*horizontal_add(vright);
317 				}
318 
319 				// compute dot-product with inv_eigenvector
320                 this_evec = inv_evec;
321 				for (i = 0; i < nstates/VCSIZE; i++) {
322                     for (j = 0; j < VCSIZE; j++) {
323                         res[j] = 0.0;
324                         for (x = 0; x < nstates/VCSIZE; x++) {
325                             res[j] = mul_add(partial_lh_tmp[x], this_evec[x], res[j]);
326                         }
327                         this_evec += nstates/VCSIZE;
328                     }
329                     lh_max = max(lh_max, abs(partial_lh[i] = horizontal_add(res)));
330 				}
331 
332                 partial_lh_left += nstates/VCSIZE;
333                 partial_lh_right += nstates/VCSIZE;
334                 partial_lh += nstates/VCSIZE;
335 			}
336 
337             // check if one should scale partial likelihoods
338             double dmax = horizontal_max(lh_max);
339             if (dmax < SCALING_THRESHOLD) {
340                 partial_lh = (VectorClass*)(dad_branch->partial_lh + ptn*block);
341             	if (dmax == 0.0) {
342             		// for very shitty data
343             		for (c = 0; c < ncat; c++)
344             			memcpy(&partial_lh[c*nstates/VCSIZE], &tip_partial_lh[ptn*nstates], nstates*sizeof(double));
345 					sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
346 					//sum_scale += log(lh_max) * ptn_freq[ptn];
347 					dad_branch->scale_num[ptn] += 4;
348 					int nsite = aln->getNSite();
349 					for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
350 						if (aln->getPatternID(i) == ptn) {
351 							outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
352 							x++;
353 						}
354             	} else {
355 					// now do the likelihood scaling
356 					for (i = 0; i < block/VCSIZE; i++) {
357 						partial_lh[i] *= SCALING_THRESHOLD_INVER;
358 	                    //partial_lh[i] /= lh_max;
359 					}
360 					// unobserved const pattern will never have underflow
361 					sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
362 					//sum_scale += log(lh_max) * ptn_freq[ptn];
363 					dad_branch->scale_num[ptn] += 1;
364             	}
365             }
366 
367 		}
368 		dad_branch->lh_scale_factor += sum_scale;
369 
370 	}
371 
372 }
373 
374 template <class VectorClass, const int VCSIZE, const int nstates>
computeSitemodelLikelihoodDervEigenSIMD(PhyloNeighbor * dad_branch,PhyloNode * dad,double & df,double & ddf)375 void PhyloTree::computeSitemodelLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
376     PhyloNode *node = (PhyloNode*) dad_branch->node;
377     PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
378     if (!central_partial_lh)
379         initializeAllPartialLh();
380     if (node->isLeaf()) {
381     	PhyloNode *tmp_node = dad;
382     	dad = node;
383     	node = tmp_node;
384     	PhyloNeighbor *tmp_nei = dad_branch;
385     	dad_branch = node_branch;
386     	node_branch = tmp_nei;
387     }
388     if ((dad_branch->partial_lh_computed & 1) == 0)
389         computeSitemodelPartialLikelihoodEigenSIMD<VectorClass,VCSIZE,nstates>(dad_branch, dad);
390     if ((node_branch->partial_lh_computed & 1) == 0)
391         computeSitemodelPartialLikelihoodEigenSIMD<VectorClass,VCSIZE,nstates>(node_branch, node);
392 
393 //    size_t nstates = aln->num_states;
394     size_t ncat = site_rate->getNRate();
395 
396     size_t block = ncat * nstates;
397     size_t ptn; // for big data size > 4GB memory required
398     size_t c, i, j;
399     size_t nptn = aln->size();
400     size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
401 
402 	ASSERT(theta_all);
403 	if (!theta_computed) {
404 		// precompute theta for fast branch length optimization
405 
406 	    if (dad->isLeaf()) {
407 	    	// special treatment for TIP-INTERNAL NODE case
408 
409             double *tip_partial_lh_node = tip_partial_lh + (dad->id * get_safe_upper_limit(nptn)*nstates);
410 
411 #ifdef _OPENMP
412 #pragma omp parallel for private(ptn, i, c) schedule(static)
413 #endif
414 	    	for (ptn = 0; ptn < nptn; ptn++) {
415 				VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
416 				VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
417 				VectorClass *lh_tip = (VectorClass*)(tip_partial_lh_node + ptn*nstates);
418                 for (c = 0; c < ncat; c++) {
419                     for (i = 0; i < nstates/VCSIZE; i++) {
420                         theta[i] = lh_tip[i] * partial_lh_dad[i];
421                     }
422                     partial_lh_dad += nstates/VCSIZE;
423                     theta += nstates/VCSIZE;
424                 }
425 
426 			}
427 	    } else
428         {
429 	    	// both dad and node are internal nodes
430 
431             size_t block_VCSIZE = block/VCSIZE;
432 
433 //	    	size_t all_entries = nptn*block;
434 #ifdef _OPENMP
435 #pragma omp parallel for private(ptn, i) schedule(static)
436 #endif
437 	    	for (ptn = 0; ptn < nptn; ptn++) {
438 				VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
439 				VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
440 			    VectorClass *partial_lh_node = (VectorClass*)(node_branch->partial_lh + ptn*block);
441 	    		for (i = 0; i < block_VCSIZE; i++) {
442 	    			theta[i] = partial_lh_node[i] * partial_lh_dad[i];
443 	    		}
444 			}
445 	    }
446 		if (nptn < maxptn) {
447 			// copy dummy values
448 			for (ptn = nptn; ptn < maxptn; ptn++)
449 				memcpy(&theta_all[ptn*block], theta_all, block*sizeof(double));
450 		}
451 		theta_computed = true;
452 	}
453 
454     ModelSet *models = (ModelSet*)model;
455     VectorClass my_df = 0.0, my_ddf = 0.0;
456     VectorClass dad_length = dad_branch->length;
457     VectorClass unit = 1.0;
458 
459 #ifdef _OPENMP
460 #pragma omp parallel private(ptn, i, c, j)
461 {
462     VectorClass my_df_thread = 0.0;
463     VectorClass my_ddf_thread = 0.0;
464 #pragma omp for nowait schedule(static)
465 #endif
466     for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
467         VectorClass lh_ptn[VCSIZE];
468         VectorClass df_ptn[VCSIZE];
469         VectorClass ddf_ptn[VCSIZE];
470 		VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
471         VectorClass* eval;
472 
473         for (j = 0; j < VCSIZE; j++) {
474             lh_ptn[j] = 0.0;
475             df_ptn[j] = 0.0;
476             ddf_ptn[j] = 0.0;
477             if (ptn+j < nptn) {
478                 eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
479             } else {
480                 eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
481             }
482             for (c = 0; c < ncat; c++) {
483                 VectorClass cat_rate = site_rate->getRate(c);
484                 VectorClass lh_cat = 0.0, df_cat = 0.0, ddf_cat = 0.0;
485                 for (i = 0; i < nstates/VCSIZE; i++) {
486                     VectorClass cof = eval[i]*cat_rate;
487                     VectorClass val = exp(cof*dad_length) * theta[i];
488                     VectorClass val1 = cof*val;
489                     lh_cat += val;
490                     df_cat += val1;
491                     ddf_cat = mul_add(cof, val1, ddf_cat);
492                 }
493                 VectorClass prop = site_rate->getProp(c);
494                 lh_ptn[j] = mul_add(prop, lh_cat, lh_ptn[j]);
495                 df_ptn[j] = mul_add(prop, df_cat, df_ptn[j]);
496                 ddf_ptn[j] = mul_add(prop, ddf_cat, ddf_ptn[j]);
497                 theta += nstates/VCSIZE;
498             }
499         }
500 
501         VectorClass inv_lh_ptn = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
502         inv_lh_ptn = unit / abs(inv_lh_ptn);
503         VectorClass freq;
504         freq.load_a(&ptn_freq[ptn]);
505 
506         VectorClass df_ptn_sum = horizontal_add(df_ptn) * inv_lh_ptn;
507         VectorClass ddf_ptn_sum = horizontal_add(ddf_ptn) * inv_lh_ptn;
508         ddf_ptn_sum = nmul_add(df_ptn_sum, df_ptn_sum, ddf_ptn_sum);
509 
510 #ifdef _OPENMP
511         my_df_thread = mul_add(df_ptn_sum, freq, my_df_thread);
512         my_ddf_thread = mul_add(ddf_ptn_sum, freq, my_ddf_thread);
513 #else
514         my_df = mul_add(df_ptn_sum, freq, my_df);
515         my_ddf = mul_add(ddf_ptn_sum, freq, my_ddf);
516 #endif
517     } // for loop
518 
519 #ifdef _OPENMP
520 #pragma omp critical
521 	{
522 		my_df += my_df_thread;
523 		my_ddf += my_ddf_thread;
524 	}
525 }
526 #endif
527 
528 	df = horizontal_add(my_df);
529 	ddf = horizontal_add(my_ddf);
530     if (isnan(df) || isinf(df)) {
531         df = 0.0;
532         ddf = 0.0;
533         outWarning("Numerical instability (some site-likelihood = 0)");
534     }
535 
536 }
537 
538 template <class VectorClass, const int VCSIZE, const int nstates>
computeSitemodelLikelihoodBranchEigenSIMD(PhyloNeighbor * dad_branch,PhyloNode * dad)539 double PhyloTree::computeSitemodelLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
540     PhyloNode *node = (PhyloNode*) dad_branch->node;
541     PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
542     if (!central_partial_lh)
543         initializeAllPartialLh();
544     if (node->isLeaf()) {
545     	PhyloNode *tmp_node = dad;
546     	dad = node;
547     	node = tmp_node;
548     	PhyloNeighbor *tmp_nei = dad_branch;
549     	dad_branch = node_branch;
550     	node_branch = tmp_nei;
551     }
552     if ((dad_branch->partial_lh_computed & 1) == 0)
553         computeSitemodelPartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(dad_branch, dad);
554     if ((node_branch->partial_lh_computed & 1) == 0)
555         computeSitemodelPartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
556     size_t ncat = site_rate->getNRate();
557 
558     size_t block = ncat * nstates;
559     size_t ptn; // for big data size > 4GB memory required
560     size_t c, i, j;
561     size_t nptn = aln->size();
562     size_t maxptn = get_safe_upper_limit(nptn);
563 
564     ModelSet *models = (ModelSet*)model;
565     VectorClass tree_lh = 0.0;
566     VectorClass *cat_length = aligned_alloc<VectorClass>(ncat);
567     VectorClass *cat_prop = aligned_alloc<VectorClass>(ncat);
568     for (c = 0; c < ncat; c++) {
569         cat_length[c] = site_rate->getRate(c) * dad_branch->length;
570         cat_prop[c] = site_rate->getProp(c);
571     }
572 
573     if (dad->isLeaf()) {
574 		// copy dummy values because VectorClass will access beyond nptn
575 		for (ptn = nptn; ptn < maxptn; ptn++)
576 			memcpy(&dad_branch->partial_lh[ptn*block], &dad_branch->partial_lh[(ptn-1)*block], block*sizeof(double));
577 
578     	// special treatment for TIP-INTERNAL NODE case
579         double *tip_partial_lh_node = tip_partial_lh + (dad->id * get_safe_upper_limit(nptn)*nstates);
580 #ifdef _OPENMP
581 #pragma omp parallel private(ptn, i, c, j)
582 {
583     VectorClass tree_lh_thread = 0.0;
584 #pragma omp for nowait schedule(static)
585 #endif
586         for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
587             VectorClass lh_ptn[VCSIZE];
588             VectorClass* eval;
589 			VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
590 			VectorClass *partial_lh_node = (VectorClass*)(tip_partial_lh_node + ptn*nstates);
591 
592             for (j = 0; j < VCSIZE; j++) {
593                 lh_ptn[j] = 0.0;
594                 if (ptn+j < nptn) {
595                     eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
596                 } else {
597                     eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
598                 }
599                 for (c = 0; c < ncat; c++) {
600                     VectorClass lh_cat = 0.0;
601                     for (i = 0; i < nstates/VCSIZE; i++) {
602                         lh_cat=mul_add(exp(eval[i]*cat_length[c]), partial_lh_dad[i] * partial_lh_node[i], lh_cat);
603                     }
604                     lh_ptn[j] = mul_add(cat_prop[c], lh_cat, lh_ptn[j]);
605                     partial_lh_dad += nstates/VCSIZE;
606 //                    partial_lh_node += nstates/VCSIZE;
607                 }
608                 partial_lh_node += nstates/VCSIZE;
609             }
610 
611             VectorClass freq;
612             freq.load_a(&ptn_freq[ptn]);
613             VectorClass lh_ptn_sum = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
614             lh_ptn_sum = log(abs(lh_ptn_sum));
615             lh_ptn_sum.store_a(&_pattern_lh[ptn]);
616 #ifdef _OPENMP
617             tree_lh_thread = mul_add(lh_ptn_sum, freq, tree_lh_thread);
618 #else
619             tree_lh = mul_add(lh_ptn_sum, freq, tree_lh);
620 #endif
621         } // for loop
622 
623 #ifdef _OPENMP
624 #pragma omp critical
625 	{
626 		tree_lh += tree_lh_thread;
627 	}
628 }
629 #endif
630 
631     } else
632     {
633     	// both dad and node are internal nodes
634 		// copy dummy values because VectorClass will access beyond nptn
635 		for (ptn = nptn; ptn < maxptn; ptn++) {
636 			memcpy(&dad_branch->partial_lh[ptn*block], &dad_branch->partial_lh[(ptn-1)*block], block*sizeof(double));
637 			memcpy(&node_branch->partial_lh[ptn*block], &node_branch->partial_lh[(ptn-1)*block], block*sizeof(double));
638         }
639 #ifdef _OPENMP
640 #pragma omp parallel private(ptn, i, c, j)
641 {
642     VectorClass tree_lh_thread = 0.0;
643 #pragma omp for nowait schedule(static)
644 #endif
645         for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
646             VectorClass lh_ptn[VCSIZE];
647             VectorClass* eval;
648 			VectorClass *partial_lh_dad = (VectorClass*)(dad_branch->partial_lh + ptn*block);
649 			VectorClass *partial_lh_node = (VectorClass*)(node_branch->partial_lh + ptn*block);
650 
651             for (j = 0; j < VCSIZE; j++) {
652                 lh_ptn[j] = 0.0;
653                 if (ptn+j < nptn) {
654                     eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
655                 } else {
656                     eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
657                 }
658                 for (c = 0; c < ncat; c++) {
659                     VectorClass lh_cat = 0.0;
660                     for (i = 0; i < nstates/VCSIZE; i++) {
661                         lh_cat = mul_add(exp(eval[i]*cat_length[c]), partial_lh_dad[i] * partial_lh_node[i], lh_cat);
662                     }
663                     lh_ptn[j] = mul_add(cat_prop[c], lh_cat, lh_ptn[j]);
664                     partial_lh_dad += nstates/VCSIZE;
665                     partial_lh_node += nstates/VCSIZE;
666                 }
667             }
668 
669             VectorClass freq;
670             freq.load_a(&ptn_freq[ptn]);
671             VectorClass lh_ptn_sum = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
672             lh_ptn_sum = log(abs(lh_ptn_sum));
673             lh_ptn_sum.store_a(&_pattern_lh[ptn]);
674 #ifdef _OPENMP
675             tree_lh_thread = mul_add(lh_ptn_sum, freq, tree_lh_thread);
676 #else
677             tree_lh = mul_add(lh_ptn_sum, freq, tree_lh);
678 #endif
679         } // for loop
680 
681 #ifdef _OPENMP
682 #pragma omp critical
683 	{
684 		tree_lh += tree_lh_thread;
685 	}
686 }
687 #endif
688 
689 
690     }
691 
692     double tree_lh_final = horizontal_add(tree_lh) + node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
693 
694     if (isnan(tree_lh_final) || isinf(tree_lh_final)) {
695         cout << "WARNING: Numerical underflow caused by alignment sites";
696         i = aln->getNSite();
697         int j;
698         for (j = 0, c = 0; j < i; j++) {
699             ptn = aln->getPatternID(j);
700             if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
701                 cout << " " << j+1;
702                 c++;
703                 if (c >= 10) {
704                     cout << " ...";
705                     break;
706                 }
707             }
708         }
709         cout << endl;
710         tree_lh_final = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
711         for (ptn = 0; ptn < nptn; ptn++) {
712             if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
713                 _pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
714             }
715             tree_lh_final += _pattern_lh[ptn] * ptn_freq[ptn];
716         }
717     }
718 
719 	ASSERT(!isnan(tree_lh_final) && !isinf(tree_lh_final));
720 
721     aligned_free(cat_prop);
722     aligned_free(cat_length);
723 
724     return tree_lh_final;
725 }
726 
727 
728 template <class VectorClass, const int VCSIZE, const int nstates>
computeSitemodelLikelihoodFromBufferEigenSIMD()729 double PhyloTree::computeSitemodelLikelihoodFromBufferEigenSIMD() {
730 	ASSERT(theta_all && theta_computed);
731 
732 //    size_t nstates = aln->num_states;
733     size_t ncat = site_rate->getNRate();
734 
735     size_t block = ncat * nstates;
736     size_t ptn; // for big data size > 4GB memory required
737     size_t c, i, j;
738     size_t nptn = aln->size();
739 
740     ModelSet *models = (ModelSet*)model;
741 
742     VectorClass tree_lh = 0.0;
743     VectorClass *cat_length = aligned_alloc<VectorClass>(ncat);
744     VectorClass *cat_prop = aligned_alloc<VectorClass>(ncat);
745     for (c = 0; c < ncat; c++) {
746         cat_length[c] = site_rate->getRate(c) * current_it->length;
747         cat_prop[c] = site_rate->getProp(c);
748     }
749 
750 #ifdef _OPENMP
751 #pragma omp parallel private(ptn, i, c, j)
752 {
753     VectorClass tree_lh_thread = 0.0;
754 #pragma omp for nowait schedule(static)
755 #endif
756     for (ptn = 0; ptn < nptn; ptn+=VCSIZE) {
757         VectorClass lh_ptn[VCSIZE];
758         VectorClass* eval;
759         VectorClass *theta = (VectorClass*)(theta_all + ptn*block);
760 
761         for (j = 0; j < VCSIZE; j++) {
762             lh_ptn[j] = 0.0;
763             if (ptn+j < nptn) {
764                 eval = (VectorClass*)models->at(ptn+j)->getEigenvalues();
765             } else {
766                 eval = (VectorClass*)models->at(nptn-1)->getEigenvalues();
767             }
768             for (c = 0; c < ncat; c++) {
769                 VectorClass lh_cat = 0.0;
770                 for (i = 0; i < nstates/VCSIZE; i++) {
771                     lh_cat = mul_add(exp(eval[i]*cat_length[c]), theta[i], lh_cat);
772                 }
773                 lh_ptn[j] = mul_add(cat_prop[c], lh_cat, lh_ptn[j]);
774                 theta += nstates/VCSIZE;
775             }
776         }
777 
778         VectorClass freq;
779         freq.load_a(&ptn_freq[ptn]);
780         VectorClass lh_ptn_sum = horizontal_add(lh_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
781         lh_ptn_sum = log(abs(lh_ptn_sum));
782         lh_ptn_sum.store_a(&_pattern_lh[ptn]);
783 #ifdef _OPENMP
784         tree_lh_thread = mul_add(lh_ptn_sum, freq, tree_lh_thread);
785 #else
786         tree_lh = mul_add(lh_ptn_sum, freq, tree_lh);
787 #endif
788     } // for loop
789 
790 #ifdef _OPENMP
791 #pragma omp critical
792 	{
793 		tree_lh += tree_lh_thread;
794 	}
795 }
796 #endif
797 
798     double tree_lh_final = horizontal_add(tree_lh) + current_it->lh_scale_factor + current_it_back->lh_scale_factor;
799 
800     aligned_free(cat_prop);
801     aligned_free(cat_length);
802 
803     return tree_lh_final;
804 }
805 
806 
807 #endif /* PHYLOKERNELSITEMODEL_H_ */
808