1 //
2 //  alignTree.cpp
3 //  pdsBayesian
4 //
5 //  Created by Patrick Schloss on 4/3/12.
6 //  Copyright (c) 2012 University of Michigan. All rights reserved.
7 //
8 
9 #include "alignnode.h"
10 #include "aligntree.h"
11 
12 /**************************************************************************************************/
13 
AlignTree(string referenceFileName,string taxonomyFileName,int cutoff)14 AlignTree::AlignTree(string referenceFileName, string taxonomyFileName, int cutoff) : Classify(), confidenceThreshold(cutoff){
15 	try {
16         AlignNode* newNode = new AlignNode("Root", 0);
17         tree.push_back(newNode);			//	the tree is stored as a vector of elements of type TaxonomyNode
18 
19         string refTaxonomy;
20 
21         readTaxonomy(taxonomyFileName);
22 
23         ifstream referenceFile;
24         Utils util; util.openInputFile(referenceFileName, referenceFile);
25         bool error = false;
26         map<int, int> lengths;
27         while(!referenceFile.eof()){
28 
29             if (m->getControl_pressed()) { break; }
30 
31             Sequence seq(referenceFile);  util.gobble(referenceFile);
32 
33             if (seq.getName() != "") {
34                 map<string, string>::iterator it = taxonomy.find(seq.getName());
35 
36                 if (it != taxonomy.end()) {
37                     refTaxonomy = it->second;		//	lookup the taxonomy string for the current reference sequence
38                     string aligned = seq.getAligned();
39                     lengths[aligned.length()] = 1;
40                     if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: reference sequences must be aligned to use the align method, quitting.\n"); break; }
41                     addTaxonomyToTree(seq.getName(), refTaxonomy, aligned);
42                 }else {
43                     m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
44                 }
45             }
46         }
47         referenceFile.close();
48 
49         length = (lengths.begin())->first;
50 
51         if (error) { m->setControl_pressed(true); }
52 
53         numTaxa = (int)tree.size();
54 
55         numLevels = 0;
56         for(int i=0;i<numTaxa;i++){
57             int level = tree[i]->getLevel();
58             if(level > numLevels){	numLevels = level;	}
59         }
60         numLevels++;
61 
62         aggregateThetas();
63 
64         int dbSize = tree[0]->getNumSeqs();
65 
66         for(int i=0;i<numTaxa;i++){
67             tree[i]->checkTheta();
68             tree[i]->setTotalSeqs(dbSize);
69         }
70 
71     }
72     catch(exception& e) {
73         m->errorOut(e, "AlignTree", "AlignTree");
74         exit(1);
75     }
76 }
77 
78 /**************************************************************************************************/
79 
~AlignTree()80 AlignTree::~AlignTree(){
81 	try {
82         for(int i=0;i<tree.size();i++){
83             delete tree[i];
84         }
85 	}
86     catch(exception& e) {
87         m->errorOut(e, "AlignTree", "~AlignTree");
88         exit(1);
89     }
90 }
91 
92 /**************************************************************************************************/
93 
addTaxonomyToTree(string seqName,string & taxonomy,string & sequence)94 int AlignTree::addTaxonomyToTree(string seqName, string& taxonomy, string& sequence){
95 	try {
96         AlignNode* newNode;
97         string taxonName = "";
98         int treePosition = 0;							//	the root is element 0
99 
100         int level = 1;
101 
102         for(int i=0;i<taxonomy.length();i++){			//	step through taxonomy string...
103 
104             if (m->getControl_pressed()) { break; }
105 
106             if(taxonomy[i] == ';'){						//	looking for semicolons...
107 
108                 if (taxonName == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;\n");  m->setControl_pressed(true); }
109 
110                 int newIndex = tree[treePosition]->getChildIndex(taxonName);	//	look to see if your current node already
111                 //	has a child with the new taxonName
112                 if(newIndex != -1)	{	treePosition = newIndex;	}		//	if you've seen it before, jump to that
113                 else {														//	 position in the tree
114                     int newChildIndex = (int)tree.size();						//	otherwise, we'll have to create one...
115                     tree[treePosition]->makeChild(taxonName, newChildIndex);
116 
117                     newNode = new AlignNode(taxonName, level);
118 
119                     newNode->setParent(treePosition);
120 
121                     tree.push_back(newNode);
122                     treePosition = newChildIndex;
123                 }
124 
125                 //	sequence data to that node to update that node's theta - seems slow...
126                 taxonName = "";								//	clear out the taxon name that we will build as we look
127                 level++;
128             }												//	for a semicolon
129             else{
130                 taxonName += taxonomy[i];					//	keep adding letters until we reach a semicolon
131             }
132         }
133         tree[treePosition]->loadSequence(sequence);	//	now that we've gotten to the correct node, add the
134 
135         return 0;
136 	}
137     catch(exception& e) {
138         m->errorOut(e, "AlignTree", "addTaxonomyToTree");
139         exit(1);
140     }
141 }
142 
143 /**************************************************************************************************/
144 
aggregateThetas()145 int AlignTree::aggregateThetas(){
146 	try {
147         vector<vector<int> > levelMatrix(numLevels+1);
148 
149         for(int i=0;i<tree.size();i++){
150             if (m->getControl_pressed()) { return 0; }
151             levelMatrix[tree[i]->getLevel()].push_back(i);
152         }
153 
154         for(int i=numLevels-1;i>0;i--){
155             if (m->getControl_pressed()) { return 0; }
156             for(int j=0;j<levelMatrix[i].size();j++){
157 
158                 AlignNode* holder = tree[levelMatrix[i][j]];
159 
160                 tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());
161             }
162         }
163 	    return 0;
164 	}
165     catch(exception& e) {
166         m->errorOut(e, "AlignTree", "aggregateThetas");
167         exit(1);
168     }
169 }
170 
171 /**************************************************************************************************/
172 
getOutlierLogProbability(string & sequence)173 double AlignTree::getOutlierLogProbability(string& sequence){
174 	try {
175         double count = 0;
176 
177         for(int i=0;i<sequence.length();i++){
178 
179             if(sequence[i] != '.'){	count++;	}
180 
181         }
182 
183         return count * log(0.2);
184     }
185     catch(exception& e) {
186         m->errorOut(e, "AlignTree", "getOutlierLogProbability");
187         exit(1);
188     }
189 }
190 
191 /**************************************************************************************************/
192 
getMinRiskIndexAlign(string & sequence,vector<int> & taxaIndices,vector<double> & probabilities)193 int AlignTree::getMinRiskIndexAlign(string& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
194 	try {
195         int numProbs = (int)probabilities.size();
196 
197         vector<double> G(numProbs, 0.2);	//a random sequence will, on average, be 20% similar to any other sequence
198         vector<double> risk(numProbs, 0);
199 
200         for(int i=1;i<numProbs;i++){ //use if you want the outlier group
201             if (m->getControl_pressed()) { return 0; }
202             G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
203         }
204 
205         double minRisk = MOTHURMAX;
206         int minRiskIndex = 0;
207 
208         for(int i=0;i<numProbs;i++){
209             if (m->getControl_pressed()) { return 0; }
210             for(int j=0;j<numProbs;j++){
211                 if(i != j){
212                     risk[i] += probabilities[j] * G[j];
213                 }
214             }
215 
216             if(risk[i] < minRisk){
217                 minRisk = risk[i];
218                 minRiskIndex = i;
219             }
220         }
221 
222         return minRiskIndex;
223     }
224     catch(exception& e) {
225         m->errorOut(e, "AlignTree", "getMinRiskIndexAlign");
226         exit(1);
227     }
228 
229 }
230 
231 /**************************************************************************************************/
232 
sanityCheck(vector<vector<int>> & indices,vector<int> & maxIndices)233 int AlignTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
234 	try {
235         int finalLevel = (int)indices.size()-1;
236 
237         for(int position=1;position<indices.size();position++){
238             if (m->getControl_pressed()) { return 0; }
239             int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
240             int actualParent = indices[position-1][maxIndices[position-1]];
241 
242             if(predictedParent != actualParent){
243                 finalLevel = position - 1;
244                 return finalLevel;
245             }
246         }
247         return finalLevel;
248 	}
249     catch(exception& e) {
250         m->errorOut(e, "AlignTree", "sanityCheck");
251         exit(1);
252     }
253 }
254 
255 /**************************************************************************************************/
256 
getTaxonomy(Sequence * seq,string & simpleTax,bool & flipped)257 string AlignTree::getTaxonomy(Sequence* seq, string& simpleTax, bool& flipped){
258     try {
259         simpleTax = "";
260         string seqName = seq->getName(); string querySequence = seq->getAligned(); string taxonProbabilityString = "";
261         if (querySequence.length() != length) {
262             m->mothurOut("[ERROR]: " + seq->getName() + " has length " + toString(querySequence.length()) + ", reference sequences length is " + toString(length) + ". Are your sequences aligned? Sequences must be aligned to use the align search method.\n"); m->setControl_pressed(true); return "";
263         }
264         double logPOutlier = getOutlierLogProbability(querySequence);
265 
266         vector<vector<double> > pXgivenKj_D_j(numLevels);
267         vector<vector<int> > indices(numLevels);
268         for(int i=0;i<numLevels;i++){
269             if (m->getControl_pressed()) { return taxonProbabilityString; }
270             pXgivenKj_D_j[i].push_back(logPOutlier);
271             indices[i].push_back(-1);
272         }
273 
274         for(int i=0;i<numTaxa;i++){
275             if (m->getControl_pressed()) { return taxonProbabilityString; }
276             pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(querySequence));
277             indices[tree[i]->getLevel()].push_back(i);
278         }
279 
280         vector<double> sumLikelihood(numLevels, 0);
281         vector<double> bestPosterior(numLevels, 0);
282         vector<int> maxIndex(numLevels, 0);
283         int maxPosteriorIndex;
284 
285         //let's find the best level and taxa within that level
286         for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
287             if (m->getControl_pressed()) { return taxonProbabilityString; }
288             int numTaxaInLevel = (int)indices[i].size();
289 
290             vector<double> posteriors(numTaxaInLevel, 0);
291             sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
292 
293             maxPosteriorIndex = 0;
294             for(int j=0;j<numTaxaInLevel;j++){
295                 posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
296 
297                 if(posteriors[j] > posteriors[maxPosteriorIndex]){
298                     maxPosteriorIndex = j;
299                 }
300 
301             }
302 
303             maxIndex[i] = getMinRiskIndexAlign(querySequence, indices[i], posteriors);
304 
305             maxIndex[i] = maxPosteriorIndex;
306             bestPosterior[i] = posteriors[maxIndex[i]];
307         }
308 
309         int saneDepth = sanityCheck(indices, maxIndex);
310 
311         simpleTax = "";
312         int savedspot = 1;
313         taxonProbabilityString = "";
314         for(int i=1;i<=saneDepth;i++){
315             if (m->getControl_pressed()) { return taxonProbabilityString; }
316             int confidenceScore = (int) (bestPosterior[i] * 100);
317             if (confidenceScore >= confidenceThreshold) {
318             if(indices[i][maxIndex[i]] != -1){
319                 taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + '(' + toString(confidenceScore) + ");";
320                 simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
321             }
322             else{
323                 taxonProbabilityString + "unclassified" + '(' + toString(confidenceScore) + ");";
324                 simpleTax += "unclassified;";
325             }
326             }else { break; }
327             savedspot = i;
328         }
329 
330         for(int i=savedspot+1;i<numLevels;i++){
331             if (m->getControl_pressed()) { return taxonProbabilityString; }
332             taxonProbabilityString + "unclassified(0);";
333             simpleTax += "unclassified;";
334         }
335 
336         return taxonProbabilityString;
337 	}
338     catch(exception& e) {
339         m->errorOut(e, "AlignTree", "getTaxonomy");
340         exit(1);
341     }
342 }
343 
344 
345 /**************************************************************************************************/
346