1 //
2 // alignTree.cpp
3 // pdsBayesian
4 //
5 // Created by Patrick Schloss on 4/3/12.
6 // Copyright (c) 2012 University of Michigan. All rights reserved.
7 //
8
9 #include "alignnode.h"
10 #include "aligntree.h"
11
12 /**************************************************************************************************/
13
AlignTree(string referenceFileName,string taxonomyFileName,int cutoff)14 AlignTree::AlignTree(string referenceFileName, string taxonomyFileName, int cutoff) : Classify(), confidenceThreshold(cutoff){
15 try {
16 AlignNode* newNode = new AlignNode("Root", 0);
17 tree.push_back(newNode); // the tree is stored as a vector of elements of type TaxonomyNode
18
19 string refTaxonomy;
20
21 readTaxonomy(taxonomyFileName);
22
23 ifstream referenceFile;
24 Utils util; util.openInputFile(referenceFileName, referenceFile);
25 bool error = false;
26 map<int, int> lengths;
27 while(!referenceFile.eof()){
28
29 if (m->getControl_pressed()) { break; }
30
31 Sequence seq(referenceFile); util.gobble(referenceFile);
32
33 if (seq.getName() != "") {
34 map<string, string>::iterator it = taxonomy.find(seq.getName());
35
36 if (it != taxonomy.end()) {
37 refTaxonomy = it->second; // lookup the taxonomy string for the current reference sequence
38 string aligned = seq.getAligned();
39 lengths[aligned.length()] = 1;
40 if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: reference sequences must be aligned to use the align method, quitting.\n"); break; }
41 addTaxonomyToTree(seq.getName(), refTaxonomy, aligned);
42 }else {
43 m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
44 }
45 }
46 }
47 referenceFile.close();
48
49 length = (lengths.begin())->first;
50
51 if (error) { m->setControl_pressed(true); }
52
53 numTaxa = (int)tree.size();
54
55 numLevels = 0;
56 for(int i=0;i<numTaxa;i++){
57 int level = tree[i]->getLevel();
58 if(level > numLevels){ numLevels = level; }
59 }
60 numLevels++;
61
62 aggregateThetas();
63
64 int dbSize = tree[0]->getNumSeqs();
65
66 for(int i=0;i<numTaxa;i++){
67 tree[i]->checkTheta();
68 tree[i]->setTotalSeqs(dbSize);
69 }
70
71 }
72 catch(exception& e) {
73 m->errorOut(e, "AlignTree", "AlignTree");
74 exit(1);
75 }
76 }
77
78 /**************************************************************************************************/
79
~AlignTree()80 AlignTree::~AlignTree(){
81 try {
82 for(int i=0;i<tree.size();i++){
83 delete tree[i];
84 }
85 }
86 catch(exception& e) {
87 m->errorOut(e, "AlignTree", "~AlignTree");
88 exit(1);
89 }
90 }
91
92 /**************************************************************************************************/
93
addTaxonomyToTree(string seqName,string & taxonomy,string & sequence)94 int AlignTree::addTaxonomyToTree(string seqName, string& taxonomy, string& sequence){
95 try {
96 AlignNode* newNode;
97 string taxonName = "";
98 int treePosition = 0; // the root is element 0
99
100 int level = 1;
101
102 for(int i=0;i<taxonomy.length();i++){ // step through taxonomy string...
103
104 if (m->getControl_pressed()) { break; }
105
106 if(taxonomy[i] == ';'){ // looking for semicolons...
107
108 if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;\n"); m->setControl_pressed(true); }
109
110 int newIndex = tree[treePosition]->getChildIndex(taxonName); // look to see if your current node already
111 // has a child with the new taxonName
112 if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that
113 else { // position in the tree
114 int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one...
115 tree[treePosition]->makeChild(taxonName, newChildIndex);
116
117 newNode = new AlignNode(taxonName, level);
118
119 newNode->setParent(treePosition);
120
121 tree.push_back(newNode);
122 treePosition = newChildIndex;
123 }
124
125 // sequence data to that node to update that node's theta - seems slow...
126 taxonName = ""; // clear out the taxon name that we will build as we look
127 level++;
128 } // for a semicolon
129 else{
130 taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon
131 }
132 }
133 tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the
134
135 return 0;
136 }
137 catch(exception& e) {
138 m->errorOut(e, "AlignTree", "addTaxonomyToTree");
139 exit(1);
140 }
141 }
142
143 /**************************************************************************************************/
144
aggregateThetas()145 int AlignTree::aggregateThetas(){
146 try {
147 vector<vector<int> > levelMatrix(numLevels+1);
148
149 for(int i=0;i<tree.size();i++){
150 if (m->getControl_pressed()) { return 0; }
151 levelMatrix[tree[i]->getLevel()].push_back(i);
152 }
153
154 for(int i=numLevels-1;i>0;i--){
155 if (m->getControl_pressed()) { return 0; }
156 for(int j=0;j<levelMatrix[i].size();j++){
157
158 AlignNode* holder = tree[levelMatrix[i][j]];
159
160 tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());
161 }
162 }
163 return 0;
164 }
165 catch(exception& e) {
166 m->errorOut(e, "AlignTree", "aggregateThetas");
167 exit(1);
168 }
169 }
170
171 /**************************************************************************************************/
172
getOutlierLogProbability(string & sequence)173 double AlignTree::getOutlierLogProbability(string& sequence){
174 try {
175 double count = 0;
176
177 for(int i=0;i<sequence.length();i++){
178
179 if(sequence[i] != '.'){ count++; }
180
181 }
182
183 return count * log(0.2);
184 }
185 catch(exception& e) {
186 m->errorOut(e, "AlignTree", "getOutlierLogProbability");
187 exit(1);
188 }
189 }
190
191 /**************************************************************************************************/
192
getMinRiskIndexAlign(string & sequence,vector<int> & taxaIndices,vector<double> & probabilities)193 int AlignTree::getMinRiskIndexAlign(string& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
194 try {
195 int numProbs = (int)probabilities.size();
196
197 vector<double> G(numProbs, 0.2); //a random sequence will, on average, be 20% similar to any other sequence
198 vector<double> risk(numProbs, 0);
199
200 for(int i=1;i<numProbs;i++){ //use if you want the outlier group
201 if (m->getControl_pressed()) { return 0; }
202 G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
203 }
204
205 double minRisk = MOTHURMAX;
206 int minRiskIndex = 0;
207
208 for(int i=0;i<numProbs;i++){
209 if (m->getControl_pressed()) { return 0; }
210 for(int j=0;j<numProbs;j++){
211 if(i != j){
212 risk[i] += probabilities[j] * G[j];
213 }
214 }
215
216 if(risk[i] < minRisk){
217 minRisk = risk[i];
218 minRiskIndex = i;
219 }
220 }
221
222 return minRiskIndex;
223 }
224 catch(exception& e) {
225 m->errorOut(e, "AlignTree", "getMinRiskIndexAlign");
226 exit(1);
227 }
228
229 }
230
231 /**************************************************************************************************/
232
sanityCheck(vector<vector<int>> & indices,vector<int> & maxIndices)233 int AlignTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
234 try {
235 int finalLevel = (int)indices.size()-1;
236
237 for(int position=1;position<indices.size();position++){
238 if (m->getControl_pressed()) { return 0; }
239 int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
240 int actualParent = indices[position-1][maxIndices[position-1]];
241
242 if(predictedParent != actualParent){
243 finalLevel = position - 1;
244 return finalLevel;
245 }
246 }
247 return finalLevel;
248 }
249 catch(exception& e) {
250 m->errorOut(e, "AlignTree", "sanityCheck");
251 exit(1);
252 }
253 }
254
255 /**************************************************************************************************/
256
getTaxonomy(Sequence * seq,string & simpleTax,bool & flipped)257 string AlignTree::getTaxonomy(Sequence* seq, string& simpleTax, bool& flipped){
258 try {
259 simpleTax = "";
260 string seqName = seq->getName(); string querySequence = seq->getAligned(); string taxonProbabilityString = "";
261 if (querySequence.length() != length) {
262 m->mothurOut("[ERROR]: " + seq->getName() + " has length " + toString(querySequence.length()) + ", reference sequences length is " + toString(length) + ". Are your sequences aligned? Sequences must be aligned to use the align search method.\n"); m->setControl_pressed(true); return "";
263 }
264 double logPOutlier = getOutlierLogProbability(querySequence);
265
266 vector<vector<double> > pXgivenKj_D_j(numLevels);
267 vector<vector<int> > indices(numLevels);
268 for(int i=0;i<numLevels;i++){
269 if (m->getControl_pressed()) { return taxonProbabilityString; }
270 pXgivenKj_D_j[i].push_back(logPOutlier);
271 indices[i].push_back(-1);
272 }
273
274 for(int i=0;i<numTaxa;i++){
275 if (m->getControl_pressed()) { return taxonProbabilityString; }
276 pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(querySequence));
277 indices[tree[i]->getLevel()].push_back(i);
278 }
279
280 vector<double> sumLikelihood(numLevels, 0);
281 vector<double> bestPosterior(numLevels, 0);
282 vector<int> maxIndex(numLevels, 0);
283 int maxPosteriorIndex;
284
285 //let's find the best level and taxa within that level
286 for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
287 if (m->getControl_pressed()) { return taxonProbabilityString; }
288 int numTaxaInLevel = (int)indices[i].size();
289
290 vector<double> posteriors(numTaxaInLevel, 0);
291 sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
292
293 maxPosteriorIndex = 0;
294 for(int j=0;j<numTaxaInLevel;j++){
295 posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
296
297 if(posteriors[j] > posteriors[maxPosteriorIndex]){
298 maxPosteriorIndex = j;
299 }
300
301 }
302
303 maxIndex[i] = getMinRiskIndexAlign(querySequence, indices[i], posteriors);
304
305 maxIndex[i] = maxPosteriorIndex;
306 bestPosterior[i] = posteriors[maxIndex[i]];
307 }
308
309 int saneDepth = sanityCheck(indices, maxIndex);
310
311 simpleTax = "";
312 int savedspot = 1;
313 taxonProbabilityString = "";
314 for(int i=1;i<=saneDepth;i++){
315 if (m->getControl_pressed()) { return taxonProbabilityString; }
316 int confidenceScore = (int) (bestPosterior[i] * 100);
317 if (confidenceScore >= confidenceThreshold) {
318 if(indices[i][maxIndex[i]] != -1){
319 taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + '(' + toString(confidenceScore) + ");";
320 simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
321 }
322 else{
323 taxonProbabilityString + "unclassified" + '(' + toString(confidenceScore) + ");";
324 simpleTax += "unclassified;";
325 }
326 }else { break; }
327 savedspot = i;
328 }
329
330 for(int i=savedspot+1;i<numLevels;i++){
331 if (m->getControl_pressed()) { return taxonProbabilityString; }
332 taxonProbabilityString + "unclassified(0);";
333 simpleTax += "unclassified;";
334 }
335
336 return taxonProbabilityString;
337 }
338 catch(exception& e) {
339 m->errorOut(e, "AlignTree", "getTaxonomy");
340 exit(1);
341 }
342 }
343
344
345 /**************************************************************************************************/
346