1 /*
2  *  bayesian.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/3/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9 
10 #include "bayesian.h"
11 #include "kmer.hpp"
12 #include "phylosummary.h"
13 
14 /**************************************************************************************************/
Bayesian(string txfile,string tempFile,string method,int ksize,int cutoff,int i,int tid,bool f,bool sh,string version)15 Bayesian::Bayesian(string txfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f, bool sh, string version) :
16 Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
17 	try {
18 
19 		threadID = tid;
20 		flip = f;
21         shortcuts = sh;
22         string baseName = tempFile;
23         string baseTName = txfile;
24         Utils util;
25 
26         /************calculate the probablity that each word will be in a specific taxonomy*************/
27         string tfileroot = util.getFullPathName(baseTName.substr(0,baseTName.find_last_of(".")+1));
28         string tempfileroot = util.getRootName(util.getSimpleName(baseName));
29         string phyloTreeName = tfileroot + "tree.train";
30         string phyloTreeSumName = tfileroot + "tree.sum";
31         string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob";
32         string probFileName2 = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.numNonZero";
33 
34         ofstream out;
35         ofstream out2;
36 
37         vector<ifstream*> files;
38         ifstream* phyloTreeTest = new ifstream(phyloTreeName.c_str()); files.push_back(phyloTreeTest);
39 		ifstream* probFileTest2 = new ifstream(probFileName2.c_str()); files.push_back(probFileTest2);
40 		ifstream* probFileTest = new ifstream(probFileName.c_str());   files.push_back(probFileTest);
41 		ifstream* probFileTest3 = new ifstream(phyloTreeSumName.c_str()); files.push_back(probFileTest3);
42 
43 		long start = time(NULL);
44 
45 		//if they are there make sure they were created after this release date
46 		bool FilesGood = false;
47 		if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3){ FilesGood = checkReleaseDate(files, version); }
48 
49 		if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){
50 
51 			m->mothurOut("Reading template taxonomy...     "); cout.flush();
52 
53 			phyloTree = new PhyloTree(*phyloTreeTest, phyloTreeName);
54             maxLevel = phyloTree->getMaxLevel();
55 
56 			m->mothurOut("DONE.\n");
57 
58 			genusNodes = phyloTree->getGenusNodes();
59 			genusTotals = phyloTree->getGenusTotals();
60 
61             m->mothurOut("Reading template probabilities...     "); cout.flush();
62             readProbFile(*probFileTest, *probFileTest2, probFileName, probFileName2);
63 
64         }else{
65 
66 			//create search database and names vector
67 			generateDatabaseAndNames(txfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0, version);
68 
69 			//prevents errors caused by creating shortcut files if you had an error in the sanity check.
70 			if (m->getControl_pressed()) {  util.mothurRemove(phyloTreeName);  util.mothurRemove(probFileName); util.mothurRemove(probFileName2); }
71 			else{
72 				genusNodes = phyloTree->getGenusNodes();
73 				genusTotals = phyloTree->getGenusTotals();
74 
75 				m->mothurOut("Calculating template taxonomy tree...     "); cout.flush();
76 
77 				phyloTree->printTreeNodes(phyloTreeName);
78 
79 				m->mothurOut("DONE.\n");
80 
81 				m->mothurOut("Calculating template probabilities...     "); cout.flush();
82 
83 				numKmers = database->getMaxKmer() + 1;
84 
85 				//initialze probabilities
86 				wordGenusProb.resize(numKmers);
87                 for (int j = 0; j < numKmers; j++) {  diffPair tempDiffPair; WordPairDiffArr.push_back(tempDiffPair); }
88 
89                 for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size(), 0.0);		}
90                 ofstream out; ofstream out2;
91 
92                 if (shortcuts) {
93                     util.openOutputFile(probFileName, out);
94 
95                     //output mothur version
96                     out << "#" << version << endl;
97 
98                     out << numKmers << endl;
99 
100                     util.openOutputFile(probFileName2, out2);
101 
102                     //output mothur version
103                     out2 << "#" << version << endl;
104                 }
105 
106 				//for each word
107 				for (int i = 0; i < numKmers; i++) {
108                     //m->mothurOut("[DEBUG]: kmer = " + toString(i) + "\n");
109 
110 					if (m->getControl_pressed()) {  break; }
111 
112                     if (shortcuts) {  out << i << '\t'; }
113 
114 					vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
115 
116 					//for each sequence with that word
117                     vector<int> count; count.resize(genusNodes.size(), 0);
118 					for (int j = 0; j < seqsWithWordi.size(); j++) {
119 						int temp = phyloTree->getGenusIndex(names[seqsWithWordi[j]]);
120 						count[temp]++;  //increment count of seq in this genus who have this word
121 					}
122 
123 					//probabilityInTemplate = (# of seqs with that word in template + 0.50) / (total number of seqs in template + 1);
124 					float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1);
125 					diffPair tempProb(log(probabilityInTemplate), 0.0);
126 					WordPairDiffArr[i] = tempProb;
127 
128 					int numNotZero = 0;
129 					for (int k = 0; k < genusNodes.size(); k++) {
130 						//probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
131 
132 
133 						wordGenusProb[i][k] = log((count[k] + probabilityInTemplate) / (float) (genusTotals[k] + 1));
134 
135 						if (count[k] != 0) {
136                             if (shortcuts) { out << k << '\t' << wordGenusProb[i][k] << '\t' ; }
137 							numNotZero++;
138 						}
139 					}
140 
141 
142                     if (shortcuts) {
143                         out << endl;
144                         out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl;
145                     }
146 
147 				}
148                 if (shortcuts) { out.close(); out2.close();  }
149 
150 				//read in new phylotree with less info. - its faster
151 				ifstream phyloTreeTest(phyloTreeName.c_str());
152 				delete phyloTree;
153 
154 				phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
155                 maxLevel = phyloTree->getMaxLevel();
156 			}
157 		}
158 
159         if (m->getDebug()) { m->mothurOut("[DEBUG]: about to generateWordPairDiffArr\n"); }
160 		generateWordPairDiffArr();
161         if (m->getDebug()) { m->mothurOut("[DEBUG]: done generateWordPairDiffArr\n"); }
162 
163         for (int i = 0; i < files.size(); i++) { delete files[i]; }
164 
165 		m->mothurOut("DONE.\n");
166 		m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities.\n");
167 	}
168 	catch(exception& e) {
169 		m->errorOut(e, "Bayesian", "Bayesian");
170 		exit(1);
171 	}
172 }
173 /**************************************************************************************************/
~Bayesian()174 Bayesian::~Bayesian() {
175 	try {
176         if (phyloTree != NULL) { delete phyloTree; }
177         if (database != NULL) {  delete database; }
178 	}
179 	catch(exception& e) {
180 		m->errorOut(e, "Bayesian", "~Bayesian");
181 		exit(1);
182 	}
183 }
184 
185 /**************************************************************************************************/
getTaxonomy(Sequence * seq,string & simpleTax,bool & flipped)186 string Bayesian::getTaxonomy(Sequence* seq, string& simpleTax, bool& flipped) {
187 	try {
188 		string tax = "";
189         simpleTax = "";
190 		Kmer kmer(kmerSize);
191 		flipped = false;
192 
193 		//get words contained in query
194 		//getKmerString returns a string where the index in the string is hte kmer number
195 		//and the character at that index can be converted to be the number of times that kmer was seen
196 		string queryKmerString = kmer.getKmerString(seq->getUnaligned());
197 
198 		vector<int> queryKmers;
199 		for (int i = 0; i < queryKmerString.length()-1; i++) {	// the -1 is to ignore any kmer with an N in it
200 			if (queryKmerString[i] != '!') { //this kmer is in the query
201 				queryKmers.push_back(i);
202 			}
203 		}
204 
205 		//if user wants to test reverse compliment and its reversed use that instead
206 		if (flip) {
207 			if (isReversed(queryKmers)) {
208 				flipped = true;
209 				seq->reverseComplement();
210 				queryKmerString = kmer.getKmerString(seq->getUnaligned());
211 				queryKmers.clear();
212 				for (int i = 0; i < queryKmerString.length()-1; i++) {	// the -1 is to ignore any kmer with an N in it
213 					if (queryKmerString[i] != '!') { //this kmer is in the query
214 						queryKmers.push_back(i);
215 					}
216 				}
217 			}
218 		}
219 
220 		if (queryKmers.size() == 0) {  m->mothurOut(seq->getName() + " is bad. It has no kmers of length " + toString(kmerSize) + ".\n");  simpleTax = "unknown;";  return "unknown;"; }
221 
222 
223 		int index = getMostProbableTaxonomy(queryKmers);
224 
225 		if (m->getControl_pressed()) { return tax; }
226 
227 		//bootstrap - to set confidenceScore
228         int numToSelect = queryKmers.size() / 8;
229 
230         if (m->getDebug()) {  m->mothurOut(seq->getName() + "\t"); }
231 
232 		tax = bootstrapResults(queryKmers, index, numToSelect, simpleTax);
233 
234         if (m->getDebug()) {  m->mothurOut("\n"); }
235 
236 		return tax;
237 	}
238 	catch(exception& e) {
239 		m->errorOut(e, "Bayesian", "getTaxonomy");
240 		exit(1);
241 	}
242 }
243 /**************************************************************************************************/
bootstrapResults(vector<int> kmers,int tax,int numToSelect,string & simpleTax)244 string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect, string& simpleTax) {
245 	try {
246 
247 		map<int, int> confidenceScores;
248 
249 		//initialize confidences to 0
250 		int seqIndex = tax;
251 		TaxNode seq = phyloTree->get(tax);
252 		confidenceScores[tax] = 0;
253 
254 		while (seq.level != 0) { //while you are not at the root
255 			seqIndex = seq.parent;
256 			confidenceScores[seqIndex] = 0;
257 			seq = phyloTree->get(seq.parent);
258 		}
259 
260 		map<int, int>::iterator itBoot;
261 		map<int, int>::iterator itBoot2;
262 		map<int, int>::iterator itConvert;
263 
264         int numKmers = kmers.size()-1;
265         Utils util;
266 		for (int i = 0; i < iters; i++) {
267 			if (m->getControl_pressed()) { return "control"; }
268 
269             vector<int> temp;
270 			for (int j = 0; j < numToSelect; j++) {
271 				int index = util.getRandomIndex(numKmers);
272 
273 				//add word to temp
274 				temp.push_back(kmers[index]);
275 			}
276 
277 			//get taxonomy
278 			int newTax = getMostProbableTaxonomy(temp);
279 			//int newTax = 1;
280 			TaxNode taxonomyTemp = phyloTree->get(newTax);
281 
282 			//add to confidence results
283 			while (taxonomyTemp.level != 0) { //while you are not at the root
284 				itBoot2 = confidenceScores.find(newTax); //is this a classification we already have a count on
285 
286 				if (itBoot2 != confidenceScores.end()) { //this is a classification we need a confidence for
287 					(itBoot2->second)++;
288 				}
289 
290 				newTax = taxonomyTemp.parent;
291 				taxonomyTemp = phyloTree->get(newTax);
292 			}
293 
294 		}
295 
296 		string confidenceTax = "";
297 		simpleTax = "";
298 
299 		int seqTaxIndex = tax;
300 		TaxNode seqTax = phyloTree->get(tax);
301 
302 
303 		while (seqTax.level != 0) { //while you are not at the root
304 
305 				itBoot2 = confidenceScores.find(seqTaxIndex); //is this a classification we already have a count on
306 
307 				int confidence = 0;
308 				if (itBoot2 != confidenceScores.end()) { //already in confidence scores
309 					confidence = itBoot2->second;
310 				}
311 
312                 if (m->getDebug()) { m->mothurOut(seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");"); }
313 
314 				if (((confidence/(float)iters) * 100) >= confidenceThreshold) {
315 					confidenceTax = seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");" + confidenceTax;
316 					simpleTax = seqTax.name + ";" + simpleTax;
317 				}
318 
319 				seqTaxIndex = seqTax.parent;
320 				seqTax = phyloTree->get(seqTax.parent);
321 		}
322 
323 		if (confidenceTax == "") { confidenceTax = "unknown;"; simpleTax = "unknown;";  }
324 
325 		return confidenceTax;
326 
327 	}
328 	catch(exception& e) {
329 		m->errorOut(e, "Bayesian", "bootstrapResults");
330 		exit(1);
331 	}
332 }
333 /**************************************************************************************************/
getMostProbableTaxonomy(vector<int> queryKmer)334 int Bayesian::getMostProbableTaxonomy(vector<int> queryKmer) {
335 	try {
336 		int indexofGenus = 0;
337 
338 		double maxProbability = -1000000.0;
339 		//find taxonomy with highest probability that this sequence is from it
340 
341         for (int k = 0; k < genusNodes.size(); k++) {
342 			//for each taxonomy calc its probability
343 
344 			double prob = 0.0000;
345 			for (int i = 0; i < queryKmer.size(); i++) { prob += wordGenusProb[queryKmer[i]][k]; }
346 
347 			//is this the taxonomy with the greatest probability?
348 			if (prob > maxProbability) {
349 				indexofGenus = genusNodes[k];
350 				maxProbability = prob;
351 			}
352 		}
353 
354 		return indexofGenus;
355 	}
356 	catch(exception& e) {
357 		m->errorOut(e, "Bayesian", "getMostProbableTaxonomy");
358 		exit(1);
359 	}
360 }
361 //********************************************************************************************************************
362 //if it is more probable that the reverse compliment kmers are in the template, then we assume the sequence is reversed.
isReversed(vector<int> & queryKmers)363 bool Bayesian::isReversed(vector<int>& queryKmers){
364 	try{
365 		bool reversed = false;
366 		float prob = 0;
367 		float reverseProb = 0;
368 
369         for (int i = 0; i < queryKmers.size(); i++){
370             int kmer = queryKmers[i];
371             if (kmer >= 0){
372                 prob += WordPairDiffArr[kmer].prob;
373 				reverseProb += WordPairDiffArr[kmer].reverseProb;
374             }
375         }
376 
377         if (reverseProb > prob){ reversed = true; }
378 
379 		return reversed;
380 	}
381 	catch(exception& e) {
382 		m->errorOut(e, "Bayesian", "isReversed");
383 		exit(1);
384 	}
385 }
386 //********************************************************************************************************************
generateWordPairDiffArr()387 int Bayesian::generateWordPairDiffArr(){
388 	try{
389 		Kmer kmer(kmerSize);
390 		for (int i = 0; i < WordPairDiffArr.size(); i++) {
391 			int reversedWord = kmer.getReverseKmerNumber(i);
392 			WordPairDiffArr[i].reverseProb = WordPairDiffArr[reversedWord].prob;
393 		}
394 
395 		return 0;
396 	}catch(exception& e) {
397 		m->errorOut(e, "Bayesian", "generateWordPairDiffArr");
398 		exit(1);
399 	}
400 }
401 /**************************************************************************************************/
readProbFile(ifstream & in,ifstream & inNum,string inName,string inNumName)402 void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) {
403 	try{
404 		Utils util;
405         //read version
406         string line = util.getline(in); util.gobble(in);
407 
408         in >> numKmers; util.gobble(in);
409         //initialze probabilities
410 
411         wordGenusProb.resize(numKmers);
412         for (int j = 0; j < wordGenusProb.size(); j++) {	wordGenusProb[j].resize(genusNodes.size());		}
413 
414         int kmer, name, count;  count = 0;
415         vector<int> num; num.resize(numKmers);
416         float prob;
417         vector<float> zeroCountProb; zeroCountProb.resize(numKmers);
418         for (int j = 0; j < numKmers; j++) {  diffPair tempDiffPair; WordPairDiffArr.push_back(tempDiffPair); }
419 
420         //read version
421         string line2 = util.getline(inNum); util.gobble(inNum);
422         float probTemp;
423 
424         while (inNum) {
425             inNum >> zeroCountProb[count] >> num[count] >> probTemp;
426             WordPairDiffArr[count].prob = probTemp;
427             count++;
428             util.gobble(inNum);
429 
430             if (m->getDebug()) { m->mothurOut("[DEBUG]: " + toString(zeroCountProb[count]) + '\t' + toString(num[count]) + '\t' + toString(numKmers) + "\n"); }
431 
432 
433         }
434         inNum.close();
435 
436         while(in) {
437             in >> kmer;
438 
439             //set them all to zero value
440             for (int i = 0; i < genusNodes.size(); i++) {
441                 wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
442             }
443 
444             //get probs for nonzero values
445             for (int i = 0; i < num[kmer]; i++) {
446                 in >> name >> prob;
447                 wordGenusProb[kmer][name] = prob;
448                 if (m->getDebug()) { m->mothurOut("[DEBUG]: " + toString(name) + '\t' + toString(prob) + '\t' + toString(kmer) + "\n"); }
449             }
450 
451             util.gobble(in);
452         }
453         in.close();
454 
455 	}
456 	catch(exception& e) {
457 		m->errorOut(e, "Bayesian", "readProbFile");
458 		exit(1);
459 	}
460 }
461 /**************************************************************************************************/
462 
463 
464 
465 
466 
467 
468