1 /*
2 * bayesian.cpp
3 * Mothur
4 *
5 * Created by westcott on 11/3/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "bayesian.h"
11 #include "kmer.hpp"
12 #include "phylosummary.h"
13
14 /**************************************************************************************************/
Bayesian(string txfile,string tempFile,string method,int ksize,int cutoff,int i,int tid,bool f,bool sh,string version)15 Bayesian::Bayesian(string txfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f, bool sh, string version) :
16 Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
17 try {
18
19 threadID = tid;
20 flip = f;
21 shortcuts = sh;
22 string baseName = tempFile;
23 string baseTName = txfile;
24 Utils util;
25
26 /************calculate the probablity that each word will be in a specific taxonomy*************/
27 string tfileroot = util.getFullPathName(baseTName.substr(0,baseTName.find_last_of(".")+1));
28 string tempfileroot = util.getRootName(util.getSimpleName(baseName));
29 string phyloTreeName = tfileroot + "tree.train";
30 string phyloTreeSumName = tfileroot + "tree.sum";
31 string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob";
32 string probFileName2 = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.numNonZero";
33
34 ofstream out;
35 ofstream out2;
36
37 vector<ifstream*> files;
38 ifstream* phyloTreeTest = new ifstream(phyloTreeName.c_str()); files.push_back(phyloTreeTest);
39 ifstream* probFileTest2 = new ifstream(probFileName2.c_str()); files.push_back(probFileTest2);
40 ifstream* probFileTest = new ifstream(probFileName.c_str()); files.push_back(probFileTest);
41 ifstream* probFileTest3 = new ifstream(phyloTreeSumName.c_str()); files.push_back(probFileTest3);
42
43 long start = time(NULL);
44
45 //if they are there make sure they were created after this release date
46 bool FilesGood = false;
47 if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3){ FilesGood = checkReleaseDate(files, version); }
48
49 if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){
50
51 m->mothurOut("Reading template taxonomy... "); cout.flush();
52
53 phyloTree = new PhyloTree(*phyloTreeTest, phyloTreeName);
54 maxLevel = phyloTree->getMaxLevel();
55
56 m->mothurOut("DONE.\n");
57
58 genusNodes = phyloTree->getGenusNodes();
59 genusTotals = phyloTree->getGenusTotals();
60
61 m->mothurOut("Reading template probabilities... "); cout.flush();
62 readProbFile(*probFileTest, *probFileTest2, probFileName, probFileName2);
63
64 }else{
65
66 //create search database and names vector
67 generateDatabaseAndNames(txfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0, version);
68
69 //prevents errors caused by creating shortcut files if you had an error in the sanity check.
70 if (m->getControl_pressed()) { util.mothurRemove(phyloTreeName); util.mothurRemove(probFileName); util.mothurRemove(probFileName2); }
71 else{
72 genusNodes = phyloTree->getGenusNodes();
73 genusTotals = phyloTree->getGenusTotals();
74
75 m->mothurOut("Calculating template taxonomy tree... "); cout.flush();
76
77 phyloTree->printTreeNodes(phyloTreeName);
78
79 m->mothurOut("DONE.\n");
80
81 m->mothurOut("Calculating template probabilities... "); cout.flush();
82
83 numKmers = database->getMaxKmer() + 1;
84
85 //initialze probabilities
86 wordGenusProb.resize(numKmers);
87 for (int j = 0; j < numKmers; j++) { diffPair tempDiffPair; WordPairDiffArr.push_back(tempDiffPair); }
88
89 for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size(), 0.0); }
90 ofstream out; ofstream out2;
91
92 if (shortcuts) {
93 util.openOutputFile(probFileName, out);
94
95 //output mothur version
96 out << "#" << version << endl;
97
98 out << numKmers << endl;
99
100 util.openOutputFile(probFileName2, out2);
101
102 //output mothur version
103 out2 << "#" << version << endl;
104 }
105
106 //for each word
107 for (int i = 0; i < numKmers; i++) {
108 //m->mothurOut("[DEBUG]: kmer = " + toString(i) + "\n");
109
110 if (m->getControl_pressed()) { break; }
111
112 if (shortcuts) { out << i << '\t'; }
113
114 vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
115
116 //for each sequence with that word
117 vector<int> count; count.resize(genusNodes.size(), 0);
118 for (int j = 0; j < seqsWithWordi.size(); j++) {
119 int temp = phyloTree->getGenusIndex(names[seqsWithWordi[j]]);
120 count[temp]++; //increment count of seq in this genus who have this word
121 }
122
123 //probabilityInTemplate = (# of seqs with that word in template + 0.50) / (total number of seqs in template + 1);
124 float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1);
125 diffPair tempProb(log(probabilityInTemplate), 0.0);
126 WordPairDiffArr[i] = tempProb;
127
128 int numNotZero = 0;
129 for (int k = 0; k < genusNodes.size(); k++) {
130 //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
131
132
133 wordGenusProb[i][k] = log((count[k] + probabilityInTemplate) / (float) (genusTotals[k] + 1));
134
135 if (count[k] != 0) {
136 if (shortcuts) { out << k << '\t' << wordGenusProb[i][k] << '\t' ; }
137 numNotZero++;
138 }
139 }
140
141
142 if (shortcuts) {
143 out << endl;
144 out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl;
145 }
146
147 }
148 if (shortcuts) { out.close(); out2.close(); }
149
150 //read in new phylotree with less info. - its faster
151 ifstream phyloTreeTest(phyloTreeName.c_str());
152 delete phyloTree;
153
154 phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
155 maxLevel = phyloTree->getMaxLevel();
156 }
157 }
158
159 if (m->getDebug()) { m->mothurOut("[DEBUG]: about to generateWordPairDiffArr\n"); }
160 generateWordPairDiffArr();
161 if (m->getDebug()) { m->mothurOut("[DEBUG]: done generateWordPairDiffArr\n"); }
162
163 for (int i = 0; i < files.size(); i++) { delete files[i]; }
164
165 m->mothurOut("DONE.\n");
166 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities.\n");
167 }
168 catch(exception& e) {
169 m->errorOut(e, "Bayesian", "Bayesian");
170 exit(1);
171 }
172 }
173 /**************************************************************************************************/
~Bayesian()174 Bayesian::~Bayesian() {
175 try {
176 if (phyloTree != NULL) { delete phyloTree; }
177 if (database != NULL) { delete database; }
178 }
179 catch(exception& e) {
180 m->errorOut(e, "Bayesian", "~Bayesian");
181 exit(1);
182 }
183 }
184
185 /**************************************************************************************************/
getTaxonomy(Sequence * seq,string & simpleTax,bool & flipped)186 string Bayesian::getTaxonomy(Sequence* seq, string& simpleTax, bool& flipped) {
187 try {
188 string tax = "";
189 simpleTax = "";
190 Kmer kmer(kmerSize);
191 flipped = false;
192
193 //get words contained in query
194 //getKmerString returns a string where the index in the string is hte kmer number
195 //and the character at that index can be converted to be the number of times that kmer was seen
196 string queryKmerString = kmer.getKmerString(seq->getUnaligned());
197
198 vector<int> queryKmers;
199 for (int i = 0; i < queryKmerString.length()-1; i++) { // the -1 is to ignore any kmer with an N in it
200 if (queryKmerString[i] != '!') { //this kmer is in the query
201 queryKmers.push_back(i);
202 }
203 }
204
205 //if user wants to test reverse compliment and its reversed use that instead
206 if (flip) {
207 if (isReversed(queryKmers)) {
208 flipped = true;
209 seq->reverseComplement();
210 queryKmerString = kmer.getKmerString(seq->getUnaligned());
211 queryKmers.clear();
212 for (int i = 0; i < queryKmerString.length()-1; i++) { // the -1 is to ignore any kmer with an N in it
213 if (queryKmerString[i] != '!') { //this kmer is in the query
214 queryKmers.push_back(i);
215 }
216 }
217 }
218 }
219
220 if (queryKmers.size() == 0) { m->mothurOut(seq->getName() + " is bad. It has no kmers of length " + toString(kmerSize) + ".\n"); simpleTax = "unknown;"; return "unknown;"; }
221
222
223 int index = getMostProbableTaxonomy(queryKmers);
224
225 if (m->getControl_pressed()) { return tax; }
226
227 //bootstrap - to set confidenceScore
228 int numToSelect = queryKmers.size() / 8;
229
230 if (m->getDebug()) { m->mothurOut(seq->getName() + "\t"); }
231
232 tax = bootstrapResults(queryKmers, index, numToSelect, simpleTax);
233
234 if (m->getDebug()) { m->mothurOut("\n"); }
235
236 return tax;
237 }
238 catch(exception& e) {
239 m->errorOut(e, "Bayesian", "getTaxonomy");
240 exit(1);
241 }
242 }
243 /**************************************************************************************************/
bootstrapResults(vector<int> kmers,int tax,int numToSelect,string & simpleTax)244 string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect, string& simpleTax) {
245 try {
246
247 map<int, int> confidenceScores;
248
249 //initialize confidences to 0
250 int seqIndex = tax;
251 TaxNode seq = phyloTree->get(tax);
252 confidenceScores[tax] = 0;
253
254 while (seq.level != 0) { //while you are not at the root
255 seqIndex = seq.parent;
256 confidenceScores[seqIndex] = 0;
257 seq = phyloTree->get(seq.parent);
258 }
259
260 map<int, int>::iterator itBoot;
261 map<int, int>::iterator itBoot2;
262 map<int, int>::iterator itConvert;
263
264 int numKmers = kmers.size()-1;
265 Utils util;
266 for (int i = 0; i < iters; i++) {
267 if (m->getControl_pressed()) { return "control"; }
268
269 vector<int> temp;
270 for (int j = 0; j < numToSelect; j++) {
271 int index = util.getRandomIndex(numKmers);
272
273 //add word to temp
274 temp.push_back(kmers[index]);
275 }
276
277 //get taxonomy
278 int newTax = getMostProbableTaxonomy(temp);
279 //int newTax = 1;
280 TaxNode taxonomyTemp = phyloTree->get(newTax);
281
282 //add to confidence results
283 while (taxonomyTemp.level != 0) { //while you are not at the root
284 itBoot2 = confidenceScores.find(newTax); //is this a classification we already have a count on
285
286 if (itBoot2 != confidenceScores.end()) { //this is a classification we need a confidence for
287 (itBoot2->second)++;
288 }
289
290 newTax = taxonomyTemp.parent;
291 taxonomyTemp = phyloTree->get(newTax);
292 }
293
294 }
295
296 string confidenceTax = "";
297 simpleTax = "";
298
299 int seqTaxIndex = tax;
300 TaxNode seqTax = phyloTree->get(tax);
301
302
303 while (seqTax.level != 0) { //while you are not at the root
304
305 itBoot2 = confidenceScores.find(seqTaxIndex); //is this a classification we already have a count on
306
307 int confidence = 0;
308 if (itBoot2 != confidenceScores.end()) { //already in confidence scores
309 confidence = itBoot2->second;
310 }
311
312 if (m->getDebug()) { m->mothurOut(seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");"); }
313
314 if (((confidence/(float)iters) * 100) >= confidenceThreshold) {
315 confidenceTax = seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");" + confidenceTax;
316 simpleTax = seqTax.name + ";" + simpleTax;
317 }
318
319 seqTaxIndex = seqTax.parent;
320 seqTax = phyloTree->get(seqTax.parent);
321 }
322
323 if (confidenceTax == "") { confidenceTax = "unknown;"; simpleTax = "unknown;"; }
324
325 return confidenceTax;
326
327 }
328 catch(exception& e) {
329 m->errorOut(e, "Bayesian", "bootstrapResults");
330 exit(1);
331 }
332 }
333 /**************************************************************************************************/
getMostProbableTaxonomy(vector<int> queryKmer)334 int Bayesian::getMostProbableTaxonomy(vector<int> queryKmer) {
335 try {
336 int indexofGenus = 0;
337
338 double maxProbability = -1000000.0;
339 //find taxonomy with highest probability that this sequence is from it
340
341 for (int k = 0; k < genusNodes.size(); k++) {
342 //for each taxonomy calc its probability
343
344 double prob = 0.0000;
345 for (int i = 0; i < queryKmer.size(); i++) { prob += wordGenusProb[queryKmer[i]][k]; }
346
347 //is this the taxonomy with the greatest probability?
348 if (prob > maxProbability) {
349 indexofGenus = genusNodes[k];
350 maxProbability = prob;
351 }
352 }
353
354 return indexofGenus;
355 }
356 catch(exception& e) {
357 m->errorOut(e, "Bayesian", "getMostProbableTaxonomy");
358 exit(1);
359 }
360 }
361 //********************************************************************************************************************
362 //if it is more probable that the reverse compliment kmers are in the template, then we assume the sequence is reversed.
isReversed(vector<int> & queryKmers)363 bool Bayesian::isReversed(vector<int>& queryKmers){
364 try{
365 bool reversed = false;
366 float prob = 0;
367 float reverseProb = 0;
368
369 for (int i = 0; i < queryKmers.size(); i++){
370 int kmer = queryKmers[i];
371 if (kmer >= 0){
372 prob += WordPairDiffArr[kmer].prob;
373 reverseProb += WordPairDiffArr[kmer].reverseProb;
374 }
375 }
376
377 if (reverseProb > prob){ reversed = true; }
378
379 return reversed;
380 }
381 catch(exception& e) {
382 m->errorOut(e, "Bayesian", "isReversed");
383 exit(1);
384 }
385 }
386 //********************************************************************************************************************
generateWordPairDiffArr()387 int Bayesian::generateWordPairDiffArr(){
388 try{
389 Kmer kmer(kmerSize);
390 for (int i = 0; i < WordPairDiffArr.size(); i++) {
391 int reversedWord = kmer.getReverseKmerNumber(i);
392 WordPairDiffArr[i].reverseProb = WordPairDiffArr[reversedWord].prob;
393 }
394
395 return 0;
396 }catch(exception& e) {
397 m->errorOut(e, "Bayesian", "generateWordPairDiffArr");
398 exit(1);
399 }
400 }
401 /**************************************************************************************************/
readProbFile(ifstream & in,ifstream & inNum,string inName,string inNumName)402 void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) {
403 try{
404 Utils util;
405 //read version
406 string line = util.getline(in); util.gobble(in);
407
408 in >> numKmers; util.gobble(in);
409 //initialze probabilities
410
411 wordGenusProb.resize(numKmers);
412 for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
413
414 int kmer, name, count; count = 0;
415 vector<int> num; num.resize(numKmers);
416 float prob;
417 vector<float> zeroCountProb; zeroCountProb.resize(numKmers);
418 for (int j = 0; j < numKmers; j++) { diffPair tempDiffPair; WordPairDiffArr.push_back(tempDiffPair); }
419
420 //read version
421 string line2 = util.getline(inNum); util.gobble(inNum);
422 float probTemp;
423
424 while (inNum) {
425 inNum >> zeroCountProb[count] >> num[count] >> probTemp;
426 WordPairDiffArr[count].prob = probTemp;
427 count++;
428 util.gobble(inNum);
429
430 if (m->getDebug()) { m->mothurOut("[DEBUG]: " + toString(zeroCountProb[count]) + '\t' + toString(num[count]) + '\t' + toString(numKmers) + "\n"); }
431
432
433 }
434 inNum.close();
435
436 while(in) {
437 in >> kmer;
438
439 //set them all to zero value
440 for (int i = 0; i < genusNodes.size(); i++) {
441 wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
442 }
443
444 //get probs for nonzero values
445 for (int i = 0; i < num[kmer]; i++) {
446 in >> name >> prob;
447 wordGenusProb[kmer][name] = prob;
448 if (m->getDebug()) { m->mothurOut("[DEBUG]: " + toString(name) + '\t' + toString(prob) + '\t' + toString(kmer) + "\n"); }
449 }
450
451 util.gobble(in);
452 }
453 in.close();
454
455 }
456 catch(exception& e) {
457 m->errorOut(e, "Bayesian", "readProbFile");
458 exit(1);
459 }
460 }
461 /**************************************************************************************************/
462
463
464
465
466
467
468