1 /*
2  *  blastdb.cpp
3  *
4  *
5  *  Created by Pat Schloss on 12/22/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9 
10 
11 #include "searchdatabase.hpp"
12 #include "sequence.hpp"
13 #include "blastdb.hpp"
14 
15 /**************************************************************************************************/
16 
BlastDB(string tag,float gO,float gE,float mm,float mM,string b,int tid)17 BlastDB::BlastDB(string tag, float gO, float gE, float mm, float mM, string b, int tid) : SearchDatabase(),
18 gapOpen(gO), gapExtend(gE), match(mm), misMatch(mM) {
19 	try {
20         current = CurrentFile::getInstance();
21 		count = 0;
22 		path = b;
23 		threadID = tid;
24 
25         Utils util;
26 		int randNumber = util.getRandomNumber();
27 		string pid = toString(threadID);
28 
29         if (m->getDebug()) { m->mothurOut("[DEBUG]: tag = " + tag + "\t pid = " + pid + "\n"); }
30 
31 		dbFileName = tag + pid + toString(randNumber) + ".template.unaligned.fasta";
32 		queryFileName = tag + pid + toString(randNumber) + ".candidate.unaligned.fasta";
33 		blastFileName = tag + pid + toString(randNumber) + ".blast";
34 
35 		//make sure blast exists in the write place
36 		if (path == "") {  path = current->getBlastPath() + "blast" + PATH_SEPARATOR + "bin" + PATH_SEPARATOR; }
37 
38 		//test to make sure formatdb exists
39 		ifstream in;
40         string formatdbCommand = path + "formatdb" + EXECUTABLE_EXT;
41 		formatdbCommand = util.getFullPathName(formatdbCommand);
42 		bool ableToOpen = util.openInputFile(formatdbCommand, in, "no error"); in.close();
43 		if(!ableToOpen) {	m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe.\n"); m->setControl_pressed(true); }
44 
45 		//test to make sure formatdb exists
46 		ifstream in2;
47         string blastCommand = path + "blastall" + EXECUTABLE_EXT;
48 		blastCommand = util.getFullPathName(blastCommand);
49 		ableToOpen = util.openInputFile(blastCommand, in2, "no error"); in2.close();
50 		if(!ableToOpen) {	m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe.\n");  m->setControl_pressed(true);  }
51 
52 		//test to make sure formatdb exists
53 		ifstream in3;
54         string megablastCommand = path + "megablast" + EXECUTABLE_EXT;
55 		megablastCommand = util.getFullPathName(megablastCommand);
56 		ableToOpen = util.openInputFile(megablastCommand, in3, "no error"); in3.close();
57 		if(!ableToOpen) {	m->mothurOut("[ERROR]: " +  megablastCommand + " file does not exist. mothur requires megablast.exe.\n");  m->setControl_pressed(true);  }
58 
59 	}
60 	catch(exception& e) {
61 		m->errorOut(e, "BlastDB", "BlastDB");
62 		exit(1);
63 	}
64 }
65 /**************************************************************************************************/
66 
BlastDB(string b,int tid)67 BlastDB::BlastDB(string b, int tid) : SearchDatabase() {
68 	try {
69         current = CurrentFile::getInstance();
70 		count = 0;
71 		path = b;
72 		threadID = tid;
73         Utils util;
74 
75 		//make sure blast exists in the write place
76 		if (path == "") {  path = current->getBlastPath() + "blast" + PATH_SEPARATOR + "bin" + PATH_SEPARATOR; }
77 
78 		int randNumber = util.getRandomNumber();;
79 		string pid = toString(threadID);
80 		dbFileName = pid + toString(randNumber) + ".template.unaligned.fasta";
81 		queryFileName = pid + toString(randNumber) + ".candidate.unaligned.fasta";
82 		blastFileName = pid + toString(randNumber) + ".blast";
83 
84         //test to make sure formatdb exists
85         ifstream in;
86         string formatdbCommand = path + "formatdb" + EXECUTABLE_EXT;
87         formatdbCommand = util.getFullPathName(formatdbCommand);
88         bool ableToOpen = util.openInputFile(formatdbCommand, in, "no error"); in.close();
89 		if(!ableToOpen) {	m->mothurOut("[ERROR]: " +  formatdbCommand + " file does not exist. mothur requires formatdb.exe.\n");  m->setControl_pressed(true);  }
90 
91         //test to make sure formatdb exists
92         ifstream in2;
93         string blastCommand = path + "blastall" + EXECUTABLE_EXT;
94         blastCommand = util.getFullPathName(blastCommand);
95         ableToOpen = util.openInputFile(blastCommand, in2, "no error"); in2.close();
96 		if(!ableToOpen) {	m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe.\n");  m->setControl_pressed(true); }
97 
98         //test to make sure formatdb exists
99         ifstream in3;
100         string megablastCommand = path + "megablast" + EXECUTABLE_EXT;
101         megablastCommand = util.getFullPathName(megablastCommand);
102         ableToOpen = util.openInputFile(megablastCommand, in3, "no error"); in3.close();
103 		if(!ableToOpen) {	m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe.\n");  m->setControl_pressed(true);  }
104 
105 
106 	}
107 	catch(exception& e) {
108 		m->errorOut(e, "BlastDB", "BlastDB");
109 		exit(1);
110 	}
111 }
112 
113 /**************************************************************************************************/
114 
~BlastDB()115 BlastDB::~BlastDB(){
116 	try{
117         Utils util;
118 		util.mothurRemove(queryFileName);				//	let's clean stuff up and remove the temp files
119 		util.mothurRemove(dbFileName);					//	let's clean stuff up and remove the temp files
120 		util.mothurRemove((dbFileName+".nsq"));					//	let's clean stuff up and remove the temp files
121 		util.mothurRemove((dbFileName+".nsi"));					//	let's clean stuff up and remove the temp files
122 		util.mothurRemove((dbFileName+".nsd"));					//	let's clean stuff up and remove the temp files
123 		util.mothurRemove((dbFileName+".nin"));					//	let's clean stuff up and remove the temp files
124 		util.mothurRemove((dbFileName+".nhr"));					//	let's clean stuff up and remove the temp files
125 		util.mothurRemove(blastFileName.c_str());				//	let's clean stuff up and remove the temp files
126 	}
127 	catch(exception& e) {
128 		m->errorOut(e, "BlastDB", "~BlastDB");
129 		exit(1);
130 	}
131 }
132 /**************************************************************************************************/
133 //assumes you have added all the template sequences using the addSequence function and run generateDB.
findClosestSequences(Sequence * seq,int n,vector<float> & scores) const134 vector<int> BlastDB::findClosestSequences(Sequence* seq, int n, vector<float>& scores) const {
135 	try{
136 		vector<int> topMatches;
137 
138 		ofstream queryFile;
139         Utils util;
140 		string pid = scrubName(seq->getName());
141 
142         util.openOutputFile((queryFileName+pid), queryFile);
143 		queryFile << '>' << seq->getName() << endl;
144 		queryFile << seq->getUnaligned() << endl;
145 		queryFile.close();
146 
147 		//lock_guard<std::mutex> guard(mutex);
148 		//	the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
149 		//	wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
150 		//	long.  With this setting, it seems comparable in speed to the suffix tree approach.
151 
152 		string blastCommand;
153 		#if defined NON_WINDOWS
154 
155 			blastCommand = path + "blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
156 			blastCommand += (" -i " + (queryFileName+pid) + " -o " + blastFileName+pid);
157 		#else
158 			blastCommand =  "\"" + path + "blastall\" -p blastn -d " + "\"" + dbFileName + "\"" + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
159 			blastCommand += (" -i " + (queryFileName+pid) + " -o " + blastFileName+pid);
160 			//wrap entire string in ""
161 			blastCommand = "\"" + blastCommand + "\"";
162 		#endif
163 
164 		system(blastCommand.c_str());
165 
166 		ifstream m8FileHandle;
167 		util.openInputFile(blastFileName+pid, m8FileHandle, "no error");
168 
169 		string dummy;
170 		int templateAccession;
171 		util.gobble(m8FileHandle);
172 
173 		while(!m8FileHandle.eof()){
174             float searchScore;
175 			m8FileHandle >> dummy >> templateAccession >> searchScore;
176 
177 			//get rest of junk in line
178 			while (!m8FileHandle.eof())	{	char c = m8FileHandle.get(); if (c == 10 || c == 13){	break;	}	}
179 
180 			util.gobble(m8FileHandle);
181 			topMatches.push_back(templateAccession);
182             scores.push_back(searchScore);
183 		}
184 		m8FileHandle.close();
185 		util.mothurRemove((queryFileName+pid));
186 		util.mothurRemove((blastFileName+pid));
187 
188 		return topMatches;
189 	}
190 	catch(exception& e) {
191 		m->errorOut(e, "BlastDB", "findClosestSequences");
192 		exit(1);
193 	}
194 
195 }
196 /**************************************************************************************************/
197 //assumes you have added all the template sequences using the addSequence function and run generateDB.
findClosestMegaBlast(Sequence * seq,int n,int minPerID)198 vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) {
199 	try{
200 		vector<int> topMatches;
201 		float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, searchScore;
202 
203 
204 		ofstream queryFile;
205 		string pid = scrubName(seq->getName());
206 
207         Utils util; util.openOutputFile((queryFileName+pid), queryFile);
208 		queryFile << '>' << seq->getName() << endl;
209 		queryFile << seq->getUnaligned() << endl;
210 		queryFile.close();
211 
212 		//	the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
213 		//	wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
214 		//	long.  With this setting, it seems comparable in speed to the suffix tree approach.
215 //7000004128189528left	0	100		66	0	0	1	66	61	126	1e-31	 131
216 		string blastCommand;
217 		#if defined NON_WINDOWS
218 			blastCommand = path + "megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
219 			blastCommand += (" -i " + (queryFileName+pid) + " -o " + blastFileName+pid);
220 		#else
221 			blastCommand =  "\"" + path + "megablast\" -e 1e-10 -d " + "\"" + dbFileName + "\"" + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
222 			blastCommand += (" -i " + (queryFileName+pid) + " -o " + blastFileName+pid);
223 			//wrap entire string in ""
224 			blastCommand = "\"" + blastCommand + "\"";
225 
226 		#endif
227 		system(blastCommand.c_str());
228 
229 		ifstream m8FileHandle;
230 		util.openInputFile(blastFileName+pid, m8FileHandle, "no error");
231 
232 		string dummy, eScore;
233 		int templateAccession;
234 		util.gobble(m8FileHandle);
235 
236 		while(!m8FileHandle.eof()){
237 			m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
238 
239 			util.gobble(m8FileHandle);
240 			if (searchScore >= minPerID) {
241 				topMatches.push_back(templateAccession);
242 				//Scores.push_back(searchScore);
243 			}
244 
245 		}
246 		m8FileHandle.close();
247 		util.mothurRemove((queryFileName+pid));
248 		util.mothurRemove((blastFileName+pid));
249 
250 		return topMatches;
251 	}
252 	catch(exception& e) {
253 		m->errorOut(e, "BlastDB", "findClosestMegaBlast");
254 		exit(1);
255 	}
256 }
257 /**************************************************************************************************/
addSequence(Sequence seq)258 void BlastDB::addSequence(Sequence seq) {
259 	try {
260 
261 		ofstream unalignedFastaFile;
262         Utils util; util.openOutputFileAppend(dbFileName, unalignedFastaFile);
263 
264 		//	generating a fasta file with unaligned template
265 		unalignedFastaFile << '>' << count << endl;					//	sequences, which will be input to formatdb
266 		unalignedFastaFile << seq.getUnaligned() << endl;
267 		unalignedFastaFile.close();
268 
269 		count++;
270 	}
271 	catch(exception& e) {
272 		m->errorOut(e, "BlastDB", "addSequence");
273 		exit(1);
274 	}
275 }
276 /**************************************************************************************************/
generateDB()277 void BlastDB::generateDB() {
278 	try {
279 
280 		//m->mothurOut("Generating the temporary BLAST database...\t");	cout.flush();
281 
282 		string formatdbCommand;
283 
284 		#if defined NON_WINDOWS
285 			formatdbCommand = path + "formatdb -p F -o T -i " + dbFileName;	//	format the database, -o option gives us the ability
286 		#else
287 			//formatdbCommand = path + "blast\\bin\\formatdb -p F -o T -i " + dbFileName;	//	format the database, -o option gives us the ability
288 
289 			formatdbCommand = "\"" + path + "formatdb\" -p F -o T -i " + "\"" +  dbFileName + "\"";
290 			//wrap entire string in ""
291 			formatdbCommand = "\"" + formatdbCommand + "\"";
292 		#endif
293 
294 		system(formatdbCommand.c_str());								//	to get the right sequence names, i think. -p F
295 																	//	option tells formatdb that seqs are DNA, not prot
296 	}
297 	catch(exception& e) {
298 		m->errorOut(e, "BlastDB", "generateDB");
299 		exit(1);
300 	}
301 }
302 /**************************************************************************************************/
scrubName(string seqName) const303 string BlastDB::scrubName(string seqName) const {
304 	try {
305 
306 		string cleanName = "";
307 
308 		for (int i = 0; i < seqName.length(); i++) {
309 			if (isalnum(seqName[i])) { cleanName += seqName[i]; }
310 			else { cleanName += "_";  }
311 		}
312 
313 		return cleanName;
314 	}
315 	catch(exception& e) {
316 		m->errorOut(e, "BlastDB", "scrubName");
317 		exit(1);
318 	}
319 }
320 /**************************************************************************************************/
321