1 /*
2 * blastdb.cpp
3 *
4 *
5 * Created by Pat Schloss on 12/22/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
7 *
8 */
9
10
11 #include "searchdatabase.hpp"
12 #include "sequence.hpp"
13 #include "blastdb.hpp"
14
15 /**************************************************************************************************/
16
BlastDB(string tag,float gO,float gE,float mm,float mM,string b,int tid)17 BlastDB::BlastDB(string tag, float gO, float gE, float mm, float mM, string b, int tid) : SearchDatabase(),
18 gapOpen(gO), gapExtend(gE), match(mm), misMatch(mM) {
19 try {
20 current = CurrentFile::getInstance();
21 count = 0;
22 path = b;
23 threadID = tid;
24
25 Utils util;
26 int randNumber = util.getRandomNumber();
27 string pid = toString(threadID);
28
29 if (m->getDebug()) { m->mothurOut("[DEBUG]: tag = " + tag + "\t pid = " + pid + "\n"); }
30
31 dbFileName = tag + pid + toString(randNumber) + ".template.unaligned.fasta";
32 queryFileName = tag + pid + toString(randNumber) + ".candidate.unaligned.fasta";
33 blastFileName = tag + pid + toString(randNumber) + ".blast";
34
35 //make sure blast exists in the write place
36 if (path == "") { path = current->getBlastPath() + "blast" + PATH_SEPARATOR + "bin" + PATH_SEPARATOR; }
37
38 //test to make sure formatdb exists
39 ifstream in;
40 string formatdbCommand = path + "formatdb" + EXECUTABLE_EXT;
41 formatdbCommand = util.getFullPathName(formatdbCommand);
42 bool ableToOpen = util.openInputFile(formatdbCommand, in, "no error"); in.close();
43 if(!ableToOpen) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe.\n"); m->setControl_pressed(true); }
44
45 //test to make sure formatdb exists
46 ifstream in2;
47 string blastCommand = path + "blastall" + EXECUTABLE_EXT;
48 blastCommand = util.getFullPathName(blastCommand);
49 ableToOpen = util.openInputFile(blastCommand, in2, "no error"); in2.close();
50 if(!ableToOpen) { m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe.\n"); m->setControl_pressed(true); }
51
52 //test to make sure formatdb exists
53 ifstream in3;
54 string megablastCommand = path + "megablast" + EXECUTABLE_EXT;
55 megablastCommand = util.getFullPathName(megablastCommand);
56 ableToOpen = util.openInputFile(megablastCommand, in3, "no error"); in3.close();
57 if(!ableToOpen) { m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe.\n"); m->setControl_pressed(true); }
58
59 }
60 catch(exception& e) {
61 m->errorOut(e, "BlastDB", "BlastDB");
62 exit(1);
63 }
64 }
65 /**************************************************************************************************/
66
BlastDB(string b,int tid)67 BlastDB::BlastDB(string b, int tid) : SearchDatabase() {
68 try {
69 current = CurrentFile::getInstance();
70 count = 0;
71 path = b;
72 threadID = tid;
73 Utils util;
74
75 //make sure blast exists in the write place
76 if (path == "") { path = current->getBlastPath() + "blast" + PATH_SEPARATOR + "bin" + PATH_SEPARATOR; }
77
78 int randNumber = util.getRandomNumber();;
79 string pid = toString(threadID);
80 dbFileName = pid + toString(randNumber) + ".template.unaligned.fasta";
81 queryFileName = pid + toString(randNumber) + ".candidate.unaligned.fasta";
82 blastFileName = pid + toString(randNumber) + ".blast";
83
84 //test to make sure formatdb exists
85 ifstream in;
86 string formatdbCommand = path + "formatdb" + EXECUTABLE_EXT;
87 formatdbCommand = util.getFullPathName(formatdbCommand);
88 bool ableToOpen = util.openInputFile(formatdbCommand, in, "no error"); in.close();
89 if(!ableToOpen) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe.\n"); m->setControl_pressed(true); }
90
91 //test to make sure formatdb exists
92 ifstream in2;
93 string blastCommand = path + "blastall" + EXECUTABLE_EXT;
94 blastCommand = util.getFullPathName(blastCommand);
95 ableToOpen = util.openInputFile(blastCommand, in2, "no error"); in2.close();
96 if(!ableToOpen) { m->mothurOut("[ERROR]: " + blastCommand + " file does not exist. mothur requires blastall.exe.\n"); m->setControl_pressed(true); }
97
98 //test to make sure formatdb exists
99 ifstream in3;
100 string megablastCommand = path + "megablast" + EXECUTABLE_EXT;
101 megablastCommand = util.getFullPathName(megablastCommand);
102 ableToOpen = util.openInputFile(megablastCommand, in3, "no error"); in3.close();
103 if(!ableToOpen) { m->mothurOut("[ERROR]: " + megablastCommand + " file does not exist. mothur requires megablast.exe.\n"); m->setControl_pressed(true); }
104
105
106 }
107 catch(exception& e) {
108 m->errorOut(e, "BlastDB", "BlastDB");
109 exit(1);
110 }
111 }
112
113 /**************************************************************************************************/
114
~BlastDB()115 BlastDB::~BlastDB(){
116 try{
117 Utils util;
118 util.mothurRemove(queryFileName); // let's clean stuff up and remove the temp files
119 util.mothurRemove(dbFileName); // let's clean stuff up and remove the temp files
120 util.mothurRemove((dbFileName+".nsq")); // let's clean stuff up and remove the temp files
121 util.mothurRemove((dbFileName+".nsi")); // let's clean stuff up and remove the temp files
122 util.mothurRemove((dbFileName+".nsd")); // let's clean stuff up and remove the temp files
123 util.mothurRemove((dbFileName+".nin")); // let's clean stuff up and remove the temp files
124 util.mothurRemove((dbFileName+".nhr")); // let's clean stuff up and remove the temp files
125 util.mothurRemove(blastFileName.c_str()); // let's clean stuff up and remove the temp files
126 }
127 catch(exception& e) {
128 m->errorOut(e, "BlastDB", "~BlastDB");
129 exit(1);
130 }
131 }
132 /**************************************************************************************************/
133 //assumes you have added all the template sequences using the addSequence function and run generateDB.
findClosestSequences(Sequence * seq,int n,vector<float> & scores) const134 vector<int> BlastDB::findClosestSequences(Sequence* seq, int n, vector<float>& scores) const {
135 try{
136 vector<int> topMatches;
137
138 ofstream queryFile;
139 Utils util;
140 string pid = scrubName(seq->getName());
141
142 util.openOutputFile((queryFileName+pid), queryFile);
143 queryFile << '>' << seq->getName() << endl;
144 queryFile << seq->getUnaligned() << endl;
145 queryFile.close();
146
147 //lock_guard<std::mutex> guard(mutex);
148 // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
149 // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
150 // long. With this setting, it seems comparable in speed to the suffix tree approach.
151
152 string blastCommand;
153 #if defined NON_WINDOWS
154
155 blastCommand = path + "blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
156 blastCommand += (" -i " + (queryFileName+pid) + " -o " + blastFileName+pid);
157 #else
158 blastCommand = "\"" + path + "blastall\" -p blastn -d " + "\"" + dbFileName + "\"" + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
159 blastCommand += (" -i " + (queryFileName+pid) + " -o " + blastFileName+pid);
160 //wrap entire string in ""
161 blastCommand = "\"" + blastCommand + "\"";
162 #endif
163
164 system(blastCommand.c_str());
165
166 ifstream m8FileHandle;
167 util.openInputFile(blastFileName+pid, m8FileHandle, "no error");
168
169 string dummy;
170 int templateAccession;
171 util.gobble(m8FileHandle);
172
173 while(!m8FileHandle.eof()){
174 float searchScore;
175 m8FileHandle >> dummy >> templateAccession >> searchScore;
176
177 //get rest of junk in line
178 while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } }
179
180 util.gobble(m8FileHandle);
181 topMatches.push_back(templateAccession);
182 scores.push_back(searchScore);
183 }
184 m8FileHandle.close();
185 util.mothurRemove((queryFileName+pid));
186 util.mothurRemove((blastFileName+pid));
187
188 return topMatches;
189 }
190 catch(exception& e) {
191 m->errorOut(e, "BlastDB", "findClosestSequences");
192 exit(1);
193 }
194
195 }
196 /**************************************************************************************************/
197 //assumes you have added all the template sequences using the addSequence function and run generateDB.
findClosestMegaBlast(Sequence * seq,int n,int minPerID)198 vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) {
199 try{
200 vector<int> topMatches;
201 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, searchScore;
202
203
204 ofstream queryFile;
205 string pid = scrubName(seq->getName());
206
207 Utils util; util.openOutputFile((queryFileName+pid), queryFile);
208 queryFile << '>' << seq->getName() << endl;
209 queryFile << seq->getUnaligned() << endl;
210 queryFile.close();
211
212 // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
213 // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
214 // long. With this setting, it seems comparable in speed to the suffix tree approach.
215 //7000004128189528left 0 100 66 0 0 1 66 61 126 1e-31 131
216 string blastCommand;
217 #if defined NON_WINDOWS
218 blastCommand = path + "megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
219 blastCommand += (" -i " + (queryFileName+pid) + " -o " + blastFileName+pid);
220 #else
221 blastCommand = "\"" + path + "megablast\" -e 1e-10 -d " + "\"" + dbFileName + "\"" + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
222 blastCommand += (" -i " + (queryFileName+pid) + " -o " + blastFileName+pid);
223 //wrap entire string in ""
224 blastCommand = "\"" + blastCommand + "\"";
225
226 #endif
227 system(blastCommand.c_str());
228
229 ifstream m8FileHandle;
230 util.openInputFile(blastFileName+pid, m8FileHandle, "no error");
231
232 string dummy, eScore;
233 int templateAccession;
234 util.gobble(m8FileHandle);
235
236 while(!m8FileHandle.eof()){
237 m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
238
239 util.gobble(m8FileHandle);
240 if (searchScore >= minPerID) {
241 topMatches.push_back(templateAccession);
242 //Scores.push_back(searchScore);
243 }
244
245 }
246 m8FileHandle.close();
247 util.mothurRemove((queryFileName+pid));
248 util.mothurRemove((blastFileName+pid));
249
250 return topMatches;
251 }
252 catch(exception& e) {
253 m->errorOut(e, "BlastDB", "findClosestMegaBlast");
254 exit(1);
255 }
256 }
257 /**************************************************************************************************/
addSequence(Sequence seq)258 void BlastDB::addSequence(Sequence seq) {
259 try {
260
261 ofstream unalignedFastaFile;
262 Utils util; util.openOutputFileAppend(dbFileName, unalignedFastaFile);
263
264 // generating a fasta file with unaligned template
265 unalignedFastaFile << '>' << count << endl; // sequences, which will be input to formatdb
266 unalignedFastaFile << seq.getUnaligned() << endl;
267 unalignedFastaFile.close();
268
269 count++;
270 }
271 catch(exception& e) {
272 m->errorOut(e, "BlastDB", "addSequence");
273 exit(1);
274 }
275 }
276 /**************************************************************************************************/
generateDB()277 void BlastDB::generateDB() {
278 try {
279
280 //m->mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
281
282 string formatdbCommand;
283
284 #if defined NON_WINDOWS
285 formatdbCommand = path + "formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
286 #else
287 //formatdbCommand = path + "blast\\bin\\formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
288
289 formatdbCommand = "\"" + path + "formatdb\" -p F -o T -i " + "\"" + dbFileName + "\"";
290 //wrap entire string in ""
291 formatdbCommand = "\"" + formatdbCommand + "\"";
292 #endif
293
294 system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F
295 // option tells formatdb that seqs are DNA, not prot
296 }
297 catch(exception& e) {
298 m->errorOut(e, "BlastDB", "generateDB");
299 exit(1);
300 }
301 }
302 /**************************************************************************************************/
scrubName(string seqName) const303 string BlastDB::scrubName(string seqName) const {
304 try {
305
306 string cleanName = "";
307
308 for (int i = 0; i < seqName.length(); i++) {
309 if (isalnum(seqName[i])) { cleanName += seqName[i]; }
310 else { cleanName += "_"; }
311 }
312
313 return cleanName;
314 }
315 catch(exception& e) {
316 m->errorOut(e, "BlastDB", "scrubName");
317 exit(1);
318 }
319 }
320 /**************************************************************************************************/
321