1 /*
2 * pairwiseseqscommand.cpp
3 * Mothur
4 *
5 * Created by westcott on 10/20/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "pairwiseseqscommand.h"
11 #include "kmerdist.hpp"
12
13 //**********************************************************************************************************************
setParameters()14 vector<string> PairwiseSeqsCommand::setParameters(){
15 try {
16 CommandParameter pcolumn("column", "InputTypes", "", "", "none", "none", "OldFastaColumn","column",false,false); parameters.push_back(pcolumn);
17 CommandParameter poldfasta("oldfasta", "InputTypes", "", "", "none", "none", "OldFastaColumn","",false,false); parameters.push_back(poldfasta);
18 CommandParameter pfitcalc("fitcalc", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pfitcalc);
19 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","phylip-column",false,true,true); parameters.push_back(pfasta);
20 CommandParameter palign("align", "Multiple", "needleman-gotoh-blast-noalign", "needleman", "", "", "","",false,false); parameters.push_back(palign);
21 CommandParameter pmatch("match", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pmatch);
22 CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch);
23 CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen);
24 CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend);
25 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
26 CommandParameter poutput("output", "Multiple", "column-lt-square-phylip", "column", "", "", "","phylip-column",false,false,true); parameters.push_back(poutput);
27 CommandParameter pcalc("calc", "Multiple", "nogaps-eachgap-onegap", "onegap", "", "", "","",false,false); parameters.push_back(pcalc);
28 CommandParameter pcountends("countends", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcountends);
29 CommandParameter pcompress("compress", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pcompress);
30 CommandParameter pcutoff("cutoff", "Number", "", "1.0", "", "", "","",false,false,true); parameters.push_back(pcutoff);
31 CommandParameter pkcutoff("kmercutoff", "Number", "", "-1.0", "", "", "","",false,false,true);
32 parameters.push_back(pkcutoff);
33 CommandParameter pksize("ksize", "Number", "", "8", "", "", "","",false,false); parameters.push_back(pksize);
34 CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
35 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
36 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
37
38 abort = false; calledHelp = false;
39
40 vector<string> tempOutNames;
41 outputTypes["phylip"] = tempOutNames;
42 outputTypes["column"] = tempOutNames;
43
44 vector<string> myArray;
45 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
46 return myArray;
47 }
48 catch(exception& e) {
49 m->errorOut(e, "PairwiseSeqsCommand", "setParameters");
50 exit(1);
51 }
52 }
53 //**********************************************************************************************************************
getHelpString()54 string PairwiseSeqsCommand::getHelpString(){
55 try {
56 string helpString = "";
57 helpString += "The pairwise.seqs command reads a fasta file and creates distance matrix.\n";
58 helpString += "The pairwise.seqs command parameters are fasta, align, match, mismatch, gapopen, gapextend, calc, output, cutoff, oldfasta, column, processors.\n";
59 helpString += "The fasta parameter is required.\n";
60 helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n";
61 helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
62 helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n";
63 helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
64 helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n";
65 helpString += "The calc parameter allows you to specify the method of calculating the distances. Your options are: nogaps, onegap or eachgap. The default is onegap.\n";
66 helpString += "The countends parameter allows you to specify whether to include terminal gaps in distance. Your options are: T or F. The default is T.\n";
67 helpString += "The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n";
68 helpString += "The kmercutoff parameter allows you to specify maximum kmer distance. The kmercutoff is used to reduce the processing time by avoiding the aligning and distance calculations for sequences with a kmer distance above the cutoff. Kmer distance are calculated using methods described here, Edgar, R. C. (2004). Muscle: a multiple sequence alignment method with reduced time and space complexity. BMC Bioinformatics, 5:113. The defaults vary based on the cutoff selected. Cutoff <= 0.05 -> kmerCutoff = -1.0, cutoff 0.05 - 0.15 -> kmerCutoff = -0.50, cutoff 0.15-0.25 -> kmerCutoff = -0.25, cutoff > 0.25 -> kmerCutoff = -0.10.\n";
69 helpString += "The ksize parameter allows you to specify the kmer size for calculating the kmer distance. The default is 7.\n";
70 helpString += "The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n";
71 helpString += "The oldfasta and column parameters allow you to append the distances calculated to the column file.\n";
72 helpString += "The compress parameter allows you to indicate that you want the resulting distance file compressed. The default is false.\n";
73 helpString += "The pairwise.seqs command should be in the following format: \n";
74 helpString += "pairwise.seqs(fasta=yourfastaFile, align=yourAlignmentMethod) \n";
75 helpString += "Example pairwise.seqs(fasta=candidate.fasta, align=blast)\n";
76
77 return helpString;
78 }
79 catch(exception& e) {
80 m->errorOut(e, "PairwiseSeqsCommand", "getHelpString");
81 exit(1);
82 }
83 }
84 //**********************************************************************************************************************
getOutputPattern(string type)85 string PairwiseSeqsCommand::getOutputPattern(string type) {
86 try {
87 string pattern = "";
88
89 if (type == "phylip") { pattern = "[filename],[outputtag],dist"; }
90 else if (type == "column") { pattern = "[filename],dist"; }
91 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }
92
93 return pattern;
94 }
95 catch(exception& e) {
96 m->errorOut(e, "PairwiseSeqsCommand", "getOutputPattern");
97 exit(1);
98 }
99 }
100 //**********************************************************************************************************************
PairwiseSeqsCommand(string option)101 PairwiseSeqsCommand::PairwiseSeqsCommand(string option) : Command() {
102 try {
103 if(option == "help") { help(); abort = true; calledHelp = true; }
104 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
105 else if(option == "category") { abort = true; calledHelp = true; }
106
107 else {
108 OptionParser parser(option, setParameters());
109 map<string, string> parameters = parser.getParameters();
110
111 ValidParameters validParameter;
112
113 fastaFileName = validParameter.validFile(parameters, "fasta");
114 if (fastaFileName == "not found") {
115 fastaFileName = current->getFastaFile();
116 if (fastaFileName != "") { m->mothurOut("Using " + fastaFileName + " as input file for the fasta parameter.\n"); }
117 else { m->mothurOut("[ERROR]: You have no current fastafile and the fasta parameter is required.\n"); abort = true; }
118 }else if (fastaFileName == "not open") { abort = true; }
119 else{ current->setFastaFile(fastaFileName); }
120
121 if (outputdir == "") { outputdir += util.hasPath(fastaFileName); }
122
123 oldfastafile = validParameter.validFile(parameters, "oldfasta");
124 if (oldfastafile == "not found") { oldfastafile = ""; }
125 else if (oldfastafile == "not open") { abort = true; }
126
127 column = validParameter.validFile(parameters, "column");
128 if (column == "not found") { column = ""; }
129 else if (column == "not open") { abort = true; }
130 else { current->setColumnFile(column); }
131
132 //check for optional parameter and set defaults
133 // ...at some point should added some additional type checking...
134 string temp;
135 temp = validParameter.valid(parameters, "match"); if (temp == "not found"){ temp = "1.0"; }
136 util.mothurConvert(temp, match);
137
138 temp = validParameter.valid(parameters, "mismatch"); if (temp == "not found"){ temp = "-1.0"; }
139 util.mothurConvert(temp, misMatch);
140 if (misMatch > 0) { m->mothurOut("[ERROR]: mismatch must be negative.\n"); abort=true; }
141
142 temp = validParameter.valid(parameters, "gapopen"); if (temp == "not found"){ temp = "-2.0"; }
143 util.mothurConvert(temp, gapOpen);
144 if (gapOpen > 0) { m->mothurOut("[ERROR]: gapopen must be negative.\n"); abort=true; }
145
146 temp = validParameter.valid(parameters, "gapextend"); if (temp == "not found"){ temp = "-1.0"; }
147 util.mothurConvert(temp, gapExtend);
148 if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; }
149
150 temp = validParameter.valid(parameters, "processors"); if (temp == "not found"){ temp = current->getProcessors(); }
151 processors = current->setProcessors(temp);
152
153 temp = validParameter.valid(parameters, "cutoff"); if(temp == "not found"){ temp = "1.0"; }
154 util.mothurConvert(temp, cutoff);
155
156 temp = validParameter.valid(parameters, "kmercutoff");
157 if(temp == "not found"){
158 if (cutoff <= 0.05) { kmerCutoff = -1.0; }
159 else if ((cutoff > 0.05) && (cutoff <= 0.15)) { kmerCutoff = -0.50; }
160 else if ((cutoff > 0.15) && (cutoff <= 0.25)) { kmerCutoff = -0.25; }
161 else { kmerCutoff = -0.1; }
162
163 }else { util.mothurConvert(temp, kmerCutoff); }
164
165 temp = validParameter.valid(parameters, "ksize"); if (temp == "not found"){ temp = "7"; }
166 util.mothurConvert(temp, kmerSize);
167
168 temp = validParameter.valid(parameters, "countends"); if(temp == "not found"){ temp = "T"; }
169 countends = util.isTrue(temp);
170
171 temp = validParameter.valid(parameters, "compress"); if(temp == "not found"){ temp = "F"; }
172 compress = util.isTrue(temp);
173
174 align = validParameter.valid(parameters, "align"); if (align == "not found"){ align = "needleman"; }
175
176 if (align == "blast") {
177 string blastlocation = "";
178 vector<string> locations = current->getLocations();
179 string path = current->getProgramPath();
180 bool foundTool = util.findBlastLocation(blastlocation, path, locations);
181
182 if (!foundTool){
183 m->mothurOut("[WARNING]: Unable to locate blast executables, cannot use blast as align method. Using needleman instead.\n"); align = "needleman";
184 }
185 }
186
187 temp = validParameter.valid(parameters, "fitcalc"); if(temp == "not found"){ temp = "F"; }
188 fitCalc = util.isTrue(temp);
189
190 output = validParameter.valid(parameters, "output"); if(output == "not found"){ output = "column"; }
191 if (output=="phylip") { output = "lt"; }
192 if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column.\n"); output = "column"; }
193
194 calc = validParameter.valid(parameters, "calc");
195 if (calc == "not found") { calc = "onegap"; }
196 else {
197 if (calc == "default") { calc = "onegap"; }
198 }
199
200 if ((calc != "nogaps") && (calc != "eachgap") && (calc != "onegap")) { m->mothurOut(calc + " is not a valid calculator for pairwise.seqs. Options are onegap, eachgap and nogaps. I will use onegap.\n"); calc = "onegap"; }
201
202 }
203
204 }
205 catch(exception& e) {
206 m->errorOut(e, "PairwiseSeqsCommand", "PairwiseSeqsCommand");
207 exit(1);
208 }
209 }
210 //**********************************************************************************************************************
PairwiseSeqsCommand(StorageDatabase * & storageDB,vector<vector<int>> kDB,vector<int> lths,string outputFileRoot,double cut,string outputformat,int proc)211 PairwiseSeqsCommand::PairwiseSeqsCommand(StorageDatabase*& storageDB, vector< vector< int > > kDB, vector< int > lths, string outputFileRoot, double cut, string outputformat, int proc) {
212 try {
213 abort = false; calledHelp = false;
214 vector<string> tempOutNames;
215 outputTypes["phylip"] = tempOutNames;
216 outputTypes["column"] = tempOutNames;
217
218 //defaults
219 calc = "onegap";
220 countends = true;
221 fitCalc = false;
222 cutoff = cut;
223 processors = proc;
224 compress = false;
225 output = outputformat;
226 match = 1.0;
227 misMatch = -1.0;
228 gapOpen = -2.0;
229 gapExtend = -1.0;
230 align = "needleman";
231 kmerSize = 7;
232 kmerDB = kDB;
233 lengths = lths;
234 if (cutoff <= 0.05) { kmerCutoff = -1.0; }
235 else if ((cutoff > 0.05) && (cutoff <= 0.15)) { kmerCutoff = -0.50; }
236 else if ((cutoff > 0.15) && (cutoff <= 0.25)) { kmerCutoff = -0.25; }
237 else { kmerCutoff = -0.1; }
238
239 longestBase = 2000; //will need to update this in driver if we find sequences with more bases. hardcoded so we don't have the pre-read user fasta file.
240 numDistsBelowCutoff = 0;
241
242 alignDB = storageDB;
243 long long numSeqs = alignDB->getNumSeqs();
244
245 if (numSeqs < 2) { m->mothurOut("[ERROR]: you must have at least 2 sequences to calculate the distances, aborting.\n"); return; }
246
247 string outputFile;
248 map<string, string> variables;
249 variables["[filename]"] = outputFileRoot;
250
251 if (output == "lt") { //does the user want lower triangle phylip formatted file
252 variables["[outputtag]"] = "phylip";
253 outputFile = getOutputFileName("phylip", variables);
254 util.mothurRemove(outputFile); outputTypes["phylip"].push_back(outputFile);
255 }else if (output == "column") { //user wants column format
256 if (fitCalc) { variables["[outputtag]"] = "fit"; }
257 outputFile = getOutputFileName("column", variables);
258 outputTypes["column"].push_back(outputFile);
259 util.mothurRemove(outputFile);
260 }
261
262 time_t start, end; time(&start);
263
264 m->mothurOut("\nSequence\tTime\tNum_Dists_Below_Cutoff\n");
265
266 createProcesses(outputFile);
267
268 time(&end);
269 m->mothurOut("\nIt took " + toString(difftime(end, start)) + " secs to find distances for " + toString(numSeqs) + " sequences. " + toString(numDistsBelowCutoff) + " distances below cutoff " + toString(cutoff) + ".\n\n");
270
271 m->mothurOut("\nOutput File Names:\n"); m->mothurOut(outputFile+"\n\n");
272
273 }
274 catch(exception& e) {
275 m->errorOut(e, "DistanceCommand", "DistanceCommand");
276 exit(1);
277 }
278 }
279 //**********************************************************************************************************************
280
execute()281 int PairwiseSeqsCommand::execute(){
282 try {
283 if (abort) { if (calledHelp) { return 0; } return 2; }
284
285 time_t start, end; time(&start);
286
287 longestBase = 2000; //will need to update this in driver if we find sequences with more bases. hardcoded so we don't have the pre-read user fasta file.
288 numDistsBelowCutoff = 0;
289
290 ifstream inFASTA; util.openInputFile(fastaFileName, inFASTA);
291 alignDB = new SequenceDB(inFASTA, kmerSize, kmerDB, lengths);
292 inFASTA.close();
293
294 //sanity check the oldfasta and column file as well as add oldfasta sequences to alignDB
295 if ((oldfastafile != "") && (column != "")) { if (!(sanityCheck())) { return 0; } }
296
297 if (m->getControl_pressed()) { delete alignDB; return 0; }
298
299 long long numSeqs = alignDB->getNumSeqs();
300
301 string outputFile = "";
302
303 map<string, string> variables;
304 variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(fastaFileName));
305 if ((oldfastafile != "") && (column != "")) { variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(oldfastafile)); }
306
307 if (output == "lt") { //does the user want lower triangle phylip formatted file
308 variables["[outputtag]"] = "phylip";
309 outputFile = getOutputFileName("phylip", variables);
310 util.mothurRemove(outputFile); outputTypes["phylip"].push_back(outputFile);
311 }else if (output == "column") { //user wants column format
312 if (fitCalc) { variables["[outputtag]"] = "fit"; }
313 outputFile = getOutputFileName("column", variables);
314 outputTypes["column"].push_back(outputFile);
315 util.mothurRemove(outputFile);
316 }else { //assume square
317 variables["[outputtag]"] = "square";
318 outputFile = getOutputFileName("phylip", variables);
319 util.mothurRemove(outputFile);
320 outputTypes["phylip"].push_back(outputFile);
321 }
322
323 m->mothurOut("\nSequence\tTime\tNum_Dists_Below_Cutoff\n");
324
325 createProcesses(outputFile);
326
327 delete alignDB;
328
329 if (m->getControl_pressed()) { outputTypes.clear(); util.mothurRemove(outputFile); return 0; }
330
331 if(util.isBlank(outputFile)) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff.\n"); }
332
333 //append the old column file to the new one
334 if ((oldfastafile != "") && (column != "")) {
335 //we had to rename the column file so we didnt overwrite above, but we want to keep old name
336 if (outputFile == column) {
337 string tempcolumn = column + ".old";
338 util.appendFiles(tempcolumn, outputFile);
339 util.mothurRemove(tempcolumn);
340 }else{
341 util.appendFiles(outputFile, column);
342 util.mothurRemove(outputFile);
343 outputFile = column;
344 }
345 outputTypes["column"].clear(); outputTypes["column"].push_back(outputFile);
346 }
347
348 if (compress) {
349 m->mothurOut("Compressing...\n");
350 m->mothurOut("(Replacing " + outputFile + " with " + outputFile + ".gz)\n");
351 system(("gzip -v " + outputFile).c_str());
352 outputNames.push_back(outputFile + ".gz");
353 }else { outputNames.push_back(outputFile); }
354
355 time(&end);
356 m->mothurOut("\nIt took " + toString(difftime(end, start)) + " secs to find distances for " + toString(numSeqs) + " sequences. " + toString(numDistsBelowCutoff) + " distances below cutoff " + toString(cutoff) + ".\n\n");
357
358 if (m->getControl_pressed()) { outputTypes.clear(); util.mothurRemove(outputFile); return 0; }
359
360 //set phylip file as new current phylipfile
361 string currentName = "";
362 itTypes = outputTypes.find("phylip");
363 if (itTypes != outputTypes.end()) {
364 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setPhylipFile(currentName); }
365 }
366
367 //set column file as new current columnfile
368 itTypes = outputTypes.find("column");
369 if (itTypes != outputTypes.end()) {
370 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setColumnFile(currentName); }
371 }
372
373 m->mothurOut("\nOutput File Names: \n");
374 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i] +"\n"); } m->mothurOutEndLine();
375
376
377 return 0;
378 }
379 catch(exception& e) {
380 m->errorOut(e, "PairwiseSeqsCommand", "execute");
381 exit(1);
382 }
383 }
384
385 /**************************************************************************************************/
386 struct pairwiseData {
387 string align, distcalcType, outputFileName;
388 unsigned long long start, end;
389 long long count;
390 float match, misMatch, gapOpen, gapExtend, cutoff, kmerCutoff;
391 int longestBase, kmerSize;
392 bool countends;
393
394 vector< vector< int > > kmerDB; //kmerDB[0] = vector<int> maxKmers long, contains kmer counts
395 vector< vector< int > > oldkmerDB; //kmerDB[0] = vector<int> maxKmers long, contains kmer counts
396 vector< int > lengths;
397 vector< int > oldlengths;
398
399 StorageDatabase* alignDB;
400 SequenceDB oldFastaDB;
401 OutputWriter* threadWriter;
402 MothurOut* m;
403 Utils util;
404
pairwiseDatapairwiseData405 pairwiseData(){}
pairwiseDatapairwiseData406 pairwiseData(OutputWriter* ofn) {
407 threadWriter = ofn;
408 m = MothurOut::getInstance();
409 }
410
pairwiseDatapairwiseData411 pairwiseData(string ofn) {
412 outputFileName = ofn;
413 m = MothurOut::getInstance();
414 }
415
setVariablespairwiseData416 void setVariables(string al, string di, bool co, string op, StorageDatabase* DB, SequenceDB oldDB, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int thr, float cu, float kcut, int ksz, vector< vector< int > > kdb, vector< int > le, vector< vector< int > > okdb, vector< int > ole) {
417 align = al;
418 distcalcType = di;
419 countends = co;
420 alignDB = DB;
421 oldFastaDB = oldDB;
422 cutoff = cu;
423 start = st;
424 end = en;
425 match = ma;
426 misMatch = misMa;
427 gapOpen = gapO;
428 gapExtend = gapE;
429 longestBase = thr;
430 kmerDB = kdb;
431 oldkmerDB = okdb;
432 lengths = le;
433 oldlengths = ole;
434 kmerSize = ksz;
435 kmerCutoff = kcut;
436 count = 0;
437 }
438 };
439 /***********************************************************************/
getUniqueKmers(vector<int> seqsKmers,int i)440 vector<kmerCount> getUniqueKmers(vector<int> seqsKmers, int i){
441 vector<kmerCount> uniques;
442
443 for (int k = 0; k < seqsKmers.size(); k++) {
444 if (seqsKmers[k] != 0) {
445 kmerCount thisKmer(k, seqsKmers[k]);
446 uniques.push_back(thisKmer);
447 }
448 }
449
450 return uniques;
451 }
452
453 /**************************************************************************************************/
454 //the higher the kmercutoff the higher the aligned dist. As kmercutoff approaches 0, aligned dist aproaches 1.
driverColumn(pairwiseData * params)455 int driverColumn(pairwiseData* params){
456 try {
457 int startTime = time(NULL);
458
459 Alignment* alignment;
460 if(params->align == "gotoh") { alignment = new GotohOverlap(params->gapOpen, params->gapExtend, params->match, params->misMatch, params->longestBase); }
461 else if(params->align == "needleman") { alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase); }
462 else if(params->align == "blast") { alignment = new BlastAlignment(params->gapOpen, params->gapExtend, params->match, params->misMatch); }
463 else if(params->align == "noalign") { alignment = new NoAlign(); }
464 else {
465 params->m->mothurOut(params->align + " is not a valid alignment option. I will run the command using needleman.\n");
466 alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);
467 }
468
469 ValidCalculators validCalculator;
470 DistCalc* distCalculator;
471 if (params->countends) {
472 if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
473 if (params->distcalcType == "nogaps") { distCalculator = new ignoreGaps(params->cutoff); }
474 else if (params->distcalcType == "eachgap") { distCalculator = new eachGapDist(params->cutoff); }
475 else if (params->distcalcType == "onegap") { distCalculator = new oneGapDist(params->cutoff); }
476 //else if (params->distcalcType == "onegap") { distCalculator = new oneGapDist(1.0); }
477 }
478 }else {
479 if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
480 if (params->distcalcType == "nogaps") { distCalculator = new ignoreGaps(params->cutoff); }
481 else if (params->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(params->cutoff); }
482 else if (params->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(params->cutoff); }
483 }
484 }
485
486 KmerDist kmerDistCalculator(params->kmerSize);
487 double kmerCutoff = params->kmerCutoff;
488
489 for(int i=params->start;i<params->end;i++){
490
491 Sequence seq = params->alignDB->getSeq(i);
492 vector<kmerCount> seqA = getUniqueKmers(params->kmerDB[i], i);
493
494 for(int j=0;j<i;j++){
495
496 if (params->m->getControl_pressed()) { break; }
497
498 vector<int> seqB = params->kmerDB[j];
499
500 int length = min(params->lengths[i], params->lengths[j]);
501
502 vector<double> kmerDist = kmerDistCalculator.calcDist(seqA, seqB, length);
503
504 if (kmerDist[0] <= kmerCutoff) {
505 Sequence seqI = seq;
506 Sequence seqJ = params->alignDB->getSeq(j);
507
508 if (seq.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seq.getUnaligned().length()+1); }
509 if (seqJ.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seqJ.getUnaligned().length()+1); }
510
511 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
512
513 seqI.setAligned(alignment->getSeqAAln());
514 seqJ.setAligned(alignment->getSeqBAln());
515
516 double dist = distCalculator->calcDist(seqI, seqJ);
517
518 if(dist <= params->cutoff){ params->count++; params->threadWriter->write(seqI.getName() + ' ' + seqJ.getName() + ' ' + toString(dist) + "\n"); }
519 }
520 }
521 if((i+1) % 100 == 0){ params->m->mothurOutJustToScreen(toString(i+1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n"); }
522 }
523 params->m->mothurOutJustToScreen(toString(params->end-1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n");
524
525 delete alignment; delete distCalculator;
526
527 return 0;
528 }
529 catch(exception& e) {
530 params->m->errorOut(e, "PairwiseSeqsCommand", "driver");
531 exit(1);
532 }
533 }
534 /**************************************************************************************************/
driverFitCalc(pairwiseData * params)535 int driverFitCalc(pairwiseData* params){
536 try {
537 int startTime = time(NULL);
538
539 Alignment* alignment;
540 if(params->align == "gotoh") { alignment = new GotohOverlap(params->gapOpen, params->gapExtend, params->match, params->misMatch, params->longestBase); }
541 else if(params->align == "needleman") { alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase); }
542 else if(params->align == "blast") { alignment = new BlastAlignment(params->gapOpen, params->gapExtend, params->match, params->misMatch); }
543 else if(params->align == "noalign") { alignment = new NoAlign(); }
544 else {
545 params->m->mothurOut(params->align + " is not a valid alignment option. I will run the command using needleman.\n");
546 alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);
547 }
548
549 ValidCalculators validCalculator;
550 DistCalc* distCalculator;
551 if (params->countends) {
552 if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
553 if (params->distcalcType == "nogaps") { distCalculator = new ignoreGaps(params->cutoff); }
554 else if (params->distcalcType == "eachgap") { distCalculator = new eachGapDist(params->cutoff); }
555 else if (params->distcalcType == "onegap") { distCalculator = new oneGapDist(params->cutoff); }
556 }
557 }else {
558 if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
559 if (params->distcalcType == "nogaps") { distCalculator = new ignoreGaps(params->cutoff); }
560 else if (params->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(params->cutoff); }
561 else if (params->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(params->cutoff); }
562 }
563 }
564
565 KmerDist kmerDistCalculator(params->kmerSize);
566 double kmerCutoff = params->kmerCutoff;
567
568 for(int i=params->start;i<params->end;i++){ //for each oldDB fasta seq calc the distance to every new seq in alignDB
569
570 Sequence seq = params->oldFastaDB.getSeq(i);
571 vector<kmerCount> seqA = getUniqueKmers(params->oldkmerDB[i], i);
572
573 for(int j = 0; j < params->alignDB->getNumSeqs(); j++){
574
575 if (params->m->getControl_pressed()) { break; }
576
577 vector<int> seqB = params->kmerDB[j];
578
579 int length = min(params->oldlengths[i], params->lengths[j]);
580
581 vector<double> kmerDist = kmerDistCalculator.calcDist(seqA, seqB, length);
582
583 if (kmerDist[0] <= kmerCutoff) {
584 Sequence seqI = seq;
585 Sequence seqJ = params->alignDB->getSeq(j);
586
587 if (seq.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seq.getUnaligned().length()+1); }
588 if (seqJ.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seqJ.getUnaligned().length()+1); }
589
590 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
591
592 seqI.setAligned(alignment->getSeqAAln());
593 seqJ.setAligned(alignment->getSeqBAln());
594
595 double dist = distCalculator->calcDist(seqI, seqJ);
596
597 if(dist <= params->cutoff){ params->count++; params->threadWriter->write(seqI.getName() + ' ' + seqJ.getName() + ' ' + toString(dist) + "\n"); }
598 }
599 }
600
601 if((i+1) % 100 == 0){ params->m->mothurOutJustToScreen(toString(i+1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n"); }
602 }
603 params->m->mothurOutJustToScreen(toString(params->end-1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n");
604
605 delete alignment; delete distCalculator;
606
607 return 0;
608 }
609 catch(exception& e) {
610 params->m->errorOut(e, "PairwiseSeqsCommand", "driverFitCalc");
611 exit(1);
612 }
613 }
614 /**************************************************************************************************/
driverLt(pairwiseData * params)615 int driverLt(pairwiseData* params){
616 try {
617
618 int startTime = time(NULL);
619
620 Alignment* alignment;
621 if(params->align == "gotoh") { alignment = new GotohOverlap(params->gapOpen, params->gapExtend, params->match, params->misMatch, params->longestBase); }
622 else if(params->align == "needleman") { alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase); }
623 else if(params->align == "blast") { alignment = new BlastAlignment(params->gapOpen, params->gapExtend, params->match, params->misMatch); }
624 else if(params->align == "noalign") { alignment = new NoAlign(); }
625 else {
626 params->m->mothurOut(params->align + " is not a valid alignment option. I will run the command using needleman.\n");
627 alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);
628 }
629
630 ValidCalculators validCalculator;
631 DistCalc* distCalculator;
632 double cutoff = 1.0;
633 if (params->countends) {
634 if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
635 if (params->distcalcType == "nogaps") { distCalculator = new ignoreGaps(cutoff); }
636 else if (params->distcalcType == "eachgap") { distCalculator = new eachGapDist(cutoff); }
637 else if (params->distcalcType == "onegap") { distCalculator = new oneGapDist(cutoff); }
638 }
639 }else {
640 if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
641 if (params->distcalcType == "nogaps") { distCalculator = new ignoreGaps(cutoff); }
642 else if (params->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(cutoff); }
643 else if (params->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(cutoff); }
644 }
645 }
646
647 //column file
648 ofstream outFile;
649 params->util.openOutputFile(params->outputFileName, outFile);
650 outFile.setf(ios::fixed, ios::showpoint);
651 outFile << setprecision(4);
652
653 if(params->start == 0){ outFile << params->alignDB->getNumSeqs() << endl; }
654
655 for(int i=params->start;i<params->end;i++){
656
657 Sequence seq = params->alignDB->getSeq(i);
658 if (seq.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seq.getUnaligned().length()+1); }
659
660 string name = seq.getName();
661 if (name.length() < 10) { while (name.length() < 10) { name += " "; } seq.setName(name); } //pad with spaces to make compatible
662 outFile << name;
663
664 for(int j=0;j<i;j++){
665
666 if (params->m->getControl_pressed()) { break; }
667
668 Sequence seqI = seq;
669 Sequence seqJ = params->alignDB->getSeq(j);
670 if (seqJ.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seqJ.getUnaligned().length()+1); }
671
672 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
673 seqI.setAligned(alignment->getSeqAAln());
674 seqJ.setAligned(alignment->getSeqBAln());
675
676 double dist = distCalculator->calcDist(seqI, seqJ);
677
678 if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
679 if(dist <= params->cutoff){ params->count++; }
680 outFile << '\t' << dist;
681 }
682
683 outFile << endl;
684
685 if(i % 100 == 0){ params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n"); }
686
687 }
688 params->m->mothurOutJustToScreen(toString(params->end-1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n");
689
690 outFile.close();
691 delete alignment;
692 delete distCalculator;
693
694 return 1;
695 }
696 catch(exception& e) {
697 params->m->errorOut(e, "PairwiseSeqsCommand", "driver");
698 exit(1);
699 }
700 }
701 /**************************************************************************************************/
driverSquare(pairwiseData * params)702 int driverSquare(pairwiseData* params){
703 try {
704
705 int startTime = time(NULL);
706
707 Alignment* alignment;
708 if(params->align == "gotoh") { alignment = new GotohOverlap(params->gapOpen, params->gapExtend, params->match, params->misMatch, params->longestBase); }
709 else if(params->align == "needleman") { alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase); }
710 else if(params->align == "blast") { alignment = new BlastAlignment(params->gapOpen, params->gapExtend, params->match, params->misMatch); }
711 else if(params->align == "noalign") { alignment = new NoAlign(); }
712 else {
713 params->m->mothurOut(params->align + " is not a valid alignment option. I will run the command using needleman.\n");
714 alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);
715 }
716
717 ValidCalculators validCalculator;
718 DistCalc* distCalculator;
719 double cutoff = 1.0;
720 if (params->countends) {
721 if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
722 if (params->distcalcType == "nogaps") { distCalculator = new ignoreGaps(cutoff); }
723 else if (params->distcalcType == "eachgap") { distCalculator = new eachGapDist(cutoff); }
724 else if (params->distcalcType == "onegap") { distCalculator = new oneGapDist(cutoff); }
725 }
726 }else {
727 if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
728 if (params->distcalcType == "nogaps") { distCalculator = new ignoreGaps(cutoff); }
729 else if (params->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(cutoff); }
730 else if (params->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(cutoff); }
731 }
732 }
733
734 //column file
735 ofstream outFile;
736 params->util.openOutputFile(params->outputFileName, outFile);
737 outFile.setf(ios::fixed, ios::showpoint);
738 outFile << setprecision(4);
739
740 long long numSeqs = params->alignDB->getNumSeqs();
741 if(params->start == 0){ outFile << numSeqs << endl; }
742
743 for(int i=params->start;i<params->end;i++){
744
745 Sequence seq = params->alignDB->getSeq(i);
746 if (seq.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seq.getUnaligned().length()+1); }
747
748 string name = seq.getName();
749 if (name.length() < 10) { while (name.length() < 10) { name += " "; } seq.setName(name); } //pad with spaces to make compatible
750 outFile << name;
751
752 for(int j=0;j<numSeqs;j++){
753
754 if (params->m->getControl_pressed()) { break; }
755
756 Sequence seqI = seq;
757 Sequence seqJ = params->alignDB->getSeq(j);
758 if (seqJ.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seqJ.getUnaligned().length()+1); }
759
760 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
761 seqI.setAligned(alignment->getSeqAAln());
762 seqJ.setAligned(alignment->getSeqBAln());
763
764 double dist = distCalculator->calcDist(seqI, seqJ);
765
766 if(dist <= params->cutoff){ params->count++; }
767 outFile << '\t' << dist;
768
769 if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
770 }
771
772 outFile << endl;
773
774 if(i % 100 == 0){ params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n"); }
775
776 }
777 params->m->mothurOutJustToScreen(toString(params->end-1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n");
778
779 outFile.close();
780 delete alignment;
781 delete distCalculator;
782
783 return 1;
784 }
785 catch(exception& e) {
786 params->m->errorOut(e, "PairwiseSeqsCommand", "driver");
787 exit(1);
788 }
789 }
790
791 /**************************************************************************************************/
createProcesses(string filename)792 void PairwiseSeqsCommand::createProcesses(string filename) {
793 try {
794 vector<linePair> lines;
795 vector<std::thread*> workerThreads;
796 vector<pairwiseData*> data;
797 long long numSeqs = alignDB->getNumSeqs();
798
799 long long numDists = 0;
800 if (output == "square") { numDists = numSeqs * numSeqs; }
801 else { for(int i=0;i<numSeqs;i++){ for(int j=0;j<i;j++){ numDists++; if (numDists > processors) { break; } } } }
802 if (numDists < processors) { processors = numDists; }
803
804 for (int i = 0; i < processors; i++) {
805 linePair tempLine;
806 lines.push_back(tempLine);
807 if (output != "square") {
808 lines[i].start = int (sqrt(float(i)/float(processors)) * numSeqs);
809 lines[i].end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
810 }else{
811 lines[i].start = int ((float(i)/float(processors)) * numSeqs);
812 lines[i].end = int ((float(i+1)/float(processors)) * numSeqs);
813 }
814 }
815
816 auto synchronizedOutputFile = std::make_shared<SynchronizedOutputFile>(filename);
817 synchronizedOutputFile->setFixedShowPoint(); synchronizedOutputFile->setPrecision(4);
818
819 SequenceDB oldFastaDB; vector< int > oldlengths; vector< vector< int > > oldkmerDB;
820 if (fitCalc) {
821 ifstream inFASTA; util.openInputFile(oldfastafile, inFASTA);
822 oldFastaDB = SequenceDB(inFASTA, kmerSize, oldkmerDB, oldlengths);
823 inFASTA.close();
824
825 lines.clear();
826 if (processors > oldFastaDB.getNumSeqs()) { processors = oldFastaDB.getNumSeqs(); }
827 int remainingSeqs = oldFastaDB.getNumSeqs();
828 int startIndex = 0;
829 for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
830 int numSeqsToFit = remainingSeqs; //case for last processor
831 if (remainingProcessors != 1) { numSeqsToFit = ceil(remainingSeqs / remainingProcessors); }
832 lines.push_back(linePair(startIndex, (startIndex+numSeqsToFit))); //startIndex, endIndex
833 startIndex = startIndex + numSeqsToFit;
834 remainingSeqs -= numSeqsToFit;
835 }
836 }
837
838 //Lauch worker threads
839 for (int i = 0; i < processors-1; i++) {
840 OutputWriter* threadWriter = NULL;
841 pairwiseData* dataBundle = NULL;
842 string extension = toString(i+1) + ".temp";
843 if (output == "column") {
844 threadWriter = new OutputWriter(synchronizedOutputFile);
845 dataBundle = new pairwiseData(threadWriter);
846 }else { dataBundle = new pairwiseData(filename+extension); }
847
848 dataBundle->setVariables(align, calc, countends, output, alignDB, oldFastaDB, lines[i+1].start, lines[i+1].end, match, misMatch, gapOpen, gapExtend, longestBase, cutoff, kmerCutoff, kmerSize, kmerDB, lengths, oldkmerDB, oldlengths);
849 data.push_back(dataBundle);
850
851 std::thread* thisThread = NULL;
852 if (output == "column") {
853 if (fitCalc) { thisThread = new std::thread(driverFitCalc, dataBundle); }
854 else { thisThread = new std::thread(driverColumn, dataBundle); }
855 }
856 else if (output == "lt") { thisThread = new std::thread(driverLt, dataBundle); }
857 else { thisThread = new std::thread(driverSquare, dataBundle); }
858 workerThreads.push_back(thisThread);
859 }
860
861 OutputWriter* threadWriter = NULL;
862 pairwiseData* dataBundle = NULL;
863 if (output == "column") {
864 threadWriter = new OutputWriter(synchronizedOutputFile);
865 dataBundle = new pairwiseData(threadWriter);
866 }else { dataBundle = new pairwiseData(filename); }
867
868 dataBundle->setVariables(align, calc, countends, output, alignDB, oldFastaDB, lines[0].start, lines[0].end, match, misMatch, gapOpen, gapExtend, longestBase, cutoff, kmerCutoff, kmerSize, kmerDB, lengths, oldkmerDB, oldlengths);
869
870 if (output == "column") {
871 if (fitCalc) { driverFitCalc(dataBundle); }
872 else { driverColumn(dataBundle); }
873 delete threadWriter;
874 }
875 else if (output == "lt") { driverLt(dataBundle); }
876 else { driverSquare(dataBundle); }
877 numDistsBelowCutoff = dataBundle->count;
878
879 for (int i = 0; i < processors-1; i++) {
880 workerThreads[i]->join();
881
882 numDistsBelowCutoff += data[i]->count;
883
884 if (output == "column") { delete data[i]->threadWriter; }
885 else {
886 string extension = toString(i+1) + ".temp";
887 util.appendFiles((filename+extension), filename);
888 util.mothurRemove(filename+extension);
889 }
890 delete data[i];
891 delete workerThreads[i];
892 }
893
894 delete dataBundle;
895 }
896 catch(exception& e) {
897 m->errorOut(e, "PairwiseSeqsCommand", "createProcesses");
898 exit(1);
899 }
900 }
901 /**************************************************************************************************/
902 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
903 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
sanityCheck()904 bool PairwiseSeqsCommand::sanityCheck() {
905 try{
906 bool good = true;
907
908 //read fasta file and save names as well as adding them to the alignDB
909 set<string> namesOldFasta;
910
911 ifstream inFasta; util.openInputFile(oldfastafile, inFasta);
912
913 while (!inFasta.eof()) {
914 if (m->getControl_pressed()) { inFasta.close(); return good; }
915
916 Sequence temp(inFasta); util.gobble(inFasta);
917
918 if (temp.getName() != "") {
919 namesOldFasta.insert(temp.getName()); //save name
920 if (!fitCalc) { alignDB->push_back(temp); }//add to DB
921 }
922 }
923 inFasta.close();
924
925 //read through the column file checking names and removing distances above the cutoff
926 ifstream inDist; util.openInputFile(column, inDist);
927
928 ofstream outDist;
929 string outputFile = column + ".temp";
930 util.openOutputFile(outputFile, outDist);
931
932 string name1, name2;
933 float dist;
934 while (!inDist.eof()) {
935 if (m->getControl_pressed()) { inDist.close(); outDist.close(); util.mothurRemove(outputFile); return good; }
936
937 inDist >> name1; util.gobble(inDist); inDist >> name2; util.gobble(inDist); inDist >> dist; util.gobble(inDist);
938
939 //both names are in fasta file and distance is below cutoff
940 if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) { good = false; break; }
941 else{ if (dist <= cutoff) { numDistsBelowCutoff++; outDist << name1 << '\t' << name2 << '\t' << dist << endl; } }
942 }
943
944 inDist.close(); outDist.close();
945
946 if (good) {
947 util.mothurRemove(column);
948 rename(outputFile.c_str(), column.c_str());
949 }else{
950 util.mothurRemove(outputFile); //temp file is bad because file mismatch above
951 }
952
953 return good;
954
955 }
956 catch(exception& e) {
957 m->errorOut(e, "PairwiseSeqsCommand", "sanityCheck");
958 exit(1);
959 }
960 }
961 /**************************************************************************************************/
962
963