1 /*
2  *  pairwiseseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 10/20/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9 
10 #include "pairwiseseqscommand.h"
11 #include "kmerdist.hpp"
12 
13 //**********************************************************************************************************************
setParameters()14 vector<string> PairwiseSeqsCommand::setParameters(){
15 	try {
16         CommandParameter pcolumn("column", "InputTypes", "", "", "none", "none", "OldFastaColumn","column",false,false); parameters.push_back(pcolumn);
17         CommandParameter poldfasta("oldfasta", "InputTypes", "", "", "none", "none", "OldFastaColumn","",false,false); parameters.push_back(poldfasta);
18         CommandParameter pfitcalc("fitcalc", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pfitcalc);
19 		CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","phylip-column",false,true,true); parameters.push_back(pfasta);
20 		CommandParameter palign("align", "Multiple", "needleman-gotoh-blast-noalign", "needleman", "", "", "","",false,false); parameters.push_back(palign);
21 		CommandParameter pmatch("match", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pmatch);
22 		CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch);
23 		CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen);
24 		CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend);
25         CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
26 		CommandParameter poutput("output", "Multiple", "column-lt-square-phylip", "column", "", "", "","phylip-column",false,false,true); parameters.push_back(poutput);
27 		CommandParameter pcalc("calc", "Multiple", "nogaps-eachgap-onegap", "onegap", "", "", "","",false,false); parameters.push_back(pcalc);
28 		CommandParameter pcountends("countends", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcountends);
29         CommandParameter pcompress("compress", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pcompress);
30 		CommandParameter pcutoff("cutoff", "Number", "", "1.0", "", "", "","",false,false,true); parameters.push_back(pcutoff);
31         CommandParameter pkcutoff("kmercutoff", "Number", "", "-1.0", "", "", "","",false,false,true);
32         parameters.push_back(pkcutoff);
33         CommandParameter pksize("ksize", "Number", "", "8", "", "", "","",false,false); parameters.push_back(pksize);
34         CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
35         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
36 		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
37 
38         abort = false; calledHelp = false;
39 
40         vector<string> tempOutNames;
41         outputTypes["phylip"] = tempOutNames;
42         outputTypes["column"] = tempOutNames;
43 
44 		vector<string> myArray;
45 		for (int i = 0; i < parameters.size(); i++) {	myArray.push_back(parameters[i].name);		}
46 		return myArray;
47 	}
48 	catch(exception& e) {
49 		m->errorOut(e, "PairwiseSeqsCommand", "setParameters");
50 		exit(1);
51 	}
52 }
53 //**********************************************************************************************************************
getHelpString()54 string PairwiseSeqsCommand::getHelpString(){
55 	try {
56 		string helpString = "";
57 		helpString += "The pairwise.seqs command reads a fasta file and creates distance matrix.\n";
58 		helpString += "The pairwise.seqs command parameters are fasta, align, match, mismatch, gapopen, gapextend, calc, output, cutoff, oldfasta, column, processors.\n";
59 		helpString += "The fasta parameter is required.\n";
60 		helpString += "The align parameter allows you to specify the alignment method to use.  Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n";
61 		helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
62 		helpString += "The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n";
63 		helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
64 		helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n";
65 		helpString += "The calc parameter allows you to specify the method of calculating the distances.  Your options are: nogaps, onegap or eachgap. The default is onegap.\n";
66 		helpString += "The countends parameter allows you to specify whether to include terminal gaps in distance.  Your options are: T or F. The default is T.\n";
67 		helpString += "The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n";
68         helpString += "The kmercutoff parameter allows you to specify maximum kmer distance. The kmercutoff is used to reduce the processing time by avoiding the aligning and distance calculations for sequences with a kmer distance above the cutoff. Kmer distance are calculated using methods described here, Edgar, R. C. (2004). Muscle: a multiple sequence alignment method with reduced time and space complexity. BMC Bioinformatics, 5:113. The defaults vary based on the cutoff selected. Cutoff <= 0.05 -> kmerCutoff = -1.0, cutoff 0.05 - 0.15 -> kmerCutoff = -0.50, cutoff 0.15-0.25 -> kmerCutoff = -0.25, cutoff > 0.25 -> kmerCutoff = -0.10.\n";
69         helpString += "The ksize parameter allows you to specify the kmer size for calculating the kmer distance.  The default is 7.\n";
70 		helpString += "The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n";
71         helpString += "The oldfasta and column parameters allow you to append the distances calculated to the column file.\n";
72 		helpString += "The compress parameter allows you to indicate that you want the resulting distance file compressed.  The default is false.\n";
73 		helpString += "The pairwise.seqs command should be in the following format: \n";
74 		helpString += "pairwise.seqs(fasta=yourfastaFile, align=yourAlignmentMethod) \n";
75 		helpString += "Example pairwise.seqs(fasta=candidate.fasta, align=blast)\n";
76 
77 		return helpString;
78 	}
79 	catch(exception& e) {
80 		m->errorOut(e, "PairwiseSeqsCommand", "getHelpString");
81 		exit(1);
82 	}
83 }
84 //**********************************************************************************************************************
getOutputPattern(string type)85 string PairwiseSeqsCommand::getOutputPattern(string type) {
86     try {
87         string pattern = "";
88 
89         if (type == "phylip") {  pattern = "[filename],[outputtag],dist"; }
90         else if (type == "column") { pattern = "[filename],dist"; }
91         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true);  }
92 
93         return pattern;
94     }
95     catch(exception& e) {
96         m->errorOut(e, "PairwiseSeqsCommand", "getOutputPattern");
97         exit(1);
98     }
99 }
100 //**********************************************************************************************************************
PairwiseSeqsCommand(string option)101 PairwiseSeqsCommand::PairwiseSeqsCommand(string option) : Command()  {
102 	try {
103 		if(option == "help") { help(); abort = true; calledHelp = true; }
104 		else if(option == "citation") { citation(); abort = true; calledHelp = true;}
105         else if(option == "category") {  abort = true; calledHelp = true;  }
106 
107 		else {
108 			OptionParser parser(option, setParameters());
109 			map<string, string> parameters = parser.getParameters();
110 
111 			ValidParameters validParameter;
112 
113 			fastaFileName = validParameter.validFile(parameters, "fasta");
114 			if (fastaFileName == "not found") {
115 				fastaFileName = current->getFastaFile();
116 				if (fastaFileName != "") {  m->mothurOut("Using " + fastaFileName + " as input file for the fasta parameter.\n");  }
117 				else { 	m->mothurOut("[ERROR]: You have no current fastafile and the fasta parameter is required.\n");  abort = true; }
118             }else if (fastaFileName == "not open") { abort = true; }
119             else{ current->setFastaFile(fastaFileName); }
120 
121             if (outputdir == "") {  outputdir += util.hasPath(fastaFileName); }
122 
123             oldfastafile = validParameter.validFile(parameters, "oldfasta");
124             if (oldfastafile == "not found") { oldfastafile = ""; }
125             else if (oldfastafile == "not open") { abort = true; }
126 
127             column = validParameter.validFile(parameters, "column");
128             if (column == "not found") { column = ""; }
129             else if (column == "not open") { abort = true; }
130             else { current->setColumnFile(column); }
131 
132 			//check for optional parameter and set defaults
133 			// ...at some point should added some additional type checking...
134 			string temp;
135 			temp = validParameter.valid(parameters, "match");		if (temp == "not found"){	temp = "1.0";			}
136 			util.mothurConvert(temp, match);
137 
138 			temp = validParameter.valid(parameters, "mismatch");		if (temp == "not found"){	temp = "-1.0";			}
139 			util.mothurConvert(temp, misMatch);
140             if (misMatch > 0) { m->mothurOut("[ERROR]: mismatch must be negative.\n"); abort=true; }
141 
142 			temp = validParameter.valid(parameters, "gapopen");		if (temp == "not found"){	temp = "-2.0";			}
143 			util.mothurConvert(temp, gapOpen);
144             if (gapOpen > 0) { m->mothurOut("[ERROR]: gapopen must be negative.\n"); abort=true; }
145 
146 			temp = validParameter.valid(parameters, "gapextend");	if (temp == "not found"){	temp = "-1.0";			}
147 			util.mothurConvert(temp, gapExtend);
148             if (gapExtend > 0) { m->mothurOut("[ERROR]: gapextend must be negative.\n"); abort=true; }
149 
150 			temp = validParameter.valid(parameters, "processors");	if (temp == "not found"){	temp = current->getProcessors();	}
151 			processors = current->setProcessors(temp);
152 
153 			temp = validParameter.valid(parameters, "cutoff");		if(temp == "not found"){	temp = "1.0"; }
154 			util.mothurConvert(temp, cutoff);
155 
156             temp = validParameter.valid(parameters, "kmercutoff");
157             if(temp == "not found"){
158                 if (cutoff <= 0.05) { kmerCutoff = -1.0; }
159                 else if ((cutoff > 0.05) && (cutoff <= 0.15)) { kmerCutoff = -0.50; }
160                 else if ((cutoff > 0.15) && (cutoff <= 0.25)) { kmerCutoff = -0.25;  }
161                 else { kmerCutoff = -0.1; }
162 
163             }else { util.mothurConvert(temp, kmerCutoff); }
164 
165             temp = validParameter.valid(parameters, "ksize");        if (temp == "not found"){ temp = "7"; }
166             util.mothurConvert(temp, kmerSize);
167 
168 			temp = validParameter.valid(parameters, "countends");	if(temp == "not found"){	temp = "T";	}
169 			countends = util.isTrue(temp);
170 
171 			temp = validParameter.valid(parameters, "compress");		if(temp == "not found"){  temp = "F"; }
172 			compress = util.isTrue(temp);
173 
174 			align = validParameter.valid(parameters, "align");		if (align == "not found"){	align = "needleman";	}
175 
176             if (align == "blast") {
177                 string blastlocation = "";
178                 vector<string> locations = current->getLocations();
179                 string path = current->getProgramPath();
180                 bool foundTool = util.findBlastLocation(blastlocation, path, locations);
181 
182                 if (!foundTool){
183                     m->mothurOut("[WARNING]: Unable to locate blast executables, cannot use blast as align method. Using needleman instead.\n"); align = "needleman";
184                 }
185             }
186 
187             temp = validParameter.valid(parameters, "fitcalc");	if(temp == "not found"){	temp = "F";	}
188             fitCalc = util.isTrue(temp);
189 
190 			output = validParameter.valid(parameters, "output");		if(output == "not found"){	output = "column"; }
191             if (output=="phylip") { output = "lt"; }
192 			if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column.\n");  output = "column"; }
193 
194 			calc = validParameter.valid(parameters, "calc");
195 			if (calc == "not found") { calc = "onegap";  }
196 			else {
197 				 if (calc == "default")  {  calc = "onegap";  }
198 			}
199 
200             if ((calc != "nogaps") && (calc != "eachgap") && (calc != "onegap")) { m->mothurOut(calc + " is not a valid calculator for pairwise.seqs. Options are onegap, eachgap and nogaps. I will use onegap.\n");  calc = "onegap"; }
201 
202 		}
203 
204 	}
205 	catch(exception& e) {
206 		m->errorOut(e, "PairwiseSeqsCommand", "PairwiseSeqsCommand");
207 		exit(1);
208 	}
209 }
210 //**********************************************************************************************************************
PairwiseSeqsCommand(StorageDatabase * & storageDB,vector<vector<int>> kDB,vector<int> lths,string outputFileRoot,double cut,string outputformat,int proc)211 PairwiseSeqsCommand::PairwiseSeqsCommand(StorageDatabase*& storageDB, vector< vector< int > > kDB, vector< int > lths,  string outputFileRoot, double cut, string outputformat, int proc) {
212     try {
213         abort = false; calledHelp = false;
214         vector<string> tempOutNames;
215         outputTypes["phylip"] = tempOutNames;
216         outputTypes["column"] = tempOutNames;
217 
218         //defaults
219         calc = "onegap";
220         countends = true;
221         fitCalc = false;
222         cutoff = cut;
223         processors = proc;
224         compress = false;
225         output = outputformat;
226         match = 1.0;
227         misMatch = -1.0;
228         gapOpen = -2.0;
229         gapExtend = -1.0;
230         align = "needleman";
231         kmerSize = 7;
232         kmerDB = kDB;
233         lengths = lths;
234         if (cutoff <= 0.05) { kmerCutoff = -1.0; }
235         else if ((cutoff > 0.05) && (cutoff <= 0.15)) { kmerCutoff = -0.50; }
236         else if ((cutoff > 0.15) && (cutoff <= 0.25)) { kmerCutoff = -0.25;  }
237         else { kmerCutoff = -0.1; }
238 
239         longestBase = 2000; //will need to update this in driver if we find sequences with more bases.  hardcoded so we don't have the pre-read user fasta file.
240         numDistsBelowCutoff = 0;
241 
242         alignDB = storageDB;
243         long long numSeqs = alignDB->getNumSeqs();
244 
245         if (numSeqs < 2) {  m->mothurOut("[ERROR]: you must have at least 2 sequences to calculate the distances, aborting.\n");  return; }
246 
247         string outputFile;
248         map<string, string> variables;
249         variables["[filename]"] = outputFileRoot;
250 
251         if (output == "lt") { //does the user want lower triangle phylip formatted file
252             variables["[outputtag]"] = "phylip";
253             outputFile = getOutputFileName("phylip", variables);
254             util.mothurRemove(outputFile); outputTypes["phylip"].push_back(outputFile);
255         }else if (output == "column") { //user wants column format
256             if (fitCalc) {  variables["[outputtag]"] = "fit";  }
257             outputFile = getOutputFileName("column", variables);
258             outputTypes["column"].push_back(outputFile);
259             util.mothurRemove(outputFile);
260         }
261 
262         time_t start, end; time(&start);
263 
264         m->mothurOut("\nSequence\tTime\tNum_Dists_Below_Cutoff\n");
265 
266         createProcesses(outputFile);
267 
268         time(&end);
269         m->mothurOut("\nIt took " + toString(difftime(end, start)) + " secs to find distances for " + toString(numSeqs) + " sequences. " + toString(numDistsBelowCutoff) + " distances below cutoff " + toString(cutoff) + ".\n\n");
270 
271         m->mothurOut("\nOutput File Names:\n"); m->mothurOut(outputFile+"\n\n");
272 
273     }
274     catch(exception& e) {
275         m->errorOut(e, "DistanceCommand", "DistanceCommand");
276         exit(1);
277     }
278 }
279 //**********************************************************************************************************************
280 
execute()281 int PairwiseSeqsCommand::execute(){
282 	try {
283 		if (abort) { if (calledHelp) { return 0; }  return 2;	}
284 
285         time_t start, end; time(&start);
286 
287 		longestBase = 2000; //will need to update this in driver if we find sequences with more bases.  hardcoded so we don't have the pre-read user fasta file.
288         numDistsBelowCutoff = 0;
289 
290         ifstream inFASTA; util.openInputFile(fastaFileName, inFASTA);
291         alignDB = new SequenceDB(inFASTA, kmerSize, kmerDB, lengths);
292         inFASTA.close();
293 
294         //sanity check the oldfasta and column file as well as add oldfasta sequences to alignDB
295         if ((oldfastafile != "") && (column != ""))  {	if (!(sanityCheck())) { return 0; }  }
296 
297         if (m->getControl_pressed()) { delete alignDB; return 0; }
298 
299         long long numSeqs = alignDB->getNumSeqs();
300 
301         string outputFile = "";
302 
303         map<string, string> variables;
304         variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(fastaFileName));
305         if ((oldfastafile != "") && (column != ""))  {  variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(oldfastafile));  }
306 
307         if (output == "lt") { //does the user want lower triangle phylip formatted file
308             variables["[outputtag]"] = "phylip";
309             outputFile = getOutputFileName("phylip", variables);
310             util.mothurRemove(outputFile); outputTypes["phylip"].push_back(outputFile);
311         }else if (output == "column") { //user wants column format
312             if (fitCalc) {  variables["[outputtag]"] = "fit";  }
313             outputFile = getOutputFileName("column", variables);
314             outputTypes["column"].push_back(outputFile);
315             util.mothurRemove(outputFile);
316         }else { //assume square
317             variables["[outputtag]"] = "square";
318             outputFile = getOutputFileName("phylip", variables);
319             util.mothurRemove(outputFile);
320             outputTypes["phylip"].push_back(outputFile);
321         }
322 
323         m->mothurOut("\nSequence\tTime\tNum_Dists_Below_Cutoff\n");
324 
325         createProcesses(outputFile);
326 
327         delete alignDB;
328 
329         if (m->getControl_pressed()) { outputTypes.clear();   util.mothurRemove(outputFile); return 0; }
330 
331         if(util.isBlank(outputFile)) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff.\n"); }
332 
333         //append the old column file to the new one
334         if ((oldfastafile != "") && (column != ""))  {
335             //we had to rename the column file so we didnt overwrite above, but we want to keep old name
336             if (outputFile == column) {
337                 string tempcolumn = column + ".old";
338                 util.appendFiles(tempcolumn, outputFile);
339                 util.mothurRemove(tempcolumn);
340             }else{
341                 util.appendFiles(outputFile, column);
342                 util.mothurRemove(outputFile);
343                 outputFile = column;
344             }
345             outputTypes["column"].clear(); outputTypes["column"].push_back(outputFile);
346         }
347 
348         if (compress) {
349             m->mothurOut("Compressing...\n");
350             m->mothurOut("(Replacing " + outputFile + " with " + outputFile + ".gz)\n");
351             system(("gzip -v " + outputFile).c_str());
352             outputNames.push_back(outputFile + ".gz");
353         }else { outputNames.push_back(outputFile); }
354 
355         time(&end);
356         m->mothurOut("\nIt took " + toString(difftime(end, start)) + " secs to find distances for " + toString(numSeqs) + " sequences. " + toString(numDistsBelowCutoff) + " distances below cutoff " + toString(cutoff) + ".\n\n");
357 
358         if (m->getControl_pressed()) { outputTypes.clear(); util.mothurRemove(outputFile); return 0; }
359 
360 		//set phylip file as new current phylipfile
361 		string currentName = "";
362 		itTypes = outputTypes.find("phylip");
363 		if (itTypes != outputTypes.end()) {
364 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setPhylipFile(currentName); }
365 		}
366 
367 		//set column file as new current columnfile
368 		itTypes = outputTypes.find("column");
369 		if (itTypes != outputTypes.end()) {
370 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setColumnFile(currentName); }
371 		}
372 
373 		m->mothurOut("\nOutput File Names: \n");
374 		for (int i = 0; i < outputNames.size(); i++) {	m->mothurOut(outputNames[i] +"\n"); 	} m->mothurOutEndLine();
375 
376 
377 		return 0;
378 	}
379 	catch(exception& e) {
380 		m->errorOut(e, "PairwiseSeqsCommand", "execute");
381 		exit(1);
382 	}
383 }
384 
385 /**************************************************************************************************/
386 struct pairwiseData {
387     string align, distcalcType, outputFileName;
388     unsigned long long start, end;
389     long long count;
390     float match, misMatch, gapOpen, gapExtend, cutoff, kmerCutoff;
391     int longestBase, kmerSize;
392     bool countends;
393 
394     vector< vector< int > > kmerDB; //kmerDB[0] = vector<int> maxKmers long, contains kmer counts
395     vector< vector< int > > oldkmerDB; //kmerDB[0] = vector<int> maxKmers long, contains kmer counts
396     vector< int > lengths;
397     vector< int > oldlengths;
398 
399     StorageDatabase* alignDB;
400     SequenceDB oldFastaDB;
401     OutputWriter* threadWriter;
402     MothurOut* m;
403     Utils util;
404 
pairwiseDatapairwiseData405     pairwiseData(){}
pairwiseDatapairwiseData406     pairwiseData(OutputWriter* ofn) {
407         threadWriter = ofn;
408         m = MothurOut::getInstance();
409     }
410 
pairwiseDatapairwiseData411     pairwiseData(string ofn) {
412         outputFileName = ofn;
413         m = MothurOut::getInstance();
414     }
415 
setVariablespairwiseData416     void setVariables(string al, string di, bool co, string op, StorageDatabase* DB, SequenceDB oldDB,  unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int thr, float cu, float kcut, int ksz, vector< vector< int > > kdb, vector< int > le, vector< vector< int > > okdb, vector< int > ole) {
417         align = al;
418         distcalcType = di;
419         countends = co;
420         alignDB = DB;
421         oldFastaDB = oldDB;
422         cutoff = cu;
423         start = st;
424         end = en;
425         match = ma;
426         misMatch = misMa;
427         gapOpen = gapO;
428         gapExtend = gapE;
429         longestBase = thr;
430         kmerDB = kdb;
431         oldkmerDB = okdb;
432         lengths = le;
433         oldlengths = ole;
434         kmerSize = ksz;
435         kmerCutoff = kcut;
436         count = 0;
437     }
438 };
439 /***********************************************************************/
getUniqueKmers(vector<int> seqsKmers,int i)440 vector<kmerCount> getUniqueKmers(vector<int> seqsKmers, int i){
441         vector<kmerCount> uniques;
442 
443         for (int k = 0; k < seqsKmers.size(); k++) {
444             if (seqsKmers[k] != 0) {
445                 kmerCount thisKmer(k, seqsKmers[k]);
446                 uniques.push_back(thisKmer);
447             }
448         }
449 
450         return uniques;
451 }
452 
453 /**************************************************************************************************/
454 //the higher the kmercutoff the higher the aligned dist. As kmercutoff approaches 0, aligned dist aproaches 1.
driverColumn(pairwiseData * params)455 int driverColumn(pairwiseData* params){
456     try {
457         int startTime = time(NULL);
458 
459         Alignment* alignment;
460         if(params->align == "gotoh")			{	alignment = new GotohOverlap(params->gapOpen, params->gapExtend, params->match, params->misMatch, params->longestBase);			}
461         else if(params->align == "needleman")	{	alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);				}
462         else if(params->align == "blast")		{	alignment = new BlastAlignment(params->gapOpen, params->gapExtend, params->match, params->misMatch);		}
463         else if(params->align == "noalign")		{	alignment = new NoAlign();													}
464         else {
465             params->m->mothurOut(params->align + " is not a valid alignment option. I will run the command using needleman.\n");
466             alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);
467         }
468 
469         ValidCalculators validCalculator;
470         DistCalc* distCalculator;
471         if (params->countends) {
472             if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
473                 if (params->distcalcType == "nogaps")			{	distCalculator = new ignoreGaps(params->cutoff);	}
474                 else if (params->distcalcType == "eachgap")	{	distCalculator = new eachGapDist(params->cutoff);	}
475                 else if (params->distcalcType == "onegap")		{	distCalculator = new oneGapDist(params->cutoff);	}
476                 //else if (params->distcalcType == "onegap")        {    distCalculator = new oneGapDist(1.0);    }
477             }
478         }else {
479             if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
480                 if (params->distcalcType == "nogaps")		{	distCalculator = new ignoreGaps(params->cutoff);					}
481                 else if (params->distcalcType == "eachgap"){	distCalculator = new eachGapIgnoreTermGapDist(params->cutoff);	}
482                 else if (params->distcalcType == "onegap")	{	distCalculator = new oneGapIgnoreTermGapDist(params->cutoff);		}
483             }
484         }
485 
486         KmerDist kmerDistCalculator(params->kmerSize);
487         double kmerCutoff = params->kmerCutoff;
488 
489         for(int i=params->start;i<params->end;i++){
490 
491             Sequence seq = params->alignDB->getSeq(i);
492             vector<kmerCount> seqA = getUniqueKmers(params->kmerDB[i], i);
493 
494             for(int j=0;j<i;j++){
495 
496                 if (params->m->getControl_pressed()) {  break;  }
497 
498                 vector<int> seqB = params->kmerDB[j];
499 
500                 int length = min(params->lengths[i], params->lengths[j]);
501 
502                 vector<double> kmerDist = kmerDistCalculator.calcDist(seqA, seqB, length);
503 
504                 if (kmerDist[0] <= kmerCutoff) {
505                     Sequence seqI = seq;
506                     Sequence seqJ = params->alignDB->getSeq(j);
507 
508                     if (seq.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seq.getUnaligned().length()+1); }
509                     if (seqJ.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seqJ.getUnaligned().length()+1); }
510 
511                     alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
512 
513                     seqI.setAligned(alignment->getSeqAAln());
514                     seqJ.setAligned(alignment->getSeqBAln());
515 
516                     double dist = distCalculator->calcDist(seqI, seqJ);
517 
518                     if(dist <= params->cutoff){ params->count++; params->threadWriter->write(seqI.getName() + ' ' + seqJ.getName() + ' ' + toString(dist) + "\n"); }
519                 }
520             }
521             if((i+1) % 100 == 0){ params->m->mothurOutJustToScreen(toString(i+1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n"); }
522         }
523         params->m->mothurOutJustToScreen(toString(params->end-1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n");
524 
525         delete alignment; delete distCalculator;
526 
527         return 0;
528     }
529     catch(exception& e) {
530         params->m->errorOut(e, "PairwiseSeqsCommand", "driver");
531         exit(1);
532     }
533 }
534 /**************************************************************************************************/
driverFitCalc(pairwiseData * params)535 int driverFitCalc(pairwiseData* params){
536     try {
537         int startTime = time(NULL);
538 
539         Alignment* alignment;
540         if(params->align == "gotoh")			{	alignment = new GotohOverlap(params->gapOpen, params->gapExtend, params->match, params->misMatch, params->longestBase);			}
541         else if(params->align == "needleman")	{	alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);				}
542         else if(params->align == "blast")		{	alignment = new BlastAlignment(params->gapOpen, params->gapExtend, params->match, params->misMatch);		}
543         else if(params->align == "noalign")		{	alignment = new NoAlign();													}
544         else {
545             params->m->mothurOut(params->align + " is not a valid alignment option. I will run the command using needleman.\n");
546             alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);
547         }
548 
549         ValidCalculators validCalculator;
550         DistCalc* distCalculator;
551         if (params->countends) {
552             if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
553                 if (params->distcalcType == "nogaps")			{	distCalculator = new ignoreGaps(params->cutoff);	}
554                 else if (params->distcalcType == "eachgap")	{	distCalculator = new eachGapDist(params->cutoff);	}
555                 else if (params->distcalcType == "onegap")		{	distCalculator = new oneGapDist(params->cutoff);	}
556             }
557         }else {
558             if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
559                 if (params->distcalcType == "nogaps")		{	distCalculator = new ignoreGaps(params->cutoff);					}
560                 else if (params->distcalcType == "eachgap"){	distCalculator = new eachGapIgnoreTermGapDist(params->cutoff);	}
561                 else if (params->distcalcType == "onegap")	{	distCalculator = new oneGapIgnoreTermGapDist(params->cutoff);		}
562             }
563         }
564 
565         KmerDist kmerDistCalculator(params->kmerSize);
566         double kmerCutoff = params->kmerCutoff;
567 
568         for(int i=params->start;i<params->end;i++){ //for each oldDB fasta seq calc the distance to every new seq in alignDB
569 
570             Sequence seq = params->oldFastaDB.getSeq(i);
571             vector<kmerCount> seqA = getUniqueKmers(params->oldkmerDB[i], i);
572 
573             for(int j = 0; j < params->alignDB->getNumSeqs(); j++){
574 
575                 if (params->m->getControl_pressed()) {  break;  }
576 
577                 vector<int> seqB = params->kmerDB[j];
578 
579                 int length = min(params->oldlengths[i], params->lengths[j]);
580 
581                 vector<double> kmerDist = kmerDistCalculator.calcDist(seqA, seqB, length);
582 
583                 if (kmerDist[0] <= kmerCutoff) {
584                     Sequence seqI = seq;
585                     Sequence seqJ = params->alignDB->getSeq(j);
586 
587                     if (seq.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seq.getUnaligned().length()+1); }
588                     if (seqJ.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seqJ.getUnaligned().length()+1); }
589 
590                     alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
591 
592                     seqI.setAligned(alignment->getSeqAAln());
593                     seqJ.setAligned(alignment->getSeqBAln());
594 
595                     double dist = distCalculator->calcDist(seqI, seqJ);
596 
597                     if(dist <= params->cutoff){ params->count++; params->threadWriter->write(seqI.getName() + ' ' + seqJ.getName() + ' ' + toString(dist) + "\n"); }
598                 }
599             }
600 
601             if((i+1) % 100 == 0){ params->m->mothurOutJustToScreen(toString(i+1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n"); }
602         }
603         params->m->mothurOutJustToScreen(toString(params->end-1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n");
604 
605         delete alignment; delete distCalculator;
606 
607         return 0;
608     }
609     catch(exception& e) {
610         params->m->errorOut(e, "PairwiseSeqsCommand", "driverFitCalc");
611         exit(1);
612     }
613 }
614 /**************************************************************************************************/
driverLt(pairwiseData * params)615 int driverLt(pairwiseData* params){
616     try {
617 
618         int startTime = time(NULL);
619 
620         Alignment* alignment;
621         if(params->align == "gotoh")			{	alignment = new GotohOverlap(params->gapOpen, params->gapExtend, params->match, params->misMatch, params->longestBase);			}
622         else if(params->align == "needleman")	{	alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);				}
623         else if(params->align == "blast")		{	alignment = new BlastAlignment(params->gapOpen, params->gapExtend, params->match, params->misMatch);		}
624         else if(params->align == "noalign")		{	alignment = new NoAlign();													}
625         else {
626             params->m->mothurOut(params->align + " is not a valid alignment option. I will run the command using needleman.\n");
627             alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);
628         }
629 
630         ValidCalculators validCalculator;
631         DistCalc* distCalculator;
632         double cutoff = 1.0;
633         if (params->countends) {
634             if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
635                 if (params->distcalcType == "nogaps")			{	distCalculator = new ignoreGaps(cutoff);	}
636                 else if (params->distcalcType == "eachgap")	{	distCalculator = new eachGapDist(cutoff);	}
637                 else if (params->distcalcType == "onegap")		{	distCalculator = new oneGapDist(cutoff);	}
638             }
639         }else {
640             if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
641                 if (params->distcalcType == "nogaps")		{	distCalculator = new ignoreGaps(cutoff);					}
642                 else if (params->distcalcType == "eachgap"){	distCalculator = new eachGapIgnoreTermGapDist(cutoff);	}
643                 else if (params->distcalcType == "onegap")	{	distCalculator = new oneGapIgnoreTermGapDist(cutoff);		}
644             }
645         }
646 
647         //column file
648         ofstream outFile;
649         params->util.openOutputFile(params->outputFileName, outFile);
650         outFile.setf(ios::fixed, ios::showpoint);
651         outFile << setprecision(4);
652 
653         if(params->start == 0){	outFile << params->alignDB->getNumSeqs() << endl;	}
654 
655         for(int i=params->start;i<params->end;i++){
656 
657             Sequence seq = params->alignDB->getSeq(i);
658             if (seq.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seq.getUnaligned().length()+1); }
659 
660             string name = seq.getName();
661             if (name.length() < 10) {  while (name.length() < 10) {  name += " ";  } seq.setName(name); } //pad with spaces to make compatible
662             outFile << name;
663 
664             for(int j=0;j<i;j++){
665 
666                 if (params->m->getControl_pressed()) { break;  }
667 
668                 Sequence seqI = seq;
669                 Sequence seqJ = params->alignDB->getSeq(j);
670                 if (seqJ.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seqJ.getUnaligned().length()+1); }
671 
672                 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
673                 seqI.setAligned(alignment->getSeqAAln());
674                 seqJ.setAligned(alignment->getSeqBAln());
675 
676                 double dist = distCalculator->calcDist(seqI, seqJ);
677 
678                 if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' +  alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
679                 if(dist <= params->cutoff){ params->count++; }
680                 outFile << '\t' << dist;
681             }
682 
683             outFile << endl;
684 
685             if(i % 100 == 0){ params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n"); }
686 
687         }
688         params->m->mothurOutJustToScreen(toString(params->end-1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n");
689 
690         outFile.close();
691         delete alignment;
692         delete distCalculator;
693 
694         return 1;
695     }
696     catch(exception& e) {
697         params->m->errorOut(e, "PairwiseSeqsCommand", "driver");
698         exit(1);
699     }
700 }
701 /**************************************************************************************************/
driverSquare(pairwiseData * params)702 int driverSquare(pairwiseData* params){
703     try {
704 
705         int startTime = time(NULL);
706 
707         Alignment* alignment;
708         if(params->align == "gotoh")			{	alignment = new GotohOverlap(params->gapOpen, params->gapExtend, params->match, params->misMatch, params->longestBase);			}
709         else if(params->align == "needleman")	{	alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);				}
710         else if(params->align == "blast")		{	alignment = new BlastAlignment(params->gapOpen, params->gapExtend, params->match, params->misMatch);		}
711         else if(params->align == "noalign")		{	alignment = new NoAlign();													}
712         else {
713             params->m->mothurOut(params->align + " is not a valid alignment option. I will run the command using needleman.\n");
714             alignment = new NeedlemanOverlap(params->gapOpen, params->match, params->misMatch, params->longestBase);
715         }
716 
717         ValidCalculators validCalculator;
718         DistCalc* distCalculator;
719         double cutoff = 1.0;
720         if (params->countends) {
721             if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
722                 if (params->distcalcType == "nogaps")			{	distCalculator = new ignoreGaps(cutoff);	}
723                 else if (params->distcalcType == "eachgap")	{	distCalculator = new eachGapDist(cutoff);	}
724                 else if (params->distcalcType == "onegap")		{	distCalculator = new oneGapDist(cutoff);	}
725             }
726         }else {
727             if (validCalculator.isValidCalculator("distance", params->distcalcType) ) {
728                 if (params->distcalcType == "nogaps")		{	distCalculator = new ignoreGaps(cutoff);					}
729                 else if (params->distcalcType == "eachgap"){	distCalculator = new eachGapIgnoreTermGapDist(cutoff);	}
730                 else if (params->distcalcType == "onegap")	{	distCalculator = new oneGapIgnoreTermGapDist(cutoff);		}
731             }
732         }
733 
734         //column file
735         ofstream outFile;
736         params->util.openOutputFile(params->outputFileName, outFile);
737         outFile.setf(ios::fixed, ios::showpoint);
738         outFile << setprecision(4);
739 
740         long long numSeqs = params->alignDB->getNumSeqs();
741         if(params->start == 0){	outFile << numSeqs << endl;	}
742 
743         for(int i=params->start;i<params->end;i++){
744 
745             Sequence seq = params->alignDB->getSeq(i);
746             if (seq.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seq.getUnaligned().length()+1); }
747 
748             string name = seq.getName();
749             if (name.length() < 10) {  while (name.length() < 10) {  name += " ";  } seq.setName(name); } //pad with spaces to make compatible
750             outFile << name;
751 
752             for(int j=0;j<numSeqs;j++){
753 
754                 if (params->m->getControl_pressed()) { break;  }
755 
756                 Sequence seqI = seq;
757                 Sequence seqJ = params->alignDB->getSeq(j);
758                 if (seqJ.getUnaligned().length() > alignment->getnRows()) { alignment->resize(seqJ.getUnaligned().length()+1); }
759 
760                 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
761                 seqI.setAligned(alignment->getSeqAAln());
762                 seqJ.setAligned(alignment->getSeqBAln());
763 
764                 double dist = distCalculator->calcDist(seqI, seqJ);
765 
766                 if(dist <= params->cutoff){ params->count++; }
767                 outFile << '\t' << dist;
768 
769                 if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' +  alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
770             }
771 
772             outFile << endl;
773 
774             if(i % 100 == 0){ params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n");  }
775 
776         }
777         params->m->mothurOutJustToScreen(toString(params->end-1) + "\t" + toString(time(NULL) - startTime)+ "\t" + toString(params->count) +"\n");
778 
779         outFile.close();
780         delete alignment;
781         delete distCalculator;
782 
783         return 1;
784     }
785     catch(exception& e) {
786         params->m->errorOut(e, "PairwiseSeqsCommand", "driver");
787         exit(1);
788     }
789 }
790 
791 /**************************************************************************************************/
createProcesses(string filename)792 void PairwiseSeqsCommand::createProcesses(string filename) {
793 	try {
794         vector<linePair> lines;
795         vector<std::thread*> workerThreads;
796         vector<pairwiseData*> data;
797         long long numSeqs = alignDB->getNumSeqs();
798 
799         long long numDists = 0;
800         if (output == "square") { numDists = numSeqs * numSeqs; }
801         else { for(int i=0;i<numSeqs;i++){ for(int j=0;j<i;j++){ numDists++; if (numDists > processors) { break; } } } }
802         if (numDists < processors) { processors = numDists; }
803 
804         for (int i = 0; i < processors; i++) {
805             linePair tempLine;
806             lines.push_back(tempLine);
807             if (output != "square") {
808                 lines[i].start = int (sqrt(float(i)/float(processors)) * numSeqs);
809                 lines[i].end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
810             }else{
811                 lines[i].start = int ((float(i)/float(processors)) * numSeqs);
812                 lines[i].end = int ((float(i+1)/float(processors)) * numSeqs);
813             }
814         }
815 
816         auto synchronizedOutputFile = std::make_shared<SynchronizedOutputFile>(filename);
817         synchronizedOutputFile->setFixedShowPoint(); synchronizedOutputFile->setPrecision(4);
818 
819         SequenceDB oldFastaDB; vector< int > oldlengths; vector< vector< int > > oldkmerDB;
820         if (fitCalc) {
821             ifstream inFASTA; util.openInputFile(oldfastafile, inFASTA);
822             oldFastaDB = SequenceDB(inFASTA, kmerSize, oldkmerDB, oldlengths);
823             inFASTA.close();
824 
825             lines.clear();
826             if (processors > oldFastaDB.getNumSeqs()) { processors = oldFastaDB.getNumSeqs(); }
827             int remainingSeqs = oldFastaDB.getNumSeqs();
828             int startIndex = 0;
829             for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
830                 int numSeqsToFit = remainingSeqs; //case for last processor
831                 if (remainingProcessors != 1) { numSeqsToFit = ceil(remainingSeqs / remainingProcessors); }
832                 lines.push_back(linePair(startIndex, (startIndex+numSeqsToFit))); //startIndex, endIndex
833                 startIndex = startIndex + numSeqsToFit;
834                 remainingSeqs -= numSeqsToFit;
835             }
836         }
837 
838         //Lauch worker threads
839         for (int i = 0; i < processors-1; i++) {
840             OutputWriter* threadWriter = NULL;
841             pairwiseData* dataBundle = NULL;
842             string extension = toString(i+1) + ".temp";
843             if (output == "column") {
844                 threadWriter = new OutputWriter(synchronizedOutputFile);
845                 dataBundle = new pairwiseData(threadWriter);
846             }else { dataBundle = new pairwiseData(filename+extension); }
847 
848             dataBundle->setVariables(align, calc, countends, output, alignDB, oldFastaDB, lines[i+1].start, lines[i+1].end, match, misMatch, gapOpen, gapExtend, longestBase, cutoff, kmerCutoff, kmerSize, kmerDB, lengths, oldkmerDB, oldlengths);
849             data.push_back(dataBundle);
850 
851             std::thread* thisThread = NULL;
852             if (output == "column")     {
853                 if (fitCalc)    { thisThread = new std::thread(driverFitCalc, dataBundle);   }
854                 else            {  thisThread = new std::thread(driverColumn, dataBundle);   }
855             }
856             else if (output == "lt")    { thisThread = new std::thread(driverLt, dataBundle);            }
857             else                        { thisThread = new std::thread(driverSquare, dataBundle);        }
858             workerThreads.push_back(thisThread);
859         }
860 
861         OutputWriter* threadWriter = NULL;
862         pairwiseData* dataBundle = NULL;
863         if (output == "column") {
864             threadWriter = new OutputWriter(synchronizedOutputFile);
865             dataBundle = new pairwiseData(threadWriter);
866         }else { dataBundle = new pairwiseData(filename); }
867 
868         dataBundle->setVariables(align, calc, countends, output, alignDB, oldFastaDB, lines[0].start, lines[0].end, match, misMatch, gapOpen, gapExtend, longestBase, cutoff, kmerCutoff, kmerSize, kmerDB, lengths, oldkmerDB, oldlengths);
869 
870         if (output == "column")     {
871             if (fitCalc)    { driverFitCalc(dataBundle);    }
872             else            { driverColumn(dataBundle);     }
873             delete threadWriter;
874         }
875         else if (output == "lt")    { driverLt(dataBundle);            }
876         else                        { driverSquare(dataBundle);        }
877         numDistsBelowCutoff = dataBundle->count;
878 
879         for (int i = 0; i < processors-1; i++) {
880             workerThreads[i]->join();
881 
882             numDistsBelowCutoff += data[i]->count;
883 
884             if (output == "column") {  delete data[i]->threadWriter; }
885             else {
886                 string extension = toString(i+1) + ".temp";
887                 util.appendFiles((filename+extension), filename);
888                 util.mothurRemove(filename+extension);
889             }
890             delete data[i];
891             delete workerThreads[i];
892         }
893 
894         delete dataBundle;
895 	}
896 	catch(exception& e) {
897 		m->errorOut(e, "PairwiseSeqsCommand", "createProcesses");
898 		exit(1);
899 	}
900 }
901 /**************************************************************************************************/
902 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
903 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
sanityCheck()904 bool PairwiseSeqsCommand::sanityCheck() {
905     try{
906         bool good = true;
907 
908         //read fasta file and save names as well as adding them to the alignDB
909         set<string> namesOldFasta;
910 
911         ifstream inFasta; util.openInputFile(oldfastafile, inFasta);
912 
913         while (!inFasta.eof()) {
914             if (m->getControl_pressed()) {  inFasta.close(); return good;  }
915 
916             Sequence temp(inFasta);  util.gobble(inFasta);
917 
918             if (temp.getName() != "") {
919                 namesOldFasta.insert(temp.getName());  //save name
920                 if (!fitCalc) { alignDB->push_back(temp);  }//add to DB
921             }
922         }
923         inFasta.close();
924 
925         //read through the column file checking names and removing distances above the cutoff
926         ifstream inDist; util.openInputFile(column, inDist);
927 
928         ofstream outDist;
929         string outputFile = column + ".temp";
930         util.openOutputFile(outputFile, outDist);
931 
932         string name1, name2;
933         float dist;
934         while (!inDist.eof()) {
935             if (m->getControl_pressed()) {  inDist.close(); outDist.close(); util.mothurRemove(outputFile); return good;  }
936 
937             inDist >> name1; util.gobble(inDist); inDist >> name2; util.gobble(inDist); inDist >> dist; util.gobble(inDist);
938 
939             //both names are in fasta file and distance is below cutoff
940             if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) {  good = false; break;  }
941             else{ if (dist <= cutoff) { numDistsBelowCutoff++; outDist << name1 << '\t' << name2 << '\t' << dist << endl; } }
942         }
943 
944         inDist.close(); outDist.close();
945 
946         if (good) {
947             util.mothurRemove(column);
948             rename(outputFile.c_str(), column.c_str());
949         }else{
950             util.mothurRemove(outputFile); //temp file is bad because file mismatch above
951         }
952 
953         return good;
954 
955     }
956     catch(exception& e) {
957         m->errorOut(e, "PairwiseSeqsCommand", "sanityCheck");
958         exit(1);
959     }
960 }
961 /**************************************************************************************************/
962 
963