1 /*
2  *  distancecommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 5/7/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9 
10 #include "distancecommand.h"
11 
12 //**********************************************************************************************************************
setParameters()13 vector<string> DistanceCommand::setParameters(){
14 	try {
15 		CommandParameter pcolumn("column", "InputTypes", "", "", "none", "none", "OldFastaColumn","column",false,false); parameters.push_back(pcolumn);
16 		CommandParameter poldfasta("oldfasta", "InputTypes", "", "", "none", "none", "OldFastaColumn","",false,false); parameters.push_back(poldfasta);
17 		CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","phylip-column",false,true, true); parameters.push_back(pfasta);
18 		CommandParameter poutput("output", "Multiple", "column-lt-square", "column", "", "", "","phylip-column",false,false, true); parameters.push_back(poutput);
19 		CommandParameter pcalc("calc", "Multiple", "nogaps-eachgap-onegap-jtt-pmb-pam-kimura", "onegap", "", "", "","",false,false); parameters.push_back(pcalc);
20 		CommandParameter pcountends("countends", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcountends);
21         CommandParameter pfitcalc("fitcalc", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pfitcalc);
22 		CommandParameter pcompress("compress", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pcompress);
23 		CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false, true); parameters.push_back(pprocessors);
24 		CommandParameter pcutoff("cutoff", "Number", "", "1.0", "", "", "","",false,false, true); parameters.push_back(pcutoff);
25 		CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
26         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
27 		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
28 
29         abort = false; calledHelp = false;
30 
31         vector<string> tempOutNames;
32         outputTypes["phylip"] = tempOutNames;
33         outputTypes["column"] = tempOutNames;
34 
35 		vector<string> myArray;
36 		for (int i = 0; i < parameters.size(); i++) {	myArray.push_back(parameters[i].name);		}
37 		return myArray;
38 	}
39 	catch(exception& e) {
40 		m->errorOut(e, "DistanceCommand", "setParameters");
41 		exit(1);
42 	}
43 }
44 //**********************************************************************************************************************
getHelpString()45 string DistanceCommand::getHelpString(){
46 	try {
47 		string helpString = "";
48 		helpString += "The dist.seqs command reads a file containing sequences and creates a distance file.\n";
49 		helpString += "The dist.seqs command parameters are fasta, oldfasta, column, calc, countends, output, compress, cutoff and processors.  \n";
50 		helpString += "The fasta parameter is required, unless you have a valid current fasta file.\n";
51 		helpString += "The oldfasta and column parameters allow you to append the distances calculated to the column file.\n";
52 		helpString += "The calc parameter allows you to specify the method of calculating the distances.  Your options are: nogaps, onegap or eachgap for dna/rna sequences. If using protein sequences, your calc options are jtt, pmb, pam and kimura. The default is onegap.\n";
53 		helpString += "The countends parameter allows you to specify whether to include terminal gaps in distance.  Your options are: T or F. The default is T.\n";
54 		helpString += "The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n";
55 		helpString += "The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n";
56 		helpString += "The processors parameter allows you to specify number of processors to use.  The default is 1.\n";
57 		helpString += "The compress parameter allows you to indicate that you want the resulting distance file compressed.  The default is false.\n";
58 		helpString += "The dist.seqs command should be in the following format: \n";
59 		helpString += "dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n";
60 		helpString += "Example dist.seqs(fasta=amazon.fasta, calc=eachgap, countends=F, cutoff= 2.0, processors=3).\n";
61 		return helpString;
62 	}
63 	catch(exception& e) {
64 		m->errorOut(e, "DistanceCommand", "getHelpString");
65 		exit(1);
66 	}
67 }
68 //**********************************************************************************************************************
getOutputPattern(string type)69 string DistanceCommand::getOutputPattern(string type) {
70     try {
71         string pattern = "";
72 
73         if (type == "phylip") {  pattern = "[filename],[outputtag],dist"; }
74         else if (type == "column") { pattern = "[filename],dist-[filename],[outputtag],dist"; }
75         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true);  }
76 
77         return pattern;
78     }
79     catch(exception& e) {
80         m->errorOut(e, "DistanceCommand", "getOutputPattern");
81         exit(1);
82     }
83 }
84 //**********************************************************************************************************************
DistanceCommand(string option)85 DistanceCommand::DistanceCommand(string option) : Command() {
86 	try {
87 		//allow user to run help
88 		if(option == "help") { help(); abort = true; calledHelp = true; }
89 		else if(option == "citation") { citation(); abort = true; calledHelp = true;}
90         else if(option == "category") {  abort = true; calledHelp = true;  }
91 
92 		else {
93 			OptionParser parser(option, setParameters());
94 			map<string, string> parameters = parser.getParameters();
95 
96 			ValidParameters validParameter;
97 			fastafile = validParameter.validFile(parameters, "fasta");
98 			if (fastafile == "not found") {
99 				fastafile = current->getFastaFile();
100                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n");  }
101                 else { 	m->mothurOut("You have no current fastafile and the fasta parameter is required.\n"); abort = true; }
102 			}else if (fastafile == "not open") { abort = true; }
103 			else{ current->setFastaFile(fastafile); }
104 
105 			oldfastafile = validParameter.validFile(parameters, "oldfasta");
106 			if (oldfastafile == "not found") { oldfastafile = ""; }
107 			else if (oldfastafile == "not open") { abort = true; }
108 
109 			column = validParameter.validFile(parameters, "column");
110 			if (column == "not found") { column = ""; }
111 			else if (column == "not open") { abort = true; }
112 			else { current->setColumnFile(column); }
113 
114             if (outputdir == ""){ outputdir += util.hasPath(fastafile);  }
115 
116 			//check for optional parameter and set defaults
117 			// ...at some point should added some additional type checking...
118 			calc = validParameter.valid(parameters, "calc");
119 			if (calc == "not found") { calc = "onegap";  }
120 			else {
121 				 if (calc == "default")  {  calc = "onegap";  }
122             }
123 
124 			string temp;
125 			temp = validParameter.valid(parameters, "countends");	if(temp == "not found"){	temp = "T";	}
126             countends = util.isTrue(temp);
127 
128             temp = validParameter.valid(parameters, "fitcalc");	if(temp == "not found"){	temp = "F";	}
129             fitCalc = util.isTrue(temp);
130 
131 			temp = validParameter.valid(parameters, "cutoff");		if(temp == "not found"){	temp = "1.0"; }
132 			util.mothurConvert(temp, cutoff);
133 
134 			temp = validParameter.valid(parameters, "processors");	if (temp == "not found"){	temp = current->getProcessors();	}
135 			processors = current->setProcessors(temp);
136 
137 			temp = validParameter.valid(parameters, "compress");		if(temp == "not found"){  temp = "F"; }
138             compress = util.isTrue(temp);
139 
140 			output = validParameter.valid(parameters, "output");		if(output == "not found"){	output = "column"; }
141             if (output == "phylip") { output = "lt";  }
142 
143 			if (((column != "") && (oldfastafile == "")) || ((column == "") && (oldfastafile != ""))) { m->mothurOut("If you provide column or oldfasta, you must provide both.\n");  abort=true; }
144 
145 			if ((column != "") && (oldfastafile != "") && (output != "column")) { m->mothurOut("You have provided column and oldfasta, indicating you want to append distances to your column file. Your output must be in column format to do so.\n"); abort=true; }
146 
147 			if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column.\n");  output = "column"; }
148 
149             if ((calc != "onegap") && (calc != "eachgap") && (calc != "nogaps") && (calc != "jtt") && (calc != "pmb") && (calc != "pam") && (calc != "kimura")) { m->mothurOut(calc + " is not a valid calc. Options are eachgap, onegap, nogaps, jtt, pmb, pam and kimura. I'll use onegap.\n");  calc = "onegap";  }
150 
151             prot = false; //not using protein sequences
152             if ((calc == "jtt") || (calc == "pmb") || (calc == "pam") || (calc == "kimura")) { prot = true; }
153 
154 		}
155 	}
156 	catch(exception& e) {
157 		m->errorOut(e, "DistanceCommand", "DistanceCommand");
158 		exit(1);
159 	}
160 }
161 //**********************************************************************************************************************
DistanceCommand(StorageDatabase * & storageDB,string outputFileRoot,double cut,string outputformat,int proc)162 DistanceCommand::DistanceCommand(StorageDatabase*& storageDB, string outputFileRoot, double cut, string outputformat, int proc) : Command() {
163     try {
164         abort = false; calledHelp = false;
165         vector<string> tempOutNames;
166         outputTypes["phylip"] = tempOutNames;
167         outputTypes["column"] = tempOutNames;
168 
169         calc = "onegap";
170         countends = true;
171         fitCalc = false;
172         cutoff = cut;
173         processors = proc;
174         compress = false;
175         output = outputformat;
176         prot = false; //not using protein sequences
177 
178         numDistsBelowCutoff = 0;
179         db = storageDB;
180         numNewFasta = db->getNumSeqs();
181         numSeqs = db->getNumSeqs();
182 
183         if (!db->sameLength()) {  m->mothurOut("[ERROR]: your sequences are not the same length, aborting.\n");  return; }
184         if (numSeqs < 2) {  m->mothurOut("[ERROR]: you must have at least 2 sequences to calculate the distances, aborting.\n");  return; }
185 
186         string outputFile;
187         map<string, string> variables;
188         variables["[filename]"] = outputFileRoot;
189 
190         if (output == "lt") { //does the user want lower triangle phylip formatted file
191             variables["[outputtag]"] = "phylip";
192             outputFile = getOutputFileName("phylip", variables);
193             util.mothurRemove(outputFile); outputTypes["phylip"].push_back(outputFile);
194 
195             //output numSeqs to phylip formatted dist file
196         }else if (output == "column") { //user wants column format
197 
198             outputFile = getOutputFileName("column", variables);
199             outputTypes["column"].push_back(outputFile);
200             util.mothurRemove(outputFile);
201         }
202 
203 
204         m->mothurOut("\nSequence\tTime\tNum_Dists_Below_Cutoff\n");
205 
206         createProcesses(outputFile);
207 
208         m->mothurOut("\nOutput File Names:\n"); m->mothurOut(outputFile+"\n\n");
209 
210     }
211     catch(exception& e) {
212         m->errorOut(e, "DistanceCommand", "DistanceCommand");
213         exit(1);
214     }
215 }
216 //**********************************************************************************************************************
217 
execute()218 int DistanceCommand::execute(){
219 	try {
220 
221 		if (abort) { if (calledHelp) { return 0; }  return 2;	}
222 
223         numDistsBelowCutoff = 0;
224 
225         ifstream inFASTA; util.openInputFile(fastafile, inFASTA);
226         if (prot) { db = new ProteinDB(inFASTA);  }
227         else      { db = new SequenceDB(inFASTA); }
228         inFASTA.close();
229 
230 		//save number of new sequence
231 		numNewFasta = db->getNumSeqs();
232 
233 		//sanity check the oldfasta and column file as well as add oldfasta sequences to db
234         if ((oldfastafile != "") && (column != ""))  {	if (!(sanityCheck())) { return 0; }  }
235 
236         if (m->getControl_pressed()) { delete db; return 0; }
237 
238 		numSeqs = db->getNumSeqs();
239 
240 		if (!db->sameLength()) {  m->mothurOut("[ERROR]: your sequences are not the same length, aborting.\n");  return 0; }
241         if (numSeqs < 2) {  m->mothurOut("[ERROR]: you must have at least 2 sequences to calculate the distances, aborting.\n");  return 0; }
242 
243 
244 		string outputFile;
245         map<string, string> variables;
246         variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(fastafile));
247         if ((oldfastafile != "") && (column != ""))  { variables["[filename]"] = outputdir + util.getRootName(util.getSimpleName(oldfastafile)); }
248 
249 		if (output == "lt") { //does the user want lower triangle phylip formatted file
250             variables["[outputtag]"] = "phylip";
251 			outputFile = getOutputFileName("phylip", variables);
252 			util.mothurRemove(outputFile); outputTypes["phylip"].push_back(outputFile);
253 
254 			//output numSeqs to phylip formatted dist file
255 		}else if (output == "column") { //user wants column format
256             if (fitCalc) {  variables["[outputtag]"] = "fit";  }
257 			outputFile = getOutputFileName("column", variables);
258 			outputTypes["column"].push_back(outputFile);
259 
260 			//so we don't accidentally overwrite
261 			if (outputFile == column) {
262 				string tempcolumn = column + ".old";
263 				rename(column.c_str(), tempcolumn.c_str());
264 			}
265 
266 			util.mothurRemove(outputFile);
267 		}else { //assume square
268 			variables["[outputtag]"] = "square";
269 			outputFile = getOutputFileName("phylip", variables);
270 			util.mothurRemove(outputFile);
271 			outputTypes["phylip"].push_back(outputFile);
272 		}
273         m->mothurOut("\nSequence\tTime\tNum_Dists_Below_Cutoff\n");
274 
275         createProcesses(outputFile);
276 
277 		if (m->getControl_pressed()) { outputTypes.clear();  util.mothurRemove(outputFile); return 0; }
278 
279 		ifstream fileHandle; fileHandle.open(outputFile.c_str());
280 		if(fileHandle) {
281 			util.gobble(fileHandle);
282 			if (fileHandle.eof()) { m->mothurOut(outputFile + " is blank. This can result if there are no distances below your cutoff.\n"); }
283 		}
284 
285 		//append the old column file to the new one
286 		if ((oldfastafile != "") && (column != ""))  {
287 			//we had to rename the column file so we didnt overwrite above, but we want to keep old name
288 			if (outputFile == column) {
289 				string tempcolumn = column + ".old";
290 				util.appendFiles(tempcolumn, outputFile);
291 				util.mothurRemove(tempcolumn);
292 			}else{
293                 if (!fitCalc) {
294                     util.appendFiles(outputFile, column);
295                     util.mothurRemove(outputFile);
296                     outputFile = column;
297                 }
298 			}
299             outputTypes["column"].clear(); outputTypes["column"].push_back(outputFile);
300         }
301 
302 		if (m->getControl_pressed()) { outputTypes.clear();  util.mothurRemove(outputFile); return 0; }
303 
304 		//set phylip file as new current phylipfile
305 		string currentName = "";
306 		itTypes = outputTypes.find("phylip");
307 		if (itTypes != outputTypes.end()) {
308 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setPhylipFile(currentName); }
309 		}
310 
311 		//set column file as new current columnfile
312 		itTypes = outputTypes.find("column");
313 		if (itTypes != outputTypes.end()) {
314 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setColumnFile(currentName); }
315 		}
316 
317 		m->mothurOut("\nOutput File Names: \n");
318 		m->mothurOut(outputFile+"\n\n");
319 
320 		if (compress) {
321 			m->mothurOut("Compressing...\n");
322 			m->mothurOut("(Replacing " + outputFile + " with " + outputFile + ".gz)\n");
323 			system(("gzip -v " + outputFile).c_str());
324 			outputNames.push_back(outputFile + ".gz");
325 		}else { outputNames.push_back(outputFile); }
326 
327 		return 0;
328 
329 	}
330 	catch(exception& e) {
331 		m->errorOut(e, "DistanceCommand", "execute");
332 		exit(1);
333 	}
334 }
335 /**************************************************************************************************/
driverColumn(distanceData * params)336 void driverColumn(distanceData* params){
337     try {
338         ValidCalculators validCalculator;
339         DistCalc* distCalculator;
340 
341         if (!params->prot) {
342             if (params->countends) {
343                 if (validCalculator.isValidCalculator("distance", params->calc) ) {
344                     if (params->calc == "nogaps")			{	distCalculator = new ignoreGaps(params->cutoff);	}
345                     else if (params->calc == "eachgap")	{	distCalculator = new eachGapDist(params->cutoff);	}
346                     else if (params->calc == "onegap")		{	distCalculator = new oneGapDist(params->cutoff);	}
347                 }
348             }else {
349                 if (validCalculator.isValidCalculator("distance", params->calc) ) {
350                     if (params->calc == "nogaps")		{	distCalculator = new ignoreGaps(params->cutoff);					}
351                     else if (params->calc == "eachgap"){	distCalculator = new eachGapIgnoreTermGapDist(params->cutoff);	}
352                     else if (params->calc == "onegap")	{	distCalculator = new oneGapIgnoreTermGapDist(params->cutoff);		}
353                 }
354             }
355         }else {
356             if (validCalculator.isValidCalculator("protdist", params->calc) ) {
357                 if (params->calc == "jtt")        {    distCalculator = new JTT(params->cutoff);                    }
358                 else if (params->calc == "pmb")        {    distCalculator = new PMB(params->cutoff);               }
359                 else if (params->calc == "pam")        {    distCalculator = new PAM(params->cutoff);               }
360                 else if (params->calc == "kimura")        {    distCalculator = new Kimura(params->cutoff);               }
361             }
362         }
363 
364 
365         int startTime = time(NULL);
366 
367         params->count = 0;
368         string buffer = "";
369 
370         for(int i=params->startLine;i<params->endLine;i++){
371 
372             Sequence seqI; Protein seqIP; string nameI = "";
373             if (params->prot)   { seqIP = params->db->getProt(i);   nameI = seqIP.getName();    }
374             else                { seqI = params->db->getSeq(i);     nameI = seqI.getName();     }
375 
376             for(int j=0;j<i;j++){
377 
378                 if (params->m->getControl_pressed()) { break;  }
379 
380                 if ((i >= params->numNewFasta) && (j >= params->numNewFasta)) { break; }
381 
382                 double dist = 1.0; string nameJ = "";
383                 if (params->prot)   { Protein seqJP = params->db->getProt(j); nameJ = seqJP.getName(); dist = distCalculator->calcDist(seqIP, seqJP);   }
384                 else                { Sequence seqJ = params->db->getSeq(j);  nameJ = seqJ.getName(); dist = distCalculator->calcDist(seqI, seqJ);       }
385 
386 
387                 if(dist <= params->cutoff){
388                     buffer += (nameI + " " + nameJ + " " + toString(dist) + "\n");
389                     params->count++;
390                 }
391             }
392 
393             if(i % 100 == 0){ params->threadWriter->write(buffer);  buffer = ""; params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
394 
395         }
396         params->threadWriter->write(buffer);
397 
398         if((params->endLine-1) % 100 != 0){ params->m->mothurOutJustToScreen(toString(params->endLine-1) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
399 
400         delete distCalculator;
401     }
402     catch(exception& e) {
403         params->m->errorOut(e, "DistanceCommand", "driverColumn");
404         exit(1);
405     }
406 }
407 /**************************************************************************************************/
driverLt(distanceData * params)408 void driverLt(distanceData* params){
409     try {
410         ValidCalculators validCalculator;
411         DistCalc* distCalculator;
412         double cutoff = 1.0;
413 
414         if (!params->prot) {
415             if (params->countends) {
416                 if (validCalculator.isValidCalculator("distance", params->calc) ) {
417                     if (params->calc == "nogaps")			{	distCalculator = new ignoreGaps(cutoff);	}
418                     else if (params->calc == "eachgap")	{	distCalculator = new eachGapDist(cutoff);	}
419                     else if (params->calc == "onegap")		{	distCalculator = new oneGapDist(cutoff);	}
420                 }
421             }else {
422                 if (validCalculator.isValidCalculator("distance", params->calc) ) {
423                     if (params->calc == "nogaps")		{	distCalculator = new ignoreGaps(cutoff);					}
424                     else if (params->calc == "eachgap"){	distCalculator = new eachGapIgnoreTermGapDist(cutoff);	}
425                     else if (params->calc == "onegap")	{	distCalculator = new oneGapIgnoreTermGapDist(cutoff);		}
426                 }
427             }
428         }else {
429             if (validCalculator.isValidCalculator("protdist", params->calc) ) {
430                 if (params->calc == "jtt")        {    distCalculator = new JTT(params->cutoff);                    }
431                 else if (params->calc == "pmb")        {    distCalculator = new PMB(params->cutoff);               }
432                 else if (params->calc == "pam")        {    distCalculator = new PAM(params->cutoff);               }
433                 else if (params->calc == "kimura")        {    distCalculator = new Kimura(params->cutoff);         }
434             }
435         }
436 
437         int startTime = time(NULL);
438         long long numSeqs = params->db->getNumSeqs();
439 
440         //column file
441         ofstream outFile; params->util.openOutputFile(params->outputFileName, outFile);
442         outFile.setf(ios::fixed, ios::showpoint); outFile << setprecision(4);
443 
444         if(params->startLine == 0){	outFile << numSeqs << endl;	}
445 
446         params->count = 0;
447         for(int i=params->startLine;i<params->endLine;i++){
448 
449             Sequence seqI; Protein seqIP; string nameI = "";
450             if (params->prot)   { seqIP = params->db->getProt(i);   nameI = seqIP.getName();    }
451             else                { seqI = params->db->getSeq(i);     nameI = seqI.getName();     }
452 
453             if (nameI.length() < 10) {  while (nameI.length() < 10) {  nameI += " ";  } }
454             outFile << nameI;
455 
456             for(int j=0;j<i;j++){
457 
458                 if (params->m->getControl_pressed()) { break;  }
459 
460                 if ((i >= params->numNewFasta) && (j >= params->numNewFasta)) { break; }
461 
462                 double dist = 1.0;
463                 if (params->prot)   { Protein seqJP = params->db->getProt(j);  dist = distCalculator->calcDist(seqIP, seqJP);   }
464                 else                { Sequence seqJ = params->db->getSeq(j);  dist = distCalculator->calcDist(seqI, seqJ);       }
465 
466                 if(dist <= params->cutoff){ params->count++; }
467                 outFile  << '\t' << dist;
468             }
469 
470             outFile << endl;
471 
472             if(i % 100 == 0){ params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
473 
474         }
475 
476         if((params->endLine-1) % 100 != 0){ params->m->mothurOutJustToScreen(toString(params->endLine-1) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
477 
478         outFile.close();
479         delete distCalculator;
480     }
481     catch(exception& e) {
482         params->m->errorOut(e, "DistanceCommand", "driverLt");
483         exit(1);
484     }
485 }
486 /**************************************************************************************************/
driverSquare(distanceData * params)487 void driverSquare(distanceData* params){
488     try {
489         ValidCalculators validCalculator;
490         DistCalc* distCalculator;
491         double cutoff = 1.0;
492 
493         if (!params->prot) {
494             if (params->countends) {
495                 if (validCalculator.isValidCalculator("distance", params->calc) ) {
496                     if (params->calc == "nogaps")			{	distCalculator = new ignoreGaps(cutoff);	}
497                     else if (params->calc == "eachgap")	{	distCalculator = new eachGapDist(cutoff);	    }
498                     else if (params->calc == "onegap")		{	distCalculator = new oneGapDist(cutoff);	}
499                 }
500             }else {
501                 if (validCalculator.isValidCalculator("distance", params->calc) ) {
502                     if (params->calc == "nogaps")		{	distCalculator = new ignoreGaps(cutoff);					}
503                     else if (params->calc == "eachgap"){	distCalculator = new eachGapIgnoreTermGapDist(cutoff);	}
504                     else if (params->calc == "onegap")	{	distCalculator = new oneGapIgnoreTermGapDist(cutoff);		}
505                 }
506             }
507         }else {
508             if (validCalculator.isValidCalculator("protdist", params->calc) ) {
509                 if (params->calc == "jtt")        {    distCalculator = new JTT(params->cutoff);                    }
510                 else if (params->calc == "pmb")        {    distCalculator = new PMB(params->cutoff);               }
511                 else if (params->calc == "pam")        {    distCalculator = new PAM(params->cutoff);               }
512                 else if (params->calc == "kimura")        {    distCalculator = new Kimura(params->cutoff);         }
513             }
514         }
515 
516         int startTime = time(NULL);
517 
518         //column file
519         ofstream outFile;
520         params->util.openOutputFile(params->outputFileName, outFile);
521         outFile.setf(ios::fixed, ios::showpoint); outFile << setprecision(4);
522 
523         long long numSeqs = params->db->getNumSeqs();
524         if(params->startLine == 0){	outFile << numSeqs << endl;	}
525 
526         params->count = 0;
527         for(int i=params->startLine;i<params->endLine;i++){
528 
529             Sequence seqI; Protein seqIP; string nameI = "";
530             if (params->prot)   { seqIP = params->db->getProt(i);   nameI = seqIP.getName();    }
531             else                { seqI = params->db->getSeq(i);     nameI = seqI.getName();     }
532 
533             if (nameI.length() < 10) {  while (nameI.length() < 10) {  nameI += " ";  } }
534             outFile << nameI << '\t';
535 
536             for(int j=0;j<numSeqs;j++){
537 
538                 if (params->m->getControl_pressed()) { break; }
539 
540                 double dist = 1.0;
541                 if (i == j) { dist = 0.0000; }
542                 else {
543                     if (params->prot)   { Protein seqJP = params->db->getProt(j);  dist = distCalculator->calcDist(seqIP, seqJP);   }
544                     else                { Sequence seqJ = params->db->getSeq(j);  dist = distCalculator->calcDist(seqI, seqJ);       }
545                 }
546 
547                 if(dist <= params->cutoff){ params->count++; }
548 
549                 outFile << dist << '\t';
550             }
551 
552             outFile << endl;
553 
554             if(i % 100 == 0){ params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n");  }
555         }
556 
557         if((params->endLine-1) % 100 != 0){ params->m->mothurOutJustToScreen(toString(params->endLine-1) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
558 
559         outFile.close();
560         delete distCalculator;
561     }
562     catch(exception& e) {
563         params->m->errorOut(e, "DistanceCommand", "driverSquare");
564         exit(1);
565     }
566 }
567 /**************************************************************************************************/
driverFitCalc(distanceData * params)568 void driverFitCalc(distanceData* params){
569     try {
570         ValidCalculators validCalculator;
571         DistCalc* distCalculator;
572 
573         if (!params->prot) {
574             if (params->countends) {
575                 if (validCalculator.isValidCalculator("distance", params->calc) ) {
576                     if (params->calc == "nogaps")			{	distCalculator = new ignoreGaps(params->cutoff);	}
577                     else if (params->calc == "eachgap")	{	distCalculator = new eachGapDist(params->cutoff);	}
578                     else if (params->calc == "onegap")		{	distCalculator = new oneGapDist(params->cutoff);	}
579                 }
580             }else {
581                 if (validCalculator.isValidCalculator("distance", params->calc) ) {
582                     if (params->calc == "nogaps")		{	distCalculator = new ignoreGaps(params->cutoff);					}
583                     else if (params->calc == "eachgap"){	distCalculator = new eachGapIgnoreTermGapDist(params->cutoff);	}
584                     else if (params->calc == "onegap")	{	distCalculator = new oneGapIgnoreTermGapDist(params->cutoff);		}
585                 }
586             }
587         }else {
588             if (validCalculator.isValidCalculator("protdist", params->calc) ) {
589                 if (params->calc == "jtt")        {    distCalculator = new JTT(params->cutoff);                    }
590                 else if (params->calc == "pmb")        {    distCalculator = new PMB(params->cutoff);               }
591                 else if (params->calc == "pam")        {    distCalculator = new PAM(params->cutoff);               }
592                 else if (params->calc == "kimura")        {    distCalculator = new Kimura(params->cutoff);         }
593             }
594         }
595 
596         int startTime = time(NULL);
597         params->count = 0;
598         string buffer = "";
599         for(int i=params->startLine;i<params->endLine;i++){
600 
601             Sequence seqI; Protein seqIP; string nameI = "";
602             if (params->prot)   { seqIP = params->oldFastaDB->getProt(i);   nameI = seqIP.getName();    }
603             else                { seqI = params->oldFastaDB->getSeq(i);     nameI = seqI.getName();     }
604 
605 
606             for(int j = 0; j < params->db->getNumSeqs(); j++){
607 
608                 if (params->m->getControl_pressed()) { break;  }
609 
610                 double dist = 1.0; string nameJ = "";
611 
612                 if (params->prot)   { Protein seqJP = params->db->getProt(j); nameJ = seqJP.getName(); dist = distCalculator->calcDist(seqIP, seqJP);   }
613                 else                { Sequence seqJ = params->db->getSeq(j);  nameJ = seqJ.getName(); dist = distCalculator->calcDist(seqI, seqJ);       }
614 
615                 if(dist <= params->cutoff){
616                     buffer += nameI + " " + nameJ + " " + toString(dist) + "\n";
617                     params->count++;
618                 }
619             }
620 
621             if(i % 100 == 0){ params->threadWriter->write(buffer);  buffer = ""; params->m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
622 
623         }
624         params->threadWriter->write(buffer);
625 
626         if((params->endLine-1) % 100 != 0){ params->m->mothurOutJustToScreen(toString(params->endLine-1) + "\t" + toString(time(NULL) - startTime) + "\t" + toString(params->count) +"\n"); }
627 
628         delete distCalculator;
629 
630     }
631     catch(exception& e) {
632         params->m->errorOut(e, "DistanceCommand", "driverFitCalc");
633         exit(1);
634     }
635 }
636 /**************************************************************************************************/
createProcesses(string filename)637 void DistanceCommand::createProcesses(string filename) {
638     try {
639         long long num = db->getNumSeqs();
640         long long distsBelowCutoff = 0;
641         time_t start, end;
642         time(&start);
643 
644         //create array of worker threads
645         vector<std::thread*> workerThreads;
646         vector<distanceData*> data;
647 
648         double numDists = 0;
649 
650         if (output == "square") { numDists = numSeqs; }
651         else { for(int i=0;i<numSeqs;i++){ for(int j=0;j<i;j++){ numDists++; if (numDists > processors) { break; } } } }
652         if (numDists < processors) { processors = numDists; }
653 
654         vector<linePair> lines;
655         for (int i = 0; i < processors; i++) {
656             linePair tempLine;
657             lines.push_back(tempLine);
658             if (output != "square") {
659                 lines[i].start = int (sqrt(float(i)/float(processors)) * numSeqs);
660                 lines[i].end = int (sqrt(float(i+1)/float(processors)) * numSeqs);
661             }else{
662                 lines[i].start = int ((float(i)/float(processors)) * numSeqs);
663                 lines[i].end = int ((float(i+1)/float(processors)) * numSeqs);
664             }
665         }
666 
667         auto synchronizedOutputFile = std::make_shared<SynchronizedOutputFile>(filename);
668         synchronizedOutputFile->setFixedShowPoint(); synchronizedOutputFile->setPrecision(4);
669 
670         StorageDatabase* oldFastaDB;
671         if (fitCalc) {
672             ifstream inFASTA; util.openInputFile(oldfastafile, inFASTA);
673             if (!prot) { oldFastaDB = new SequenceDB(inFASTA); }
674             else                    { oldFastaDB = new ProteinDB(inFASTA); }
675             inFASTA.close();
676 
677             lines.clear();
678             if (processors > oldFastaDB->getNumSeqs()) { processors = oldFastaDB->getNumSeqs(); }
679             int remainingSeqs = oldFastaDB->getNumSeqs();
680             int startIndex = 0;
681             for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
682                 int numSeqsToFit = remainingSeqs; //case for last processor
683                 if (remainingProcessors != 1) { numSeqsToFit = ceil(remainingSeqs / remainingProcessors); }
684                 lines.push_back(linePair(startIndex, (startIndex+numSeqsToFit))); //startIndex, endIndex
685                 startIndex = startIndex + numSeqsToFit;
686                 remainingSeqs -= numSeqsToFit;
687             }
688         }
689 
690         //Lauch worker threads
691         for (int i = 0; i < processors-1; i++) {
692             OutputWriter* threadWriter = NULL;
693             distanceData* dataBundle = NULL;
694             string extension = toString(i+1) + ".temp";
695             if (output == "column") {
696                 threadWriter = new OutputWriter(synchronizedOutputFile);
697                 dataBundle = new distanceData(threadWriter);
698             }else { dataBundle = new distanceData(filename+extension); }
699             dataBundle->setVariables(lines[i+1].start, lines[i+1].end, cutoff, db, oldFastaDB, calc, prot, numNewFasta, countends);
700             data.push_back(dataBundle);
701 
702             std::thread* thisThread = NULL;
703             if (output == "column")     {
704                 if (fitCalc)    { thisThread = new std::thread(driverFitCalc, dataBundle);   }
705                 else            {  thisThread = new std::thread(driverColumn, dataBundle);   }
706             }
707             else if (output == "lt")    { thisThread = new std::thread(driverLt, dataBundle);            }
708             else                        { thisThread = new std::thread(driverSquare, dataBundle);        }
709             workerThreads.push_back(thisThread);
710         }
711 
712         OutputWriter* threadWriter = NULL;
713         distanceData* dataBundle = NULL;
714         if (output == "column") {
715             threadWriter = new OutputWriter(synchronizedOutputFile);
716             dataBundle = new distanceData(threadWriter);
717         }else { dataBundle = new distanceData(filename); }
718         dataBundle->setVariables(lines[0].start, lines[0].end, cutoff, db, oldFastaDB, calc, prot, numNewFasta, countends);
719 
720         if (output == "column")     {
721             if (fitCalc)    { driverFitCalc(dataBundle);    }
722             else            { driverColumn(dataBundle);     }
723         }
724         else if (output == "lt")    { driverLt(dataBundle);            }
725         else                        { driverSquare(dataBundle);        }
726         distsBelowCutoff = dataBundle->count;
727 
728 
729         for (int i = 0; i < processors-1; i++) {
730             workerThreads[i]->join();
731 
732             distsBelowCutoff += data[i]->count;
733             if (output == "column") {  delete data[i]->threadWriter; }
734             else {
735                 string extension = toString(i+1) + ".temp";
736                 util.appendFiles((filename+extension), filename);
737                 util.mothurRemove(filename+extension);
738             }
739             delete data[i];
740             delete workerThreads[i];
741         }
742         if (output == "column")     { synchronizedOutputFile->close(); delete threadWriter; }
743         delete dataBundle;
744 
745         time(&end);
746         m->mothurOut("\nIt took " + toString(difftime(end, start)) + " secs to find distances for " + toString(num) + " sequences. " + toString(distsBelowCutoff+numDistsBelowCutoff) + " distances below cutoff " + toString(cutoff) + ".\n\n");
747 
748 	}
749 	catch(exception& e) {
750 		m->errorOut(e, "DistanceCommand", "createProcesses");
751 		exit(1);
752 	}
753 }
754 
755 /**************************************************************************************************/
756 //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
757 //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
758 //also check to make sure the 2 files have the same alignment length.
sanityCheck()759 bool DistanceCommand::sanityCheck() {
760 	try{
761 		bool good = true;
762 
763 		//make sure the 2 fasta files have the same alignment length
764 		ifstream in; util.openInputFile(fastafile, in);
765 		int fastaAlignLength = 0;
766 
767 		if (in) {
768             if (!prot) {
769                 Sequence tempIn(in);
770                 fastaAlignLength = tempIn.getAligned().length();
771             }else {
772                 Protein tempIn(in);
773                 fastaAlignLength = tempIn.getAligned().size();
774             }
775 		}
776 		in.close();
777 
778 		ifstream in2; util.openInputFile(oldfastafile, in2);
779 		int oldfastaAlignLength = 0;
780 		if (in2) {
781             if (!prot) {
782                 Sequence tempIn(in2);
783                 oldfastaAlignLength = tempIn.getAligned().length();
784             }else {
785                 Protein tempIn(in2);
786                 oldfastaAlignLength = tempIn.getAligned().size();
787             }
788 		}
789 		in2.close();
790 
791 		if (fastaAlignLength != oldfastaAlignLength) { m->mothurOut("fasta files do not have the same alignment length.\n");  return false;  }
792 
793         //read fasta file and save names as well as adding them to the alignDB
794         set<string> namesOldFasta;
795 
796         ifstream inFasta; util.openInputFile(oldfastafile, inFasta);
797 
798         while (!inFasta.eof()) {
799             if (m->getControl_pressed()) {  inFasta.close(); return good;  }
800 
801             if (!prot) {
802                 Sequence temp(inFasta);  util.gobble(inFasta);
803                 if (temp.getName() != "") {
804                     namesOldFasta.insert(temp.getName());  //save name
805                     if (!fitCalc) { db->push_back(temp);  }//add to DB
806                 }
807             }else {
808                 Protein temp(inFasta);  util.gobble(inFasta);
809                 if (temp.getName() != "") {
810                     namesOldFasta.insert(temp.getName());  //save name
811                     if (!fitCalc) { db->push_back(temp);  }//add to DB
812                 }
813             }
814         }
815         inFasta.close();
816 
817 		//read through the column file checking names and removing distances above the cutoff
818 		ifstream inDist;
819 		util.openInputFile(column, inDist);
820 
821 		ofstream outDist;
822 		string outputFile = column + ".temp";
823 		util.openOutputFile(outputFile, outDist);
824 
825 		string name1, name2;
826 		float dist;
827 		while (!inDist.eof()) {
828 			if (m->getControl_pressed()) {  inDist.close(); outDist.close(); util.mothurRemove(outputFile); return good;  }
829 
830             inDist >> name1; util.gobble(inDist); inDist >> name2; util.gobble(inDist); inDist >> dist; util.gobble(inDist);
831 
832 			//both names are in fasta file and distance is below cutoff
833 			if ((namesOldFasta.count(name1) == 0) || (namesOldFasta.count(name2) == 0)) {  good = false; break;  }
834             else{ if (dist <= cutoff) { numDistsBelowCutoff++; outDist << name1 << '\t' << name2 << '\t' << dist << endl; } }
835 		}
836 
837 		inDist.close();
838 		outDist.close();
839 
840 		if (good) {
841 			util.mothurRemove(column);
842 			rename(outputFile.c_str(), column.c_str());
843 		}else{
844 			util.mothurRemove(outputFile); //temp file is bad because file mismatch above
845 		}
846 
847 		return good;
848 
849 	}
850 	catch(exception& e) {
851 		m->errorOut(e, "DistanceCommand", "sanityCheck");
852 		exit(1);
853 	}
854 }
855 /**************************************************************************************************/
856 
857 
858 
859 
860