1 /*
2  *  clustercommand.cpp
3  *  Dotur
4  *
5  *  Created by Sarah Westcott on 1/2/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9 
10 #include "clustercommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "sequence.hpp"
15 #include "systemcommand.h"
16 #include "sensspeccommand.h"
17 #include "mcc.hpp"
18 #include "sensitivity.hpp"
19 #include "specificity.hpp"
20 #include "fdr.hpp"
21 #include "npv.hpp"
22 #include "ppv.hpp"
23 #include "f1score.hpp"
24 #include "tp.hpp"
25 #include "fp.hpp"
26 #include "fpfn.hpp"
27 #include "tptn.hpp"
28 #include "tn.hpp"
29 #include "fn.hpp"
30 #include "accuracy.hpp"
31 
32 
33 
34 //**********************************************************************************************************************
setParameters()35 vector<string> ClusterCommand::setParameters(){
36 	try {
37         CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
38         CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
39         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
40         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
41         CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
42 		CommandParameter pcutoff("cutoff", "Number", "", "0.03", "", "", "","",false,false,true); parameters.push_back(pcutoff);
43 		CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
44 		CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted-agc-dgc-opti-unique", "opti", "", "", "","",false,false,true); parameters.push_back(pmethod);
45         CommandParameter pinitialize("initialize", "Multiple", "oneotu-singleton", "singleton", "", "", "","",false,false,true); parameters.push_back(pinitialize);
46         CommandParameter pmetric("metric", "Multiple", "mcc-sens-spec-tptn-fpfn-tp-tn-fp-fn-f1score-accuracy-ppv-npv-fdr", "mcc", "", "", "","",false,false,true); parameters.push_back(pmetric);
47         CommandParameter pmetriccutoff("delta", "Number", "", "0.0001", "", "", "","",false,false,true); parameters.push_back(pmetriccutoff);
48         CommandParameter piters("iters", "Number", "", "100", "", "", "","",false,false,true); parameters.push_back(piters);
49 		CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
50 		CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
51 		CommandParameter psim("sim", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psim);
52         CommandParameter pvsearchlocation("vsearch", "String", "", "", "", "", "","",false,false); parameters.push_back(pvsearchlocation);
53         CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
54         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
55         CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
56 		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
57 
58         abort = false; calledHelp = false;
59 
60         vector<string> tempOutNames;
61         outputTypes["list"] = tempOutNames;
62         outputTypes["sensspec"] = tempOutNames;
63         outputTypes["rabund"] = tempOutNames;
64         outputTypes["sabund"] = tempOutNames;
65         outputTypes["steps"] = tempOutNames;
66 
67 		vector<string> myArray;
68 		for (int i = 0; i < parameters.size(); i++) {	myArray.push_back(parameters[i].name);		}
69 		return myArray;
70 	}
71 	catch(exception& e) {
72 		m->errorOut(e, "ClusterCommand", "setParameters");
73 		exit(1);
74 	}
75 }
76 //**********************************************************************************************************************
getHelpString()77 string ClusterCommand::getHelpString(){
78 	try {
79 		string helpString = "";
80 		helpString += "The cluster command parameter options are phylip, column, name, count, method, cutoff, precision, sim, showabund, timing, metric, iters, initialize. Fasta or Phylip or column and name are required.\n";
81         helpString += "The phylip and column parameter allow you to enter your distance file. \n";
82         helpString += "The fasta parameter allows you to enter your fasta file for use with the agc or dgc methods. \n";
83         helpString += "The name parameter allows you to enter your name file. \n";
84         helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format.\n";
85         helpString += "The iters parameter allow you to set the maxiters for the opticluster method. \n";
86         helpString += "The metric parameter allows to select the metric in the opticluster method. Options are Matthews correlation coefficient (mcc), sensitivity (sens), specificity (spec), true positives + true negatives (tptn), false positives + false negatives (fpfn), true positives (tp), true negative (tn), false positive (fp), false negative (fn), f1score (f1score), accuracy (accuracy), positive predictive value (ppv), negative predictive value (npv), false discovery rate (fdr). Default=mcc.\n";
87         helpString += "The initialize parameter allows to select the initial randomization for the opticluster method. Options are singleton, meaning each sequence is randomly assigned to its own OTU, or oneotu meaning all sequences are assigned to one otu. Default=singleton.\n";
88         helpString += "The delta parameter allows to set the stable value for the metric in the opticluster method (delta=0.0001). \n";
89         helpString += "The method parameter allows you to enter your clustering mothod. Options are furthest, nearest, average, weighted, agc, dgc, unique and opti. Default=opti.  The agc and dgc methods require a fasta file.";
90         helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
91          helpString += "The vsearch parameter allows you to specify the name and location of your vsearch executable if using agc or dgc clustering methods. By default mothur will look in your path, mothur's executable and mothur tools locations.  You can set the vsearch location as follows, vsearch=/usr/bin/vsearch.\n";
92        helpString += "The cluster command should be in the following format: \n";
93 		helpString += "cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n";
94 		return helpString;
95 	}
96 	catch(exception& e) {
97 		m->errorOut(e, "ClusterCommand", "getHelpString");
98 		exit(1);
99 	}
100 }
101 //**********************************************************************************************************************
getOutputPattern(string type)102 string ClusterCommand::getOutputPattern(string type) {
103     try {
104         string pattern = "";
105 
106         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
107         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; }
108         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
109         else if (type == "sensspec") {  pattern = "[filename],[clustertag],sensspec"; }
110         else if (type == "steps") {  pattern = "[filename],[clustertag],steps"; }
111         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true);  }
112 
113         return pattern;
114     }
115     catch(exception& e) {
116         m->errorOut(e, "ClusterCommand", "getOutputPattern");
117         exit(1);
118     }
119 }
120 //**********************************************************************************************************************
121 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
ClusterCommand(string option)122 ClusterCommand::ClusterCommand(string option) : Command()  {
123 	try{
124 		//allow user to run help
125 		if(option == "help") { help(); abort = true; calledHelp = true; }
126 		else if(option == "citation") { citation(); abort = true; calledHelp = true;}
127         else if(option == "category") {  abort = true; calledHelp = true;  }
128 
129 		else {
130 			OptionParser parser(option, setParameters());
131 			map<string,string> parameters = parser.getParameters();
132 
133 			ValidParameters validParameter;
134 
135 
136 			//check for required parameters
137 			phylipfile = validParameter.validFile(parameters, "phylip");
138 			if (phylipfile == "not open") { phylipfile = ""; abort = true; }
139 			else if (phylipfile == "not found") { phylipfile = ""; }
140 			else {  distfile = phylipfile;  format = "phylip"; 	current->setPhylipFile(phylipfile); }
141 
142 			columnfile = validParameter.validFile(parameters, "column");
143 			if (columnfile == "not open") { columnfile = ""; abort = true; }
144 			else if (columnfile == "not found") { columnfile = ""; }
145 			else {  distfile = columnfile; format = "column"; current->setColumnFile(columnfile);	}
146 
147             fastafile = validParameter.validFile(parameters, "fasta");
148             if (fastafile == "not open") { abort = true; }
149             else if (fastafile == "not found") { fastafile = ""; }
150             else { distfile = fastafile;  format = "fasta"; current->setFastaFile(fastafile); }
151 
152 			namefile = validParameter.validFile(parameters, "name");
153 			if (namefile == "not open") { abort = true; }
154 			else if (namefile == "not found") { namefile = ""; }
155 			else { current->setNameFile(namefile); }
156 
157             countfile = validParameter.validFile(parameters, "count");
158 			if (countfile == "not open") { abort = true; countfile = ""; }
159 			else if (countfile == "not found") { countfile = ""; }
160 			else { current->setCountFile(countfile); }
161 
162             method = validParameter.valid(parameters, "method");
163             if (method == "not found") {  method = "opti";}
164 
165             vector<string> versionOutputs;
166             bool foundTool = false;
167             string path = current->getProgramPath();
168             string programName = "vsearch"; programName += EXECUTABLE_EXT;
169 
170             vsearchLocation = validParameter.validPath(parameters, "vsearch");
171             if (vsearchLocation == "not found") {
172                 vsearchLocation = "";
173                 if ((method == "agc") || (method == "dgc")) {
174                     foundTool = util.findTool(programName, vsearchLocation, path, versionOutputs, current->getLocations());
175                 }
176             }
177             else {
178                 if ((method == "agc") || (method == "dgc")) {
179                     //test to make sure vsearch exists
180                     ifstream in;
181                     vsearchLocation = util.getFullPathName(vsearchLocation);
182                     bool ableToOpen = util.openInputFile(vsearchLocation, in, "no error"); in.close();
183                     if(!ableToOpen) {
184                         m->mothurOut(vsearchLocation + " file does not exist or cannot be opened, ignoring.\n"); vsearchLocation = "";
185                         programName = util.getSimpleName(vsearchLocation); vsearchLocation = "";
186                         foundTool = util.findTool(programName, vsearchLocation, path, versionOutputs, current->getLocations());
187                     }
188                 }
189             }
190 
191             if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted") || (method == "agc") || (method == "dgc") || (method == "opti") || (method == "unique")) { }
192             else { m->mothurOut("[ERROR]: Not a valid clustering method.  Valid clustering algorithms are furthest, nearest, average, weighted, agc, dgc, unique and opti.\n");  abort = true; }
193 
194             if (method != "unique") {
195                 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
196                     //is there are current file available for either of these?
197                     //give priority to column, then phylip
198                     columnfile = current->getColumnFile();
199                     if (columnfile != "") {  distfile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter.\n");  }
200                     else {
201                         phylipfile = current->getPhylipFile();
202                         if (phylipfile != "") { distfile = phylipfile;  format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter.\n");  }
203                         else {
204                             fastafile = current->getFastaFile();
205                             if (fastafile != "") {  distfile = fastafile; format = "fasta"; m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n");  }
206                             else {
207                                 m->mothurOut("No valid current files. You must provide a phylip, column or fasta file before you can use the cluster command, unless using the unique method.\n");
208                                 abort = true;
209                             }
210                         }
211                     }
212                 }
213                 else if (((phylipfile != "") && (columnfile != "")) || ((phylipfile != "") && (fastafile != "")) || ((fastafile != "") && (columnfile != "")))  { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: phylip, column or fasta.\n");  abort = true; }
214 
215                 if (columnfile != "") {
216                     if ((namefile == "") && (countfile == "")){
217                         namefile = current->getNameFile();
218                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter.\n");  }
219                         else {
220                             countfile = current->getCountFile();
221                             if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter.\n");  }
222                             else {
223                                 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format.\n");
224                                 abort = true;
225                             }
226                         }
227                     }
228                 }
229                 if ((method != "agc") && (method != "dgc")) {
230                     if ((columnfile == "") && (phylipfile == "")) {
231                         m->mothurOut("[ERROR]: You must provide a distance file unless you are using the agc, dgc or unique clustering methods, aborting\n."); abort = true;
232                     }
233                 }
234             }else {
235                 if ((countfile == "") && (namefile == "")) {
236                     countfile = current->getCountFile();
237                     if (countfile != "") { distfile = countfile;  format = "count"; m->mothurOut("Using " + countfile + " as input file for the count parameter.\n");  }
238                     else {
239                         namefile = current->getNameFile();
240                         if (namefile != "") {  distfile = namefile; format = "name"; m->mothurOut("Using " + namefile + " as input file for the name parameter.\n");  }
241                         else {
242                             m->mothurOut("No valid current files. You must provide a count or name file before you can use the cluster command with the unique method.\n");
243                             abort = true;
244                         }
245                     }
246 
247                 }
248                 else if(countfile != "")    { format = "count"; }
249                 else if(namefile != "")     { format = "name";  }
250             }
251             if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: count or name.\n");  abort = true; }
252 
253 			//check for optional parameter and set defaults
254 			// ...at some point should added some additional type checking...
255 			//get user cutoff and precision or use defaults
256 			string temp;
257 			temp = validParameter.valid(parameters, "precision");
258 			if (temp == "not found") { temp = "100"; }
259 			//saves precision legnth for formatting below
260 			length = temp.length();
261 			util.mothurConvert(temp, precision);
262 
263 			temp = validParameter.valid(parameters, "sim");				if (temp == "not found") { temp = "F"; }
264 			sim = util.isTrue(temp);
265 
266             temp = validParameter.valid(parameters, "delta");		if (temp == "not found")  { temp = "0.0001"; }
267             util.mothurConvert(temp, stableMetric);
268 
269             metricName = validParameter.valid(parameters, "metric");		if (metricName == "not found") { metricName = "mcc"; }
270 
271             if ((metricName == "mcc") || (metricName == "sens") || (metricName == "spec") || (metricName == "tptn") || (metricName == "tp") || (metricName == "tn") || (metricName == "fp") || (metricName == "fn") || (metricName == "f1score") || (metricName == "accuracy") || (metricName == "ppv") || (metricName == "npv") || (metricName == "fdr") || (metricName == "fpfn") ){ }
272             else { m->mothurOut("[ERROR]: Not a valid metric.  Valid metrics are mcc, sens, spec, tp, tn, fp, fn, tptn, fpfn, f1score, accuracy, ppv, npv, fdr.\n");  abort = true; }
273 
274             initialize = validParameter.valid(parameters, "initialize");		if (initialize == "not found") { initialize = "singleton"; }
275 
276             if ((initialize == "singleton") || (initialize == "oneotu")){ }
277             else { m->mothurOut("[ERROR]: Not a valid initialization.  Valid initializations are singleton and oneotu.\n");  abort = true; }
278 
279             temp = validParameter.valid(parameters, "iters");		if (temp == "not found")  { temp = "100"; }
280             util.mothurConvert(temp, maxIters);
281 
282             adjust=-1.0;
283 
284             bool setProcessors = true;
285             temp = validParameter.valid(parameters, "processors");	if (temp == "not found"){ setProcessors=false;	temp = current->getProcessors();	}
286             processors = current->setProcessors(temp);
287 
288             if ((method == "agc") || (method == "dgc")) {
289                 if (fastafile == "") { m->mothurOut("[ERROR]: You must provide a fasta file when using the agc or dgc clustering methods, aborting\n."); abort = true;}
290             }else if (setProcessors) {
291                 m->mothurOut("[WARNING]: You can only use the processors option when using the agc or dgc clustering methods. Using 1 processor.\n.");
292             }
293 
294             cutOffSet = false;
295             temp = validParameter.valid(parameters, "cutoff");
296             if (temp == "not found") { if ((method == "opti") || (method == "agc") || (method == "dgc")) { temp = "0.03"; }else { temp = "0.15"; } }
297             else { cutOffSet = true; }
298             int pos = temp.find('-');
299             if (pos != string::npos) { //multiple cutoffs given
300                 if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted")) {
301                     m->mothurOut("[WARNING]: Multiple cutoffs can only be specified when using the agc, dgc or opti method. Using 0.15. \n.");
302                     cutOffSet = false; temp = "0.15";
303                 }else { util.splitAtDash(temp, cutoffs);  temp = *cutoffs.begin(); }
304             }else {     cutoffs.insert(temp);  }
305             util.mothurConvert(temp, cutoff);
306 
307 			showabund = validParameter.valid(parameters, "showabund");
308 			if (showabund == "not found") { showabund = "T"; }
309 
310 			timing = validParameter.valid(parameters, "timing");
311 			if (timing == "not found") { timing = "F"; }
312 		}
313 	}
314 	catch(exception& e) {
315 		m->errorOut(e, "ClusterCommand", "ClusterCommand");
316 		exit(1);
317 	}
318 }
319 //**********************************************************************************************************************
~ClusterCommand()320 ClusterCommand::~ClusterCommand(){}
321 //**********************************************************************************************************************
322 
execute()323 int ClusterCommand::execute(){
324 	try {
325 
326 		if (abort) { if (calledHelp) { return 0; }  return 2;	}
327 
328 		//phylip file given and cutoff not given - use cluster.classic because it uses less memory and is faster
329 		if ((format == "phylip") && (!cutOffSet) && (method != "opti")) {
330 			m->mothurOut("\nYou are using a phylip file and no cutoff.  I will run cluster.classic to save memory and time.\n");
331 
332 			//run unique.seqs for deconvolute results
333 			string inputString = "phylip=" + distfile;
334 			if (namefile != "") { inputString += ", name=" + namefile; }
335             else if (countfile != "") { inputString += ", count=" + countfile; }
336 			inputString += ", precision=" + toString(precision);
337 			inputString += ", method=" + method;
338             if (sim)	{ inputString += ", sim=T";		}
339 			else		{ inputString += ", sim=F";		}
340 
341 			m->mothurOut("\n/------------------------------------------------------------/\n");
342 			m->mothurOut("Running command: cluster.classic(" + inputString + ")\n");
343 
344 			Command* clusterClassicCommand = new ClusterDoturCommand(inputString);
345 			clusterClassicCommand->execute();
346 			delete clusterClassicCommand;
347 
348 			m->mothurOut("/------------------------------------------------------------/\n");
349 
350 			return 0;
351 		}
352 
353         time_t estart = time(NULL);
354 
355         if (format == "fasta")          {   runVsearchCluster();    }
356         else if (method == "opti")      {   runOptiCluster();       }
357         else if (method == "unique")    {   runUniqueCluster();     }
358         else                            {   runMothurCluster();     }
359 
360 		if (m->getControl_pressed()) { 	for (int j = 0; j < outputNames.size(); j++) { util.mothurRemove(outputNames[j]); }  return 0; }
361 
362         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster\n");
363 
364 		//set list file as new current listfile
365 		string currentName = "";
366 		itTypes = outputTypes.find("list");
367 		if (itTypes != outputTypes.end()) {
368 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setListFile(currentName); }
369 		}
370 
371 		//set rabund file as new current rabundfile
372 		itTypes = outputTypes.find("rabund");
373 		if (itTypes != outputTypes.end()) {
374 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setRabundFile(currentName); }
375 		}
376 
377 		//set sabund file as new current sabundfile
378 		itTypes = outputTypes.find("sabund");
379 		if (itTypes != outputTypes.end()) {
380 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setSabundFile(currentName); }
381 		}
382 
383 		m->mothurOut("\nOutput File Names: \n");
384 		for (int i = 0; i < outputNames.size(); i++) {	m->mothurOut(outputNames[i] +"\n"); 	} m->mothurOutEndLine();
385 
386 		return 0;
387 	}
388 	catch(exception& e) {
389 		m->errorOut(e, "ClusterCommand", "execute");
390 		exit(1);
391 	}
392 }
393 //**********************************************************************************************************************
394 
runVsearchCluster()395 int ClusterCommand::runVsearchCluster(){
396     try {
397         string vsearchFastafile = ""; VsearchFileParser* vParse;
398         if ((namefile == "") && (countfile == ""))  { vParse = new VsearchFileParser(fastafile);                        }
399         else if (namefile != "")                    { vParse = new VsearchFileParser(fastafile, namefile, "name");      }
400         else if (countfile != "")                   { vParse = new VsearchFileParser(fastafile, countfile, "count");    }
401         else                                        { m->mothurOut("[ERROR]: Opps, should never get here. ClusterCommand::runVsearchCluster() \n"); m->setControl_pressed(true); return 0; }
402 
403         if (m->getControl_pressed()) {  delete vParse; return 0; }
404 
405         vsearchFastafile = vParse->getVsearchFile();
406 
407         if (cutoff > 1.0) {  m->mothurOut("You did not set a cutoff, using 0.03.\n"); cutoff = 0.03; }
408 
409         map<string, int> counts;
410         map<string, string> variables;
411         if (outputdir == "") { outputdir += util.hasPath(distfile); }
412         fileroot = outputdir + util.getRootName(util.getSimpleName(distfile)); tag = method;
413 
414         variables["[filename]"] = fileroot;
415         variables["[clustertag]"] = tag;
416         string listFileName = getOutputFileName("list", variables);
417         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
418 
419         ofstream out;
420         util.openOutputFile(listFileName,	out);
421         bool printHeaders = true;
422 
423         for (set<string>::iterator it = cutoffs.begin(); it != cutoffs.end(); it++) {
424 
425             m->mothurOut("\n" + *it + "\n");
426             util.mothurConvert(*it, cutoff);
427 
428             //Run vsearch
429             string ucVsearchFile = util.getSimpleName(vsearchFastafile) + ".clustered.uc";
430             string logfile = util.getSimpleName(vsearchFastafile) + ".clustered.log";
431             vsearchDriver(vsearchFastafile, ucVsearchFile, logfile);
432 
433             if (m->getControl_pressed()) { break; }
434 
435             //Convert outputted *.uc file into a list file
436             ListVector list = vParse->createListFile(ucVsearchFile, vParse->getNumBins(logfile), toString(1.0-cutoff), counts);
437 
438             if (printHeaders) {
439                 printHeaders = false;
440             }else {  list.setPrintedLabels(printHeaders);  }
441 
442             if (countfile != "") { list.print(out, counts); }
443             else { list.print(out); }
444 
445             //remove temp files
446             util.mothurRemove(ucVsearchFile); util.mothurRemove(logfile);
447 
448         }
449         out.close();
450         util.mothurRemove(vsearchFastafile); delete vParse;
451         if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
452 
453         return 0;
454     }
455     catch(exception& e) {
456         m->errorOut(e, "ClusterCommand", "runVsearchCluster");
457         exit(1);
458     }
459 }
460 //**********************************************************************************************************************
461 
vsearchDriver(string inputFile,string ucClusteredFile,string logfile)462 int ClusterCommand::vsearchDriver(string inputFile, string ucClusteredFile, string logfile){
463     try {
464 
465         //vsearch --maxaccepts 16 --usersort --id 0.97 --minseqlength 30 --wordlength 8 --uc $ROOT.clustered.uc --cluster_smallmem $ROOT.sorted.fna --maxrejects 64 --strand both --log $ROOT.clustered.log --sizeorder
466 
467 
468         //no sizeorder for dgc
469 
470         ucClusteredFile = util.getFullPathName(ucClusteredFile);
471         inputFile = util.getFullPathName(inputFile);
472         logfile = util.getFullPathName(logfile);
473 
474         //to allow for spaces in the path
475         ucClusteredFile = "\"" + ucClusteredFile + "\"";
476         inputFile = "\"" + inputFile + "\"";
477         logfile = "\"" + logfile + "\"";
478 
479         vector<char*> cPara;
480 
481         string vsearchCommand = vsearchLocation;
482         vsearchCommand = "\"" + vsearchCommand + "\" ";
483 
484         vector<char*> vsearchParameters;
485         vsearchParameters.push_back(util.mothurConvert(vsearchCommand));
486 
487         //--maxaccepts=16
488         vsearchParameters.push_back(util.mothurConvert("--maxaccepts=16"));
489 
490         //--threads=1
491         string processorsString = "--threads=" + toString(processors);
492         vsearchParameters.push_back(util.mothurConvert(processorsString));
493 
494         //--usersort
495         vsearchParameters.push_back(util.mothurConvert("--usersort"));
496 
497         //--id=0.97
498         cutoff = abs(1.0 - cutoff); string cutoffString = toString(cutoff);
499         if (cutoffString.length() > 4) {  cutoffString = cutoffString.substr(0, 4);  }
500         else if (cutoffString.length() < 4)  {  for (int i = cutoffString.length(); i < 4; i++)  { cutoffString += "0";  } }
501 
502         cutoffString = "--id=" +  cutoffString;
503         vsearchParameters.push_back(util.mothurConvert(cutoffString));
504 
505         //--minseqlength=30
506         vsearchParameters.push_back(util.mothurConvert("--minseqlength=30"));
507 
508         //--wordlength=8
509         vsearchParameters.push_back(util.mothurConvert("--wordlength=8"));
510 
511         //--uc=$ROOT.clustered.uc
512         string tempIn = "--uc=" + ucClusteredFile;
513         vsearchParameters.push_back(util.mothurConvert(tempIn));
514 
515         //--cluster_smallmem $ROOT.sorted.fna
516         string tempSorted = "--cluster_smallmem=" + inputFile;
517         vsearchParameters.push_back(util.mothurConvert(tempSorted));
518 
519         //--maxrejects=64
520         vsearchParameters.push_back(util.mothurConvert("--maxrejects=64"));
521 
522         //--strand=both
523         vsearchParameters.push_back(util.mothurConvert("--strand=both"));
524 
525         //--log=$ROOT.clustered.log
526         string tempLog = "--log=" + logfile;
527         vsearchParameters.push_back(util.mothurConvert(tempLog));
528 
529         if (method == "agc") { //--sizeorder
530             vsearchParameters.push_back(util.mothurConvert("--sizeorder"));
531          }
532 
533         if (m->getDebug()) {  m->mothurOut("[DEBUG]: "); for(int i = 0; i < vsearchParameters.size(); i++)  { m->mothurOut(toString(vsearchParameters[i]) + "\t"); } m->mothurOut("\n");  }
534 
535         string commandString = "";
536         for (int i = 0; i < vsearchParameters.size(); i++) {    commandString += toString(vsearchParameters[i]) + " "; }
537 
538 #if defined NON_WINDOWS
539 #else
540         commandString = "\"" + commandString + "\"";
541 #endif
542         if (m->getDebug()) { m->mothurOut("[DEBUG]: vsearch cluster command = " + commandString + ".\n"); }
543         system(commandString.c_str());
544 
545         //free memory
546         for(int i = 0; i < vsearchParameters.size(); i++)  {  delete vsearchParameters[i];  }
547 
548         //remove "" from filenames
549         ucClusteredFile = ucClusteredFile.substr(1, ucClusteredFile.length()-2);
550         inputFile = inputFile.substr(1, inputFile.length()-2);
551         logfile = logfile.substr(1, logfile.length()-2);
552 
553         return 0;
554     }
555     catch(exception& e) {
556         m->errorOut(e, "ClusterCommand", "vsearchDriver");
557         exit(1);
558     }
559 }
560 //**********************************************************************************************************************
561 
runMothurCluster()562 int ClusterCommand::runMothurCluster(){
563     try {
564 
565         ReadMatrix* read;
566         if (format == "column") { read = new ReadColumnMatrix(columnfile, sim); }	//sim indicates whether its a similarity matrix
567         else if (format == "phylip") { read = new ReadPhylipMatrix(phylipfile, sim); }
568         else { m->setControl_pressed(true); return 0; }
569 
570         read->setCutoff(cutoff);
571 
572         NameAssignment* nameMap = NULL;
573         CountTable* ct = NULL;
574         map<string, int> counts;
575         if(namefile != ""){
576             nameMap = new NameAssignment(namefile);
577             nameMap->readMap();
578             read->read(nameMap);
579         }else if (countfile != "") {
580             ct = new CountTable();
581             ct->readTable(countfile, false, false);
582             read->read(ct);
583             counts = ct->getNameMap();
584         }else { read->read(nameMap); }
585 
586         list = read->getListVector();
587         matrix = read->getDMatrix();
588 
589         if(countfile != "") {
590             rabund = new RAbundVector();
591             createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
592             delete ct;
593         }else { rabund = new RAbundVector(list->getRAbundVector()); }
594         delete read;
595 
596         if (m->getControl_pressed()) { //clean up
597             delete list; delete matrix; delete rabund; if(countfile == ""){rabundFile.close(); sabundFile.close();  util.mothurRemove((fileroot+ tag + ".rabund")); util.mothurRemove((fileroot+ tag + ".sabund")); }
598             listFile.close(); util.mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0;
599         }
600 
601         //create cluster
602         if (method == "furthest")	{	cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
603         else if(method == "nearest"){	cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
604         else if(method == "average"){	cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust);	}
605         else if(method == "weighted"){	cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method, adjust);	}
606         tag = cluster->getTag();
607 
608         if (outputdir == "") { outputdir += util.hasPath(distfile); }
609         fileroot = outputdir + util.getRootName(util.getSimpleName(distfile));
610 
611         map<string, string> variables;
612         variables["[filename]"] = fileroot;
613         variables["[clustertag]"] = tag;
614         string sabundFileName = getOutputFileName("sabund", variables);
615         string rabundFileName = getOutputFileName("rabund", variables);
616         //if (countfile != "") { variables["[tag2]"] = "unique_list"; }
617         string listFileName = getOutputFileName("list", variables);
618 
619         if (countfile == "") {
620             util.openOutputFile(sabundFileName,	sabundFile);
621             util.openOutputFile(rabundFileName,	rabundFile);
622             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
623             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
624 
625         }
626         util.openOutputFile(listFileName,	listFile);
627         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
628 
629         float previousDist = 0.00000;
630         float rndPreviousDist = 0.00000;
631         oldRAbund = *rabund;
632         oldList = *list;
633 
634         print_start = true;
635         start = time(NULL);
636         loops = 0;
637         double saveCutoff = cutoff;
638         bool printHeaders = true;
639 
640         while ((matrix->getSmallDist() <= cutoff) && (matrix->getNNodes() > 0)){
641 
642             if (m->getControl_pressed()) { //clean up
643                 delete list; delete matrix; delete rabund; delete cluster;
644                 if(countfile == "") {rabundFile.close(); sabundFile.close();  util.mothurRemove((fileroot+ tag + ".rabund")); util.mothurRemove((fileroot+ tag + ".sabund")); }
645                 listFile.close(); util.mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0;
646             }
647 
648             if (print_start && util.isTrue(timing)) {
649                 m->mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/"
650                              + toString(util.roundDist(matrix->getSmallDist(), precision))
651                              + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
652                 cout.flush();
653                 print_start = false;
654             }
655 
656             cluster->update(cutoff);
657 
658             float dist = matrix->getSmallDist();
659             float rndDist = util.ceilDist(dist, precision);
660 
661             if(previousDist <= 0.0000 && !util.isEqual(dist, previousDist))  {  printData("unique", counts, printHeaders);                               }
662             else if(!util.isEqual(rndDist, rndPreviousDist))                 { printData(toString(rndPreviousDist,  length-1), counts, printHeaders);    }
663 
664             previousDist = dist;
665             rndPreviousDist = rndDist;
666             oldRAbund = *rabund;
667             oldList = *list;
668         }
669 
670         if (print_start && util.isTrue(timing)) {
671             m->mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist)
672                          + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
673             cout.flush();
674             print_start = false;
675         }
676 
677         if(previousDist <= 0.0000)          { printData("unique", counts, printHeaders);                            }
678         else if(rndPreviousDist<cutoff)     { printData(toString(rndPreviousDist, length-1), counts, printHeaders); }
679 
680         delete matrix;
681         delete list;
682         delete rabund;
683         delete cluster;
684         if (countfile == "") {
685             sabundFile.close();
686             rabundFile.close();
687         }
688         listFile.close();
689 
690         if (!util.isEqual(saveCutoff, cutoff)) {
691             saveCutoff = util.ceilDist(saveCutoff, precision);
692             m->mothurOut("changed cutoff to " + toString(cutoff)+"\n");
693         }
694 
695         return 0;
696     }
697     catch(exception& e) {
698         m->errorOut(e, "ClusterCommand", "runMothurCluster");
699         exit(1);
700     }
701 }
702 //**********************************************************************************************************************
703 
printData(string label,map<string,int> & counts,bool & ph)704 void ClusterCommand::printData(string label, map<string, int>& counts, bool& ph){
705 	try {
706         oldList.setPrintedLabels(ph); ph = false;
707 
708 		if (util.isTrue(timing)) {
709 			m->mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins())
710 		     + "\tclusters. Updates: " + toString(loops)+"\n");
711 		}
712 		print_start = true;
713 		loops = 0;
714 		start = time(NULL);
715 
716         oldRAbund.setLabel(label);
717         if (countfile == "") {
718             oldRAbund.print(rabundFile);
719             oldRAbund.getSAbundVector().print(sabundFile);
720         }
721 
722         if (util.isTrue(showabund)) {
723             oldRAbund.getSAbundVector().print(cout);
724         }
725 
726 		oldList.setLabel(label);
727         if(countfile != "") {
728             oldList.print(listFile, counts);
729         }else {
730             oldList.print(listFile);
731         }
732 
733 	}
734 	catch(exception& e) {
735 		m->errorOut(e, "ClusterCommand", "printData");
736 		exit(1);
737 	}
738 
739 
740 }
741 //**********************************************************************************************************************
742 
createRabund(CountTable * & ct,ListVector * & list,RAbundVector * & rabund)743 int ClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
744     try {
745         rabund->setLabel(list->getLabel());
746         for(int i = 0; i < list->getNumBins(); i++) {
747             if (m->getControl_pressed()) { break; }
748             vector<string> binNames;
749             string bin = list->get(i);
750             util.splitAtComma(bin, binNames);
751             int total = 0;
752             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
753             rabund->push_back(total);
754         }
755         return 0;
756     }
757     catch(exception& e) {
758 		m->errorOut(e, "ClusterCommand", "createRabund");
759 		exit(1);
760 	}
761 
762 }
763 //**********************************************************************************************************************
764 
runOptiCluster()765 int ClusterCommand::runOptiCluster(){
766     try {
767         if (!cutOffSet) {  m->mothurOut("\nYou did not set a cutoff, using 0.03.\n"); cutoff = 0.03;  }
768 
769         m->mothurOut("\nClustering " + distfile+"\n");
770 
771         ClusterMetric* metric = NULL;
772         if (metricName == "mcc")             { metric = new MCC();              }
773         else if (metricName == "sens")       { metric = new Sensitivity();      }
774         else if (metricName == "spec")       { metric = new Specificity();      }
775         else if (metricName == "tptn")       { metric = new TPTN();             }
776         else if (metricName == "tp")         { metric = new TP();               }
777         else if (metricName == "tn")         { metric = new TN();               }
778         else if (metricName == "fp")         { metric = new FP();               }
779         else if (metricName == "fn")         { metric = new FN();               }
780         else if (metricName == "f1score")    { metric = new F1Score();          }
781         else if (metricName == "accuracy")   { metric = new Accuracy();         }
782         else if (metricName == "ppv")        { metric = new PPV();              }
783         else if (metricName == "npv")        { metric = new NPV();              }
784         else if (metricName == "fdr")        { metric = new FDR();              }
785         else if (metricName == "fpfn")       { metric = new FPFN();             }
786         else { return 0; }
787 
788         string nameOrCount = "";
789         string thisNamefile = "";
790         map<string, int> counts;
791         if (countfile != "") { nameOrCount = "count"; thisNamefile = countfile; CountTable ct; ct.readTable(countfile, false, false); counts = ct.getNameMap(); }
792         else if (namefile != "") { nameOrCount = "name"; thisNamefile = namefile; }
793 
794         string distfile = columnfile;
795         if (format == "phylip") { distfile = phylipfile; }
796 
797         if (outputdir == "") { outputdir += util.hasPath(distfile); }
798         fileroot = outputdir + util.getRootName(util.getSimpleName(distfile));
799         tag = "opti_" + metric->getName();
800 
801         string listFileName = fileroot+ tag + ".list";
802 
803         ofstream listFile;
804         util.openOutputFile(listFileName,	listFile);
805         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
806 
807         map<string, string> variables;
808         variables["[filename]"] = fileroot;
809         variables["[clustertag]"] = tag;
810         string outputName = getOutputFileName("steps", variables);
811         outputNames.push_back(outputName); outputTypes["steps"].push_back(outputName);
812         ofstream outStep;
813         util.openOutputFile(outputName, outStep);
814 
815         string sensspecFilename = fileroot+ tag + ".sensspec";
816         ofstream sensFile;
817         util.openOutputFile(sensspecFilename,	sensFile);
818         outputNames.push_back(sensspecFilename); outputTypes["sensspec"].push_back(sensspecFilename);
819 
820         sensFile << "label\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n";
821         m->mothurOut("\n\niter\ttime\tlabel\tnum_otus\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n");
822         outStep << "iter\ttime\tlabel\tnum_otus\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n";
823 
824         bool printHeaders = true;
825         for (set<string>::iterator it = cutoffs.begin(); it != cutoffs.end(); it++) {
826 
827             m->mothurOut("\n" + *it + "\n");
828             util.mothurConvert(*it, cutoff);
829 
830             OptiData* matrix = new OptiMatrix(distfile, thisNamefile, nameOrCount, format, cutoff, false);
831 
832             OptiCluster cluster(matrix, metric, 0);
833 
834             int iters = 0;
835             double listVectorMetric = 0; //worst state
836             double delta = 1;
837 
838             cluster.initialize(listVectorMetric, true, initialize);
839 
840             long long numBins = cluster.getNumBins();
841             double tp, tn, fp, fn;
842             vector<double> results = cluster.getStats(tp, tn, fp, fn);
843             m->mothurOut("0\t0\t" + toString(cutoff) + "\t" + toString(numBins) + "\t"+ toString(cutoff) + "\t" + toString(tp) + "\t" + toString(tn) + "\t" + toString(fp) + "\t" + toString(fn) + "\t");
844             outStep << "0\t0\t" + toString(cutoff) + "\t" + toString(numBins) + "\t" + toString(cutoff) + "\t" << tp << '\t' << tn << '\t' << fp << '\t' << fn << '\t';
845             for (int i = 0; i < results.size(); i++) { m->mothurOut(toString(results[i]) + "\t"); outStep << results[i] << "\t"; }
846             m->mothurOutEndLine();
847             outStep << endl;
848 
849             while ((delta > stableMetric) && (iters < maxIters)) {
850 
851                 long start = time(NULL);
852 
853                 if (m->getControl_pressed()) { break; }
854                 double oldMetric = listVectorMetric;
855 
856                 cluster.update(listVectorMetric);
857 
858                 delta = abs(oldMetric - listVectorMetric);
859                 iters++;
860 
861                 results = cluster.getStats(tp, tn, fp, fn);
862                 numBins = cluster.getNumBins();
863 
864                 m->mothurOut(toString(iters) + "\t" + toString(time(NULL) - start) + "\t" + toString(cutoff) + "\t" + toString(numBins) + "\t" + toString(cutoff) + "\t"+ toString(tp) + "\t" + toString(tn) + "\t" + toString(fp) + "\t" + toString(fn) + "\t");
865                 outStep << (toString(iters) + "\t" + toString(time(NULL) - start) + "\t" + toString(cutoff) + "\t" + toString(numBins) + "\t" + toString(cutoff) + "\t") << tp << '\t' << tn << '\t' << fp << '\t' << fn << '\t';
866                 for (int i = 0; i < results.size(); i++) { m->mothurOut(toString(results[i]) + "\t"); outStep << results[i] << "\t"; }
867                 m->mothurOutEndLine(); outStep << endl;
868             }
869             m->mothurOutEndLine(); m->mothurOutEndLine();
870 
871             if (m->getControl_pressed()) { delete matrix; delete metric; metric = NULL; return 0; }
872 
873             ListVector* list = cluster.getList();
874             list->setLabel(toString(cutoff));
875 
876             if (printHeaders) { //only print headers the first time
877                 printHeaders = false;
878             }else {  list->setPrintedLabels(printHeaders);  }
879 
880             if(countfile != "") { list->print(listFile, counts); }
881             else { list->print(listFile); }
882             delete list;
883 
884             results = cluster.getStats(tp, tn, fp, fn);
885 
886             sensFile << cutoff << '\t' << cutoff << '\t' << tp << '\t' << tn << '\t' << fp << '\t' << fn << '\t';
887             for (int i = 0; i < results.size(); i++) {  sensFile << results[i] << '\t'; } sensFile << '\n';
888 
889 
890             delete matrix;
891         }
892 
893         listFile.close();
894         sensFile.close();
895         outStep.close();
896 
897         return 0;
898 
899     }
900     catch(exception& e) {
901         m->errorOut(e, "ClusterCommand", "runOptiCluster");
902         exit(1);
903     }
904 
905 }
906 //**********************************************************************************************************************
907 
runUniqueCluster()908 int ClusterCommand::runUniqueCluster(){
909     try {
910         if (countfile != "") {  distfile = countfile;   }
911         else if (namefile != "") {  distfile = namefile;  }
912 
913         m->mothurOut("\nClustering " + distfile+"\n");
914 
915         ListVector list; list.setLabel("ASV");
916 
917         map<string, int> counts;
918         if (countfile != "") {
919             CountTable ct; ct.readTable(countfile, false, false); counts = ct.getNameMap();
920             for (map<string, int>::iterator it = counts.begin(); it != counts.end(); it++) {
921                 if (m->getControl_pressed()) { return 0; }
922                 list.push_back(it->first);
923             }
924         }else {
925             map<string, string> nameMap;
926             util.readNames(namefile, nameMap);
927             for (map<string, string>::iterator it = nameMap.begin(); it != nameMap.end(); it++) {
928                 if (m->getControl_pressed()) { return 0; }
929                 list.push_back(it->second);
930             }
931         }
932 
933         if (outputdir == "") { outputdir += util.hasPath(distfile); }
934         fileroot = outputdir + util.getRootName(util.getSimpleName(distfile));
935 
936         tag = "unique";
937         string listFileName = fileroot+ tag + ".list";
938 
939         ofstream listFile;
940         util.openOutputFile(listFileName,	listFile);
941         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
942 
943         if(countfile != "") { list.print(listFile, counts); }
944         else { list.print(listFile); }
945         listFile.close();
946 
947         map<string, string> variables;
948         variables["[filename]"] = fileroot;
949         variables["[clustertag]"] = tag;
950         string sabundFileName = getOutputFileName("sabund", variables);
951         string rabundFileName = getOutputFileName("rabund", variables);
952 
953         if (countfile == "") {
954             util.openOutputFile(sabundFileName,	sabundFile);
955             util.openOutputFile(rabundFileName,	rabundFile);
956             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
957             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
958 
959             SAbundVector sabund = list.getSAbundVector();
960             sabund.print(sabundFile);
961             sabundFile.close();
962 
963             RAbundVector rabund = list.getRAbundVector();
964             rabund.print(rabundFile);
965             rabundFile.close();
966         }
967 
968         return 0;
969     }
970     catch(exception& e) {
971         m->errorOut(e, "ClusterCommand", "runUniqueCluster");
972         exit(1);
973     }
974 
975 }
976 //**********************************************************************************************************************
977