1 /*
2 * clustercommand.cpp
3 * Dotur
4 *
5 * Created by Sarah Westcott on 1/2/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7 *
8 */
9
10 #include "clustercommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "sequence.hpp"
15 #include "systemcommand.h"
16 #include "sensspeccommand.h"
17 #include "mcc.hpp"
18 #include "sensitivity.hpp"
19 #include "specificity.hpp"
20 #include "fdr.hpp"
21 #include "npv.hpp"
22 #include "ppv.hpp"
23 #include "f1score.hpp"
24 #include "tp.hpp"
25 #include "fp.hpp"
26 #include "fpfn.hpp"
27 #include "tptn.hpp"
28 #include "tn.hpp"
29 #include "fn.hpp"
30 #include "accuracy.hpp"
31
32
33
34 //**********************************************************************************************************************
setParameters()35 vector<string> ClusterCommand::setParameters(){
36 try {
37 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
38 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
39 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
40 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
41 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
42 CommandParameter pcutoff("cutoff", "Number", "", "0.03", "", "", "","",false,false,true); parameters.push_back(pcutoff);
43 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
44 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted-agc-dgc-opti-unique", "opti", "", "", "","",false,false,true); parameters.push_back(pmethod);
45 CommandParameter pinitialize("initialize", "Multiple", "oneotu-singleton", "singleton", "", "", "","",false,false,true); parameters.push_back(pinitialize);
46 CommandParameter pmetric("metric", "Multiple", "mcc-sens-spec-tptn-fpfn-tp-tn-fp-fn-f1score-accuracy-ppv-npv-fdr", "mcc", "", "", "","",false,false,true); parameters.push_back(pmetric);
47 CommandParameter pmetriccutoff("delta", "Number", "", "0.0001", "", "", "","",false,false,true); parameters.push_back(pmetriccutoff);
48 CommandParameter piters("iters", "Number", "", "100", "", "", "","",false,false,true); parameters.push_back(piters);
49 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
50 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
51 CommandParameter psim("sim", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psim);
52 CommandParameter pvsearchlocation("vsearch", "String", "", "", "", "", "","",false,false); parameters.push_back(pvsearchlocation);
53 CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
54 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
55 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
56 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
57
58 abort = false; calledHelp = false;
59
60 vector<string> tempOutNames;
61 outputTypes["list"] = tempOutNames;
62 outputTypes["sensspec"] = tempOutNames;
63 outputTypes["rabund"] = tempOutNames;
64 outputTypes["sabund"] = tempOutNames;
65 outputTypes["steps"] = tempOutNames;
66
67 vector<string> myArray;
68 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
69 return myArray;
70 }
71 catch(exception& e) {
72 m->errorOut(e, "ClusterCommand", "setParameters");
73 exit(1);
74 }
75 }
76 //**********************************************************************************************************************
getHelpString()77 string ClusterCommand::getHelpString(){
78 try {
79 string helpString = "";
80 helpString += "The cluster command parameter options are phylip, column, name, count, method, cutoff, precision, sim, showabund, timing, metric, iters, initialize. Fasta or Phylip or column and name are required.\n";
81 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
82 helpString += "The fasta parameter allows you to enter your fasta file for use with the agc or dgc methods. \n";
83 helpString += "The name parameter allows you to enter your name file. \n";
84 helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format.\n";
85 helpString += "The iters parameter allow you to set the maxiters for the opticluster method. \n";
86 helpString += "The metric parameter allows to select the metric in the opticluster method. Options are Matthews correlation coefficient (mcc), sensitivity (sens), specificity (spec), true positives + true negatives (tptn), false positives + false negatives (fpfn), true positives (tp), true negative (tn), false positive (fp), false negative (fn), f1score (f1score), accuracy (accuracy), positive predictive value (ppv), negative predictive value (npv), false discovery rate (fdr). Default=mcc.\n";
87 helpString += "The initialize parameter allows to select the initial randomization for the opticluster method. Options are singleton, meaning each sequence is randomly assigned to its own OTU, or oneotu meaning all sequences are assigned to one otu. Default=singleton.\n";
88 helpString += "The delta parameter allows to set the stable value for the metric in the opticluster method (delta=0.0001). \n";
89 helpString += "The method parameter allows you to enter your clustering mothod. Options are furthest, nearest, average, weighted, agc, dgc, unique and opti. Default=opti. The agc and dgc methods require a fasta file.";
90 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
91 helpString += "The vsearch parameter allows you to specify the name and location of your vsearch executable if using agc or dgc clustering methods. By default mothur will look in your path, mothur's executable and mothur tools locations. You can set the vsearch location as follows, vsearch=/usr/bin/vsearch.\n";
92 helpString += "The cluster command should be in the following format: \n";
93 helpString += "cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n";
94 return helpString;
95 }
96 catch(exception& e) {
97 m->errorOut(e, "ClusterCommand", "getHelpString");
98 exit(1);
99 }
100 }
101 //**********************************************************************************************************************
getOutputPattern(string type)102 string ClusterCommand::getOutputPattern(string type) {
103 try {
104 string pattern = "";
105
106 if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
107 else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
108 else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
109 else if (type == "sensspec") { pattern = "[filename],[clustertag],sensspec"; }
110 else if (type == "steps") { pattern = "[filename],[clustertag],steps"; }
111 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }
112
113 return pattern;
114 }
115 catch(exception& e) {
116 m->errorOut(e, "ClusterCommand", "getOutputPattern");
117 exit(1);
118 }
119 }
120 //**********************************************************************************************************************
121 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
ClusterCommand(string option)122 ClusterCommand::ClusterCommand(string option) : Command() {
123 try{
124 //allow user to run help
125 if(option == "help") { help(); abort = true; calledHelp = true; }
126 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
127 else if(option == "category") { abort = true; calledHelp = true; }
128
129 else {
130 OptionParser parser(option, setParameters());
131 map<string,string> parameters = parser.getParameters();
132
133 ValidParameters validParameter;
134
135
136 //check for required parameters
137 phylipfile = validParameter.validFile(parameters, "phylip");
138 if (phylipfile == "not open") { phylipfile = ""; abort = true; }
139 else if (phylipfile == "not found") { phylipfile = ""; }
140 else { distfile = phylipfile; format = "phylip"; current->setPhylipFile(phylipfile); }
141
142 columnfile = validParameter.validFile(parameters, "column");
143 if (columnfile == "not open") { columnfile = ""; abort = true; }
144 else if (columnfile == "not found") { columnfile = ""; }
145 else { distfile = columnfile; format = "column"; current->setColumnFile(columnfile); }
146
147 fastafile = validParameter.validFile(parameters, "fasta");
148 if (fastafile == "not open") { abort = true; }
149 else if (fastafile == "not found") { fastafile = ""; }
150 else { distfile = fastafile; format = "fasta"; current->setFastaFile(fastafile); }
151
152 namefile = validParameter.validFile(parameters, "name");
153 if (namefile == "not open") { abort = true; }
154 else if (namefile == "not found") { namefile = ""; }
155 else { current->setNameFile(namefile); }
156
157 countfile = validParameter.validFile(parameters, "count");
158 if (countfile == "not open") { abort = true; countfile = ""; }
159 else if (countfile == "not found") { countfile = ""; }
160 else { current->setCountFile(countfile); }
161
162 method = validParameter.valid(parameters, "method");
163 if (method == "not found") { method = "opti";}
164
165 vector<string> versionOutputs;
166 bool foundTool = false;
167 string path = current->getProgramPath();
168 string programName = "vsearch"; programName += EXECUTABLE_EXT;
169
170 vsearchLocation = validParameter.validPath(parameters, "vsearch");
171 if (vsearchLocation == "not found") {
172 vsearchLocation = "";
173 if ((method == "agc") || (method == "dgc")) {
174 foundTool = util.findTool(programName, vsearchLocation, path, versionOutputs, current->getLocations());
175 }
176 }
177 else {
178 if ((method == "agc") || (method == "dgc")) {
179 //test to make sure vsearch exists
180 ifstream in;
181 vsearchLocation = util.getFullPathName(vsearchLocation);
182 bool ableToOpen = util.openInputFile(vsearchLocation, in, "no error"); in.close();
183 if(!ableToOpen) {
184 m->mothurOut(vsearchLocation + " file does not exist or cannot be opened, ignoring.\n"); vsearchLocation = "";
185 programName = util.getSimpleName(vsearchLocation); vsearchLocation = "";
186 foundTool = util.findTool(programName, vsearchLocation, path, versionOutputs, current->getLocations());
187 }
188 }
189 }
190
191 if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted") || (method == "agc") || (method == "dgc") || (method == "opti") || (method == "unique")) { }
192 else { m->mothurOut("[ERROR]: Not a valid clustering method. Valid clustering algorithms are furthest, nearest, average, weighted, agc, dgc, unique and opti.\n"); abort = true; }
193
194 if (method != "unique") {
195 if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
196 //is there are current file available for either of these?
197 //give priority to column, then phylip
198 columnfile = current->getColumnFile();
199 if (columnfile != "") { distfile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter.\n"); }
200 else {
201 phylipfile = current->getPhylipFile();
202 if (phylipfile != "") { distfile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter.\n"); }
203 else {
204 fastafile = current->getFastaFile();
205 if (fastafile != "") { distfile = fastafile; format = "fasta"; m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n"); }
206 else {
207 m->mothurOut("No valid current files. You must provide a phylip, column or fasta file before you can use the cluster command, unless using the unique method.\n");
208 abort = true;
209 }
210 }
211 }
212 }
213 else if (((phylipfile != "") && (columnfile != "")) || ((phylipfile != "") && (fastafile != "")) || ((fastafile != "") && (columnfile != ""))) { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: phylip, column or fasta.\n"); abort = true; }
214
215 if (columnfile != "") {
216 if ((namefile == "") && (countfile == "")){
217 namefile = current->getNameFile();
218 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter.\n"); }
219 else {
220 countfile = current->getCountFile();
221 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter.\n"); }
222 else {
223 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format.\n");
224 abort = true;
225 }
226 }
227 }
228 }
229 if ((method != "agc") && (method != "dgc")) {
230 if ((columnfile == "") && (phylipfile == "")) {
231 m->mothurOut("[ERROR]: You must provide a distance file unless you are using the agc, dgc or unique clustering methods, aborting\n."); abort = true;
232 }
233 }
234 }else {
235 if ((countfile == "") && (namefile == "")) {
236 countfile = current->getCountFile();
237 if (countfile != "") { distfile = countfile; format = "count"; m->mothurOut("Using " + countfile + " as input file for the count parameter.\n"); }
238 else {
239 namefile = current->getNameFile();
240 if (namefile != "") { distfile = namefile; format = "name"; m->mothurOut("Using " + namefile + " as input file for the name parameter.\n"); }
241 else {
242 m->mothurOut("No valid current files. You must provide a count or name file before you can use the cluster command with the unique method.\n");
243 abort = true;
244 }
245 }
246
247 }
248 else if(countfile != "") { format = "count"; }
249 else if(namefile != "") { format = "name"; }
250 }
251 if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster command you must enter ONLY ONE of the following: count or name.\n"); abort = true; }
252
253 //check for optional parameter and set defaults
254 // ...at some point should added some additional type checking...
255 //get user cutoff and precision or use defaults
256 string temp;
257 temp = validParameter.valid(parameters, "precision");
258 if (temp == "not found") { temp = "100"; }
259 //saves precision legnth for formatting below
260 length = temp.length();
261 util.mothurConvert(temp, precision);
262
263 temp = validParameter.valid(parameters, "sim"); if (temp == "not found") { temp = "F"; }
264 sim = util.isTrue(temp);
265
266 temp = validParameter.valid(parameters, "delta"); if (temp == "not found") { temp = "0.0001"; }
267 util.mothurConvert(temp, stableMetric);
268
269 metricName = validParameter.valid(parameters, "metric"); if (metricName == "not found") { metricName = "mcc"; }
270
271 if ((metricName == "mcc") || (metricName == "sens") || (metricName == "spec") || (metricName == "tptn") || (metricName == "tp") || (metricName == "tn") || (metricName == "fp") || (metricName == "fn") || (metricName == "f1score") || (metricName == "accuracy") || (metricName == "ppv") || (metricName == "npv") || (metricName == "fdr") || (metricName == "fpfn") ){ }
272 else { m->mothurOut("[ERROR]: Not a valid metric. Valid metrics are mcc, sens, spec, tp, tn, fp, fn, tptn, fpfn, f1score, accuracy, ppv, npv, fdr.\n"); abort = true; }
273
274 initialize = validParameter.valid(parameters, "initialize"); if (initialize == "not found") { initialize = "singleton"; }
275
276 if ((initialize == "singleton") || (initialize == "oneotu")){ }
277 else { m->mothurOut("[ERROR]: Not a valid initialization. Valid initializations are singleton and oneotu.\n"); abort = true; }
278
279 temp = validParameter.valid(parameters, "iters"); if (temp == "not found") { temp = "100"; }
280 util.mothurConvert(temp, maxIters);
281
282 adjust=-1.0;
283
284 bool setProcessors = true;
285 temp = validParameter.valid(parameters, "processors"); if (temp == "not found"){ setProcessors=false; temp = current->getProcessors(); }
286 processors = current->setProcessors(temp);
287
288 if ((method == "agc") || (method == "dgc")) {
289 if (fastafile == "") { m->mothurOut("[ERROR]: You must provide a fasta file when using the agc or dgc clustering methods, aborting\n."); abort = true;}
290 }else if (setProcessors) {
291 m->mothurOut("[WARNING]: You can only use the processors option when using the agc or dgc clustering methods. Using 1 processor.\n.");
292 }
293
294 cutOffSet = false;
295 temp = validParameter.valid(parameters, "cutoff");
296 if (temp == "not found") { if ((method == "opti") || (method == "agc") || (method == "dgc")) { temp = "0.03"; }else { temp = "0.15"; } }
297 else { cutOffSet = true; }
298 int pos = temp.find('-');
299 if (pos != string::npos) { //multiple cutoffs given
300 if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted")) {
301 m->mothurOut("[WARNING]: Multiple cutoffs can only be specified when using the agc, dgc or opti method. Using 0.15. \n.");
302 cutOffSet = false; temp = "0.15";
303 }else { util.splitAtDash(temp, cutoffs); temp = *cutoffs.begin(); }
304 }else { cutoffs.insert(temp); }
305 util.mothurConvert(temp, cutoff);
306
307 showabund = validParameter.valid(parameters, "showabund");
308 if (showabund == "not found") { showabund = "T"; }
309
310 timing = validParameter.valid(parameters, "timing");
311 if (timing == "not found") { timing = "F"; }
312 }
313 }
314 catch(exception& e) {
315 m->errorOut(e, "ClusterCommand", "ClusterCommand");
316 exit(1);
317 }
318 }
319 //**********************************************************************************************************************
~ClusterCommand()320 ClusterCommand::~ClusterCommand(){}
321 //**********************************************************************************************************************
322
execute()323 int ClusterCommand::execute(){
324 try {
325
326 if (abort) { if (calledHelp) { return 0; } return 2; }
327
328 //phylip file given and cutoff not given - use cluster.classic because it uses less memory and is faster
329 if ((format == "phylip") && (!cutOffSet) && (method != "opti")) {
330 m->mothurOut("\nYou are using a phylip file and no cutoff. I will run cluster.classic to save memory and time.\n");
331
332 //run unique.seqs for deconvolute results
333 string inputString = "phylip=" + distfile;
334 if (namefile != "") { inputString += ", name=" + namefile; }
335 else if (countfile != "") { inputString += ", count=" + countfile; }
336 inputString += ", precision=" + toString(precision);
337 inputString += ", method=" + method;
338 if (sim) { inputString += ", sim=T"; }
339 else { inputString += ", sim=F"; }
340
341 m->mothurOut("\n/------------------------------------------------------------/\n");
342 m->mothurOut("Running command: cluster.classic(" + inputString + ")\n");
343
344 Command* clusterClassicCommand = new ClusterDoturCommand(inputString);
345 clusterClassicCommand->execute();
346 delete clusterClassicCommand;
347
348 m->mothurOut("/------------------------------------------------------------/\n");
349
350 return 0;
351 }
352
353 time_t estart = time(NULL);
354
355 if (format == "fasta") { runVsearchCluster(); }
356 else if (method == "opti") { runOptiCluster(); }
357 else if (method == "unique") { runUniqueCluster(); }
358 else { runMothurCluster(); }
359
360 if (m->getControl_pressed()) { for (int j = 0; j < outputNames.size(); j++) { util.mothurRemove(outputNames[j]); } return 0; }
361
362 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster\n");
363
364 //set list file as new current listfile
365 string currentName = "";
366 itTypes = outputTypes.find("list");
367 if (itTypes != outputTypes.end()) {
368 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setListFile(currentName); }
369 }
370
371 //set rabund file as new current rabundfile
372 itTypes = outputTypes.find("rabund");
373 if (itTypes != outputTypes.end()) {
374 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setRabundFile(currentName); }
375 }
376
377 //set sabund file as new current sabundfile
378 itTypes = outputTypes.find("sabund");
379 if (itTypes != outputTypes.end()) {
380 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setSabundFile(currentName); }
381 }
382
383 m->mothurOut("\nOutput File Names: \n");
384 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i] +"\n"); } m->mothurOutEndLine();
385
386 return 0;
387 }
388 catch(exception& e) {
389 m->errorOut(e, "ClusterCommand", "execute");
390 exit(1);
391 }
392 }
393 //**********************************************************************************************************************
394
runVsearchCluster()395 int ClusterCommand::runVsearchCluster(){
396 try {
397 string vsearchFastafile = ""; VsearchFileParser* vParse;
398 if ((namefile == "") && (countfile == "")) { vParse = new VsearchFileParser(fastafile); }
399 else if (namefile != "") { vParse = new VsearchFileParser(fastafile, namefile, "name"); }
400 else if (countfile != "") { vParse = new VsearchFileParser(fastafile, countfile, "count"); }
401 else { m->mothurOut("[ERROR]: Opps, should never get here. ClusterCommand::runVsearchCluster() \n"); m->setControl_pressed(true); return 0; }
402
403 if (m->getControl_pressed()) { delete vParse; return 0; }
404
405 vsearchFastafile = vParse->getVsearchFile();
406
407 if (cutoff > 1.0) { m->mothurOut("You did not set a cutoff, using 0.03.\n"); cutoff = 0.03; }
408
409 map<string, int> counts;
410 map<string, string> variables;
411 if (outputdir == "") { outputdir += util.hasPath(distfile); }
412 fileroot = outputdir + util.getRootName(util.getSimpleName(distfile)); tag = method;
413
414 variables["[filename]"] = fileroot;
415 variables["[clustertag]"] = tag;
416 string listFileName = getOutputFileName("list", variables);
417 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
418
419 ofstream out;
420 util.openOutputFile(listFileName, out);
421 bool printHeaders = true;
422
423 for (set<string>::iterator it = cutoffs.begin(); it != cutoffs.end(); it++) {
424
425 m->mothurOut("\n" + *it + "\n");
426 util.mothurConvert(*it, cutoff);
427
428 //Run vsearch
429 string ucVsearchFile = util.getSimpleName(vsearchFastafile) + ".clustered.uc";
430 string logfile = util.getSimpleName(vsearchFastafile) + ".clustered.log";
431 vsearchDriver(vsearchFastafile, ucVsearchFile, logfile);
432
433 if (m->getControl_pressed()) { break; }
434
435 //Convert outputted *.uc file into a list file
436 ListVector list = vParse->createListFile(ucVsearchFile, vParse->getNumBins(logfile), toString(1.0-cutoff), counts);
437
438 if (printHeaders) {
439 printHeaders = false;
440 }else { list.setPrintedLabels(printHeaders); }
441
442 if (countfile != "") { list.print(out, counts); }
443 else { list.print(out); }
444
445 //remove temp files
446 util.mothurRemove(ucVsearchFile); util.mothurRemove(logfile);
447
448 }
449 out.close();
450 util.mothurRemove(vsearchFastafile); delete vParse;
451 if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
452
453 return 0;
454 }
455 catch(exception& e) {
456 m->errorOut(e, "ClusterCommand", "runVsearchCluster");
457 exit(1);
458 }
459 }
460 //**********************************************************************************************************************
461
vsearchDriver(string inputFile,string ucClusteredFile,string logfile)462 int ClusterCommand::vsearchDriver(string inputFile, string ucClusteredFile, string logfile){
463 try {
464
465 //vsearch --maxaccepts 16 --usersort --id 0.97 --minseqlength 30 --wordlength 8 --uc $ROOT.clustered.uc --cluster_smallmem $ROOT.sorted.fna --maxrejects 64 --strand both --log $ROOT.clustered.log --sizeorder
466
467
468 //no sizeorder for dgc
469
470 ucClusteredFile = util.getFullPathName(ucClusteredFile);
471 inputFile = util.getFullPathName(inputFile);
472 logfile = util.getFullPathName(logfile);
473
474 //to allow for spaces in the path
475 ucClusteredFile = "\"" + ucClusteredFile + "\"";
476 inputFile = "\"" + inputFile + "\"";
477 logfile = "\"" + logfile + "\"";
478
479 vector<char*> cPara;
480
481 string vsearchCommand = vsearchLocation;
482 vsearchCommand = "\"" + vsearchCommand + "\" ";
483
484 vector<char*> vsearchParameters;
485 vsearchParameters.push_back(util.mothurConvert(vsearchCommand));
486
487 //--maxaccepts=16
488 vsearchParameters.push_back(util.mothurConvert("--maxaccepts=16"));
489
490 //--threads=1
491 string processorsString = "--threads=" + toString(processors);
492 vsearchParameters.push_back(util.mothurConvert(processorsString));
493
494 //--usersort
495 vsearchParameters.push_back(util.mothurConvert("--usersort"));
496
497 //--id=0.97
498 cutoff = abs(1.0 - cutoff); string cutoffString = toString(cutoff);
499 if (cutoffString.length() > 4) { cutoffString = cutoffString.substr(0, 4); }
500 else if (cutoffString.length() < 4) { for (int i = cutoffString.length(); i < 4; i++) { cutoffString += "0"; } }
501
502 cutoffString = "--id=" + cutoffString;
503 vsearchParameters.push_back(util.mothurConvert(cutoffString));
504
505 //--minseqlength=30
506 vsearchParameters.push_back(util.mothurConvert("--minseqlength=30"));
507
508 //--wordlength=8
509 vsearchParameters.push_back(util.mothurConvert("--wordlength=8"));
510
511 //--uc=$ROOT.clustered.uc
512 string tempIn = "--uc=" + ucClusteredFile;
513 vsearchParameters.push_back(util.mothurConvert(tempIn));
514
515 //--cluster_smallmem $ROOT.sorted.fna
516 string tempSorted = "--cluster_smallmem=" + inputFile;
517 vsearchParameters.push_back(util.mothurConvert(tempSorted));
518
519 //--maxrejects=64
520 vsearchParameters.push_back(util.mothurConvert("--maxrejects=64"));
521
522 //--strand=both
523 vsearchParameters.push_back(util.mothurConvert("--strand=both"));
524
525 //--log=$ROOT.clustered.log
526 string tempLog = "--log=" + logfile;
527 vsearchParameters.push_back(util.mothurConvert(tempLog));
528
529 if (method == "agc") { //--sizeorder
530 vsearchParameters.push_back(util.mothurConvert("--sizeorder"));
531 }
532
533 if (m->getDebug()) { m->mothurOut("[DEBUG]: "); for(int i = 0; i < vsearchParameters.size(); i++) { m->mothurOut(toString(vsearchParameters[i]) + "\t"); } m->mothurOut("\n"); }
534
535 string commandString = "";
536 for (int i = 0; i < vsearchParameters.size(); i++) { commandString += toString(vsearchParameters[i]) + " "; }
537
538 #if defined NON_WINDOWS
539 #else
540 commandString = "\"" + commandString + "\"";
541 #endif
542 if (m->getDebug()) { m->mothurOut("[DEBUG]: vsearch cluster command = " + commandString + ".\n"); }
543 system(commandString.c_str());
544
545 //free memory
546 for(int i = 0; i < vsearchParameters.size(); i++) { delete vsearchParameters[i]; }
547
548 //remove "" from filenames
549 ucClusteredFile = ucClusteredFile.substr(1, ucClusteredFile.length()-2);
550 inputFile = inputFile.substr(1, inputFile.length()-2);
551 logfile = logfile.substr(1, logfile.length()-2);
552
553 return 0;
554 }
555 catch(exception& e) {
556 m->errorOut(e, "ClusterCommand", "vsearchDriver");
557 exit(1);
558 }
559 }
560 //**********************************************************************************************************************
561
runMothurCluster()562 int ClusterCommand::runMothurCluster(){
563 try {
564
565 ReadMatrix* read;
566 if (format == "column") { read = new ReadColumnMatrix(columnfile, sim); } //sim indicates whether its a similarity matrix
567 else if (format == "phylip") { read = new ReadPhylipMatrix(phylipfile, sim); }
568 else { m->setControl_pressed(true); return 0; }
569
570 read->setCutoff(cutoff);
571
572 NameAssignment* nameMap = NULL;
573 CountTable* ct = NULL;
574 map<string, int> counts;
575 if(namefile != ""){
576 nameMap = new NameAssignment(namefile);
577 nameMap->readMap();
578 read->read(nameMap);
579 }else if (countfile != "") {
580 ct = new CountTable();
581 ct->readTable(countfile, false, false);
582 read->read(ct);
583 counts = ct->getNameMap();
584 }else { read->read(nameMap); }
585
586 list = read->getListVector();
587 matrix = read->getDMatrix();
588
589 if(countfile != "") {
590 rabund = new RAbundVector();
591 createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
592 delete ct;
593 }else { rabund = new RAbundVector(list->getRAbundVector()); }
594 delete read;
595
596 if (m->getControl_pressed()) { //clean up
597 delete list; delete matrix; delete rabund; if(countfile == ""){rabundFile.close(); sabundFile.close(); util.mothurRemove((fileroot+ tag + ".rabund")); util.mothurRemove((fileroot+ tag + ".sabund")); }
598 listFile.close(); util.mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0;
599 }
600
601 //create cluster
602 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
603 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
604 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust); }
605 else if(method == "weighted"){ cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method, adjust); }
606 tag = cluster->getTag();
607
608 if (outputdir == "") { outputdir += util.hasPath(distfile); }
609 fileroot = outputdir + util.getRootName(util.getSimpleName(distfile));
610
611 map<string, string> variables;
612 variables["[filename]"] = fileroot;
613 variables["[clustertag]"] = tag;
614 string sabundFileName = getOutputFileName("sabund", variables);
615 string rabundFileName = getOutputFileName("rabund", variables);
616 //if (countfile != "") { variables["[tag2]"] = "unique_list"; }
617 string listFileName = getOutputFileName("list", variables);
618
619 if (countfile == "") {
620 util.openOutputFile(sabundFileName, sabundFile);
621 util.openOutputFile(rabundFileName, rabundFile);
622 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
623 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
624
625 }
626 util.openOutputFile(listFileName, listFile);
627 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
628
629 float previousDist = 0.00000;
630 float rndPreviousDist = 0.00000;
631 oldRAbund = *rabund;
632 oldList = *list;
633
634 print_start = true;
635 start = time(NULL);
636 loops = 0;
637 double saveCutoff = cutoff;
638 bool printHeaders = true;
639
640 while ((matrix->getSmallDist() <= cutoff) && (matrix->getNNodes() > 0)){
641
642 if (m->getControl_pressed()) { //clean up
643 delete list; delete matrix; delete rabund; delete cluster;
644 if(countfile == "") {rabundFile.close(); sabundFile.close(); util.mothurRemove((fileroot+ tag + ".rabund")); util.mothurRemove((fileroot+ tag + ".sabund")); }
645 listFile.close(); util.mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0;
646 }
647
648 if (print_start && util.isTrue(timing)) {
649 m->mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/"
650 + toString(util.roundDist(matrix->getSmallDist(), precision))
651 + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
652 cout.flush();
653 print_start = false;
654 }
655
656 cluster->update(cutoff);
657
658 float dist = matrix->getSmallDist();
659 float rndDist = util.ceilDist(dist, precision);
660
661 if(previousDist <= 0.0000 && !util.isEqual(dist, previousDist)) { printData("unique", counts, printHeaders); }
662 else if(!util.isEqual(rndDist, rndPreviousDist)) { printData(toString(rndPreviousDist, length-1), counts, printHeaders); }
663
664 previousDist = dist;
665 rndPreviousDist = rndDist;
666 oldRAbund = *rabund;
667 oldList = *list;
668 }
669
670 if (print_start && util.isTrue(timing)) {
671 m->mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist)
672 + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
673 cout.flush();
674 print_start = false;
675 }
676
677 if(previousDist <= 0.0000) { printData("unique", counts, printHeaders); }
678 else if(rndPreviousDist<cutoff) { printData(toString(rndPreviousDist, length-1), counts, printHeaders); }
679
680 delete matrix;
681 delete list;
682 delete rabund;
683 delete cluster;
684 if (countfile == "") {
685 sabundFile.close();
686 rabundFile.close();
687 }
688 listFile.close();
689
690 if (!util.isEqual(saveCutoff, cutoff)) {
691 saveCutoff = util.ceilDist(saveCutoff, precision);
692 m->mothurOut("changed cutoff to " + toString(cutoff)+"\n");
693 }
694
695 return 0;
696 }
697 catch(exception& e) {
698 m->errorOut(e, "ClusterCommand", "runMothurCluster");
699 exit(1);
700 }
701 }
702 //**********************************************************************************************************************
703
printData(string label,map<string,int> & counts,bool & ph)704 void ClusterCommand::printData(string label, map<string, int>& counts, bool& ph){
705 try {
706 oldList.setPrintedLabels(ph); ph = false;
707
708 if (util.isTrue(timing)) {
709 m->mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins())
710 + "\tclusters. Updates: " + toString(loops)+"\n");
711 }
712 print_start = true;
713 loops = 0;
714 start = time(NULL);
715
716 oldRAbund.setLabel(label);
717 if (countfile == "") {
718 oldRAbund.print(rabundFile);
719 oldRAbund.getSAbundVector().print(sabundFile);
720 }
721
722 if (util.isTrue(showabund)) {
723 oldRAbund.getSAbundVector().print(cout);
724 }
725
726 oldList.setLabel(label);
727 if(countfile != "") {
728 oldList.print(listFile, counts);
729 }else {
730 oldList.print(listFile);
731 }
732
733 }
734 catch(exception& e) {
735 m->errorOut(e, "ClusterCommand", "printData");
736 exit(1);
737 }
738
739
740 }
741 //**********************************************************************************************************************
742
createRabund(CountTable * & ct,ListVector * & list,RAbundVector * & rabund)743 int ClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
744 try {
745 rabund->setLabel(list->getLabel());
746 for(int i = 0; i < list->getNumBins(); i++) {
747 if (m->getControl_pressed()) { break; }
748 vector<string> binNames;
749 string bin = list->get(i);
750 util.splitAtComma(bin, binNames);
751 int total = 0;
752 for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
753 rabund->push_back(total);
754 }
755 return 0;
756 }
757 catch(exception& e) {
758 m->errorOut(e, "ClusterCommand", "createRabund");
759 exit(1);
760 }
761
762 }
763 //**********************************************************************************************************************
764
runOptiCluster()765 int ClusterCommand::runOptiCluster(){
766 try {
767 if (!cutOffSet) { m->mothurOut("\nYou did not set a cutoff, using 0.03.\n"); cutoff = 0.03; }
768
769 m->mothurOut("\nClustering " + distfile+"\n");
770
771 ClusterMetric* metric = NULL;
772 if (metricName == "mcc") { metric = new MCC(); }
773 else if (metricName == "sens") { metric = new Sensitivity(); }
774 else if (metricName == "spec") { metric = new Specificity(); }
775 else if (metricName == "tptn") { metric = new TPTN(); }
776 else if (metricName == "tp") { metric = new TP(); }
777 else if (metricName == "tn") { metric = new TN(); }
778 else if (metricName == "fp") { metric = new FP(); }
779 else if (metricName == "fn") { metric = new FN(); }
780 else if (metricName == "f1score") { metric = new F1Score(); }
781 else if (metricName == "accuracy") { metric = new Accuracy(); }
782 else if (metricName == "ppv") { metric = new PPV(); }
783 else if (metricName == "npv") { metric = new NPV(); }
784 else if (metricName == "fdr") { metric = new FDR(); }
785 else if (metricName == "fpfn") { metric = new FPFN(); }
786 else { return 0; }
787
788 string nameOrCount = "";
789 string thisNamefile = "";
790 map<string, int> counts;
791 if (countfile != "") { nameOrCount = "count"; thisNamefile = countfile; CountTable ct; ct.readTable(countfile, false, false); counts = ct.getNameMap(); }
792 else if (namefile != "") { nameOrCount = "name"; thisNamefile = namefile; }
793
794 string distfile = columnfile;
795 if (format == "phylip") { distfile = phylipfile; }
796
797 if (outputdir == "") { outputdir += util.hasPath(distfile); }
798 fileroot = outputdir + util.getRootName(util.getSimpleName(distfile));
799 tag = "opti_" + metric->getName();
800
801 string listFileName = fileroot+ tag + ".list";
802
803 ofstream listFile;
804 util.openOutputFile(listFileName, listFile);
805 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
806
807 map<string, string> variables;
808 variables["[filename]"] = fileroot;
809 variables["[clustertag]"] = tag;
810 string outputName = getOutputFileName("steps", variables);
811 outputNames.push_back(outputName); outputTypes["steps"].push_back(outputName);
812 ofstream outStep;
813 util.openOutputFile(outputName, outStep);
814
815 string sensspecFilename = fileroot+ tag + ".sensspec";
816 ofstream sensFile;
817 util.openOutputFile(sensspecFilename, sensFile);
818 outputNames.push_back(sensspecFilename); outputTypes["sensspec"].push_back(sensspecFilename);
819
820 sensFile << "label\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n";
821 m->mothurOut("\n\niter\ttime\tlabel\tnum_otus\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n");
822 outStep << "iter\ttime\tlabel\tnum_otus\tcutoff\ttp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n";
823
824 bool printHeaders = true;
825 for (set<string>::iterator it = cutoffs.begin(); it != cutoffs.end(); it++) {
826
827 m->mothurOut("\n" + *it + "\n");
828 util.mothurConvert(*it, cutoff);
829
830 OptiData* matrix = new OptiMatrix(distfile, thisNamefile, nameOrCount, format, cutoff, false);
831
832 OptiCluster cluster(matrix, metric, 0);
833
834 int iters = 0;
835 double listVectorMetric = 0; //worst state
836 double delta = 1;
837
838 cluster.initialize(listVectorMetric, true, initialize);
839
840 long long numBins = cluster.getNumBins();
841 double tp, tn, fp, fn;
842 vector<double> results = cluster.getStats(tp, tn, fp, fn);
843 m->mothurOut("0\t0\t" + toString(cutoff) + "\t" + toString(numBins) + "\t"+ toString(cutoff) + "\t" + toString(tp) + "\t" + toString(tn) + "\t" + toString(fp) + "\t" + toString(fn) + "\t");
844 outStep << "0\t0\t" + toString(cutoff) + "\t" + toString(numBins) + "\t" + toString(cutoff) + "\t" << tp << '\t' << tn << '\t' << fp << '\t' << fn << '\t';
845 for (int i = 0; i < results.size(); i++) { m->mothurOut(toString(results[i]) + "\t"); outStep << results[i] << "\t"; }
846 m->mothurOutEndLine();
847 outStep << endl;
848
849 while ((delta > stableMetric) && (iters < maxIters)) {
850
851 long start = time(NULL);
852
853 if (m->getControl_pressed()) { break; }
854 double oldMetric = listVectorMetric;
855
856 cluster.update(listVectorMetric);
857
858 delta = abs(oldMetric - listVectorMetric);
859 iters++;
860
861 results = cluster.getStats(tp, tn, fp, fn);
862 numBins = cluster.getNumBins();
863
864 m->mothurOut(toString(iters) + "\t" + toString(time(NULL) - start) + "\t" + toString(cutoff) + "\t" + toString(numBins) + "\t" + toString(cutoff) + "\t"+ toString(tp) + "\t" + toString(tn) + "\t" + toString(fp) + "\t" + toString(fn) + "\t");
865 outStep << (toString(iters) + "\t" + toString(time(NULL) - start) + "\t" + toString(cutoff) + "\t" + toString(numBins) + "\t" + toString(cutoff) + "\t") << tp << '\t' << tn << '\t' << fp << '\t' << fn << '\t';
866 for (int i = 0; i < results.size(); i++) { m->mothurOut(toString(results[i]) + "\t"); outStep << results[i] << "\t"; }
867 m->mothurOutEndLine(); outStep << endl;
868 }
869 m->mothurOutEndLine(); m->mothurOutEndLine();
870
871 if (m->getControl_pressed()) { delete matrix; delete metric; metric = NULL; return 0; }
872
873 ListVector* list = cluster.getList();
874 list->setLabel(toString(cutoff));
875
876 if (printHeaders) { //only print headers the first time
877 printHeaders = false;
878 }else { list->setPrintedLabels(printHeaders); }
879
880 if(countfile != "") { list->print(listFile, counts); }
881 else { list->print(listFile); }
882 delete list;
883
884 results = cluster.getStats(tp, tn, fp, fn);
885
886 sensFile << cutoff << '\t' << cutoff << '\t' << tp << '\t' << tn << '\t' << fp << '\t' << fn << '\t';
887 for (int i = 0; i < results.size(); i++) { sensFile << results[i] << '\t'; } sensFile << '\n';
888
889
890 delete matrix;
891 }
892
893 listFile.close();
894 sensFile.close();
895 outStep.close();
896
897 return 0;
898
899 }
900 catch(exception& e) {
901 m->errorOut(e, "ClusterCommand", "runOptiCluster");
902 exit(1);
903 }
904
905 }
906 //**********************************************************************************************************************
907
runUniqueCluster()908 int ClusterCommand::runUniqueCluster(){
909 try {
910 if (countfile != "") { distfile = countfile; }
911 else if (namefile != "") { distfile = namefile; }
912
913 m->mothurOut("\nClustering " + distfile+"\n");
914
915 ListVector list; list.setLabel("ASV");
916
917 map<string, int> counts;
918 if (countfile != "") {
919 CountTable ct; ct.readTable(countfile, false, false); counts = ct.getNameMap();
920 for (map<string, int>::iterator it = counts.begin(); it != counts.end(); it++) {
921 if (m->getControl_pressed()) { return 0; }
922 list.push_back(it->first);
923 }
924 }else {
925 map<string, string> nameMap;
926 util.readNames(namefile, nameMap);
927 for (map<string, string>::iterator it = nameMap.begin(); it != nameMap.end(); it++) {
928 if (m->getControl_pressed()) { return 0; }
929 list.push_back(it->second);
930 }
931 }
932
933 if (outputdir == "") { outputdir += util.hasPath(distfile); }
934 fileroot = outputdir + util.getRootName(util.getSimpleName(distfile));
935
936 tag = "unique";
937 string listFileName = fileroot+ tag + ".list";
938
939 ofstream listFile;
940 util.openOutputFile(listFileName, listFile);
941 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
942
943 if(countfile != "") { list.print(listFile, counts); }
944 else { list.print(listFile); }
945 listFile.close();
946
947 map<string, string> variables;
948 variables["[filename]"] = fileroot;
949 variables["[clustertag]"] = tag;
950 string sabundFileName = getOutputFileName("sabund", variables);
951 string rabundFileName = getOutputFileName("rabund", variables);
952
953 if (countfile == "") {
954 util.openOutputFile(sabundFileName, sabundFile);
955 util.openOutputFile(rabundFileName, rabundFile);
956 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
957 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
958
959 SAbundVector sabund = list.getSAbundVector();
960 sabund.print(sabundFile);
961 sabundFile.close();
962
963 RAbundVector rabund = list.getRAbundVector();
964 rabund.print(rabundFile);
965 rabundFile.close();
966 }
967
968 return 0;
969 }
970 catch(exception& e) {
971 m->errorOut(e, "ClusterCommand", "runUniqueCluster");
972 exit(1);
973 }
974
975 }
976 //**********************************************************************************************************************
977