1 /*
2  *  clustersplitcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/19/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9 
10 #include "clustersplitcommand.h"
11 #include "systemcommand.h"
12 #include "sensspeccommand.h"
13 #include "mcc.hpp"
14 #include "sensitivity.hpp"
15 #include "specificity.hpp"
16 #include "fdr.hpp"
17 #include "npv.hpp"
18 #include "ppv.hpp"
19 #include "f1score.hpp"
20 #include "tp.hpp"
21 #include "fp.hpp"
22 #include "fpfn.hpp"
23 #include "tptn.hpp"
24 #include "tn.hpp"
25 #include "fn.hpp"
26 #include "accuracy.hpp"
27 
28 //**********************************************************************************************************************
setParameters()29 vector<string> ClusterSplitCommand::setParameters(){
30 	try {
31         CommandParameter pfile("file", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","",false,false,true); parameters.push_back(pfile);
32 		CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
33 		CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
34 		CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
35         CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
36 		CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
37 		CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
38         CommandParameter prunspenspec("runsensspec", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(prunspenspec);
39         CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
40 		CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
41 		CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
42 		CommandParameter pcutoff("cutoff", "Number", "", "0.03", "", "", "","",false,false,true); parameters.push_back(pcutoff);
43         CommandParameter pmetriccutoff("delta", "Number", "", "0.0001", "", "", "","",false,false,true); parameters.push_back(pmetriccutoff);
44         CommandParameter piters("iters", "Number", "", "100", "", "", "","",false,false,true); parameters.push_back(piters);
45         CommandParameter pinitialize("initialize", "Multiple", "oneotu-singleton", "singleton", "", "", "","",false,false,true); parameters.push_back(pinitialize);
46         CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
47         CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted-agc-dgc-opti", "opti", "", "", "","",false,false,true); parameters.push_back(pmethod);
48         CommandParameter pmetric("metric", "Multiple", "mcc-sens-spec-tptn-fpfn-tp-tn-fp-fn-f1score-accuracy-ppv-npv-fdr", "mcc", "", "", "","",false,false,true); parameters.push_back(pmetric);
49        CommandParameter pdist("dist", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdist);
50         CommandParameter pislist("islist", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pislist);
51         CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
52         CommandParameter pvsearchlocation("vsearch", "String", "", "", "", "", "","",false,false); parameters.push_back(pvsearchlocation);
53 		CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
54         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
55 		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
56 
57         abort = false; calledHelp = false;
58 
59         vector<string> tempOutNames;
60         outputTypes["list"] = tempOutNames;
61         outputTypes["rabund"] = tempOutNames;
62         outputTypes["sabund"] = tempOutNames;
63         outputTypes["column"] = tempOutNames;
64         outputTypes["name"] = tempOutNames;
65         outputTypes["file"] = tempOutNames;
66         outputTypes["sensspec"] = tempOutNames;
67 
68 		vector<string> myArray;
69 		for (int i = 0; i < parameters.size(); i++) {	myArray.push_back(parameters[i].name);		}
70 		return myArray;
71 	}
72 	catch(exception& e) {
73 		m->errorOut(e, "ClusterSplitCommand", "setParameters");
74 		exit(1);
75 	}
76 }
77 //**********************************************************************************************************************
getHelpString()78 string ClusterSplitCommand::getHelpString(){
79 	try {
80 		string helpString = "";
81 		helpString += "The cluster.split command parameter options are file, fasta, name, count, cutoff, precision, method, taxonomy, taxlevel, showabund, timing, cluster, iters, delta, initialize, dist, processors, runsensspec. Fasta or file are required.\n";
82 		helpString += "The cluster.split command splits your files by classification using a fasta file to generate distance matrices for each taxonomic group. \n";
83         helpString += "The file option allows you to enter your file containing your list of column and names/count files as well as the singleton file.  This file is mothur generated, when you run cluster.split() with the cluster=f parameter.  This can be helpful when you have a large dataset that you may be able to use all your processors for the splitting step, but have to reduce them for the cluster step due to RAM constraints. For example: cluster.split(fasta=yourFasta, taxonomy=yourTax, count=yourCount, taxlevel=3, cluster=f, processors=8) then cluster.split(file=yourFile, processors=4).  This allows your to maximize your processors during the splitting step.  Also, if you are unsure if the cluster step will have RAM issue with multiple processors, you can avoid running the first part of the command multiple times.\n";
84 		helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
85 		helpString += "The name parameter allows you to enter your name file. \n";
86         helpString += "The count parameter allows you to enter your count file.\n";
87         helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences. This is required unless you are running the command with the file option. \n";
88         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the dataset into taxanomic matrices, default=t";
89         helpString += "The dist parameter allows you to indicate whether you want a column formatted distance matrix outputted along with the list file. Default=F.";
90 		helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.03. \n";
91 		helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
92         helpString += "The iters parameter allow you to set the maxiters for the opticluster method. \n";
93         helpString += "The metric parameter allows to select the metric in the opticluster method. Options are Matthews correlation coefficient (mcc), sensitivity (sens), specificity (spec), true positives + true negatives (tptn), false positives + false negatives (fpfn), true positives (tp), true negative (tn), false positive (fp), false negative (fn), f1score (f1score), accuracy (accuracy), positive predictive value (ppv), negative predictive value (npv), false discovery rate (fdr). Default=mcc.\n";
94         helpString += "The delta parameter allows to set the stable value for the metric in the opticluster method. Default=0.0001\n";
95         helpString += "The initialize parameter allows to select the initial randomization for the opticluster method. Options are singleton, meaning each sequence is randomly assigned to its own OTU, or oneotu meaning all sequences are assigned to one otu. Default=singleton.\n";
96         helpString += "The runsensspec parameter allows to run the sens.spec command on the completed list file. Default=true.\n";
97 		helpString += "The method parameter allows you to enter your clustering mothod. Options are furthest, nearest, average, weighted, agc, dgc and opti. Default=opti.  The agc and dgc methods require a fasta file.";
98         helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the dataset, default=3.\n";
99         helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. Default=f.\n";
100         helpString += "The processors parameter allows you to specify the number of processors to use. The default is all available.\n";
101          helpString += "The vsearch parameter allows you to specify the name and location of your vsearch executable if using agc or dgc clustering methods. By default mothur will look in your path, mothur's executable and mothur tools locations.  You can set the vsearch location as follows, vsearch=/usr/bin/vsearch.\n";
102 		helpString += "The cluster.split command should be in the following format: \n";
103 		helpString += "cluster.split(fasta=yourFastaFile, count=yourCountFile, method=yourMethod, cutoff=yourCutoff, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
104 		return helpString;
105 	}
106 	catch(exception& e) {
107 		m->errorOut(e, "ClusterSplitCommand", "getHelpString");
108 		exit(1);
109 	}
110 }
111 //**********************************************************************************************************************
getOutputPattern(string type)112 string ClusterSplitCommand::getOutputPattern(string type) {
113     try {
114         string pattern = "";
115 
116         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
117         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; }
118         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
119         else if (type == "sensspec") {  pattern = "[filename],[clustertag],sensspec"; }
120         else if (type == "column") {  pattern = "[filename],dist"; }
121         else if (type == "file")   {  pattern = "[filename],file"; }
122         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true);  }
123 
124         return pattern;
125     }
126     catch(exception& e) {
127         m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
128         exit(1);
129     }
130 }
131 //**********************************************************************************************************************
getCommonQuestions()132 string ClusterSplitCommand::getCommonQuestions(){
133     try {
134         vector<string> questions, issues, qanswers, ianswers, howtos, hanswers;
135 
136         string issue = "Cluster.split crashes after merging individual list files. What do I do?"; issues.push_back(issue);
137         string ianswer = "\tAfter merging the split list files, mothur runs the sens.spec command on the entire dataset. The entire dataset's distance matrix may be too large to fit in memory, which causes the crash. You can skip this step by setting the runsensspec parameter to false. Skipping the sens.spec analysis does not effect the OTU assignment, and you can run the sens.spec analysis separately using the sens.spec command. \n"; ianswers.push_back(ianswer);
138 
139         issue = "Cluster.split crashes while reading the split distance matrices. What should I do?"; issues.push_back(issue);
140         ianswer = "\tThe command is crashing because the distance matrices are too large to fit into memory. Why do I have such a large distance matrix? This is most often caused by poor overlap of your reads. When reads have poor overlap, it greatly increases your error rate. Also, sequences that should cluster together don't because the errors appear to be genetic differences when in fact they are not. The quality of the data you are processing can not be overstressed. Error filled reads produce error filled results. To take a step back, if you look through our MiSeq SOP, you’ll see that we go to great pains to only work with the unique sequences to limit the number of sequences we have to align, screen for chimeras, classify, etc. We all know that 20 million reads will never make it through the pipeline without setting your computer on fire. Returning to the question at hand, you can imagine that if the reads do not fully overlap then any error in the 5’ end of the first read will be uncorrected by the 3’ end of the second read. If we assume for now that the errors are random, then every error will generate a new unique sequence. Granted, this happens less than 1% of the time, but multiply that by 20 million reads at whatever length you choose and you’ve got a big number. Viola, a bunch of unique reads and a ginormous distance matrix. \n"; ianswers.push_back(ianswer);
141 
142         string howto = "How do I cluster my sequences into OTUs at distance 0.03?"; howtos.push_back(howto);
143         string hanswer = "\tBy default the cluster.split command will use the opti method to cluster to 0.03. To find OTUs at a different distance set the cutoff parameter. ie. cutoff=0.01 will assemble OTUs for distance 0.01.\n"; hanswers.push_back(hanswer);
144 
145         string commonQuestions = util.getFormattedHelp(questions, qanswers, issues, ianswers, howtos, hanswers);
146 
147         return commonQuestions;
148     }
149     catch(exception& e) {
150         m->errorOut(e, "ClusterSplitCommand", "getCommonQuestions");
151         exit(1);
152     }
153 }
154 //**********************************************************************************************************************
155 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
ClusterSplitCommand(string option)156 ClusterSplitCommand::ClusterSplitCommand(string option) : Command()  {
157 	try{
158 		format = "";
159 
160 		//allow user to run help
161 		if(option == "help") { help(); abort = true; calledHelp = true; }
162 		else if(option == "citation") { citation(); abort = true; calledHelp = true;}
163         else if(option == "category") {  abort = true; calledHelp = true;  }
164 
165 		else {
166 			OptionParser parser(option, setParameters());
167 			map<string,string> parameters = parser.getParameters();
168 
169 			ValidParameters validParameter;
170 
171 			//check for required parameters
172 			file = validParameter.validFile(parameters, "file");
173 			if (file == "not open") { file = ""; abort = true; }
174 			else if (file == "not found") { file = ""; }
175             else { distfile = file; type = ""; }
176 
177 			namefile = validParameter.validFile(parameters, "name");
178 			if (namefile == "not open") { abort = true; namefile = "";}
179 			else if (namefile == "not found") { namefile = "";  }
180             else { current->setNameFile(namefile); type = "name"; }
181 
182             countfile = validParameter.validFile(parameters, "count");
183 			if (countfile == "not open") { abort = true; countfile = "";}
184 			else if (countfile == "not found") { countfile = "";  }
185             else { current->setCountFile(countfile); type = "count"; }
186 
187 			fastafile = validParameter.validFile(parameters, "fasta");
188 			if (fastafile == "not open") { abort = true; }
189 			else if (fastafile == "not found") { fastafile = ""; }
190 			else { distfile = fastafile;  current->setFastaFile(fastafile); }
191 
192 			taxFile = validParameter.validFile(parameters, "taxonomy");
193 			if (taxFile == "not open") { taxFile = ""; abort = true; }
194 			else if (taxFile == "not found") { taxFile = ""; }
195 			else { current->setTaxonomyFile(taxFile); }
196 
197             if ((fastafile == "") && (file == "")) {
198                 fastafile = current->getFastaFile();
199                 if (fastafile != "") {   m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n");  }
200                 else {
201                     m->mothurOut("[ERROR]: No valid current files. When executing a cluster.split command you must enter a file file or fastafile.\n"); abort = true;
202                 }
203             }
204 			else if ((fastafile != "") && (file != "")) { m->mothurOut("[ERROR]: When executing a cluster.split command you must enter ONLY ONE of the following: file or fasta.\n");  abort = true; }
205 
206             if ((countfile != "") && (namefile != "")) { m->mothurOut("[ERROR]: When executing a cluster.split command you must enter ONLY ONE of the following: count or name.\n");  abort = true; }
207 
208             if (file != "") {
209                 if ((namefile == "") && (countfile == "")) {
210                     m->mothurOut("\n[WARNING]: When using the file option, it is recommended you include the name or count file. Doing so will ensure the OTUs are printed by OTU size reflecting the redundant reads, instead of just the unique reads.\n");
211                 }
212             }
213 
214 			if (fastafile != "") {
215 				if (taxFile == "") {
216 					taxFile = current->getTaxonomyFile();
217 					if (taxFile != "") {  m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter.\n");  }
218 					else {
219 						m->mothurOut("[ERROR]: You need to provide a taxonomy file if you are if you are using a fasta file to generate the split.\n");
220 						abort = true;
221 					}
222 				}
223 
224 				if ((namefile == "") && (countfile == "")) {
225 					namefile = current->getNameFile();
226                     if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter.\n");  type = "name"; }
227 					else {
228 						countfile = current->getCountFile();
229                         if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter.\n"); type = "count"; }
230                         else {  m->mothurOut("[ERROR]: You need to provide a namefile or countfile.\n");   abort = true;
231                         }
232 					}
233 				}
234 			}
235 
236 
237 			string temp;
238 			temp = validParameter.valid(parameters, "precision");
239 			if (temp == "not found") { temp = "100"; }
240 			//saves precision legnth for formatting below
241 			length = temp.length();
242 			util.mothurConvert(temp, precision);
243 
244 			temp = validParameter.valid(parameters, "processors");	if (temp == "not found"){	temp = current->getProcessors();	}
245 			processors = current->setProcessors(temp);
246 
247             temp = validParameter.valid(parameters, "classic");			if (temp == "not found") { temp = "F"; }
248 			classic = util.isTrue(temp);
249 
250             temp = validParameter.valid(parameters, "runsensspec");			if (temp == "not found") { temp = "T"; }
251             runsensSpec = util.isTrue(temp);
252 
253 			temp = validParameter.valid(parameters, "taxlevel");		if (temp == "not found")  { temp = "3"; }
254 			util.mothurConvert(temp, taxLevelCutoff);
255 
256             temp = validParameter.valid(parameters, "iters");		if (temp == "not found")  { temp = "100"; }
257             util.mothurConvert(temp, maxIters);
258 
259             temp = validParameter.valid(parameters, "delta");		if (temp == "not found")  { temp = "0.0001"; }
260             util.mothurConvert(temp, stableMetric);
261 
262             metricName = validParameter.valid(parameters, "metric");		if (metricName == "not found") { metricName = "mcc"; }
263 
264             if ((metricName == "mcc") || (metricName == "sens") || (metricName == "spec") || (metricName == "tptn") || (metricName == "tp") || (metricName == "tn") || (metricName == "fp") || (metricName == "fn") || (metricName == "f1score") || (metricName == "accuracy") || (metricName == "ppv") || (metricName == "npv") || (metricName == "fdr") || (metricName == "fpfn") ){ }
265             else { m->mothurOut("[ERROR]: Not a valid metric.  Valid metrics are mcc, sens, spec, tp, tn, fp, fn, tptn, fpfn, f1score, accuracy, ppv, npv, fdr.\n");  abort = true; }
266 
267             initialize = validParameter.valid(parameters, "initialize");		if (initialize == "not found") { initialize = "singleton"; }
268 
269             if ((initialize == "singleton") || (initialize == "oneotu")){ }
270             else { m->mothurOut("[ERROR]: Not a valid initialization.  Valid initializations are singleton and oneotu.\n");  abort = true; }
271 
272 			method = validParameter.valid(parameters, "method");		if (method == "not found") { method = "opti";  }
273 
274             vector<string> versionOutputs;
275             bool foundTool = false;
276             string path = current->getProgramPath();
277             string programName = "vsearch"; programName += EXECUTABLE_EXT;
278 
279             vsearchLocation = validParameter.validPath(parameters, "vsearch");
280             if (vsearchLocation == "not found") {
281                 vsearchLocation = "";
282                 if ((method == "agc") || (method == "dgc")) {
283                     foundTool = util.findTool(programName, vsearchLocation, path, versionOutputs, current->getLocations());
284                 }
285             }
286             else {
287                 if ((method == "agc") || (method == "dgc")) {
288                     //test to make sure vsearch exists
289                     ifstream in;
290                     vsearchLocation = util.getFullPathName(vsearchLocation);
291                     bool ableToOpen = util.openInputFile(vsearchLocation, in, "no error"); in.close();
292                     if(!ableToOpen) {
293                         m->mothurOut(vsearchLocation + " file does not exist or cannot be opened, ignoring.\n"); vsearchLocation = "";
294                         programName = util.getSimpleName(vsearchLocation); vsearchLocation = "";
295                         foundTool = util.findTool(programName, vsearchLocation, path, versionOutputs, current->getLocations());
296                     }
297                 }
298             }
299 
300             if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted") || (method == "agc") || (method == "dgc") || (method == "opti")) { }
301             else { m->mothurOut("[ERROR]: Not a valid clustering method.  Valid clustering algorithms are furthest, nearest, average, weighted, agc, dgc and opti.\n");  abort = true; }
302 
303             if ((method == "agc") || (method == "dgc")) {
304                 if (fastafile == "") { m->mothurOut("[ERROR]: You must provide a fasta file when using the agc or dgc clustering methods, aborting\n."); abort = true;}
305                 if (classic) { m->mothurOut("[ERROR]: You cannot use cluster.classic with the agc or dgc clustering methods, aborting\n."); abort = true; }
306                 if (!foundTool) { abort = true; }
307             }
308 
309             cutoffNotSet = false;
310             temp = validParameter.valid(parameters, "cutoff");
311             if (temp == "not found") { cutoffNotSet = true; if ((method == "opti") || (method == "agc") || (method == "dgc")) { temp = "0.03"; }else { temp = "0.15"; } }
312             util.mothurConvert(temp, cutoff);
313 
314 			temp = validParameter.valid(parameters, "showabund");
315 			if (temp == "not found") { temp = "T"; }
316             showabund = util.isTrue(temp);
317 
318             temp = validParameter.valid(parameters, "cluster");  if (temp == "not found") { temp = "T"; }
319             runCluster = util.isTrue(temp);
320 
321             temp = validParameter.valid(parameters, "islist");  if (temp == "not found") { temp = "F"; }
322             isList = util.isTrue(temp);
323 
324             temp = validParameter.valid(parameters, "dist");  if (temp == "not found") { temp = "F"; }
325             makeDist = util.isTrue(temp);
326             if (method == "opti") { makeDist = runsensSpec;  }
327 
328             if (classic && makeDist) { m->mothurOut("[ERROR]: You cannot use the dist parameter with the classic parameter. Mothur will ignore the dist parameter.\n"); makeDist = false; }
329 
330 			timing = validParameter.valid(parameters, "timing");
331 			if (timing == "not found") { timing = "F"; }
332 		}
333 	}
334 	catch(exception& e) {
335 		m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
336 		exit(1);
337 	}
338 }
339 
340 //**********************************************************************************************************************
341 
execute()342 int ClusterSplitCommand::execute(){
343 	try {
344 
345 		if (abort) { if (calledHelp) { return 0; }  return 2;	}
346 
347         time_t estart;
348         vector<string> listFileNames;
349         vector< map<string, string> > distName;
350         set<string> labels;
351         string singletonName = "";
352 
353         double saveCutoff = cutoff;
354 
355         if (file != "") {
356             deleteFiles = false; estart = time(NULL);
357             singletonName = readFile(distName);
358 
359             if (isList) {
360 
361                 //set list file as new current listfile
362                 string currentName = "";
363                 itTypes = outputTypes.find("list");
364                 if (itTypes != outputTypes.end()) {
365                     if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setListFile(currentName); }
366                 }
367 
368                 m->mothurOut("\nOutput File Names: \n");
369                 for (int i = 0; i < outputNames.size(); i++) {	m->mothurOut(outputNames[i] + "\n");	} m->mothurOutEndLine();
370 
371                 return 0;
372             }
373 
374         }else {
375             //splitting
376             estart = time(NULL); bool usingVsearchToCLuster = false;
377             if ((method == "agc") || (method == "dgc")) { usingVsearchToCLuster = true; if (cutoffNotSet) {  m->mothurOut("\nYou did not set a cutoff, using 0.03.\n"); cutoff = 0.03; } }
378 
379             m->mothurOut("Splitting the file...\n");
380             current->setMothurCalling(true);
381 
382             //split matrix into non-overlapping groups
383             SplitMatrix* split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff,  processors, classic, outputdir, usingVsearchToCLuster);
384 
385             if (fastafile != "") {  current->setFastaFile(fastafile);  }
386 
387             if (m->getControl_pressed()) { delete split; return 0; }
388 
389             singletonName = split->getSingletonNames();
390             distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
391             delete split;
392             current->setMothurCalling(false);
393 
394             if (m->getDebug()) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
395 
396             m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file.\n");
397 
398             //output a merged distance file
399             if (makeDist)		{ createMergedDistanceFile(distName); }
400 
401             if (m->getControl_pressed()) { return 0; }
402 
403             estart = time(NULL);
404 
405             if (!runCluster) {
406                 string filename = printFile(singletonName, distName);
407 
408                 m->mothurOutEndLine();
409                 m->mothurOut("Output File Names:\n\n"); m->mothurOut(filename); m->mothurOutEndLine();
410                 for (int i = 0; i < distName.size(); i++) {	m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();	}
411                 m->mothurOutEndLine();
412 
413                 return 0;
414             }
415             deleteFiles = true;
416         }
417 		//****************** break up files between processes and cluster each file set ******************************//
418 
419         listFileNames = createProcesses(distName, labels);
420 
421         if (deleteFiles) {
422             //delete the temp files now that we are done
423             for (int i = 0; i < distName.size(); i++) {
424                 string thisNamefile = distName[i].begin()->second;
425                 string thisDistFile = distName[i].begin()->first;
426                 util.mothurRemove(thisNamefile);
427                 util.mothurRemove(thisDistFile);
428             }
429         }
430 
431 		if (m->getControl_pressed()) { for (int i = 0; i < listFileNames.size(); i++) { util.mothurRemove(listFileNames[i]); } return 0; }
432 
433 		if (!util.isEqual(saveCutoff, cutoff)) { m->mothurOut("\nCutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
434 
435 		m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster\n");
436 
437 		//****************** merge list file and create rabund and sabund files ******************************//
438 		estart = time(NULL);
439 		m->mothurOut("Merging the clustered files...\n");
440 
441 		ListVector* listSingle;
442 		map<double, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
443 
444 		if (m->getControl_pressed()) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
445 
446 		mergeLists(listFileNames, labelBins, listSingle);
447 
448 		if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
449 
450         //delete after all are complete incase a crash happens
451         if (!deleteFiles) { for (int i = 0; i < distName.size(); i++) {	util.mothurRemove(distName[i].begin()->first); util.mothurRemove(distName[i].begin()->second); 	} }
452 
453 		m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge.\n");
454 
455         if ((method == "opti") && (runsensSpec)) { runSensSpec();  }
456 
457         if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
458 
459 		//set list file as new current listfile
460 		string currentName = "";
461 		itTypes = outputTypes.find("list");
462 		if (itTypes != outputTypes.end()) {
463 			if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setListFile(currentName); }
464 		}
465 
466 		//set rabund file as new current rabundfile
467 		itTypes = outputTypes.find("rabund");
468 		if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setRabundFile(currentName); } }
469 
470 		//set sabund file as new current sabundfile
471 		itTypes = outputTypes.find("sabund");
472 		if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setSabundFile(currentName); } }
473 
474         //set sabund file as new current sabundfile
475         itTypes = outputTypes.find("column");
476         if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setColumnFile(currentName); } }
477 
478 		m->mothurOut("\nOutput File Names: \n");
479 		for (int i = 0; i < outputNames.size(); i++) {	m->mothurOut(outputNames[i] +"\n"); 	} m->mothurOutEndLine();
480 
481 		return 0;
482 	}
483 	catch(exception& e) {
484 		m->errorOut(e, "ClusterSplitCommand", "execute");
485 		exit(1);
486 	}
487 }
488 //**********************************************************************************************************************
completeListFile(vector<string> listNames,string singleton,set<string> & userLabels,ListVector * & listSingle)489 map<double, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
490 	try {
491 		map<double, int> labelBin;
492 		vector<double> orderFloat;
493 		int numSingleBins;
494 
495 		//read in singletons
496 		if (singleton != "none") {
497 
498             ifstream in;
499             util.openInputFile(singleton, in);
500 
501 			string firstCol, secondCol;
502 			listSingle = new ListVector();
503 
504             if (type == "count") { util.getline(in); util.gobble(in); }
505 
506 			while (!in.eof()) {
507 				in >> firstCol >> secondCol;
508                 util.getline(in);
509                 if (type == "name")  { listSingle->push_back(secondCol); }
510                 else { listSingle->push_back(firstCol); }
511                 util.gobble(in);
512 			}
513 
514 			in.close();
515 			util.mothurRemove(singleton);
516 
517 			numSingleBins = listSingle->getNumBins();
518         }else{  listSingle = NULL; numSingleBins = 0;  }
519 
520         //go through users set and make them floats so we can sort them
521         for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
522             double temp = -10.0;
523 
524             if ((*it != "unique") && (convertTestFloat(*it, temp) ))	{	util.mothurConvert(*it, temp);	}
525             else if (*it == "unique")										{	temp = -1.0;		}
526 
527             if ((temp < cutoff) || util.isEqual(cutoff, temp)) {
528                 orderFloat.push_back(temp);
529                 labelBin[temp] = numSingleBins; //initialize numbins
530             }
531         }
532 
533 		//sort order
534 		sort(orderFloat.begin(), orderFloat.end());
535 		userLabels.clear();
536 
537 		//get the list info from each file
538 		for (int k = 0; k < listNames.size(); k++) {
539 
540 			if (m->getControl_pressed()) {
541 				if (listSingle != NULL) { delete listSingle; listSingle = NULL; util.mothurRemove(singleton);  }
542 				for (int i = 0; i < listNames.size(); i++) {   util.mothurRemove(listNames[i]);  }
543 				return labelBin;
544 			}
545 
546 			InputData* input = new InputData(listNames[k], "list", nullVector);
547 			ListVector* list = input->getListVector();
548 			string lastLabel = list->getLabel();
549 
550 			string filledInList = listNames[k] + "filledInTemp";
551 			ofstream outFilled;
552 			util.openOutputFile(filledInList, outFilled);
553             bool printHeaders = true;
554 
555 
556 			//for each label needed
557 			for(int l = 0; l < orderFloat.size(); l++){
558 
559 				string thisLabel;
560 				if (util.isEqual(orderFloat[l],-1)) { thisLabel = "unique"; }
561 				else { thisLabel = toString(orderFloat[l],  length-1);  }
562 
563 				//this file has reached the end
564 				if (list == NULL) {
565 					list = input->getListVector(lastLabel, true);
566 				}else{	//do you have the distance, or do you need to fill in
567 
568 					float labelFloat;
569 					if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
570 					else { convert(list->getLabel(), labelFloat); }
571 
572 					//check for missing labels
573 					if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
574 						//if its bigger get last label, otherwise keep it
575 						delete list;
576 						list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
577 					}
578 					lastLabel = list->getLabel();
579 				}
580 
581 				//print to new file
582 				list->setLabel(thisLabel);
583                 list->setPrintedLabels(printHeaders);
584                 list->print(outFilled, true); printHeaders = false;
585 
586 				//update labelBin
587 				labelBin[orderFloat[l]] += list->getNumBins();
588 
589 				delete list;
590 
591 				list = input->getListVector();
592 			}
593 
594 			if (list != NULL) { delete list; }
595 			delete input;
596 
597 			outFilled.close();
598 			util.mothurRemove(listNames[k]);
599 			rename(filledInList.c_str(), listNames[k].c_str());
600 		}
601 
602 		return labelBin;
603 	}
604 	catch(exception& e) {
605 		m->errorOut(e, "ClusterSplitCommand", "completeListFile");
606 		exit(1);
607 	}
608 }
609 //**********************************************************************************************************************
mergeLists(vector<string> listNames,map<double,int> userLabels,ListVector * listSingle)610 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<double, int> userLabels, ListVector* listSingle){
611 	try {
612 		if (outputdir == "") { outputdir += util.hasPath(distfile); }
613 		fileroot = outputdir + util.getRootName(util.getSimpleName(distfile));
614 
615         map<string, string> variables;
616         variables["[filename]"] = fileroot;
617         variables["[clustertag]"] = tag;
618         string sabundFileName = getOutputFileName("sabund", variables);
619         string rabundFileName = getOutputFileName("rabund", variables);
620         string listFileName = getOutputFileName("list", variables);
621 
622         map<string, int> counts;
623         ofstream outList, outRabund, outSabund;
624         if (countfile == "") {
625             util.openOutputFile(sabundFileName,	outSabund);
626             util.openOutputFile(rabundFileName,	outRabund);
627             outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
628             outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
629 
630         }else {
631             CountTable ct;
632             ct.readTable(countfile, false, false);
633             counts = ct.getNameMap();
634         }
635 
636 		util.openOutputFile(listFileName,	outList);
637         outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
638         bool printHeaders = true;
639 
640 		//for each label needed
641 		for(map<double, int>::iterator itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
642 
643 			string thisLabel;
644 			if (util.isEqual(itLabel->first,-1)) { thisLabel = "unique"; }
645 			else { thisLabel = toString(itLabel->first,  length-1);  }
646 
647 			//outList << thisLabel << '\t' << itLabel->second << '\t';
648 
649             RAbundVector* rabund = NULL;
650             ListVector completeList;
651             completeList.setLabel(thisLabel);
652 
653             if (countfile == "") {
654                 rabund = new RAbundVector();
655                 rabund->setLabel(thisLabel);
656             }
657 
658 			//add in singletons
659 			if (listSingle != NULL) {
660 				for (int j = 0; j < listSingle->getNumBins(); j++) {
661 					//outList << listSingle->get(j) << '\t';
662                     completeList.push_back(listSingle->get(j));
663 					if (countfile == "") { rabund->push_back(util.getNumNames(listSingle->get(j))); }
664 				}
665 			}
666 
667 			//get the list info from each file
668 			for (int k = 0; k < listNames.size(); k++) {
669 
670 				if (m->getControl_pressed()) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { util.mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
671 
672 				InputData* input = new InputData(listNames[k], "list", nullVector);
673 				ListVector* list = input->getListVector(thisLabel);
674 
675 				//this file has reached the end
676 				if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }
677 				else {
678 					for (int j = 0; j < list->getNumBins(); j++) {
679                         completeList.push_back(list->get(j));
680 						if (countfile == "") { rabund->push_back(util.getNumNames(list->get(j))); }
681 					}
682 					delete list;
683 				}
684 				delete input;
685 			}
686 
687             if (countfile == "") {
688                 SAbundVector sabund = rabund->getSAbundVector();
689                 sabund.print(outSabund);
690                 rabund->print(outRabund);
691             }
692 
693             completeList.setPrintedLabels(printHeaders);
694             if (countfile != "") { completeList.print(outList, counts);  printHeaders = false; }
695             else { completeList.print(outList);  printHeaders = false; }
696 
697 			if (rabund != NULL) { delete rabund; }
698 		}
699 
700 		outList.close();
701         if (countfile == "") {
702             outRabund.close();
703             outSabund.close();
704 		}
705 		if (listSingle != NULL) { delete listSingle;  }
706 
707 		for (int i = 0; i < listNames.size(); i++) {  util.mothurRemove(listNames[i]);  }
708 
709 		return 0;
710 	}
711 	catch(exception& e) {
712 		m->errorOut(e, "ClusterSplitCommand", "mergeLists");
713 		exit(1);
714 	}
715 }
716 /**************************************************************************************************/
717 struct clusterData {
718     MothurOut* m;
719     Utils util;
720     int count, precision, length, maxIters; //numSingletons,
721     bool showabund, classic, useName, useCount, deleteFiles, cutoffNotSet;
722     double cutoff, stableMetric;
723     ofstream outList, outRabund, outSabund;
724     string tag, method,  vsearchLocation, metricName, initialize, outputDir, type;
725     vector< map<string, string> > distNames;
726     set<string> labels;
727     vector<string> listFileNames;
728 
clusterDataclusterData729     clusterData(){}
clusterDataclusterData730     clusterData(bool showab, bool cla, bool df, vector< map<string, string> > dN, bool cns, double cu, int prec, int len, string meth, string opd, string vl, string ty) {
731         showabund = showab;
732         distNames = dN;
733         cutoff = cu;
734         classic = cla;
735         method = meth;
736         precision = prec;
737         length = len;
738         outputDir = opd;
739         vsearchLocation = vl;
740         deleteFiles = df;
741         cutoffNotSet = cns;
742         m = MothurOut::getInstance();
743         count = 0;
744         type = ty;
745         useName = false;
746         useCount = false;
747         //numSingletons = 0;
748     }
setOptiOptionsclusterData749     void setOptiOptions(string metn, double stabMet, string init, int mxi ) {
750         metricName = metn;
751         stableMetric = stabMet;
752         maxIters = mxi;
753         initialize = init;
754     }
setNamesCountclusterData755     void setNamesCount(string cnf) {
756         useName = false;
757         useCount = false;
758         if (type == "name") { useName = true;  }
759         if (type == "count") { useCount = true; }
760     }
761 };
762 //**********************************************************************************************************************
createRabund(CountTable * & ct,ListVector * & list,RAbundVector * & rabund,clusterData * params)763 int createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund, clusterData* params){
764     try {
765         rabund->setLabel(list->getLabel());
766         for(int i = 0; i < list->getNumBins(); i++) {
767             if (params->m->getControl_pressed()) { break; }
768             vector<string> binNames;
769             string bin = list->get(i);
770             params->util.splitAtComma(bin, binNames);
771             int total = 0;
772             for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
773             rabund->push_back(total);
774         }
775         return 0;
776     }
777     catch(exception& e) {
778         params->m->errorOut(e, "ClusterSplitCommand", "createRabund");
779         exit(1);
780     }
781 
782 }
783 //**********************************************************************************************************************
clusterClassicFile(string thisDistFile,string thisNamefile,double & smallestCutoff,clusterData * params)784 string clusterClassicFile(string thisDistFile, string thisNamefile, double& smallestCutoff, clusterData* params){
785     try {
786         string listFileName = "";
787 
788         ListVector* list = NULL;
789         ListVector oldList;
790         RAbundVector* rabund = NULL;
791 
792         params->m->mothurOut("\nReading " + thisDistFile + "\n");
793 
794         //reads phylip file storing data in 2D vector, also fills list and rabund
795         bool sim = false;
796         ClusterClassic cluster(params->cutoff, params->method, sim);
797 
798         NameAssignment* nameMap = NULL;
799         CountTable* ct = NULL;
800         if(params->useName){
801             nameMap = new NameAssignment(thisNamefile);
802             nameMap->readMap();
803             cluster.readPhylipFile(thisDistFile, nameMap);
804         }else if (params->useCount) {
805             ct = new CountTable();
806             ct->readTable(thisNamefile, false, false);
807             cluster.readPhylipFile(thisDistFile, ct);
808         }
809         params->tag = cluster.getTag();
810 
811         if (params->m->getControl_pressed()) { if(params->useName){	delete nameMap; }
812             else if (params->useCount) { delete ct; } return listFileName; }
813 
814         list = cluster.getListVector();
815         rabund = cluster.getRAbundVector();
816 
817         string thisOutputDir = params->outputDir;
818         if (params->outputDir == "") { thisOutputDir += params->util.hasPath(thisDistFile); }
819         string fileroot = thisOutputDir + params->util.getRootName(params->util.getSimpleName(thisDistFile));
820         listFileName = fileroot+ params->tag + ".list";
821 
822         ofstream listFile;
823         params->util.openOutputFile(fileroot+ params->tag + ".list",	listFile);
824 
825         float previousDist = 0.00000;
826         float rndPreviousDist = 0.00000;
827         bool printHeaders = true;
828         oldList = *list;
829 
830         params->m->mothurOut("\nClustering " + thisDistFile + "\n");
831 
832         while ((cluster.getSmallDist() < params->cutoff) && (cluster.getNSeqs() > 1)){
833             if (params->m->getControl_pressed()) {  delete list; delete rabund; listFile.close();  if(params->useName){	delete nameMap; }
834                 else if (params->useCount) { delete ct; } return listFileName;  }
835 
836             cluster.update(params->cutoff);
837 
838             float dist = cluster.getSmallDist();
839             float rndDist = params->util.ceilDist(dist, params->precision);
840 
841             if(previousDist <= 0.0000 && !params->util.isEqual(dist, previousDist)){
842                 oldList.setLabel("unique");
843                 oldList.setPrintedLabels(printHeaders);
844                 oldList.print(listFile); printHeaders = false;
845                 if (params->labels.count("unique") == 0) {  params->labels.insert("unique");  }
846             }
847             else if(!params->util.isEqual(rndDist, rndPreviousDist)){
848                 oldList.setLabel(toString(rndPreviousDist,  params->length-1));
849                 oldList.setPrintedLabels(printHeaders);
850                 oldList.print(listFile); printHeaders = false;
851                 if (params->labels.count(toString(rndPreviousDist,  params->length-1)) == 0) { params->labels.insert(toString(rndPreviousDist,  params->length-1)); }
852             }
853 
854 
855             previousDist = dist;
856             rndPreviousDist = rndDist;
857             oldList = *list;
858         }
859 
860         if(previousDist <= 0.0000){
861             oldList.setLabel("unique");
862             oldList.setPrintedLabels(printHeaders);
863             oldList.print(listFile); printHeaders = false;
864             if (params->labels.count("unique") == 0) { params->labels.insert("unique"); }
865         }
866         else if(rndPreviousDist<params->cutoff){
867             oldList.setLabel(toString(rndPreviousDist,  params->length-1));
868             oldList.setPrintedLabels(printHeaders);
869             oldList.print(listFile); printHeaders = false;
870             if (params->labels.count(toString(rndPreviousDist,  params->length-1)) == 0) { params->labels.insert(toString(rndPreviousDist,  params->length-1)); }
871         }
872         listFile.close();
873 
874         delete list; delete rabund;
875         if(params->useName)         {	delete nameMap; }
876         else if (params->useCount)  {   delete ct;      }
877 
878         if (params->deleteFiles) {
879             params->util.mothurRemove(thisDistFile);
880             params->util.mothurRemove(thisNamefile);
881         }
882         return listFileName;
883     }
884     catch(exception& e) {
885         params->m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
886         exit(1);
887     }
888 }
889 //**********************************************************************************************************************
runOptiCluster(string thisDistFile,string thisNamefile,double & smallestCutoff,clusterData * params)890 string runOptiCluster(string thisDistFile, string thisNamefile, double& smallestCutoff, clusterData* params){
891     try {
892         if (params->cutoffNotSet) {  params->m->mothurOut("\nYou did not set a cutoff, using 0.03.\n"); params->cutoff = 0.03;  }
893 
894         string nameOrCount = params->type;
895 
896         OptiMatrix matrix(thisDistFile, thisNamefile, nameOrCount, "column", params->cutoff, false);
897 
898         ClusterMetric* metric = NULL;
899         if (params->metricName == "mcc")             { metric = new MCC();              }
900         else if (params->metricName == "sens")       { metric = new Sensitivity();      }
901         else if (params->metricName == "spec")       { metric = new Specificity();      }
902         else if (params->metricName == "tptn")       { metric = new TPTN();             }
903         else if (params->metricName == "tp")         { metric = new TP();               }
904         else if (params->metricName == "tn")         { metric = new TN();               }
905         else if (params->metricName == "fp")         { metric = new FP();               }
906         else if (params->metricName == "fn")         { metric = new FN();               }
907         else if (params->metricName == "f1score")    { metric = new F1Score();          }
908         else if (params->metricName == "accuracy")   { metric = new Accuracy();         }
909         else if (params->metricName == "ppv")        { metric = new PPV();              }
910         else if (params->metricName == "npv")        { metric = new NPV();              }
911         else if (params->metricName == "fdr")        { metric = new FDR();              }
912         else if (params->metricName == "fpfn")       { metric = new FPFN();             }
913 
914         OptiCluster cluster(&matrix, metric, 0);
915         params->tag = cluster.getTag();
916 
917         params->m->mothurOut("\nClustering " + thisDistFile + "\n");
918 
919         string thisOutputDir = params->outputDir;
920         if (params->outputDir == "") { thisOutputDir += params->util.hasPath(thisDistFile); }
921         string fileroot = thisOutputDir + params->util.getRootName(params->util.getSimpleName(thisDistFile));
922         string listFileName = fileroot+ params->tag + ".list";
923 
924         int iters = 0;
925         double listVectorMetric = 0; //worst state
926         double delta = 1;
927 
928         cluster.initialize(listVectorMetric, true, params->initialize);
929 
930         while ((delta > params->stableMetric) && (iters < params->maxIters)) {
931 
932             if (params->m->getControl_pressed()) { if (params->deleteFiles) { params->util.mothurRemove(thisDistFile);  params->util.mothurRemove(thisNamefile); } return listFileName; }
933             double oldMetric = listVectorMetric;
934             cluster.update(listVectorMetric);
935 
936             delta = abs(oldMetric - listVectorMetric);
937             iters++;
938         }
939 
940         if (params->m->getControl_pressed()) { delete metric; metric = NULL; return 0; }
941 
942         ListVector* list = cluster.getList();
943         list->setLabel(toString(smallestCutoff));
944         //params->cutoff = params->util.ceilDist(params->cutoff, params->precision);
945         params->labels.insert(toString(smallestCutoff));
946 
947         ofstream listFile;
948         params->util.openOutputFile(listFileName,	listFile);
949         list->print(listFile);
950         listFile.close();
951 
952         if (params->deleteFiles) {
953             params->util.mothurRemove(thisDistFile);
954             params->util.mothurRemove(thisNamefile);
955         }
956 
957         double tp, tn, fp, fn;
958         params->m->mothurOut("\ntp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n");
959         vector<double> results = cluster.getStats(tp, tn, fp, fn);
960         params->m->mothurOut(toString(tp) + "\t" + toString(tn) + "\t" + toString(fp) + "\t" + toString(fn) + "\t");
961         for (int i = 0; i < results.size(); i++) { params->m->mothurOut(toString(results[i]) + "\t");  }
962         params->m->mothurOut("\n\n");
963         return listFileName;
964 
965     }
966     catch(exception& e) {
967         params->m->errorOut(e, "ClusterSplitCommand", "runOptiCluster");
968         exit(1);
969     }
970 }
971 //**********************************************************************************************************************
972 
vsearchDriver(string inputFile,string ucClusteredFile,string logfile,double cutoff,clusterData * params)973 int vsearchDriver(string inputFile, string ucClusteredFile, string logfile, double cutoff, clusterData* params){
974     try {
975 
976         //vsearch --maxaccepts 16 --usersort --id 0.97 --minseqlength 30 --wordlength 8 --uc $ROOT.clustered.uc --cluster_smallmem $ROOT.sorted.fna --maxrejects 64 --strand both --log $ROOT.clustered.log --sizeorder
977 
978 
979         ucClusteredFile = params->util.getFullPathName(ucClusteredFile);
980         inputFile = params->util.getFullPathName(inputFile);
981         logfile = params->util.getFullPathName(logfile);
982 
983         //to allow for spaces in the path
984         ucClusteredFile = "\"" + ucClusteredFile + "\"";
985         inputFile = "\"" + inputFile + "\"";
986         logfile = "\"" + logfile + "\"";
987 
988         vector<char*> cPara;
989 
990         string vsearchCommand = params->vsearchLocation;
991         vsearchCommand = "\"" + vsearchCommand + "\" ";
992 
993         vector<char*> vsearchParameters;
994         vsearchParameters.push_back(params->util.mothurConvert(vsearchCommand));
995 
996         //--maxaccepts=16
997         vsearchParameters.push_back(params->util.mothurConvert("--maxaccepts=16"));
998 
999         //--threads=1
1000         string processorsString = "--threads=1";
1001         vsearchParameters.push_back(params->util.mothurConvert(processorsString));
1002 
1003         //--usersort
1004         vsearchParameters.push_back(params->util.mothurConvert("--usersort"));
1005 
1006         //--id=0.97
1007         cutoff = abs(1.0 - cutoff); string cutoffString = toString(cutoff);
1008         if (cutoffString.length() > 4) {  cutoffString = cutoffString.substr(0, 4);  }
1009         else if (cutoffString.length() < 4)  {  for (int i = cutoffString.length(); i < 4; i++)  { cutoffString += "0";  } }
1010 
1011         cutoffString = "--id=" +  cutoffString;
1012         vsearchParameters.push_back(params->util.mothurConvert(cutoffString));
1013 
1014         //--minseqlength=30
1015         vsearchParameters.push_back(params->util.mothurConvert("--minseqlength=30"));
1016 
1017         //--wordlength=8
1018         vsearchParameters.push_back(params->util.mothurConvert("--wordlength=8"));
1019 
1020         //--uc=$ROOT.clustered.uc
1021         string tempIn = "--uc=" + ucClusteredFile;
1022         vsearchParameters.push_back(params->util.mothurConvert(tempIn));
1023 
1024         //--cluster_smallmem $ROOT.sorted.fna
1025         string tempSorted = "--cluster_smallmem=" + inputFile;
1026         vsearchParameters.push_back(params->util.mothurConvert(tempSorted));
1027 
1028         //--maxrejects=64
1029         vsearchParameters.push_back(params->util.mothurConvert("--maxrejects=64"));
1030 
1031         //--strand=both
1032         vsearchParameters.push_back(params->util.mothurConvert("--strand=both"));
1033 
1034         //--log=$ROOT.clustered.log
1035         string tempLog = "--log=" + logfile;
1036         vsearchParameters.push_back(params->util.mothurConvert(tempLog));
1037 
1038         if (params->method == "agc") { //--sizeorder
1039             vsearchParameters.push_back(params->util.mothurConvert("--sizeorder"));
1040          }
1041 
1042         if (params->m->getDebug()) {  params->m->mothurOut("[DEBUG]: "); for(int i = 0; i < vsearchParameters.size(); i++)  { params->m->mothurOut(toString(vsearchParameters[i]) + "\t"); } params->m->mothurOut("\n");  }
1043 
1044         string commandString = "";
1045         for (int i = 0; i < vsearchParameters.size(); i++) {    commandString += toString(vsearchParameters[i]) + " "; }
1046 
1047 #if defined NON_WINDOWS
1048 #else
1049         commandString = "\"" + commandString + "\"";
1050 #endif
1051         if (params->m->getDebug()) {  params->m->mothurOut("[DEBUG]: vsearch cluster command = " + commandString + ".\n"); }
1052 
1053         system(commandString.c_str());
1054 
1055         //free memory
1056         for(int i = 0; i < vsearchParameters.size(); i++)  {  delete vsearchParameters[i];  }
1057 
1058         //remove "" from filenames
1059         ucClusteredFile = ucClusteredFile.substr(1, ucClusteredFile.length()-2);
1060         inputFile = inputFile.substr(1, inputFile.length()-2);
1061         logfile = logfile.substr(1, logfile.length()-2);
1062 
1063         return 0;
1064 
1065     }
1066     catch(exception& e) {
1067         params->m->errorOut(e, "ClusterSplitCommand", "vsearchDriver");
1068         exit(1);
1069     }
1070 }
1071 //**********************************************************************************************************************
runVsearchCluster(string thisDistFile,string thisNamefile,double & smallestCutoff,clusterData * params)1072 string runVsearchCluster(string thisDistFile, string thisNamefile, double& smallestCutoff, clusterData* params){
1073     try {
1074 
1075         params->m->mothurOut("\nClustering " + thisDistFile + "\n");
1076 
1077         string vsearchFastafile = ""; VsearchFileParser* vParse;
1078         if (params->useName)                    { vParse = new VsearchFileParser(thisDistFile, thisNamefile, "name");       }
1079         else if (params->useCount)              { vParse = new VsearchFileParser(thisDistFile, thisNamefile, "count");      }
1080         else                                    { params->m->mothurOut("[ERROR]: Opps, should never get here. ClusterSplitCommand::runVsearchCluster() \n"); params->m->setControl_pressed(true);  return ""; }
1081 
1082         if (params->m->getControl_pressed()) {  delete vParse; return ""; }
1083 
1084         vsearchFastafile = vParse->getVsearchFile();
1085 
1086         if (params->cutoff > 1.0) {  params->m->mothurOut("You did not set a cutoff, using 0.03.\n"); params->cutoff = 0.03; }
1087 
1088         //Run vsearch
1089         string ucVsearchFile = params->util.getSimpleName(vsearchFastafile) + ".clustered.uc";
1090         string logfile = params->util.getSimpleName(vsearchFastafile) + ".clustered.log";
1091         vsearchDriver(vsearchFastafile, ucVsearchFile, logfile, smallestCutoff, params);
1092 
1093         if (params->m->getControl_pressed()) { params->util.mothurRemove(ucVsearchFile); params->util.mothurRemove(logfile);  params->util.mothurRemove(vsearchFastafile); return ""; }
1094 
1095         string thisOutputDir = params->outputDir;
1096         if (params->outputDir == "") { thisOutputDir += params->util.hasPath(thisDistFile); }
1097         params->tag = params->method;
1098         string listFileName = thisOutputDir + params->util.getRootName(params->util.getSimpleName(thisDistFile)) + params->tag + ".list";
1099 
1100         //Convert outputted *.uc file into a list file
1101         map<string, int> counts;
1102         ListVector list = vParse->createListFile(ucVsearchFile, vParse->getNumBins(logfile), toString(params->cutoff), counts);
1103 
1104         ofstream out;
1105         params->util.openOutputFile(listFileName,	out);
1106 
1107         list.DataVector::printHeaders(out);
1108 
1109         if (params->useCount) { list.print(out, counts); }
1110         else { list.print(out); } delete vParse;
1111 
1112         //remove temp files
1113         params->util.mothurRemove(ucVsearchFile); params->util.mothurRemove(logfile);  params->util.mothurRemove(vsearchFastafile);
1114 
1115         if (params->deleteFiles) {
1116             params->util.mothurRemove(thisDistFile);
1117             params->util.mothurRemove(thisNamefile);
1118         }
1119         params->labels.insert(toString(params->cutoff));
1120 
1121         return listFileName;
1122     }
1123     catch(exception& e) {
1124         params->m->errorOut(e, "ClusterSplitCommand", "runVsearchCluster");
1125         exit(1);
1126     }
1127 }
1128 //**********************************************************************************************************************
clusterFile(string thisDistFile,string thisNamefile,double & smallestCutoff,clusterData * params)1129 string clusterFile(string thisDistFile, string thisNamefile, double& smallestCutoff, clusterData* params){
1130     try {
1131         string listFileName = "";
1132 
1133         if ((params->method == "agc") || (params->method == "dgc")) {  listFileName = runVsearchCluster(thisDistFile, thisNamefile, smallestCutoff, params);  }
1134         else if (params->method == "opti")                          {  listFileName = runOptiCluster(thisDistFile, thisNamefile, smallestCutoff, params);     }
1135         else {
1136 
1137             Cluster* cluster = NULL;
1138             SparseDistanceMatrix* matrix = NULL;
1139             ListVector* list = NULL;
1140             ListVector oldList;
1141             RAbundVector* rabund = NULL;
1142 
1143             if (params->m->getControl_pressed()) { return listFileName; }
1144 
1145             params->m->mothurOut("\nReading " + thisDistFile + "\n");
1146 
1147             ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1148             read->setCutoff(params->cutoff);
1149 
1150             NameAssignment* nameMap = NULL;
1151             CountTable* ct = NULL;
1152             if(params->useName){
1153                 nameMap = new NameAssignment(thisNamefile);
1154                 nameMap->readMap();
1155                 read->read(nameMap);
1156             }else if (params->useCount) {
1157                 ct = new CountTable();
1158                 ct->readTable(thisNamefile, false, false);
1159                 read->read(ct);
1160             }else { read->read(nameMap); }
1161 
1162             list = read->getListVector();
1163             matrix = read->getDMatrix();
1164 
1165             if(params->useCount) {
1166                 rabund = new RAbundVector();
1167                 createRabund(ct, list, rabund, params); //creates an rabund that includes the counts for the unique list
1168                 delete ct;
1169             }else { rabund = new RAbundVector(list->getRAbundVector()); }
1170 
1171             delete read;  read = NULL;
1172             if (params->useName) { delete nameMap; nameMap = NULL; }
1173 
1174             params->m->mothurOut("\nClustering " + thisDistFile + "\n");
1175 
1176             //create cluster
1177             float adjust = -1.0;
1178             if (params->method == "furthest")	{	cluster = new CompleteLinkage(rabund, list, matrix, params->cutoff, params->method, adjust); }
1179             else if(params->method == "nearest"){	cluster = new SingleLinkage(rabund, list, matrix, params->cutoff, params->method, adjust); }
1180             else if(params->method == "average"){	cluster = new AverageLinkage(rabund, list, matrix, params->cutoff, params->method, adjust);	}
1181             params->tag = cluster->getTag();
1182 
1183             string thisOutputDir = params->outputDir;
1184             if (params->outputDir == "") { thisOutputDir += params->util.hasPath(thisDistFile); }
1185             string fileroot = thisOutputDir + params->util.getRootName(params->util.getSimpleName(thisDistFile));
1186             listFileName = fileroot+ params->tag + ".list";
1187 
1188             ofstream listFile;
1189             params->util.openOutputFile(listFileName,	listFile);
1190 
1191             float previousDist = 0.00000;
1192             float rndPreviousDist = 0.00000;
1193             bool printHeaders = true;
1194 
1195             oldList = *list;
1196 
1197             double saveCutoff = params->cutoff;
1198 
1199             while (matrix->getSmallDist() < params->cutoff && matrix->getNNodes() > 0){
1200 
1201                 if (params->m->getControl_pressed()) { //clean up
1202                     delete matrix; delete list;	delete cluster; delete rabund;
1203                     listFile.close();
1204                     params->util.mothurRemove(listFileName);
1205                     return listFileName;
1206                 }
1207 
1208                 cluster->update(saveCutoff);
1209 
1210                 float dist = matrix->getSmallDist();
1211                 float rndDist = params->util.ceilDist(dist, params->precision);
1212 
1213                 if(previousDist <= 0.0000 && !params->util.isEqual(dist, previousDist)){
1214                     oldList.setLabel("unique");
1215                     oldList.setPrintedLabels(printHeaders);
1216                     oldList.print(listFile); printHeaders = false;
1217                     if (params->labels.count("unique") == 0) {  params->labels.insert("unique");  }
1218                 }
1219                 else if(!params->util.isEqual(rndDist, rndPreviousDist)){
1220                     oldList.setPrintedLabels(printHeaders);
1221                     oldList.setLabel(toString(rndPreviousDist,  params->length-1));
1222                     oldList.setPrintedLabels(printHeaders);
1223                     oldList.print(listFile); printHeaders = false;
1224                     if (params->labels.count(toString(rndPreviousDist,  params->length-1)) == 0) { params->labels.insert(toString(rndPreviousDist,  params->length-1)); }
1225                 }
1226 
1227                 previousDist = dist;
1228                 rndPreviousDist = rndDist;
1229                 oldList = *list;
1230             }
1231 
1232 
1233             if(previousDist <= 0.0000){
1234                 oldList.setLabel("unique");
1235                 oldList.setPrintedLabels(printHeaders);
1236                 oldList.print(listFile); printHeaders = false;
1237                 if (params->labels.count("unique") == 0) { params->labels.insert("unique"); }
1238             }
1239             else if(rndPreviousDist<params->cutoff){
1240                 oldList.setLabel(toString(rndPreviousDist,  params->length-1));
1241                 oldList.setPrintedLabels(printHeaders);
1242                 oldList.print(listFile); printHeaders = false;
1243                 if (params->labels.count(toString(rndPreviousDist,  params->length-1)) == 0) { params->labels.insert(toString(rndPreviousDist,  params->length-1)); }
1244             }
1245 
1246             delete matrix; delete list;	delete cluster; delete rabund;
1247             matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1248             listFile.close();
1249 
1250             if (params->m->getControl_pressed()) { //clean up
1251                 params->util.mothurRemove(listFileName);
1252                 return listFileName;
1253             }
1254 
1255             if (params->deleteFiles) {
1256                 params->util.mothurRemove(thisDistFile);
1257                 params->util.mothurRemove(thisNamefile);
1258             }
1259 
1260             if (!params->util.isEqual(saveCutoff, params->cutoff)) {
1261                 saveCutoff = params->util.ceilDist(saveCutoff, params->precision);
1262                 params->m->mothurOut("Cutoff was " + toString(params->cutoff) + " changed cutoff to " + toString(saveCutoff) + "\n");
1263             }
1264 
1265             if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
1266         }
1267         return listFileName;
1268     }
1269     catch(exception& e) {
1270         params->m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1271         exit(1);
1272     }
1273 }
1274 //**********************************************************************************************************************
cluster(clusterData * params)1275 void cluster(clusterData* params){
1276     try {
1277         vector<string> listFileNames;
1278         double smallestCutoff = params->cutoff;
1279 
1280         //cluster each distance file
1281         for (int i = 0; i < params->distNames.size(); i++) {
1282 
1283             string thisNamefile = params->distNames[i].begin()->second;
1284             string thisDistFile = params->distNames[i].begin()->first;
1285 
1286             params->setNamesCount(thisNamefile);
1287 
1288             string listFileName = "";
1289             if (params->classic)    {  listFileName = clusterClassicFile(thisDistFile, thisNamefile, smallestCutoff, params);   }
1290             else                    {  listFileName = clusterFile(thisDistFile, thisNamefile, smallestCutoff, params);          }
1291 
1292             if (params->m->getControl_pressed()) { //clean up
1293                 for (int i = 0; i < listFileNames.size(); i++) {	params->util.mothurRemove(listFileNames[i]); 	}
1294                 params->listFileNames.clear(); break;
1295             }
1296             params->listFileNames.push_back(listFileName);
1297         }
1298         params->cutoff = smallestCutoff;
1299     }
1300     catch(exception& e) {
1301         params->m->errorOut(e, "ClusterSplitCommand", "cluster");
1302         exit(1);
1303     }
1304 
1305 
1306 }
1307 //**********************************************************************************************************************
printData(ListVector * oldList,clusterData * params)1308 void printData(ListVector* oldList, clusterData* params){
1309 	try {
1310 		string label = oldList->getLabel();
1311 		RAbundVector oldRAbund = oldList->getRAbundVector();
1312 
1313 		oldRAbund.setLabel(label);
1314 		if (params->showabund) {
1315 			oldRAbund.getSAbundVector().print(cout);
1316 		}
1317 		oldRAbund.print(params->outRabund);
1318 		oldRAbund.getSAbundVector().print(params->outSabund);
1319 
1320 		oldList->print(params->outList, true);
1321 	}
1322 	catch(exception& e) {
1323 		params->m->errorOut(e, "ClusterSplitCommand", "printData");
1324 		exit(1);
1325 	}
1326 }
1327 //**********************************************************************************************************************
createProcesses(vector<map<string,string>> distName,set<string> & labels)1328 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
1329 	try {
1330         //sanity check
1331         if (processors > distName.size()) { processors = distName.size(); }
1332         deleteFiles = false; //so if we need to recalc the processors the files are still there
1333         vector<string> listFiles;
1334         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
1335         dividedNames.resize(processors);
1336 
1337         //for each file group figure out which process will complete it
1338         //want to divide the load intelligently so the big files are spread between processes
1339         for (int i = 0; i < distName.size(); i++) {
1340             int processToAssign = (i+1) % processors;
1341             if (processToAssign == 0) { processToAssign = processors; }
1342 
1343             dividedNames[(processToAssign-1)].push_back(distName[i]);
1344             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
1345         }
1346 
1347         //now lets reverse the order of ever other process, so we balance big files running with little ones
1348         for (int i = 0; i < processors; i++) {
1349             int remainder = ((i+1) % processors);
1350             if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
1351         }
1352 
1353         if (m->getControl_pressed()) { return listFiles; }
1354 
1355 
1356         //create array of worker threads
1357         vector<std::thread*> workerThreads;
1358         vector<clusterData*> data;
1359 
1360         //Lauch worker threads
1361         for (int i = 0; i < processors-1; i++) {
1362             clusterData* dataBundle = new clusterData(showabund, classic, deleteFiles, dividedNames[i+1], cutoffNotSet, cutoff, precision, length, method, outputdir, vsearchLocation, type);
1363             dataBundle->setOptiOptions(metricName, stableMetric, initialize, maxIters);
1364             data.push_back(dataBundle);
1365 
1366             workerThreads.push_back(new std::thread(cluster, dataBundle));
1367         }
1368 
1369 
1370         clusterData* dataBundle = new clusterData(showabund, classic, deleteFiles, dividedNames[0], cutoffNotSet, cutoff, precision, length, method, outputdir, vsearchLocation, type);
1371         dataBundle->setOptiOptions(metricName, stableMetric, initialize, maxIters);
1372         cluster(dataBundle);
1373         listFiles = dataBundle->listFileNames;
1374         tag = dataBundle->tag;
1375         cutoff = dataBundle->cutoff;
1376         labels = dataBundle->labels;
1377 
1378 
1379         for (int i = 0; i < processors-1; i++) {
1380             workerThreads[i]->join();
1381 
1382             listFiles.insert(listFiles.end(), data[i]->listFileNames.begin(), data[i]->listFileNames.end());
1383             labels.insert(data[i]->labels.begin(), data[i]->labels.end());
1384             if (data[i]->cutoff < cutoff) { cutoff = data[i]->cutoff; }
1385 
1386             delete data[i];
1387             delete workerThreads[i];
1388         }
1389         delete dataBundle;
1390         deleteFiles = true;
1391 
1392         return listFiles;
1393 	}
1394 	catch(exception& e) {
1395 		m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1396 		exit(1);
1397 	}
1398 }
1399 //**********************************************************************************************************************
1400 
createMergedDistanceFile(vector<map<string,string>> distNames)1401 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1402 	try{
1403 		string thisOutputDir = outputdir;
1404 		if (outputdir == "") { thisOutputDir = util.hasPath(fastafile); }
1405         map<string, string> variables;
1406         variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(fastafile));
1407 		string outputFileName = getOutputFileName("column", variables);
1408 		util.mothurRemove(outputFileName);
1409 
1410 
1411 		for (int i = 0; i < distNames.size(); i++) {
1412 			if (m->getControl_pressed()) {  return 0; }
1413 
1414 			string thisDistFile = distNames[i].begin()->first;
1415 
1416 			util.appendFiles(thisDistFile, outputFileName);
1417 		}
1418 
1419 		outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1420 
1421 		return 0;
1422 	}
1423 	catch(exception& e) {
1424 		m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1425 		exit(1);
1426 	}
1427 }
1428 //**********************************************************************************************************************
1429 
runSensSpec()1430 int ClusterSplitCommand::runSensSpec() {
1431     try{
1432         string listFile = "";
1433         itTypes = outputTypes.find("list");
1434         if (itTypes != outputTypes.end()) {
1435             if ((itTypes->second).size() != 0) { listFile = (itTypes->second)[0];  }
1436         }
1437 
1438         string columnFile = "";
1439         if (makeDist) {
1440             itTypes = outputTypes.find("column");
1441             if (itTypes != outputTypes.end()) {
1442                 if ((itTypes->second).size() != 0) { columnFile = (itTypes->second)[0];  }
1443             }
1444         }
1445 
1446         string inputString = "cutoff=" + toString(cutoff) + ", list=" + listFile;
1447         if (columnFile != "") { inputString += ", column=" + columnFile;  }
1448         else { m->mothurOut("[WARNING]: Cannot run sens.spec analysis without a column file, skipping."); return 0;  }
1449 
1450         if (namefile != "")         {  inputString += ", name=" + namefile; }
1451         else if (countfile != "")   { inputString += ", count=" + countfile; }
1452         else { m->mothurOut("[WARNING]: Cannot run sens.spec analysis without a name or count file, skipping."); return 0;  }
1453 
1454         m->mothurOut("/******************************************/\n");
1455         m->mothurOut("Running command: sens.spec(" + inputString + ")\n");
1456         current->setMothurCalling(true);
1457 
1458         Command* sensspecCommand = new SensSpecCommand(inputString);
1459         sensspecCommand->execute();
1460 
1461         map<string, vector<string> > filenames = sensspecCommand->getOutputFiles();
1462 
1463         delete sensspecCommand;
1464          current->setMothurCalling(false);
1465 
1466         string outputFileName = filenames["sensspec"][0];
1467 
1468         outputTypes["sensspec"].push_back(outputFileName);  outputNames.push_back(outputFileName);
1469 
1470         m->mothurOut("/******************************************/\n");
1471         m->mothurOut("Done.\n\n\n");
1472 
1473         ifstream in;
1474         util.openInputFile(outputFileName, in);
1475 
1476         while(!in.eof()){
1477             if (m->getControl_pressed()) { break; }
1478 
1479             m->mothurOut(util.getline(in)+"\n"); util.gobble(in);
1480         }
1481         in.close();
1482 
1483         return 0;
1484     }
1485     catch(exception& e) {
1486         m->errorOut(e, "ClusterSplitCommand", "runSensSpec");
1487         exit(1);
1488     }
1489 }
1490 //**********************************************************************************************************************
printFile(string singleton,vector<map<string,string>> & distName)1491 string ClusterSplitCommand::printFile(string singleton, vector< map<string, string> >& distName){
1492     try {
1493         ofstream out;
1494         map<string, string> variables;
1495         string thisOutputDir = outputdir;
1496 		if (outputdir == "") { thisOutputDir = util.hasPath(distfile); }
1497         variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(distfile));
1498 		string outputFileName = getOutputFileName("file", variables);
1499         util.openOutputFile(outputFileName, out);
1500         outputTypes["file"].push_back(outputFileName); outputNames.push_back(outputFileName);
1501         current->setFileFile(outputFileName);
1502 
1503         out << singleton << endl;
1504         if (namefile != "") { out << "name" << endl; }
1505         else if (countfile != "") { out << "count" << endl; }
1506         else { out << "unknown" << endl; }
1507 
1508         for (int i = 0; i < distName.size(); i++) {    out << distName[i].begin()->first << '\t' << distName[i].begin()->second << endl;	}
1509         out.close();
1510 
1511         return outputFileName;
1512     }
1513     catch(exception& e) {
1514 		m->errorOut(e, "ClusterSplitCommand", "printFile");
1515 		exit(1);
1516 	}
1517 
1518 }
1519 //**********************************************************************************************************************
readFile(vector<map<string,string>> & distName)1520 string ClusterSplitCommand::readFile(vector< map<string, string> >& distName){
1521     try {
1522 
1523         string singleton, thiscolumn, thisname;
1524 
1525         ifstream in;
1526         util.openInputFile(file, in);
1527 
1528         in >> singleton; util.gobble(in);
1529 
1530         string path = util.hasPath(singleton);
1531         if (path == "") {  singleton = inputDir + singleton;  }
1532 
1533         in >> type; util.gobble(in);
1534 
1535         if (type == "name") { }
1536         else if (type == "count") { }
1537         else {  m->mothurOut("[ERROR]: unknown file type. Are the files in column 2 of the file name files or count files? Please change unknown to name or count.\n"); m->setControl_pressed(true); }
1538 
1539         if (isList) {
1540 
1541             vector<string> listFileNames;
1542             string thisListFileName = "";
1543             set<string> listLabels;
1544 
1545             while(!in.eof()) {
1546                 if (m->getControl_pressed()) { break; }
1547 
1548                 in >> thisListFileName; util.gobble(in);
1549 
1550                 string path = util.hasPath(thisListFileName);
1551                 if (path == "") {  thisListFileName = inputDir + thisListFileName;  }
1552 
1553                 getLabels(thisListFileName, listLabels);
1554                 listFileNames.push_back(thisListFileName);
1555             }
1556 
1557             ListVector* listSingle;
1558             map<double, int> labelBins = completeListFile(listFileNames, singleton, listLabels, listSingle);
1559 
1560             mergeLists(listFileNames, labelBins, listSingle);
1561 
1562         }else {
1563 
1564             while(!in.eof()) {
1565                 if (m->getControl_pressed()) { break; }
1566 
1567                 in >> thiscolumn; util.gobble(in);
1568                 in >> thisname; util.gobble(in);
1569 
1570                 map<string, string> temp;
1571                 temp[thiscolumn] = thisname;
1572                 distName.push_back(temp);
1573             }
1574         }
1575 
1576         in.close();
1577 
1578         return singleton;
1579     }
1580     catch(exception& e) {
1581 		m->errorOut(e, "ClusterSplitCommand", "readFile");
1582 		exit(1);
1583 	}
1584 
1585 }
1586 //**********************************************************************************************************************
getLabels(string file,set<string> & listLabels)1587 int ClusterSplitCommand::getLabels(string file, set<string>& listLabels){
1588     try {
1589         ifstream in;
1590         util.openInputFile(file, in);
1591 
1592         //read headers
1593         util.getline(in); util.gobble(in);
1594 
1595         string label;
1596         while(!in.eof()) {
1597             if (m->getControl_pressed()) { break; }
1598 
1599             in >> label; util.getline(in); util.gobble(in);
1600 
1601             listLabels.insert(label);
1602         }
1603 
1604         in.close();
1605 
1606         return 0;
1607     }
1608     catch(exception& e) {
1609         m->errorOut(e, "ClusterSplitCommand", "getLabels");
1610         exit(1);
1611     }
1612 
1613 }
1614 //**********************************************************************************************************************
1615