1 /*
2 * clustersplitcommand.cpp
3 * Mothur
4 *
5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "clustersplitcommand.h"
11 #include "systemcommand.h"
12 #include "sensspeccommand.h"
13 #include "mcc.hpp"
14 #include "sensitivity.hpp"
15 #include "specificity.hpp"
16 #include "fdr.hpp"
17 #include "npv.hpp"
18 #include "ppv.hpp"
19 #include "f1score.hpp"
20 #include "tp.hpp"
21 #include "fp.hpp"
22 #include "fpfn.hpp"
23 #include "tptn.hpp"
24 #include "tn.hpp"
25 #include "fn.hpp"
26 #include "accuracy.hpp"
27
28 //**********************************************************************************************************************
setParameters()29 vector<string> ClusterSplitCommand::setParameters(){
30 try {
31 CommandParameter pfile("file", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","",false,false,true); parameters.push_back(pfile);
32 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
33 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
34 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
35 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
36 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
37 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
38 CommandParameter prunspenspec("runsensspec", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(prunspenspec);
39 CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
40 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
41 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
42 CommandParameter pcutoff("cutoff", "Number", "", "0.03", "", "", "","",false,false,true); parameters.push_back(pcutoff);
43 CommandParameter pmetriccutoff("delta", "Number", "", "0.0001", "", "", "","",false,false,true); parameters.push_back(pmetriccutoff);
44 CommandParameter piters("iters", "Number", "", "100", "", "", "","",false,false,true); parameters.push_back(piters);
45 CommandParameter pinitialize("initialize", "Multiple", "oneotu-singleton", "singleton", "", "", "","",false,false,true); parameters.push_back(pinitialize);
46 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
47 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted-agc-dgc-opti", "opti", "", "", "","",false,false,true); parameters.push_back(pmethod);
48 CommandParameter pmetric("metric", "Multiple", "mcc-sens-spec-tptn-fpfn-tp-tn-fp-fn-f1score-accuracy-ppv-npv-fdr", "mcc", "", "", "","",false,false,true); parameters.push_back(pmetric);
49 CommandParameter pdist("dist", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdist);
50 CommandParameter pislist("islist", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pislist);
51 CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
52 CommandParameter pvsearchlocation("vsearch", "String", "", "", "", "", "","",false,false); parameters.push_back(pvsearchlocation);
53 CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
54 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
55 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
56
57 abort = false; calledHelp = false;
58
59 vector<string> tempOutNames;
60 outputTypes["list"] = tempOutNames;
61 outputTypes["rabund"] = tempOutNames;
62 outputTypes["sabund"] = tempOutNames;
63 outputTypes["column"] = tempOutNames;
64 outputTypes["name"] = tempOutNames;
65 outputTypes["file"] = tempOutNames;
66 outputTypes["sensspec"] = tempOutNames;
67
68 vector<string> myArray;
69 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
70 return myArray;
71 }
72 catch(exception& e) {
73 m->errorOut(e, "ClusterSplitCommand", "setParameters");
74 exit(1);
75 }
76 }
77 //**********************************************************************************************************************
getHelpString()78 string ClusterSplitCommand::getHelpString(){
79 try {
80 string helpString = "";
81 helpString += "The cluster.split command parameter options are file, fasta, name, count, cutoff, precision, method, taxonomy, taxlevel, showabund, timing, cluster, iters, delta, initialize, dist, processors, runsensspec. Fasta or file are required.\n";
82 helpString += "The cluster.split command splits your files by classification using a fasta file to generate distance matrices for each taxonomic group. \n";
83 helpString += "The file option allows you to enter your file containing your list of column and names/count files as well as the singleton file. This file is mothur generated, when you run cluster.split() with the cluster=f parameter. This can be helpful when you have a large dataset that you may be able to use all your processors for the splitting step, but have to reduce them for the cluster step due to RAM constraints. For example: cluster.split(fasta=yourFasta, taxonomy=yourTax, count=yourCount, taxlevel=3, cluster=f, processors=8) then cluster.split(file=yourFile, processors=4). This allows your to maximize your processors during the splitting step. Also, if you are unsure if the cluster step will have RAM issue with multiple processors, you can avoid running the first part of the command multiple times.\n";
84 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
85 helpString += "The name parameter allows you to enter your name file. \n";
86 helpString += "The count parameter allows you to enter your count file.\n";
87 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences. This is required unless you are running the command with the file option. \n";
88 helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the dataset into taxanomic matrices, default=t";
89 helpString += "The dist parameter allows you to indicate whether you want a column formatted distance matrix outputted along with the list file. Default=F.";
90 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.03. \n";
91 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
92 helpString += "The iters parameter allow you to set the maxiters for the opticluster method. \n";
93 helpString += "The metric parameter allows to select the metric in the opticluster method. Options are Matthews correlation coefficient (mcc), sensitivity (sens), specificity (spec), true positives + true negatives (tptn), false positives + false negatives (fpfn), true positives (tp), true negative (tn), false positive (fp), false negative (fn), f1score (f1score), accuracy (accuracy), positive predictive value (ppv), negative predictive value (npv), false discovery rate (fdr). Default=mcc.\n";
94 helpString += "The delta parameter allows to set the stable value for the metric in the opticluster method. Default=0.0001\n";
95 helpString += "The initialize parameter allows to select the initial randomization for the opticluster method. Options are singleton, meaning each sequence is randomly assigned to its own OTU, or oneotu meaning all sequences are assigned to one otu. Default=singleton.\n";
96 helpString += "The runsensspec parameter allows to run the sens.spec command on the completed list file. Default=true.\n";
97 helpString += "The method parameter allows you to enter your clustering mothod. Options are furthest, nearest, average, weighted, agc, dgc and opti. Default=opti. The agc and dgc methods require a fasta file.";
98 helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the dataset, default=3.\n";
99 helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. Default=f.\n";
100 helpString += "The processors parameter allows you to specify the number of processors to use. The default is all available.\n";
101 helpString += "The vsearch parameter allows you to specify the name and location of your vsearch executable if using agc or dgc clustering methods. By default mothur will look in your path, mothur's executable and mothur tools locations. You can set the vsearch location as follows, vsearch=/usr/bin/vsearch.\n";
102 helpString += "The cluster.split command should be in the following format: \n";
103 helpString += "cluster.split(fasta=yourFastaFile, count=yourCountFile, method=yourMethod, cutoff=yourCutoff, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n";
104 return helpString;
105 }
106 catch(exception& e) {
107 m->errorOut(e, "ClusterSplitCommand", "getHelpString");
108 exit(1);
109 }
110 }
111 //**********************************************************************************************************************
getOutputPattern(string type)112 string ClusterSplitCommand::getOutputPattern(string type) {
113 try {
114 string pattern = "";
115
116 if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
117 else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
118 else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
119 else if (type == "sensspec") { pattern = "[filename],[clustertag],sensspec"; }
120 else if (type == "column") { pattern = "[filename],dist"; }
121 else if (type == "file") { pattern = "[filename],file"; }
122 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }
123
124 return pattern;
125 }
126 catch(exception& e) {
127 m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
128 exit(1);
129 }
130 }
131 //**********************************************************************************************************************
getCommonQuestions()132 string ClusterSplitCommand::getCommonQuestions(){
133 try {
134 vector<string> questions, issues, qanswers, ianswers, howtos, hanswers;
135
136 string issue = "Cluster.split crashes after merging individual list files. What do I do?"; issues.push_back(issue);
137 string ianswer = "\tAfter merging the split list files, mothur runs the sens.spec command on the entire dataset. The entire dataset's distance matrix may be too large to fit in memory, which causes the crash. You can skip this step by setting the runsensspec parameter to false. Skipping the sens.spec analysis does not effect the OTU assignment, and you can run the sens.spec analysis separately using the sens.spec command. \n"; ianswers.push_back(ianswer);
138
139 issue = "Cluster.split crashes while reading the split distance matrices. What should I do?"; issues.push_back(issue);
140 ianswer = "\tThe command is crashing because the distance matrices are too large to fit into memory. Why do I have such a large distance matrix? This is most often caused by poor overlap of your reads. When reads have poor overlap, it greatly increases your error rate. Also, sequences that should cluster together don't because the errors appear to be genetic differences when in fact they are not. The quality of the data you are processing can not be overstressed. Error filled reads produce error filled results. To take a step back, if you look through our MiSeq SOP, you’ll see that we go to great pains to only work with the unique sequences to limit the number of sequences we have to align, screen for chimeras, classify, etc. We all know that 20 million reads will never make it through the pipeline without setting your computer on fire. Returning to the question at hand, you can imagine that if the reads do not fully overlap then any error in the 5’ end of the first read will be uncorrected by the 3’ end of the second read. If we assume for now that the errors are random, then every error will generate a new unique sequence. Granted, this happens less than 1% of the time, but multiply that by 20 million reads at whatever length you choose and you’ve got a big number. Viola, a bunch of unique reads and a ginormous distance matrix. \n"; ianswers.push_back(ianswer);
141
142 string howto = "How do I cluster my sequences into OTUs at distance 0.03?"; howtos.push_back(howto);
143 string hanswer = "\tBy default the cluster.split command will use the opti method to cluster to 0.03. To find OTUs at a different distance set the cutoff parameter. ie. cutoff=0.01 will assemble OTUs for distance 0.01.\n"; hanswers.push_back(hanswer);
144
145 string commonQuestions = util.getFormattedHelp(questions, qanswers, issues, ianswers, howtos, hanswers);
146
147 return commonQuestions;
148 }
149 catch(exception& e) {
150 m->errorOut(e, "ClusterSplitCommand", "getCommonQuestions");
151 exit(1);
152 }
153 }
154 //**********************************************************************************************************************
155 //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen.
ClusterSplitCommand(string option)156 ClusterSplitCommand::ClusterSplitCommand(string option) : Command() {
157 try{
158 format = "";
159
160 //allow user to run help
161 if(option == "help") { help(); abort = true; calledHelp = true; }
162 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
163 else if(option == "category") { abort = true; calledHelp = true; }
164
165 else {
166 OptionParser parser(option, setParameters());
167 map<string,string> parameters = parser.getParameters();
168
169 ValidParameters validParameter;
170
171 //check for required parameters
172 file = validParameter.validFile(parameters, "file");
173 if (file == "not open") { file = ""; abort = true; }
174 else if (file == "not found") { file = ""; }
175 else { distfile = file; type = ""; }
176
177 namefile = validParameter.validFile(parameters, "name");
178 if (namefile == "not open") { abort = true; namefile = "";}
179 else if (namefile == "not found") { namefile = ""; }
180 else { current->setNameFile(namefile); type = "name"; }
181
182 countfile = validParameter.validFile(parameters, "count");
183 if (countfile == "not open") { abort = true; countfile = "";}
184 else if (countfile == "not found") { countfile = ""; }
185 else { current->setCountFile(countfile); type = "count"; }
186
187 fastafile = validParameter.validFile(parameters, "fasta");
188 if (fastafile == "not open") { abort = true; }
189 else if (fastafile == "not found") { fastafile = ""; }
190 else { distfile = fastafile; current->setFastaFile(fastafile); }
191
192 taxFile = validParameter.validFile(parameters, "taxonomy");
193 if (taxFile == "not open") { taxFile = ""; abort = true; }
194 else if (taxFile == "not found") { taxFile = ""; }
195 else { current->setTaxonomyFile(taxFile); }
196
197 if ((fastafile == "") && (file == "")) {
198 fastafile = current->getFastaFile();
199 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter.\n"); }
200 else {
201 m->mothurOut("[ERROR]: No valid current files. When executing a cluster.split command you must enter a file file or fastafile.\n"); abort = true;
202 }
203 }
204 else if ((fastafile != "") && (file != "")) { m->mothurOut("[ERROR]: When executing a cluster.split command you must enter ONLY ONE of the following: file or fasta.\n"); abort = true; }
205
206 if ((countfile != "") && (namefile != "")) { m->mothurOut("[ERROR]: When executing a cluster.split command you must enter ONLY ONE of the following: count or name.\n"); abort = true; }
207
208 if (file != "") {
209 if ((namefile == "") && (countfile == "")) {
210 m->mothurOut("\n[WARNING]: When using the file option, it is recommended you include the name or count file. Doing so will ensure the OTUs are printed by OTU size reflecting the redundant reads, instead of just the unique reads.\n");
211 }
212 }
213
214 if (fastafile != "") {
215 if (taxFile == "") {
216 taxFile = current->getTaxonomyFile();
217 if (taxFile != "") { m->mothurOut("Using " + taxFile + " as input file for the taxonomy parameter.\n"); }
218 else {
219 m->mothurOut("[ERROR]: You need to provide a taxonomy file if you are if you are using a fasta file to generate the split.\n");
220 abort = true;
221 }
222 }
223
224 if ((namefile == "") && (countfile == "")) {
225 namefile = current->getNameFile();
226 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter.\n"); type = "name"; }
227 else {
228 countfile = current->getCountFile();
229 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter.\n"); type = "count"; }
230 else { m->mothurOut("[ERROR]: You need to provide a namefile or countfile.\n"); abort = true;
231 }
232 }
233 }
234 }
235
236
237 string temp;
238 temp = validParameter.valid(parameters, "precision");
239 if (temp == "not found") { temp = "100"; }
240 //saves precision legnth for formatting below
241 length = temp.length();
242 util.mothurConvert(temp, precision);
243
244 temp = validParameter.valid(parameters, "processors"); if (temp == "not found"){ temp = current->getProcessors(); }
245 processors = current->setProcessors(temp);
246
247 temp = validParameter.valid(parameters, "classic"); if (temp == "not found") { temp = "F"; }
248 classic = util.isTrue(temp);
249
250 temp = validParameter.valid(parameters, "runsensspec"); if (temp == "not found") { temp = "T"; }
251 runsensSpec = util.isTrue(temp);
252
253 temp = validParameter.valid(parameters, "taxlevel"); if (temp == "not found") { temp = "3"; }
254 util.mothurConvert(temp, taxLevelCutoff);
255
256 temp = validParameter.valid(parameters, "iters"); if (temp == "not found") { temp = "100"; }
257 util.mothurConvert(temp, maxIters);
258
259 temp = validParameter.valid(parameters, "delta"); if (temp == "not found") { temp = "0.0001"; }
260 util.mothurConvert(temp, stableMetric);
261
262 metricName = validParameter.valid(parameters, "metric"); if (metricName == "not found") { metricName = "mcc"; }
263
264 if ((metricName == "mcc") || (metricName == "sens") || (metricName == "spec") || (metricName == "tptn") || (metricName == "tp") || (metricName == "tn") || (metricName == "fp") || (metricName == "fn") || (metricName == "f1score") || (metricName == "accuracy") || (metricName == "ppv") || (metricName == "npv") || (metricName == "fdr") || (metricName == "fpfn") ){ }
265 else { m->mothurOut("[ERROR]: Not a valid metric. Valid metrics are mcc, sens, spec, tp, tn, fp, fn, tptn, fpfn, f1score, accuracy, ppv, npv, fdr.\n"); abort = true; }
266
267 initialize = validParameter.valid(parameters, "initialize"); if (initialize == "not found") { initialize = "singleton"; }
268
269 if ((initialize == "singleton") || (initialize == "oneotu")){ }
270 else { m->mothurOut("[ERROR]: Not a valid initialization. Valid initializations are singleton and oneotu.\n"); abort = true; }
271
272 method = validParameter.valid(parameters, "method"); if (method == "not found") { method = "opti"; }
273
274 vector<string> versionOutputs;
275 bool foundTool = false;
276 string path = current->getProgramPath();
277 string programName = "vsearch"; programName += EXECUTABLE_EXT;
278
279 vsearchLocation = validParameter.validPath(parameters, "vsearch");
280 if (vsearchLocation == "not found") {
281 vsearchLocation = "";
282 if ((method == "agc") || (method == "dgc")) {
283 foundTool = util.findTool(programName, vsearchLocation, path, versionOutputs, current->getLocations());
284 }
285 }
286 else {
287 if ((method == "agc") || (method == "dgc")) {
288 //test to make sure vsearch exists
289 ifstream in;
290 vsearchLocation = util.getFullPathName(vsearchLocation);
291 bool ableToOpen = util.openInputFile(vsearchLocation, in, "no error"); in.close();
292 if(!ableToOpen) {
293 m->mothurOut(vsearchLocation + " file does not exist or cannot be opened, ignoring.\n"); vsearchLocation = "";
294 programName = util.getSimpleName(vsearchLocation); vsearchLocation = "";
295 foundTool = util.findTool(programName, vsearchLocation, path, versionOutputs, current->getLocations());
296 }
297 }
298 }
299
300 if ((method == "furthest") || (method == "nearest") || (method == "average") || (method == "weighted") || (method == "agc") || (method == "dgc") || (method == "opti")) { }
301 else { m->mothurOut("[ERROR]: Not a valid clustering method. Valid clustering algorithms are furthest, nearest, average, weighted, agc, dgc and opti.\n"); abort = true; }
302
303 if ((method == "agc") || (method == "dgc")) {
304 if (fastafile == "") { m->mothurOut("[ERROR]: You must provide a fasta file when using the agc or dgc clustering methods, aborting\n."); abort = true;}
305 if (classic) { m->mothurOut("[ERROR]: You cannot use cluster.classic with the agc or dgc clustering methods, aborting\n."); abort = true; }
306 if (!foundTool) { abort = true; }
307 }
308
309 cutoffNotSet = false;
310 temp = validParameter.valid(parameters, "cutoff");
311 if (temp == "not found") { cutoffNotSet = true; if ((method == "opti") || (method == "agc") || (method == "dgc")) { temp = "0.03"; }else { temp = "0.15"; } }
312 util.mothurConvert(temp, cutoff);
313
314 temp = validParameter.valid(parameters, "showabund");
315 if (temp == "not found") { temp = "T"; }
316 showabund = util.isTrue(temp);
317
318 temp = validParameter.valid(parameters, "cluster"); if (temp == "not found") { temp = "T"; }
319 runCluster = util.isTrue(temp);
320
321 temp = validParameter.valid(parameters, "islist"); if (temp == "not found") { temp = "F"; }
322 isList = util.isTrue(temp);
323
324 temp = validParameter.valid(parameters, "dist"); if (temp == "not found") { temp = "F"; }
325 makeDist = util.isTrue(temp);
326 if (method == "opti") { makeDist = runsensSpec; }
327
328 if (classic && makeDist) { m->mothurOut("[ERROR]: You cannot use the dist parameter with the classic parameter. Mothur will ignore the dist parameter.\n"); makeDist = false; }
329
330 timing = validParameter.valid(parameters, "timing");
331 if (timing == "not found") { timing = "F"; }
332 }
333 }
334 catch(exception& e) {
335 m->errorOut(e, "ClusterSplitCommand", "ClusterSplitCommand");
336 exit(1);
337 }
338 }
339
340 //**********************************************************************************************************************
341
execute()342 int ClusterSplitCommand::execute(){
343 try {
344
345 if (abort) { if (calledHelp) { return 0; } return 2; }
346
347 time_t estart;
348 vector<string> listFileNames;
349 vector< map<string, string> > distName;
350 set<string> labels;
351 string singletonName = "";
352
353 double saveCutoff = cutoff;
354
355 if (file != "") {
356 deleteFiles = false; estart = time(NULL);
357 singletonName = readFile(distName);
358
359 if (isList) {
360
361 //set list file as new current listfile
362 string currentName = "";
363 itTypes = outputTypes.find("list");
364 if (itTypes != outputTypes.end()) {
365 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setListFile(currentName); }
366 }
367
368 m->mothurOut("\nOutput File Names: \n");
369 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i] + "\n"); } m->mothurOutEndLine();
370
371 return 0;
372 }
373
374 }else {
375 //splitting
376 estart = time(NULL); bool usingVsearchToCLuster = false;
377 if ((method == "agc") || (method == "dgc")) { usingVsearchToCLuster = true; if (cutoffNotSet) { m->mothurOut("\nYou did not set a cutoff, using 0.03.\n"); cutoff = 0.03; } }
378
379 m->mothurOut("Splitting the file...\n");
380 current->setMothurCalling(true);
381
382 //split matrix into non-overlapping groups
383 SplitMatrix* split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, processors, classic, outputdir, usingVsearchToCLuster);
384
385 if (fastafile != "") { current->setFastaFile(fastafile); }
386
387 if (m->getControl_pressed()) { delete split; return 0; }
388
389 singletonName = split->getSingletonNames();
390 distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
391 delete split;
392 current->setMothurCalling(false);
393
394 if (m->getDebug()) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
395
396 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file.\n");
397
398 //output a merged distance file
399 if (makeDist) { createMergedDistanceFile(distName); }
400
401 if (m->getControl_pressed()) { return 0; }
402
403 estart = time(NULL);
404
405 if (!runCluster) {
406 string filename = printFile(singletonName, distName);
407
408 m->mothurOutEndLine();
409 m->mothurOut("Output File Names:\n\n"); m->mothurOut(filename); m->mothurOutEndLine();
410 for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); }
411 m->mothurOutEndLine();
412
413 return 0;
414 }
415 deleteFiles = true;
416 }
417 //****************** break up files between processes and cluster each file set ******************************//
418
419 listFileNames = createProcesses(distName, labels);
420
421 if (deleteFiles) {
422 //delete the temp files now that we are done
423 for (int i = 0; i < distName.size(); i++) {
424 string thisNamefile = distName[i].begin()->second;
425 string thisDistFile = distName[i].begin()->first;
426 util.mothurRemove(thisNamefile);
427 util.mothurRemove(thisDistFile);
428 }
429 }
430
431 if (m->getControl_pressed()) { for (int i = 0; i < listFileNames.size(); i++) { util.mothurRemove(listFileNames[i]); } return 0; }
432
433 if (!util.isEqual(saveCutoff, cutoff)) { m->mothurOut("\nCutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
434
435 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster\n");
436
437 //****************** merge list file and create rabund and sabund files ******************************//
438 estart = time(NULL);
439 m->mothurOut("Merging the clustered files...\n");
440
441 ListVector* listSingle;
442 map<double, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
443
444 if (m->getControl_pressed()) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
445
446 mergeLists(listFileNames, labelBins, listSingle);
447
448 if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
449
450 //delete after all are complete incase a crash happens
451 if (!deleteFiles) { for (int i = 0; i < distName.size(); i++) { util.mothurRemove(distName[i].begin()->first); util.mothurRemove(distName[i].begin()->second); } }
452
453 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge.\n");
454
455 if ((method == "opti") && (runsensSpec)) { runSensSpec(); }
456
457 if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
458
459 //set list file as new current listfile
460 string currentName = "";
461 itTypes = outputTypes.find("list");
462 if (itTypes != outputTypes.end()) {
463 if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setListFile(currentName); }
464 }
465
466 //set rabund file as new current rabundfile
467 itTypes = outputTypes.find("rabund");
468 if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setRabundFile(currentName); } }
469
470 //set sabund file as new current sabundfile
471 itTypes = outputTypes.find("sabund");
472 if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setSabundFile(currentName); } }
473
474 //set sabund file as new current sabundfile
475 itTypes = outputTypes.find("column");
476 if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentName = (itTypes->second)[0]; current->setColumnFile(currentName); } }
477
478 m->mothurOut("\nOutput File Names: \n");
479 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i] +"\n"); } m->mothurOutEndLine();
480
481 return 0;
482 }
483 catch(exception& e) {
484 m->errorOut(e, "ClusterSplitCommand", "execute");
485 exit(1);
486 }
487 }
488 //**********************************************************************************************************************
completeListFile(vector<string> listNames,string singleton,set<string> & userLabels,ListVector * & listSingle)489 map<double, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
490 try {
491 map<double, int> labelBin;
492 vector<double> orderFloat;
493 int numSingleBins;
494
495 //read in singletons
496 if (singleton != "none") {
497
498 ifstream in;
499 util.openInputFile(singleton, in);
500
501 string firstCol, secondCol;
502 listSingle = new ListVector();
503
504 if (type == "count") { util.getline(in); util.gobble(in); }
505
506 while (!in.eof()) {
507 in >> firstCol >> secondCol;
508 util.getline(in);
509 if (type == "name") { listSingle->push_back(secondCol); }
510 else { listSingle->push_back(firstCol); }
511 util.gobble(in);
512 }
513
514 in.close();
515 util.mothurRemove(singleton);
516
517 numSingleBins = listSingle->getNumBins();
518 }else{ listSingle = NULL; numSingleBins = 0; }
519
520 //go through users set and make them floats so we can sort them
521 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
522 double temp = -10.0;
523
524 if ((*it != "unique") && (convertTestFloat(*it, temp) )) { util.mothurConvert(*it, temp); }
525 else if (*it == "unique") { temp = -1.0; }
526
527 if ((temp < cutoff) || util.isEqual(cutoff, temp)) {
528 orderFloat.push_back(temp);
529 labelBin[temp] = numSingleBins; //initialize numbins
530 }
531 }
532
533 //sort order
534 sort(orderFloat.begin(), orderFloat.end());
535 userLabels.clear();
536
537 //get the list info from each file
538 for (int k = 0; k < listNames.size(); k++) {
539
540 if (m->getControl_pressed()) {
541 if (listSingle != NULL) { delete listSingle; listSingle = NULL; util.mothurRemove(singleton); }
542 for (int i = 0; i < listNames.size(); i++) { util.mothurRemove(listNames[i]); }
543 return labelBin;
544 }
545
546 InputData* input = new InputData(listNames[k], "list", nullVector);
547 ListVector* list = input->getListVector();
548 string lastLabel = list->getLabel();
549
550 string filledInList = listNames[k] + "filledInTemp";
551 ofstream outFilled;
552 util.openOutputFile(filledInList, outFilled);
553 bool printHeaders = true;
554
555
556 //for each label needed
557 for(int l = 0; l < orderFloat.size(); l++){
558
559 string thisLabel;
560 if (util.isEqual(orderFloat[l],-1)) { thisLabel = "unique"; }
561 else { thisLabel = toString(orderFloat[l], length-1); }
562
563 //this file has reached the end
564 if (list == NULL) {
565 list = input->getListVector(lastLabel, true);
566 }else{ //do you have the distance, or do you need to fill in
567
568 float labelFloat;
569 if (list->getLabel() == "unique") { labelFloat = -1.0; }
570 else { convert(list->getLabel(), labelFloat); }
571
572 //check for missing labels
573 if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
574 //if its bigger get last label, otherwise keep it
575 delete list;
576 list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
577 }
578 lastLabel = list->getLabel();
579 }
580
581 //print to new file
582 list->setLabel(thisLabel);
583 list->setPrintedLabels(printHeaders);
584 list->print(outFilled, true); printHeaders = false;
585
586 //update labelBin
587 labelBin[orderFloat[l]] += list->getNumBins();
588
589 delete list;
590
591 list = input->getListVector();
592 }
593
594 if (list != NULL) { delete list; }
595 delete input;
596
597 outFilled.close();
598 util.mothurRemove(listNames[k]);
599 rename(filledInList.c_str(), listNames[k].c_str());
600 }
601
602 return labelBin;
603 }
604 catch(exception& e) {
605 m->errorOut(e, "ClusterSplitCommand", "completeListFile");
606 exit(1);
607 }
608 }
609 //**********************************************************************************************************************
mergeLists(vector<string> listNames,map<double,int> userLabels,ListVector * listSingle)610 int ClusterSplitCommand::mergeLists(vector<string> listNames, map<double, int> userLabels, ListVector* listSingle){
611 try {
612 if (outputdir == "") { outputdir += util.hasPath(distfile); }
613 fileroot = outputdir + util.getRootName(util.getSimpleName(distfile));
614
615 map<string, string> variables;
616 variables["[filename]"] = fileroot;
617 variables["[clustertag]"] = tag;
618 string sabundFileName = getOutputFileName("sabund", variables);
619 string rabundFileName = getOutputFileName("rabund", variables);
620 string listFileName = getOutputFileName("list", variables);
621
622 map<string, int> counts;
623 ofstream outList, outRabund, outSabund;
624 if (countfile == "") {
625 util.openOutputFile(sabundFileName, outSabund);
626 util.openOutputFile(rabundFileName, outRabund);
627 outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
628 outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
629
630 }else {
631 CountTable ct;
632 ct.readTable(countfile, false, false);
633 counts = ct.getNameMap();
634 }
635
636 util.openOutputFile(listFileName, outList);
637 outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
638 bool printHeaders = true;
639
640 //for each label needed
641 for(map<double, int>::iterator itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
642
643 string thisLabel;
644 if (util.isEqual(itLabel->first,-1)) { thisLabel = "unique"; }
645 else { thisLabel = toString(itLabel->first, length-1); }
646
647 //outList << thisLabel << '\t' << itLabel->second << '\t';
648
649 RAbundVector* rabund = NULL;
650 ListVector completeList;
651 completeList.setLabel(thisLabel);
652
653 if (countfile == "") {
654 rabund = new RAbundVector();
655 rabund->setLabel(thisLabel);
656 }
657
658 //add in singletons
659 if (listSingle != NULL) {
660 for (int j = 0; j < listSingle->getNumBins(); j++) {
661 //outList << listSingle->get(j) << '\t';
662 completeList.push_back(listSingle->get(j));
663 if (countfile == "") { rabund->push_back(util.getNumNames(listSingle->get(j))); }
664 }
665 }
666
667 //get the list info from each file
668 for (int k = 0; k < listNames.size(); k++) {
669
670 if (m->getControl_pressed()) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { util.mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; }
671
672 InputData* input = new InputData(listNames[k], "list", nullVector);
673 ListVector* list = input->getListVector(thisLabel);
674
675 //this file has reached the end
676 if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
677 else {
678 for (int j = 0; j < list->getNumBins(); j++) {
679 completeList.push_back(list->get(j));
680 if (countfile == "") { rabund->push_back(util.getNumNames(list->get(j))); }
681 }
682 delete list;
683 }
684 delete input;
685 }
686
687 if (countfile == "") {
688 SAbundVector sabund = rabund->getSAbundVector();
689 sabund.print(outSabund);
690 rabund->print(outRabund);
691 }
692
693 completeList.setPrintedLabels(printHeaders);
694 if (countfile != "") { completeList.print(outList, counts); printHeaders = false; }
695 else { completeList.print(outList); printHeaders = false; }
696
697 if (rabund != NULL) { delete rabund; }
698 }
699
700 outList.close();
701 if (countfile == "") {
702 outRabund.close();
703 outSabund.close();
704 }
705 if (listSingle != NULL) { delete listSingle; }
706
707 for (int i = 0; i < listNames.size(); i++) { util.mothurRemove(listNames[i]); }
708
709 return 0;
710 }
711 catch(exception& e) {
712 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
713 exit(1);
714 }
715 }
716 /**************************************************************************************************/
717 struct clusterData {
718 MothurOut* m;
719 Utils util;
720 int count, precision, length, maxIters; //numSingletons,
721 bool showabund, classic, useName, useCount, deleteFiles, cutoffNotSet;
722 double cutoff, stableMetric;
723 ofstream outList, outRabund, outSabund;
724 string tag, method, vsearchLocation, metricName, initialize, outputDir, type;
725 vector< map<string, string> > distNames;
726 set<string> labels;
727 vector<string> listFileNames;
728
clusterDataclusterData729 clusterData(){}
clusterDataclusterData730 clusterData(bool showab, bool cla, bool df, vector< map<string, string> > dN, bool cns, double cu, int prec, int len, string meth, string opd, string vl, string ty) {
731 showabund = showab;
732 distNames = dN;
733 cutoff = cu;
734 classic = cla;
735 method = meth;
736 precision = prec;
737 length = len;
738 outputDir = opd;
739 vsearchLocation = vl;
740 deleteFiles = df;
741 cutoffNotSet = cns;
742 m = MothurOut::getInstance();
743 count = 0;
744 type = ty;
745 useName = false;
746 useCount = false;
747 //numSingletons = 0;
748 }
setOptiOptionsclusterData749 void setOptiOptions(string metn, double stabMet, string init, int mxi ) {
750 metricName = metn;
751 stableMetric = stabMet;
752 maxIters = mxi;
753 initialize = init;
754 }
setNamesCountclusterData755 void setNamesCount(string cnf) {
756 useName = false;
757 useCount = false;
758 if (type == "name") { useName = true; }
759 if (type == "count") { useCount = true; }
760 }
761 };
762 //**********************************************************************************************************************
createRabund(CountTable * & ct,ListVector * & list,RAbundVector * & rabund,clusterData * params)763 int createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund, clusterData* params){
764 try {
765 rabund->setLabel(list->getLabel());
766 for(int i = 0; i < list->getNumBins(); i++) {
767 if (params->m->getControl_pressed()) { break; }
768 vector<string> binNames;
769 string bin = list->get(i);
770 params->util.splitAtComma(bin, binNames);
771 int total = 0;
772 for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); }
773 rabund->push_back(total);
774 }
775 return 0;
776 }
777 catch(exception& e) {
778 params->m->errorOut(e, "ClusterSplitCommand", "createRabund");
779 exit(1);
780 }
781
782 }
783 //**********************************************************************************************************************
clusterClassicFile(string thisDistFile,string thisNamefile,double & smallestCutoff,clusterData * params)784 string clusterClassicFile(string thisDistFile, string thisNamefile, double& smallestCutoff, clusterData* params){
785 try {
786 string listFileName = "";
787
788 ListVector* list = NULL;
789 ListVector oldList;
790 RAbundVector* rabund = NULL;
791
792 params->m->mothurOut("\nReading " + thisDistFile + "\n");
793
794 //reads phylip file storing data in 2D vector, also fills list and rabund
795 bool sim = false;
796 ClusterClassic cluster(params->cutoff, params->method, sim);
797
798 NameAssignment* nameMap = NULL;
799 CountTable* ct = NULL;
800 if(params->useName){
801 nameMap = new NameAssignment(thisNamefile);
802 nameMap->readMap();
803 cluster.readPhylipFile(thisDistFile, nameMap);
804 }else if (params->useCount) {
805 ct = new CountTable();
806 ct->readTable(thisNamefile, false, false);
807 cluster.readPhylipFile(thisDistFile, ct);
808 }
809 params->tag = cluster.getTag();
810
811 if (params->m->getControl_pressed()) { if(params->useName){ delete nameMap; }
812 else if (params->useCount) { delete ct; } return listFileName; }
813
814 list = cluster.getListVector();
815 rabund = cluster.getRAbundVector();
816
817 string thisOutputDir = params->outputDir;
818 if (params->outputDir == "") { thisOutputDir += params->util.hasPath(thisDistFile); }
819 string fileroot = thisOutputDir + params->util.getRootName(params->util.getSimpleName(thisDistFile));
820 listFileName = fileroot+ params->tag + ".list";
821
822 ofstream listFile;
823 params->util.openOutputFile(fileroot+ params->tag + ".list", listFile);
824
825 float previousDist = 0.00000;
826 float rndPreviousDist = 0.00000;
827 bool printHeaders = true;
828 oldList = *list;
829
830 params->m->mothurOut("\nClustering " + thisDistFile + "\n");
831
832 while ((cluster.getSmallDist() < params->cutoff) && (cluster.getNSeqs() > 1)){
833 if (params->m->getControl_pressed()) { delete list; delete rabund; listFile.close(); if(params->useName){ delete nameMap; }
834 else if (params->useCount) { delete ct; } return listFileName; }
835
836 cluster.update(params->cutoff);
837
838 float dist = cluster.getSmallDist();
839 float rndDist = params->util.ceilDist(dist, params->precision);
840
841 if(previousDist <= 0.0000 && !params->util.isEqual(dist, previousDist)){
842 oldList.setLabel("unique");
843 oldList.setPrintedLabels(printHeaders);
844 oldList.print(listFile); printHeaders = false;
845 if (params->labels.count("unique") == 0) { params->labels.insert("unique"); }
846 }
847 else if(!params->util.isEqual(rndDist, rndPreviousDist)){
848 oldList.setLabel(toString(rndPreviousDist, params->length-1));
849 oldList.setPrintedLabels(printHeaders);
850 oldList.print(listFile); printHeaders = false;
851 if (params->labels.count(toString(rndPreviousDist, params->length-1)) == 0) { params->labels.insert(toString(rndPreviousDist, params->length-1)); }
852 }
853
854
855 previousDist = dist;
856 rndPreviousDist = rndDist;
857 oldList = *list;
858 }
859
860 if(previousDist <= 0.0000){
861 oldList.setLabel("unique");
862 oldList.setPrintedLabels(printHeaders);
863 oldList.print(listFile); printHeaders = false;
864 if (params->labels.count("unique") == 0) { params->labels.insert("unique"); }
865 }
866 else if(rndPreviousDist<params->cutoff){
867 oldList.setLabel(toString(rndPreviousDist, params->length-1));
868 oldList.setPrintedLabels(printHeaders);
869 oldList.print(listFile); printHeaders = false;
870 if (params->labels.count(toString(rndPreviousDist, params->length-1)) == 0) { params->labels.insert(toString(rndPreviousDist, params->length-1)); }
871 }
872 listFile.close();
873
874 delete list; delete rabund;
875 if(params->useName) { delete nameMap; }
876 else if (params->useCount) { delete ct; }
877
878 if (params->deleteFiles) {
879 params->util.mothurRemove(thisDistFile);
880 params->util.mothurRemove(thisNamefile);
881 }
882 return listFileName;
883 }
884 catch(exception& e) {
885 params->m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
886 exit(1);
887 }
888 }
889 //**********************************************************************************************************************
runOptiCluster(string thisDistFile,string thisNamefile,double & smallestCutoff,clusterData * params)890 string runOptiCluster(string thisDistFile, string thisNamefile, double& smallestCutoff, clusterData* params){
891 try {
892 if (params->cutoffNotSet) { params->m->mothurOut("\nYou did not set a cutoff, using 0.03.\n"); params->cutoff = 0.03; }
893
894 string nameOrCount = params->type;
895
896 OptiMatrix matrix(thisDistFile, thisNamefile, nameOrCount, "column", params->cutoff, false);
897
898 ClusterMetric* metric = NULL;
899 if (params->metricName == "mcc") { metric = new MCC(); }
900 else if (params->metricName == "sens") { metric = new Sensitivity(); }
901 else if (params->metricName == "spec") { metric = new Specificity(); }
902 else if (params->metricName == "tptn") { metric = new TPTN(); }
903 else if (params->metricName == "tp") { metric = new TP(); }
904 else if (params->metricName == "tn") { metric = new TN(); }
905 else if (params->metricName == "fp") { metric = new FP(); }
906 else if (params->metricName == "fn") { metric = new FN(); }
907 else if (params->metricName == "f1score") { metric = new F1Score(); }
908 else if (params->metricName == "accuracy") { metric = new Accuracy(); }
909 else if (params->metricName == "ppv") { metric = new PPV(); }
910 else if (params->metricName == "npv") { metric = new NPV(); }
911 else if (params->metricName == "fdr") { metric = new FDR(); }
912 else if (params->metricName == "fpfn") { metric = new FPFN(); }
913
914 OptiCluster cluster(&matrix, metric, 0);
915 params->tag = cluster.getTag();
916
917 params->m->mothurOut("\nClustering " + thisDistFile + "\n");
918
919 string thisOutputDir = params->outputDir;
920 if (params->outputDir == "") { thisOutputDir += params->util.hasPath(thisDistFile); }
921 string fileroot = thisOutputDir + params->util.getRootName(params->util.getSimpleName(thisDistFile));
922 string listFileName = fileroot+ params->tag + ".list";
923
924 int iters = 0;
925 double listVectorMetric = 0; //worst state
926 double delta = 1;
927
928 cluster.initialize(listVectorMetric, true, params->initialize);
929
930 while ((delta > params->stableMetric) && (iters < params->maxIters)) {
931
932 if (params->m->getControl_pressed()) { if (params->deleteFiles) { params->util.mothurRemove(thisDistFile); params->util.mothurRemove(thisNamefile); } return listFileName; }
933 double oldMetric = listVectorMetric;
934 cluster.update(listVectorMetric);
935
936 delta = abs(oldMetric - listVectorMetric);
937 iters++;
938 }
939
940 if (params->m->getControl_pressed()) { delete metric; metric = NULL; return 0; }
941
942 ListVector* list = cluster.getList();
943 list->setLabel(toString(smallestCutoff));
944 //params->cutoff = params->util.ceilDist(params->cutoff, params->precision);
945 params->labels.insert(toString(smallestCutoff));
946
947 ofstream listFile;
948 params->util.openOutputFile(listFileName, listFile);
949 list->print(listFile);
950 listFile.close();
951
952 if (params->deleteFiles) {
953 params->util.mothurRemove(thisDistFile);
954 params->util.mothurRemove(thisNamefile);
955 }
956
957 double tp, tn, fp, fn;
958 params->m->mothurOut("\ntp\ttn\tfp\tfn\tsensitivity\tspecificity\tppv\tnpv\tfdr\taccuracy\tmcc\tf1score\n");
959 vector<double> results = cluster.getStats(tp, tn, fp, fn);
960 params->m->mothurOut(toString(tp) + "\t" + toString(tn) + "\t" + toString(fp) + "\t" + toString(fn) + "\t");
961 for (int i = 0; i < results.size(); i++) { params->m->mothurOut(toString(results[i]) + "\t"); }
962 params->m->mothurOut("\n\n");
963 return listFileName;
964
965 }
966 catch(exception& e) {
967 params->m->errorOut(e, "ClusterSplitCommand", "runOptiCluster");
968 exit(1);
969 }
970 }
971 //**********************************************************************************************************************
972
vsearchDriver(string inputFile,string ucClusteredFile,string logfile,double cutoff,clusterData * params)973 int vsearchDriver(string inputFile, string ucClusteredFile, string logfile, double cutoff, clusterData* params){
974 try {
975
976 //vsearch --maxaccepts 16 --usersort --id 0.97 --minseqlength 30 --wordlength 8 --uc $ROOT.clustered.uc --cluster_smallmem $ROOT.sorted.fna --maxrejects 64 --strand both --log $ROOT.clustered.log --sizeorder
977
978
979 ucClusteredFile = params->util.getFullPathName(ucClusteredFile);
980 inputFile = params->util.getFullPathName(inputFile);
981 logfile = params->util.getFullPathName(logfile);
982
983 //to allow for spaces in the path
984 ucClusteredFile = "\"" + ucClusteredFile + "\"";
985 inputFile = "\"" + inputFile + "\"";
986 logfile = "\"" + logfile + "\"";
987
988 vector<char*> cPara;
989
990 string vsearchCommand = params->vsearchLocation;
991 vsearchCommand = "\"" + vsearchCommand + "\" ";
992
993 vector<char*> vsearchParameters;
994 vsearchParameters.push_back(params->util.mothurConvert(vsearchCommand));
995
996 //--maxaccepts=16
997 vsearchParameters.push_back(params->util.mothurConvert("--maxaccepts=16"));
998
999 //--threads=1
1000 string processorsString = "--threads=1";
1001 vsearchParameters.push_back(params->util.mothurConvert(processorsString));
1002
1003 //--usersort
1004 vsearchParameters.push_back(params->util.mothurConvert("--usersort"));
1005
1006 //--id=0.97
1007 cutoff = abs(1.0 - cutoff); string cutoffString = toString(cutoff);
1008 if (cutoffString.length() > 4) { cutoffString = cutoffString.substr(0, 4); }
1009 else if (cutoffString.length() < 4) { for (int i = cutoffString.length(); i < 4; i++) { cutoffString += "0"; } }
1010
1011 cutoffString = "--id=" + cutoffString;
1012 vsearchParameters.push_back(params->util.mothurConvert(cutoffString));
1013
1014 //--minseqlength=30
1015 vsearchParameters.push_back(params->util.mothurConvert("--minseqlength=30"));
1016
1017 //--wordlength=8
1018 vsearchParameters.push_back(params->util.mothurConvert("--wordlength=8"));
1019
1020 //--uc=$ROOT.clustered.uc
1021 string tempIn = "--uc=" + ucClusteredFile;
1022 vsearchParameters.push_back(params->util.mothurConvert(tempIn));
1023
1024 //--cluster_smallmem $ROOT.sorted.fna
1025 string tempSorted = "--cluster_smallmem=" + inputFile;
1026 vsearchParameters.push_back(params->util.mothurConvert(tempSorted));
1027
1028 //--maxrejects=64
1029 vsearchParameters.push_back(params->util.mothurConvert("--maxrejects=64"));
1030
1031 //--strand=both
1032 vsearchParameters.push_back(params->util.mothurConvert("--strand=both"));
1033
1034 //--log=$ROOT.clustered.log
1035 string tempLog = "--log=" + logfile;
1036 vsearchParameters.push_back(params->util.mothurConvert(tempLog));
1037
1038 if (params->method == "agc") { //--sizeorder
1039 vsearchParameters.push_back(params->util.mothurConvert("--sizeorder"));
1040 }
1041
1042 if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: "); for(int i = 0; i < vsearchParameters.size(); i++) { params->m->mothurOut(toString(vsearchParameters[i]) + "\t"); } params->m->mothurOut("\n"); }
1043
1044 string commandString = "";
1045 for (int i = 0; i < vsearchParameters.size(); i++) { commandString += toString(vsearchParameters[i]) + " "; }
1046
1047 #if defined NON_WINDOWS
1048 #else
1049 commandString = "\"" + commandString + "\"";
1050 #endif
1051 if (params->m->getDebug()) { params->m->mothurOut("[DEBUG]: vsearch cluster command = " + commandString + ".\n"); }
1052
1053 system(commandString.c_str());
1054
1055 //free memory
1056 for(int i = 0; i < vsearchParameters.size(); i++) { delete vsearchParameters[i]; }
1057
1058 //remove "" from filenames
1059 ucClusteredFile = ucClusteredFile.substr(1, ucClusteredFile.length()-2);
1060 inputFile = inputFile.substr(1, inputFile.length()-2);
1061 logfile = logfile.substr(1, logfile.length()-2);
1062
1063 return 0;
1064
1065 }
1066 catch(exception& e) {
1067 params->m->errorOut(e, "ClusterSplitCommand", "vsearchDriver");
1068 exit(1);
1069 }
1070 }
1071 //**********************************************************************************************************************
runVsearchCluster(string thisDistFile,string thisNamefile,double & smallestCutoff,clusterData * params)1072 string runVsearchCluster(string thisDistFile, string thisNamefile, double& smallestCutoff, clusterData* params){
1073 try {
1074
1075 params->m->mothurOut("\nClustering " + thisDistFile + "\n");
1076
1077 string vsearchFastafile = ""; VsearchFileParser* vParse;
1078 if (params->useName) { vParse = new VsearchFileParser(thisDistFile, thisNamefile, "name"); }
1079 else if (params->useCount) { vParse = new VsearchFileParser(thisDistFile, thisNamefile, "count"); }
1080 else { params->m->mothurOut("[ERROR]: Opps, should never get here. ClusterSplitCommand::runVsearchCluster() \n"); params->m->setControl_pressed(true); return ""; }
1081
1082 if (params->m->getControl_pressed()) { delete vParse; return ""; }
1083
1084 vsearchFastafile = vParse->getVsearchFile();
1085
1086 if (params->cutoff > 1.0) { params->m->mothurOut("You did not set a cutoff, using 0.03.\n"); params->cutoff = 0.03; }
1087
1088 //Run vsearch
1089 string ucVsearchFile = params->util.getSimpleName(vsearchFastafile) + ".clustered.uc";
1090 string logfile = params->util.getSimpleName(vsearchFastafile) + ".clustered.log";
1091 vsearchDriver(vsearchFastafile, ucVsearchFile, logfile, smallestCutoff, params);
1092
1093 if (params->m->getControl_pressed()) { params->util.mothurRemove(ucVsearchFile); params->util.mothurRemove(logfile); params->util.mothurRemove(vsearchFastafile); return ""; }
1094
1095 string thisOutputDir = params->outputDir;
1096 if (params->outputDir == "") { thisOutputDir += params->util.hasPath(thisDistFile); }
1097 params->tag = params->method;
1098 string listFileName = thisOutputDir + params->util.getRootName(params->util.getSimpleName(thisDistFile)) + params->tag + ".list";
1099
1100 //Convert outputted *.uc file into a list file
1101 map<string, int> counts;
1102 ListVector list = vParse->createListFile(ucVsearchFile, vParse->getNumBins(logfile), toString(params->cutoff), counts);
1103
1104 ofstream out;
1105 params->util.openOutputFile(listFileName, out);
1106
1107 list.DataVector::printHeaders(out);
1108
1109 if (params->useCount) { list.print(out, counts); }
1110 else { list.print(out); } delete vParse;
1111
1112 //remove temp files
1113 params->util.mothurRemove(ucVsearchFile); params->util.mothurRemove(logfile); params->util.mothurRemove(vsearchFastafile);
1114
1115 if (params->deleteFiles) {
1116 params->util.mothurRemove(thisDistFile);
1117 params->util.mothurRemove(thisNamefile);
1118 }
1119 params->labels.insert(toString(params->cutoff));
1120
1121 return listFileName;
1122 }
1123 catch(exception& e) {
1124 params->m->errorOut(e, "ClusterSplitCommand", "runVsearchCluster");
1125 exit(1);
1126 }
1127 }
1128 //**********************************************************************************************************************
clusterFile(string thisDistFile,string thisNamefile,double & smallestCutoff,clusterData * params)1129 string clusterFile(string thisDistFile, string thisNamefile, double& smallestCutoff, clusterData* params){
1130 try {
1131 string listFileName = "";
1132
1133 if ((params->method == "agc") || (params->method == "dgc")) { listFileName = runVsearchCluster(thisDistFile, thisNamefile, smallestCutoff, params); }
1134 else if (params->method == "opti") { listFileName = runOptiCluster(thisDistFile, thisNamefile, smallestCutoff, params); }
1135 else {
1136
1137 Cluster* cluster = NULL;
1138 SparseDistanceMatrix* matrix = NULL;
1139 ListVector* list = NULL;
1140 ListVector oldList;
1141 RAbundVector* rabund = NULL;
1142
1143 if (params->m->getControl_pressed()) { return listFileName; }
1144
1145 params->m->mothurOut("\nReading " + thisDistFile + "\n");
1146
1147 ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
1148 read->setCutoff(params->cutoff);
1149
1150 NameAssignment* nameMap = NULL;
1151 CountTable* ct = NULL;
1152 if(params->useName){
1153 nameMap = new NameAssignment(thisNamefile);
1154 nameMap->readMap();
1155 read->read(nameMap);
1156 }else if (params->useCount) {
1157 ct = new CountTable();
1158 ct->readTable(thisNamefile, false, false);
1159 read->read(ct);
1160 }else { read->read(nameMap); }
1161
1162 list = read->getListVector();
1163 matrix = read->getDMatrix();
1164
1165 if(params->useCount) {
1166 rabund = new RAbundVector();
1167 createRabund(ct, list, rabund, params); //creates an rabund that includes the counts for the unique list
1168 delete ct;
1169 }else { rabund = new RAbundVector(list->getRAbundVector()); }
1170
1171 delete read; read = NULL;
1172 if (params->useName) { delete nameMap; nameMap = NULL; }
1173
1174 params->m->mothurOut("\nClustering " + thisDistFile + "\n");
1175
1176 //create cluster
1177 float adjust = -1.0;
1178 if (params->method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, params->cutoff, params->method, adjust); }
1179 else if(params->method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, params->cutoff, params->method, adjust); }
1180 else if(params->method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, params->cutoff, params->method, adjust); }
1181 params->tag = cluster->getTag();
1182
1183 string thisOutputDir = params->outputDir;
1184 if (params->outputDir == "") { thisOutputDir += params->util.hasPath(thisDistFile); }
1185 string fileroot = thisOutputDir + params->util.getRootName(params->util.getSimpleName(thisDistFile));
1186 listFileName = fileroot+ params->tag + ".list";
1187
1188 ofstream listFile;
1189 params->util.openOutputFile(listFileName, listFile);
1190
1191 float previousDist = 0.00000;
1192 float rndPreviousDist = 0.00000;
1193 bool printHeaders = true;
1194
1195 oldList = *list;
1196
1197 double saveCutoff = params->cutoff;
1198
1199 while (matrix->getSmallDist() < params->cutoff && matrix->getNNodes() > 0){
1200
1201 if (params->m->getControl_pressed()) { //clean up
1202 delete matrix; delete list; delete cluster; delete rabund;
1203 listFile.close();
1204 params->util.mothurRemove(listFileName);
1205 return listFileName;
1206 }
1207
1208 cluster->update(saveCutoff);
1209
1210 float dist = matrix->getSmallDist();
1211 float rndDist = params->util.ceilDist(dist, params->precision);
1212
1213 if(previousDist <= 0.0000 && !params->util.isEqual(dist, previousDist)){
1214 oldList.setLabel("unique");
1215 oldList.setPrintedLabels(printHeaders);
1216 oldList.print(listFile); printHeaders = false;
1217 if (params->labels.count("unique") == 0) { params->labels.insert("unique"); }
1218 }
1219 else if(!params->util.isEqual(rndDist, rndPreviousDist)){
1220 oldList.setPrintedLabels(printHeaders);
1221 oldList.setLabel(toString(rndPreviousDist, params->length-1));
1222 oldList.setPrintedLabels(printHeaders);
1223 oldList.print(listFile); printHeaders = false;
1224 if (params->labels.count(toString(rndPreviousDist, params->length-1)) == 0) { params->labels.insert(toString(rndPreviousDist, params->length-1)); }
1225 }
1226
1227 previousDist = dist;
1228 rndPreviousDist = rndDist;
1229 oldList = *list;
1230 }
1231
1232
1233 if(previousDist <= 0.0000){
1234 oldList.setLabel("unique");
1235 oldList.setPrintedLabels(printHeaders);
1236 oldList.print(listFile); printHeaders = false;
1237 if (params->labels.count("unique") == 0) { params->labels.insert("unique"); }
1238 }
1239 else if(rndPreviousDist<params->cutoff){
1240 oldList.setLabel(toString(rndPreviousDist, params->length-1));
1241 oldList.setPrintedLabels(printHeaders);
1242 oldList.print(listFile); printHeaders = false;
1243 if (params->labels.count(toString(rndPreviousDist, params->length-1)) == 0) { params->labels.insert(toString(rndPreviousDist, params->length-1)); }
1244 }
1245
1246 delete matrix; delete list; delete cluster; delete rabund;
1247 matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
1248 listFile.close();
1249
1250 if (params->m->getControl_pressed()) { //clean up
1251 params->util.mothurRemove(listFileName);
1252 return listFileName;
1253 }
1254
1255 if (params->deleteFiles) {
1256 params->util.mothurRemove(thisDistFile);
1257 params->util.mothurRemove(thisNamefile);
1258 }
1259
1260 if (!params->util.isEqual(saveCutoff, params->cutoff)) {
1261 saveCutoff = params->util.ceilDist(saveCutoff, params->precision);
1262 params->m->mothurOut("Cutoff was " + toString(params->cutoff) + " changed cutoff to " + toString(saveCutoff) + "\n");
1263 }
1264
1265 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
1266 }
1267 return listFileName;
1268 }
1269 catch(exception& e) {
1270 params->m->errorOut(e, "ClusterSplitCommand", "clusterFile");
1271 exit(1);
1272 }
1273 }
1274 //**********************************************************************************************************************
cluster(clusterData * params)1275 void cluster(clusterData* params){
1276 try {
1277 vector<string> listFileNames;
1278 double smallestCutoff = params->cutoff;
1279
1280 //cluster each distance file
1281 for (int i = 0; i < params->distNames.size(); i++) {
1282
1283 string thisNamefile = params->distNames[i].begin()->second;
1284 string thisDistFile = params->distNames[i].begin()->first;
1285
1286 params->setNamesCount(thisNamefile);
1287
1288 string listFileName = "";
1289 if (params->classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, smallestCutoff, params); }
1290 else { listFileName = clusterFile(thisDistFile, thisNamefile, smallestCutoff, params); }
1291
1292 if (params->m->getControl_pressed()) { //clean up
1293 for (int i = 0; i < listFileNames.size(); i++) { params->util.mothurRemove(listFileNames[i]); }
1294 params->listFileNames.clear(); break;
1295 }
1296 params->listFileNames.push_back(listFileName);
1297 }
1298 params->cutoff = smallestCutoff;
1299 }
1300 catch(exception& e) {
1301 params->m->errorOut(e, "ClusterSplitCommand", "cluster");
1302 exit(1);
1303 }
1304
1305
1306 }
1307 //**********************************************************************************************************************
printData(ListVector * oldList,clusterData * params)1308 void printData(ListVector* oldList, clusterData* params){
1309 try {
1310 string label = oldList->getLabel();
1311 RAbundVector oldRAbund = oldList->getRAbundVector();
1312
1313 oldRAbund.setLabel(label);
1314 if (params->showabund) {
1315 oldRAbund.getSAbundVector().print(cout);
1316 }
1317 oldRAbund.print(params->outRabund);
1318 oldRAbund.getSAbundVector().print(params->outSabund);
1319
1320 oldList->print(params->outList, true);
1321 }
1322 catch(exception& e) {
1323 params->m->errorOut(e, "ClusterSplitCommand", "printData");
1324 exit(1);
1325 }
1326 }
1327 //**********************************************************************************************************************
createProcesses(vector<map<string,string>> distName,set<string> & labels)1328 vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
1329 try {
1330 //sanity check
1331 if (processors > distName.size()) { processors = distName.size(); }
1332 deleteFiles = false; //so if we need to recalc the processors the files are still there
1333 vector<string> listFiles;
1334 vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
1335 dividedNames.resize(processors);
1336
1337 //for each file group figure out which process will complete it
1338 //want to divide the load intelligently so the big files are spread between processes
1339 for (int i = 0; i < distName.size(); i++) {
1340 int processToAssign = (i+1) % processors;
1341 if (processToAssign == 0) { processToAssign = processors; }
1342
1343 dividedNames[(processToAssign-1)].push_back(distName[i]);
1344 if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
1345 }
1346
1347 //now lets reverse the order of ever other process, so we balance big files running with little ones
1348 for (int i = 0; i < processors; i++) {
1349 int remainder = ((i+1) % processors);
1350 if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
1351 }
1352
1353 if (m->getControl_pressed()) { return listFiles; }
1354
1355
1356 //create array of worker threads
1357 vector<std::thread*> workerThreads;
1358 vector<clusterData*> data;
1359
1360 //Lauch worker threads
1361 for (int i = 0; i < processors-1; i++) {
1362 clusterData* dataBundle = new clusterData(showabund, classic, deleteFiles, dividedNames[i+1], cutoffNotSet, cutoff, precision, length, method, outputdir, vsearchLocation, type);
1363 dataBundle->setOptiOptions(metricName, stableMetric, initialize, maxIters);
1364 data.push_back(dataBundle);
1365
1366 workerThreads.push_back(new std::thread(cluster, dataBundle));
1367 }
1368
1369
1370 clusterData* dataBundle = new clusterData(showabund, classic, deleteFiles, dividedNames[0], cutoffNotSet, cutoff, precision, length, method, outputdir, vsearchLocation, type);
1371 dataBundle->setOptiOptions(metricName, stableMetric, initialize, maxIters);
1372 cluster(dataBundle);
1373 listFiles = dataBundle->listFileNames;
1374 tag = dataBundle->tag;
1375 cutoff = dataBundle->cutoff;
1376 labels = dataBundle->labels;
1377
1378
1379 for (int i = 0; i < processors-1; i++) {
1380 workerThreads[i]->join();
1381
1382 listFiles.insert(listFiles.end(), data[i]->listFileNames.begin(), data[i]->listFileNames.end());
1383 labels.insert(data[i]->labels.begin(), data[i]->labels.end());
1384 if (data[i]->cutoff < cutoff) { cutoff = data[i]->cutoff; }
1385
1386 delete data[i];
1387 delete workerThreads[i];
1388 }
1389 delete dataBundle;
1390 deleteFiles = true;
1391
1392 return listFiles;
1393 }
1394 catch(exception& e) {
1395 m->errorOut(e, "ClusterSplitCommand", "createProcesses");
1396 exit(1);
1397 }
1398 }
1399 //**********************************************************************************************************************
1400
createMergedDistanceFile(vector<map<string,string>> distNames)1401 int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
1402 try{
1403 string thisOutputDir = outputdir;
1404 if (outputdir == "") { thisOutputDir = util.hasPath(fastafile); }
1405 map<string, string> variables;
1406 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(fastafile));
1407 string outputFileName = getOutputFileName("column", variables);
1408 util.mothurRemove(outputFileName);
1409
1410
1411 for (int i = 0; i < distNames.size(); i++) {
1412 if (m->getControl_pressed()) { return 0; }
1413
1414 string thisDistFile = distNames[i].begin()->first;
1415
1416 util.appendFiles(thisDistFile, outputFileName);
1417 }
1418
1419 outputTypes["column"].push_back(outputFileName); outputNames.push_back(outputFileName);
1420
1421 return 0;
1422 }
1423 catch(exception& e) {
1424 m->errorOut(e, "ClusterSplitCommand", "createMergedDistanceFile");
1425 exit(1);
1426 }
1427 }
1428 //**********************************************************************************************************************
1429
runSensSpec()1430 int ClusterSplitCommand::runSensSpec() {
1431 try{
1432 string listFile = "";
1433 itTypes = outputTypes.find("list");
1434 if (itTypes != outputTypes.end()) {
1435 if ((itTypes->second).size() != 0) { listFile = (itTypes->second)[0]; }
1436 }
1437
1438 string columnFile = "";
1439 if (makeDist) {
1440 itTypes = outputTypes.find("column");
1441 if (itTypes != outputTypes.end()) {
1442 if ((itTypes->second).size() != 0) { columnFile = (itTypes->second)[0]; }
1443 }
1444 }
1445
1446 string inputString = "cutoff=" + toString(cutoff) + ", list=" + listFile;
1447 if (columnFile != "") { inputString += ", column=" + columnFile; }
1448 else { m->mothurOut("[WARNING]: Cannot run sens.spec analysis without a column file, skipping."); return 0; }
1449
1450 if (namefile != "") { inputString += ", name=" + namefile; }
1451 else if (countfile != "") { inputString += ", count=" + countfile; }
1452 else { m->mothurOut("[WARNING]: Cannot run sens.spec analysis without a name or count file, skipping."); return 0; }
1453
1454 m->mothurOut("/******************************************/\n");
1455 m->mothurOut("Running command: sens.spec(" + inputString + ")\n");
1456 current->setMothurCalling(true);
1457
1458 Command* sensspecCommand = new SensSpecCommand(inputString);
1459 sensspecCommand->execute();
1460
1461 map<string, vector<string> > filenames = sensspecCommand->getOutputFiles();
1462
1463 delete sensspecCommand;
1464 current->setMothurCalling(false);
1465
1466 string outputFileName = filenames["sensspec"][0];
1467
1468 outputTypes["sensspec"].push_back(outputFileName); outputNames.push_back(outputFileName);
1469
1470 m->mothurOut("/******************************************/\n");
1471 m->mothurOut("Done.\n\n\n");
1472
1473 ifstream in;
1474 util.openInputFile(outputFileName, in);
1475
1476 while(!in.eof()){
1477 if (m->getControl_pressed()) { break; }
1478
1479 m->mothurOut(util.getline(in)+"\n"); util.gobble(in);
1480 }
1481 in.close();
1482
1483 return 0;
1484 }
1485 catch(exception& e) {
1486 m->errorOut(e, "ClusterSplitCommand", "runSensSpec");
1487 exit(1);
1488 }
1489 }
1490 //**********************************************************************************************************************
printFile(string singleton,vector<map<string,string>> & distName)1491 string ClusterSplitCommand::printFile(string singleton, vector< map<string, string> >& distName){
1492 try {
1493 ofstream out;
1494 map<string, string> variables;
1495 string thisOutputDir = outputdir;
1496 if (outputdir == "") { thisOutputDir = util.hasPath(distfile); }
1497 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(distfile));
1498 string outputFileName = getOutputFileName("file", variables);
1499 util.openOutputFile(outputFileName, out);
1500 outputTypes["file"].push_back(outputFileName); outputNames.push_back(outputFileName);
1501 current->setFileFile(outputFileName);
1502
1503 out << singleton << endl;
1504 if (namefile != "") { out << "name" << endl; }
1505 else if (countfile != "") { out << "count" << endl; }
1506 else { out << "unknown" << endl; }
1507
1508 for (int i = 0; i < distName.size(); i++) { out << distName[i].begin()->first << '\t' << distName[i].begin()->second << endl; }
1509 out.close();
1510
1511 return outputFileName;
1512 }
1513 catch(exception& e) {
1514 m->errorOut(e, "ClusterSplitCommand", "printFile");
1515 exit(1);
1516 }
1517
1518 }
1519 //**********************************************************************************************************************
readFile(vector<map<string,string>> & distName)1520 string ClusterSplitCommand::readFile(vector< map<string, string> >& distName){
1521 try {
1522
1523 string singleton, thiscolumn, thisname;
1524
1525 ifstream in;
1526 util.openInputFile(file, in);
1527
1528 in >> singleton; util.gobble(in);
1529
1530 string path = util.hasPath(singleton);
1531 if (path == "") { singleton = inputDir + singleton; }
1532
1533 in >> type; util.gobble(in);
1534
1535 if (type == "name") { }
1536 else if (type == "count") { }
1537 else { m->mothurOut("[ERROR]: unknown file type. Are the files in column 2 of the file name files or count files? Please change unknown to name or count.\n"); m->setControl_pressed(true); }
1538
1539 if (isList) {
1540
1541 vector<string> listFileNames;
1542 string thisListFileName = "";
1543 set<string> listLabels;
1544
1545 while(!in.eof()) {
1546 if (m->getControl_pressed()) { break; }
1547
1548 in >> thisListFileName; util.gobble(in);
1549
1550 string path = util.hasPath(thisListFileName);
1551 if (path == "") { thisListFileName = inputDir + thisListFileName; }
1552
1553 getLabels(thisListFileName, listLabels);
1554 listFileNames.push_back(thisListFileName);
1555 }
1556
1557 ListVector* listSingle;
1558 map<double, int> labelBins = completeListFile(listFileNames, singleton, listLabels, listSingle);
1559
1560 mergeLists(listFileNames, labelBins, listSingle);
1561
1562 }else {
1563
1564 while(!in.eof()) {
1565 if (m->getControl_pressed()) { break; }
1566
1567 in >> thiscolumn; util.gobble(in);
1568 in >> thisname; util.gobble(in);
1569
1570 map<string, string> temp;
1571 temp[thiscolumn] = thisname;
1572 distName.push_back(temp);
1573 }
1574 }
1575
1576 in.close();
1577
1578 return singleton;
1579 }
1580 catch(exception& e) {
1581 m->errorOut(e, "ClusterSplitCommand", "readFile");
1582 exit(1);
1583 }
1584
1585 }
1586 //**********************************************************************************************************************
getLabels(string file,set<string> & listLabels)1587 int ClusterSplitCommand::getLabels(string file, set<string>& listLabels){
1588 try {
1589 ifstream in;
1590 util.openInputFile(file, in);
1591
1592 //read headers
1593 util.getline(in); util.gobble(in);
1594
1595 string label;
1596 while(!in.eof()) {
1597 if (m->getControl_pressed()) { break; }
1598
1599 in >> label; util.getline(in); util.gobble(in);
1600
1601 listLabels.insert(label);
1602 }
1603
1604 in.close();
1605
1606 return 0;
1607 }
1608 catch(exception& e) {
1609 m->errorOut(e, "ClusterSplitCommand", "getLabels");
1610 exit(1);
1611 }
1612
1613 }
1614 //**********************************************************************************************************************
1615